Merge branch 'slab/for-7.0/sheaves' into slab/for-next

-6

include/linux/slab.h

··· 57 57 #endif 58 58 _SLAB_OBJECT_POISON, 59 59 _SLAB_CMPXCHG_DOUBLE, 60 - #ifdef CONFIG_SLAB_OBJ_EXT 61 60 _SLAB_NO_OBJ_EXT, 62 - #endif 63 61 #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 64 62 _SLAB_OBJ_EXT_IN_OBJ, 65 63 #endif ··· 239 241 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 240 242 241 243 /* Slab created using create_boot_cache */ 242 - #ifdef CONFIG_SLAB_OBJ_EXT 243 244 #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) 244 - #else 245 - #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED 246 - #endif 247 245 248 246 #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 249 247 #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ)

-11

mm/Kconfig

··· 247 247 out which slabs are relevant to a particular load. 248 248 Try running: slabinfo -DA 249 249 250 - config SLUB_CPU_PARTIAL 251 - default y 252 - depends on SMP && !SLUB_TINY 253 - bool "Enable per cpu partial caches" 254 - help 255 - Per cpu partial caches accelerate objects allocation and freeing 256 - that is local to a processor at the price of more indeterminism 257 - in the latency of the free. On overflow these caches will be cleared 258 - which requires the taking of locks that may cause latency spikes. 259 - Typically one would choose no for a realtime system. 260 - 261 250 config RANDOM_KMALLOC_CACHES 262 251 default n 263 252 depends on !SLUB_TINY

+1

mm/internal.h

··· 846 846 struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); 847 847 #define alloc_frozen_pages_nolock(...) \ 848 848 alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) 849 + void free_frozen_pages_nolock(struct page *page, unsigned int order); 849 850 850 851 extern void zone_pcp_reset(struct zone *zone); 851 852 extern void zone_pcp_disable(struct zone *zone);

+5

mm/page_alloc.c

··· 2981 2981 __free_frozen_pages(page, order, FPI_NONE); 2982 2982 } 2983 2983 2984 + void free_frozen_pages_nolock(struct page *page, unsigned int order) 2985 + { 2986 + __free_frozen_pages(page, order, FPI_TRYLOCK); 2987 + } 2988 + 2984 2989 /* 2985 2990 * Free a batch of folios 2986 2991 */

+17 -40

mm/slab.h

··· 21 21 # define system_has_freelist_aba() system_has_cmpxchg128() 22 22 # define try_cmpxchg_freelist try_cmpxchg128 23 23 # endif 24 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 25 24 typedef u128 freelist_full_t; 26 25 #else /* CONFIG_64BIT */ 27 26 # ifdef system_has_cmpxchg64 28 27 # define system_has_freelist_aba() system_has_cmpxchg64() 29 28 # define try_cmpxchg_freelist try_cmpxchg64 30 29 # endif 31 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 32 30 typedef u64 freelist_full_t; 33 31 #endif /* CONFIG_64BIT */ 34 32 ··· 77 79 struct kmem_cache *slab_cache; 78 80 union { 79 81 struct { 80 - union { 81 - struct list_head slab_list; 82 - struct { /* For deferred deactivate_slab() */ 83 - struct llist_node llnode; 84 - void *flush_freelist; 85 - }; 86 - #ifdef CONFIG_SLUB_CPU_PARTIAL 87 - struct { 88 - struct slab *next; 89 - int slabs; /* Nr of slabs left */ 90 - }; 91 - #endif 92 - }; 82 + struct list_head slab_list; 93 83 /* Double-word boundary */ 94 84 struct freelist_counters; 95 85 }; ··· 182 196 return PAGE_SIZE << slab_order(slab); 183 197 } 184 198 185 - #ifdef CONFIG_SLUB_CPU_PARTIAL 186 - #define slub_percpu_partial(c) ((c)->partial) 187 - 188 - #define slub_set_percpu_partial(c, p) \ 189 - ({ \ 190 - slub_percpu_partial(c) = (p)->next; \ 191 - }) 192 - 193 - #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) 194 - #else 195 - #define slub_percpu_partial(c) NULL 196 - 197 - #define slub_set_percpu_partial(c, p) 198 - 199 - #define slub_percpu_partial_read_once(c) NULL 200 - #endif // CONFIG_SLUB_CPU_PARTIAL 201 - 202 199 /* 203 200 * Word size structure that can be atomically updated or read and that 204 201 * contains both the order and the number of objects that a slab of the ··· 195 226 * Slab cache management. 196 227 */ 197 228 struct kmem_cache { 198 - struct kmem_cache_cpu __percpu *cpu_slab; 199 - struct lock_class_key lock_key; 200 229 struct slub_percpu_sheaves __percpu *cpu_sheaves; 201 230 /* Used for retrieving partial slabs, etc. */ 202 231 slab_flags_t flags; ··· 203 236 unsigned int object_size; /* Object size without metadata */ 204 237 struct reciprocal_value reciprocal_size; 205 238 unsigned int offset; /* Free pointer offset */ 206 - #ifdef CONFIG_SLUB_CPU_PARTIAL 207 - /* Number of per cpu partial objects to keep around */ 208 - unsigned int cpu_partial; 209 - /* Number of per cpu partial slabs to keep around */ 210 - unsigned int cpu_partial_slabs; 211 - #endif 212 239 unsigned int sheaf_capacity; 213 240 struct kmem_cache_order_objects oo; 214 241 ··· 243 282 unsigned int usersize; /* Usercopy region size */ 244 283 #endif 245 284 285 + #ifdef CONFIG_SLUB_STATS 286 + struct kmem_cache_stats __percpu *cpu_stats; 287 + #endif 288 + 246 289 struct kmem_cache_node *node[MAX_NUMNODES]; 247 290 }; 291 + 292 + /* 293 + * Every cache has !NULL s->cpu_sheaves but they may point to the 294 + * bootstrap_sheaf temporarily during init, or permanently for the boot caches 295 + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This 296 + * helper distinguishes whether cache has real non-bootstrap sheaves. 297 + */ 298 + static inline bool cache_has_sheaves(struct kmem_cache *s) 299 + { 300 + /* Test CONFIG_SLUB_TINY for code elimination purposes */ 301 + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; 302 + } 248 303 249 304 #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) 250 305 #define SLAB_SUPPORTS_SYSFS 1

+3 -6

mm/slab_common.c

··· 1604 1604 return false; 1605 1605 1606 1606 s = slab->slab_cache; 1607 - if (s->cpu_sheaves) { 1608 - if (likely(!IS_ENABLED(CONFIG_NUMA) || 1609 - slab_nid(slab) == numa_mem_id())) 1610 - return __kfree_rcu_sheaf(s, obj); 1611 - } 1607 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) 1608 + return __kfree_rcu_sheaf(s, obj); 1612 1609 1613 1610 return false; 1614 1611 } ··· 2109 2112 */ 2110 2113 void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) 2111 2114 { 2112 - if (s->cpu_sheaves) { 2115 + if (cache_has_sheaves(s)) { 2113 2116 flush_rcu_sheaves_on_cache(s); 2114 2117 rcu_barrier(); 2115 2118 }

+971 -1723

mm/slub.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 - * SLUB: A slab allocator that limits cache line use instead of queuing 4 - * objects in per cpu and per node lists. 3 + * SLUB: A slab allocator with low overhead percpu array caches and mostly 4 + * lockless freeing of objects to slabs in the slowpath. 5 5 * 6 - * The allocator synchronizes using per slab locks or atomic operations 7 - * and only uses a centralized lock to manage a pool of partial slabs. 6 + * The allocator synchronizes using spin_trylock for percpu arrays in the 7 + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. 8 + * Uses a centralized lock to manage a pool of partial slabs. 8 9 * 9 10 * (C) 2007 SGI, Christoph Lameter 10 11 * (C) 2011 Linux Foundation, Christoph Lameter 12 + * (C) 2025 SUSE, Vlastimil Babka 11 13 */ 12 14 13 15 #include <linux/mm.h> ··· 55 53 56 54 /* 57 55 * Lock order: 58 - * 1. slab_mutex (Global Mutex) 59 - * 2. node->list_lock (Spinlock) 60 - * 3. kmem_cache->cpu_slab->lock (Local lock) 61 - * 4. slab_lock(slab) (Only on some arches) 62 - * 5. object_map_lock (Only for debugging) 56 + * 0. cpu_hotplug_lock 57 + * 1. slab_mutex (Global Mutex) 58 + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) 59 + * 2b. node->barn->lock (Spinlock) 60 + * 2c. node->list_lock (Spinlock) 61 + * 3. slab_lock(slab) (Only on some arches) 62 + * 4. object_map_lock (Only for debugging) 63 63 * 64 64 * slab_mutex 65 65 * ··· 82 78 * C. slab->objects -> Number of objects in slab 83 79 * D. slab->frozen -> frozen state 84 80 * 81 + * SL_partial slabs 82 + * 83 + * Slabs on node partial list have at least one free object. A limited number 84 + * of slabs on the list can be fully free (slab->inuse == 0), until we start 85 + * discarding them. These slabs are marked with SL_partial, and the flag is 86 + * cleared while removing them, usually to grab their freelist afterwards. 87 + * This clearing also exempts them from list management. Please see 88 + * __slab_free() for more details. 89 + * 90 + * Full slabs 91 + * 92 + * For caches without debugging enabled, full slabs (slab->inuse == 93 + * slab->objects and slab->freelist == NULL) are not placed on any list. 94 + * The __slab_free() freeing the first object from such a slab will place 95 + * it on the partial list. Caches with debugging enabled place such slab 96 + * on the full list and use different allocation and freeing paths. 97 + * 85 98 * Frozen slabs 86 99 * 87 - * If a slab is frozen then it is exempt from list management. It is 88 - * the cpu slab which is actively allocated from by the processor that 89 - * froze it and it is not on any list. The processor that froze the 90 - * slab is the one who can perform list operations on the slab. Other 91 - * processors may put objects onto the freelist but the processor that 92 - * froze the slab is the only one that can retrieve the objects from the 93 - * slab's freelist. 94 - * 95 - * CPU partial slabs 96 - * 97 - * The partially empty slabs cached on the CPU partial list are used 98 - * for performance reasons, which speeds up the allocation process. 99 - * These slabs are not frozen, but are also exempt from list management, 100 - * by clearing the SL_partial flag when moving out of the node 101 - * partial list. Please see __slab_free() for more details. 100 + * If a slab is frozen then it is exempt from list management. It is used to 101 + * indicate a slab that has failed consistency checks and thus cannot be 102 + * allocated from anymore - it is also marked as full. Any previously 103 + * allocated objects will be simply leaked upon freeing instead of attempting 104 + * to modify the potentially corrupted freelist and metadata. 102 105 * 103 106 * To sum up, the current scheme is: 104 - * - node partial slab: SL_partial && !frozen 105 - * - cpu partial slab: !SL_partial && !frozen 106 - * - cpu slab: !SL_partial && frozen 107 - * - full slab: !SL_partial && !frozen 107 + * - node partial slab: SL_partial && !full && !frozen 108 + * - taken off partial list: !SL_partial && !full && !frozen 109 + * - full slab, not on any list: !SL_partial && full && !frozen 110 + * - frozen due to inconsistency: !SL_partial && full && frozen 108 111 * 109 - * list_lock 112 + * node->list_lock (spinlock) 110 113 * 111 114 * The list_lock protects the partial and full list on each node and 112 115 * the partial slab counter. If taken then no new slabs may be added or ··· 123 112 * 124 113 * The list_lock is a centralized lock and thus we avoid taking it as 125 114 * much as possible. As long as SLUB does not have to handle partial 126 - * slabs, operations can continue without any centralized lock. F.e. 127 - * allocating a long series of objects that fill up slabs does not require 128 - * the list lock. 115 + * slabs, operations can continue without any centralized lock. 129 116 * 130 117 * For debug caches, all allocations are forced to go through a list_lock 131 118 * protected region to serialize against concurrent validation. 132 119 * 133 - * cpu_slab->lock local lock 120 + * cpu_sheaves->lock (local_trylock) 134 121 * 135 - * This locks protect slowpath manipulation of all kmem_cache_cpu fields 136 - * except the stat counters. This is a percpu structure manipulated only by 137 - * the local cpu, so the lock protects against being preempted or interrupted 138 - * by an irq. Fast path operations rely on lockless operations instead. 122 + * This lock protects fastpath operations on the percpu sheaves. On !RT it 123 + * only disables preemption and does no atomic operations. As long as the main 124 + * or spare sheaf can handle the allocation or free, there is no other 125 + * overhead. 139 126 * 140 - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption 141 - * which means the lockless fastpath cannot be used as it might interfere with 142 - * an in-progress slow path operations. In this case the local lock is always 143 - * taken but it still utilizes the freelist for the common operations. 127 + * node->barn->lock (spinlock) 144 128 * 145 - * lockless fastpaths 129 + * This lock protects the operations on per-NUMA-node barn. It can quickly 130 + * serve an empty or full sheaf if available, and avoid more expensive refill 131 + * or flush operation. 146 132 * 147 - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) 148 - * are fully lockless when satisfied from the percpu slab (and when 149 - * cmpxchg_double is possible to use, otherwise slab_lock is taken). 150 - * They also don't disable preemption or migration or irqs. They rely on 151 - * the transaction id (tid) field to detect being preempted or moved to 152 - * another cpu. 133 + * Lockless freeing 134 + * 135 + * Objects may have to be freed to their slabs when they are from a remote 136 + * node (where we want to avoid filling local sheaves with remote objects) 137 + * or when there are too many full sheaves. On architectures supporting 138 + * cmpxchg_double this is done by a lockless update of slab's freelist and 139 + * counters, otherwise slab_lock is taken. This only needs to take the 140 + * list_lock if it's a first free to a full slab, or when a slab becomes empty 141 + * after the free. 153 142 * 154 143 * irq, preemption, migration considerations 155 144 * 156 - * Interrupts are disabled as part of list_lock or local_lock operations, or 145 + * Interrupts are disabled as part of list_lock or barn lock operations, or 157 146 * around the slab_lock operation, in order to make the slab allocator safe 158 147 * to use in the context of an irq. 148 + * Preemption is disabled as part of local_trylock operations. 149 + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see 150 + * their limitations. 159 151 * 160 - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the 161 - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the 162 - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer 163 - * doesn't have to be revalidated in each section protected by the local lock. 164 - * 165 - * SLUB assigns one slab for allocation to each processor. 166 - * Allocations only occur from these slabs called cpu slabs. 152 + * SLUB assigns two object arrays called sheaves for caching allocations and 153 + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. 154 + * Allocations and frees are primarily served from these sheaves. 167 155 * 168 156 * Slabs with free elements are kept on a partial list and during regular 169 157 * operations no list for full slabs is used. If an object in a full slab is ··· 170 160 * We track full slabs for debugging purposes though because otherwise we 171 161 * cannot scan all objects. 172 162 * 173 - * Slabs are freed when they become empty. Teardown and setup is 174 - * minimal so we rely on the page allocators per cpu caches for 175 - * fast frees and allocs. 176 - * 177 - * slab->frozen The slab is frozen and exempt from list processing. 178 - * This means that the slab is dedicated to a purpose 179 - * such as satisfying allocations for a specific 180 - * processor. Objects may be freed in the slab while 181 - * it is frozen but slab_free will then skip the usual 182 - * list operations. It is up to the processor holding 183 - * the slab to integrate the slab into the slab lists 184 - * when the slab is no longer needed. 185 - * 186 - * One use of this flag is to mark slabs that are 187 - * used for allocations. Then such a slab becomes a cpu 188 - * slab. The cpu slab may be equipped with an additional 189 - * freelist that allows lockless access to 190 - * free objects in addition to the regular freelist 191 - * that requires the slab lock. 163 + * Slabs are freed when they become empty. Teardown and setup is minimal so we 164 + * rely on the page allocators per cpu caches for fast frees and allocs. 192 165 * 193 166 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug 194 167 * options set. This moves slab handling out of ··· 194 201 SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ 195 202 }; 196 203 197 - /* 198 - * We could simply use migrate_disable()/enable() but as long as it's a 199 - * function call even on !PREEMPT_RT, use inline preempt_disable() there. 200 - */ 201 - #ifndef CONFIG_PREEMPT_RT 202 - #define slub_get_cpu_ptr(var) get_cpu_ptr(var) 203 - #define slub_put_cpu_ptr(var) put_cpu_ptr(var) 204 - #define USE_LOCKLESS_FAST_PATH() (true) 205 - #else 206 - #define slub_get_cpu_ptr(var) \ 207 - ({ \ 208 - migrate_disable(); \ 209 - this_cpu_ptr(var); \ 210 - }) 211 - #define slub_put_cpu_ptr(var) \ 212 - do { \ 213 - (void)(var); \ 214 - migrate_enable(); \ 215 - } while (0) 216 - #define USE_LOCKLESS_FAST_PATH() (false) 217 - #endif 218 - 219 204 #ifndef CONFIG_SLUB_TINY 220 205 #define __fastpath_inline __always_inline 221 206 #else ··· 212 241 static DEFINE_STATIC_KEY_FALSE(strict_numa); 213 242 #endif 214 243 215 - /* Structure holding parameters for get_partial() call chain */ 244 + /* Structure holding parameters for get_from_partial() call chain */ 216 245 struct partial_context { 217 246 gfp_t flags; 218 247 unsigned int orig_size; 219 - void *object; 248 + }; 249 + 250 + /* Structure holding parameters for get_partial_node_bulk() */ 251 + struct partial_bulk_context { 252 + gfp_t flags; 253 + unsigned int min_objects; 254 + unsigned int max_objects; 255 + struct list_head slabs; 220 256 }; 221 257 222 258 static inline bool kmem_cache_debug(struct kmem_cache *s) ··· 237 259 p += s->red_left_pad; 238 260 239 261 return p; 240 - } 241 - 242 - static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 243 - { 244 - #ifdef CONFIG_SLUB_CPU_PARTIAL 245 - return !kmem_cache_debug(s); 246 - #else 247 - return false; 248 - #endif 249 262 } 250 263 251 264 /* ··· 329 360 static inline void debugfs_slab_add(struct kmem_cache *s) { } 330 361 #endif 331 362 363 + enum add_mode { 364 + ADD_TO_HEAD, 365 + ADD_TO_TAIL, 366 + }; 367 + 332 368 enum stat_item { 333 - ALLOC_PCS, /* Allocation from percpu sheaf */ 334 - ALLOC_FASTPATH, /* Allocation from cpu slab */ 335 - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 336 - FREE_PCS, /* Free to percpu sheaf */ 369 + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ 370 + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ 337 371 FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ 338 372 FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ 339 - FREE_FASTPATH, /* Free to cpu slab */ 340 - FREE_SLOWPATH, /* Freeing not to cpu slab */ 341 - FREE_FROZEN, /* Freeing to frozen slab */ 373 + FREE_FASTPATH, /* Free to percpu sheaves */ 374 + FREE_SLOWPATH, /* Free to a slab */ 342 375 FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ 343 376 FREE_REMOVE_PARTIAL, /* Freeing removes last object */ 344 - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ 345 - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ 346 - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ 347 - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ 377 + ALLOC_SLAB, /* New slab acquired from page allocator */ 378 + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ 348 379 FREE_SLAB, /* Slab freed to the page allocator */ 349 - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ 350 - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ 351 - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ 352 - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ 353 - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ 354 - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ 355 - DEACTIVATE_BYPASS, /* Implicit deactivation */ 356 380 ORDER_FALLBACK, /* Number of times fallback was necessary */ 357 - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ 358 381 CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ 359 - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ 360 - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ 361 - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ 362 - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ 363 382 SHEAF_FLUSH, /* Objects flushed from a sheaf */ 364 383 SHEAF_REFILL, /* Objects refilled to a sheaf */ 365 384 SHEAF_ALLOC, /* Allocation of an empty sheaf */ ··· 364 407 NR_SLUB_STAT_ITEMS 365 408 }; 366 409 367 - struct freelist_tid { 368 - union { 369 - struct { 370 - void *freelist; /* Pointer to next available object */ 371 - unsigned long tid; /* Globally unique transaction id */ 372 - }; 373 - freelist_full_t freelist_tid; 374 - }; 375 - }; 376 - 377 - /* 378 - * When changing the layout, make sure freelist and tid are still compatible 379 - * with this_cpu_cmpxchg_double() alignment requirements. 380 - */ 381 - struct kmem_cache_cpu { 382 - struct freelist_tid; 383 - struct slab *slab; /* The slab from which we are allocating */ 384 - #ifdef CONFIG_SLUB_CPU_PARTIAL 385 - struct slab *partial; /* Partially allocated slabs */ 386 - #endif 387 - local_trylock_t lock; /* Protects the fields above */ 388 410 #ifdef CONFIG_SLUB_STATS 411 + struct kmem_cache_stats { 389 412 unsigned int stat[NR_SLUB_STAT_ITEMS]; 390 - #endif 391 413 }; 414 + #endif 392 415 393 416 static inline void stat(const struct kmem_cache *s, enum stat_item si) 394 417 { ··· 377 440 * The rmw is racy on a preemptible kernel but this is acceptable, so 378 441 * avoid this_cpu_add()'s irq-disable overhead. 379 442 */ 380 - raw_cpu_inc(s->cpu_slab->stat[si]); 443 + raw_cpu_inc(s->cpu_stats->stat[si]); 381 444 #endif 382 445 } 383 446 ··· 385 448 void stat_add(const struct kmem_cache *s, enum stat_item si, int v) 386 449 { 387 450 #ifdef CONFIG_SLUB_STATS 388 - raw_cpu_add(s->cpu_slab->stat[si], v); 451 + raw_cpu_add(s->cpu_stats->stat[si], v); 389 452 #endif 390 453 } 391 454 ··· 474 537 static nodemask_t slab_nodes; 475 538 476 539 /* 477 - * Workqueue used for flush_cpu_slab(). 540 + * Workqueue used for flushing cpu and kfree_rcu sheaves. 478 541 */ 479 542 static struct workqueue_struct *flushwq; 480 543 ··· 531 594 ptr_addr = (unsigned long)object + s->offset; 532 595 p = *(freeptr_t *)(ptr_addr); 533 596 return freelist_ptr_decode(s, p, ptr_addr); 534 - } 535 - 536 - static void prefetch_freepointer(const struct kmem_cache *s, void *object) 537 - { 538 - prefetchw(object + s->offset); 539 - } 540 - 541 - /* 542 - * When running under KMSAN, get_freepointer_safe() may return an uninitialized 543 - * pointer value in the case the current thread loses the race for the next 544 - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in 545 - * slab_alloc_node() will fail, so the uninitialized value won't be used, but 546 - * KMSAN will still check all arguments of cmpxchg because of imperfect 547 - * handling of inline assembly. 548 - * To work around this problem, we apply __no_kmsan_checks to ensure that 549 - * get_freepointer_safe() returns initialized memory. 550 - */ 551 - __no_kmsan_checks 552 - static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 553 - { 554 - unsigned long freepointer_addr; 555 - freeptr_t p; 556 - 557 - if (!debug_pagealloc_enabled_static()) 558 - return get_freepointer(s, object); 559 - 560 - object = kasan_reset_tag(object); 561 - freepointer_addr = (unsigned long)object + s->offset; 562 - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); 563 - return freelist_ptr_decode(s, p, freepointer_addr); 564 597 } 565 598 566 599 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) ··· 596 689 return x.x & OO_MASK; 597 690 } 598 691 599 - #ifdef CONFIG_SLUB_CPU_PARTIAL 600 - static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 601 - { 602 - unsigned int nr_slabs; 603 - 604 - s->cpu_partial = nr_objects; 605 - 606 - /* 607 - * We take the number of objects but actually limit the number of 608 - * slabs on the per cpu partial list, in order to limit excessive 609 - * growth of the list. For simplicity we assume that the slabs will 610 - * be half-full. 611 - */ 612 - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); 613 - s->cpu_partial_slabs = nr_slabs; 614 - } 615 - 616 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 617 - { 618 - return s->cpu_partial_slabs; 619 - } 620 - #else 621 - #ifdef SLAB_SUPPORTS_SYSFS 622 - static inline void 623 - slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 624 - { 625 - } 626 - #endif 627 - 628 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 629 - { 630 - return 0; 631 - } 632 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 633 - 634 692 /* 635 693 * If network-based swap is enabled, slub must keep track of whether memory 636 694 * were allocated from pfmemalloc reserves. ··· 651 779 if (slab->freelist == old->freelist && 652 780 slab->counters == old->counters) { 653 781 slab->freelist = new->freelist; 654 - slab->counters = new->counters; 782 + /* prevent tearing for the read in get_partial_node_bulk() */ 783 + WRITE_ONCE(slab->counters, new->counters); 655 784 ret = true; 656 785 } 657 786 slab_unlock(slab); ··· 672 799 { 673 800 bool ret; 674 801 675 - if (USE_LOCKLESS_FAST_PATH()) 802 + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) 676 803 lockdep_assert_irqs_disabled(); 677 804 678 805 if (s->flags & __CMPXCHG_DOUBLE) ··· 1051 1178 p->handle = handle; 1052 1179 #endif 1053 1180 p->addr = addr; 1054 - p->cpu = smp_processor_id(); 1181 + p->cpu = raw_smp_processor_id(); 1055 1182 p->pid = current->pid; 1056 1183 p->when = jiffies; 1057 1184 } ··· 1213 1340 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 1214 1341 1215 1342 WARN_ON(1); 1216 - } 1217 - 1218 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 1219 - void **freelist, void *nextfree) 1220 - { 1221 - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && 1222 - !check_valid_pointer(s, slab, nextfree) && freelist) { 1223 - object_err(s, slab, *freelist, "Freechain corrupt"); 1224 - *freelist = NULL; 1225 - slab_fix(s, "Isolate corrupted freechain"); 1226 - return true; 1227 - } 1228 - 1229 - return false; 1230 1343 } 1231 1344 1232 1345 static void __slab_err(struct slab *slab) ··· 2026 2167 int objects) {} 2027 2168 static inline void dec_slabs_node(struct kmem_cache *s, int node, 2028 2169 int objects) {} 2029 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 2030 - void **freelist, void *nextfree) 2031 - { 2032 - return false; 2033 - } 2034 2170 #endif /* CONFIG_SLUB_DEBUG */ 2035 2171 2036 2172 /* ··· 2726 2872 return object; 2727 2873 } 2728 2874 2729 - static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) 2875 + static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, 2876 + unsigned int capacity) 2730 2877 { 2731 2878 struct slab_sheaf *sheaf; 2732 2879 size_t sheaf_size; ··· 2745 2890 if (s->flags & SLAB_KMALLOC) 2746 2891 gfp |= __GFP_NO_OBJ_EXT; 2747 2892 2748 - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); 2893 + sheaf_size = struct_size(sheaf, objects, capacity); 2749 2894 sheaf = kzalloc(sheaf_size, gfp); 2750 2895 2751 2896 if (unlikely(!sheaf)) ··· 2758 2903 return sheaf; 2759 2904 } 2760 2905 2906 + static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, 2907 + gfp_t gfp) 2908 + { 2909 + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); 2910 + } 2911 + 2761 2912 static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) 2762 2913 { 2763 2914 kfree(sheaf); ··· 2771 2910 stat(s, SHEAF_FREE); 2772 2911 } 2773 2912 2774 - static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 2775 - size_t size, void **p); 2776 - 2913 + static unsigned int 2914 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 2915 + unsigned int max); 2777 2916 2778 2917 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, 2779 2918 gfp_t gfp) ··· 2784 2923 if (!to_fill) 2785 2924 return 0; 2786 2925 2787 - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, 2788 - &sheaf->objects[sheaf->size]); 2926 + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, 2927 + to_fill); 2789 2928 2790 2929 sheaf->size += filled; 2791 2930 ··· 2986 3125 { 2987 3126 int cpu; 2988 3127 3128 + /* 3129 + * We may be unwinding cache creation that failed before or during the 3130 + * allocation of this. 3131 + */ 3132 + if (!s->cpu_sheaves) 3133 + return; 3134 + 3135 + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ 3136 + if (!cache_has_sheaves(s)) 3137 + goto free_pcs; 3138 + 2989 3139 for_each_possible_cpu(cpu) { 2990 3140 struct slub_percpu_sheaves *pcs; 2991 3141 2992 3142 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 2993 3143 2994 - /* can happen when unwinding failed create */ 3144 + /* This can happen when unwinding failed cache creation. */ 2995 3145 if (!pcs->main) 2996 3146 continue; 2997 3147 ··· 3024 3152 } 3025 3153 } 3026 3154 3155 + free_pcs: 3027 3156 free_percpu(s->cpu_sheaves); 3028 3157 s->cpu_sheaves = NULL; 3029 3158 } 3030 3159 3031 - static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) 3160 + static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, 3161 + bool allow_spin) 3032 3162 { 3033 3163 struct slab_sheaf *empty = NULL; 3034 3164 unsigned long flags; ··· 3038 3164 if (!data_race(barn->nr_empty)) 3039 3165 return NULL; 3040 3166 3041 - spin_lock_irqsave(&barn->lock, flags); 3167 + if (likely(allow_spin)) 3168 + spin_lock_irqsave(&barn->lock, flags); 3169 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3170 + return NULL; 3042 3171 3043 3172 if (likely(barn->nr_empty)) { 3044 3173 empty = list_first_entry(&barn->sheaves_empty, ··· 3118 3241 * change. 3119 3242 */ 3120 3243 static struct slab_sheaf * 3121 - barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) 3244 + barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, 3245 + bool allow_spin) 3122 3246 { 3123 3247 struct slab_sheaf *full = NULL; 3124 3248 unsigned long flags; ··· 3127 3249 if (!data_race(barn->nr_full)) 3128 3250 return NULL; 3129 3251 3130 - spin_lock_irqsave(&barn->lock, flags); 3252 + if (likely(allow_spin)) 3253 + spin_lock_irqsave(&barn->lock, flags); 3254 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3255 + return NULL; 3131 3256 3132 3257 if (likely(barn->nr_full)) { 3133 3258 full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, ··· 3151 3270 * barn. But if there are too many full sheaves, reject this with -E2BIG. 3152 3271 */ 3153 3272 static struct slab_sheaf * 3154 - barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) 3273 + barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, 3274 + bool allow_spin) 3155 3275 { 3156 3276 struct slab_sheaf *empty; 3157 3277 unsigned long flags; ··· 3163 3281 if (!data_race(barn->nr_empty)) 3164 3282 return ERR_PTR(-ENOMEM); 3165 3283 3166 - spin_lock_irqsave(&barn->lock, flags); 3284 + if (likely(allow_spin)) 3285 + spin_lock_irqsave(&barn->lock, flags); 3286 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3287 + return ERR_PTR(-EBUSY); 3167 3288 3168 3289 if (likely(barn->nr_empty)) { 3169 3290 empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, ··· 3470 3585 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 3471 3586 } 3472 3587 3473 - static void __free_slab(struct kmem_cache *s, struct slab *slab) 3588 + static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) 3474 3589 { 3475 3590 struct page *page = slab_page(slab); 3476 3591 int order = compound_order(page); ··· 3481 3596 __ClearPageSlab(page); 3482 3597 mm_account_reclaimed_pages(pages); 3483 3598 unaccount_slab(slab, order, s); 3484 - free_frozen_pages(page, order); 3599 + if (allow_spin) 3600 + free_frozen_pages(page, order); 3601 + else 3602 + free_frozen_pages_nolock(page, order); 3603 + } 3604 + 3605 + static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) 3606 + { 3607 + /* 3608 + * Since it was just allocated, we can skip the actions in 3609 + * discard_slab() and free_slab(). 3610 + */ 3611 + __free_slab(s, slab, false); 3485 3612 } 3486 3613 3487 3614 static void rcu_free_slab(struct rcu_head *h) 3488 3615 { 3489 3616 struct slab *slab = container_of(h, struct slab, rcu_head); 3490 3617 3491 - __free_slab(slab->slab_cache, slab); 3618 + __free_slab(slab->slab_cache, slab, true); 3492 3619 } 3493 3620 3494 3621 static void free_slab(struct kmem_cache *s, struct slab *slab) ··· 3516 3619 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) 3517 3620 call_rcu(&slab->rcu_head, rcu_free_slab); 3518 3621 else 3519 - __free_slab(s, slab); 3622 + __free_slab(s, slab, true); 3520 3623 } 3521 3624 3522 3625 static void discard_slab(struct kmem_cache *s, struct slab *slab) ··· 3544 3647 * Management of partially allocated slabs. 3545 3648 */ 3546 3649 static inline void 3547 - __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) 3650 + __add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) 3548 3651 { 3549 3652 n->nr_partial++; 3550 - if (tail == DEACTIVATE_TO_TAIL) 3653 + if (mode == ADD_TO_TAIL) 3551 3654 list_add_tail(&slab->slab_list, &n->partial); 3552 3655 else 3553 3656 list_add(&slab->slab_list, &n->partial); ··· 3555 3658 } 3556 3659 3557 3660 static inline void add_partial(struct kmem_cache_node *n, 3558 - struct slab *slab, int tail) 3661 + struct slab *slab, enum add_mode mode) 3559 3662 { 3560 3663 lockdep_assert_held(&n->list_lock); 3561 - __add_partial(n, slab, tail); 3664 + __add_partial(n, slab, mode); 3562 3665 } 3563 3666 3564 3667 static inline void remove_partial(struct kmem_cache_node *n, ··· 3609 3712 return object; 3610 3713 } 3611 3714 3612 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); 3613 - 3614 3715 /* 3615 3716 * Called only for kmem_cache_debug() caches to allocate from a freshly 3616 3717 * allocated slab. Allocate a single object instead of whole freelist ··· 3624 3729 void *object; 3625 3730 3626 3731 if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { 3627 - /* Unlucky, discard newly allocated slab */ 3628 - defer_deactivate_slab(slab, NULL); 3732 + /* Unlucky, discard newly allocated slab. */ 3733 + free_new_slab_nolock(s, slab); 3629 3734 return NULL; 3630 3735 } 3631 3736 ··· 3651 3756 if (slab->inuse == slab->objects) 3652 3757 add_full(s, n, slab); 3653 3758 else 3654 - add_partial(n, slab, DEACTIVATE_TO_HEAD); 3759 + add_partial(n, slab, ADD_TO_HEAD); 3655 3760 3656 3761 inc_slabs_node(s, nid, slab->objects); 3657 3762 spin_unlock_irqrestore(&n->list_lock, flags); ··· 3659 3764 return object; 3660 3765 } 3661 3766 3662 - #ifdef CONFIG_SLUB_CPU_PARTIAL 3663 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); 3664 - #else 3665 - static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, 3666 - int drain) { } 3667 - #endif 3668 3767 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); 3669 3768 3670 - /* 3671 - * Try to allocate a partial slab from a specific node. 3672 - */ 3673 - static struct slab *get_partial_node(struct kmem_cache *s, 3674 - struct kmem_cache_node *n, 3675 - struct partial_context *pc) 3769 + static bool get_partial_node_bulk(struct kmem_cache *s, 3770 + struct kmem_cache_node *n, 3771 + struct partial_bulk_context *pc, 3772 + bool allow_spin) 3676 3773 { 3677 - struct slab *slab, *slab2, *partial = NULL; 3774 + struct slab *slab, *slab2; 3775 + unsigned int total_free = 0; 3678 3776 unsigned long flags; 3679 - unsigned int partial_slabs = 0; 3777 + 3778 + /* Racy check to avoid taking the lock unnecessarily. */ 3779 + if (!n || data_race(!n->nr_partial)) 3780 + return false; 3781 + 3782 + INIT_LIST_HEAD(&pc->slabs); 3783 + 3784 + if (allow_spin) 3785 + spin_lock_irqsave(&n->list_lock, flags); 3786 + else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3787 + return false; 3788 + 3789 + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3790 + struct freelist_counters flc; 3791 + unsigned int slab_free; 3792 + 3793 + if (!pfmemalloc_match(slab, pc->flags)) 3794 + continue; 3795 + 3796 + /* 3797 + * determine the number of free objects in the slab racily 3798 + * 3799 + * slab_free is a lower bound due to possible subsequent 3800 + * concurrent freeing, so the caller may get more objects than 3801 + * requested and must handle that 3802 + */ 3803 + flc.counters = data_race(READ_ONCE(slab->counters)); 3804 + slab_free = flc.objects - flc.inuse; 3805 + 3806 + /* we have already min and this would get us over the max */ 3807 + if (total_free >= pc->min_objects 3808 + && total_free + slab_free > pc->max_objects) 3809 + break; 3810 + 3811 + remove_partial(n, slab); 3812 + 3813 + list_add(&slab->slab_list, &pc->slabs); 3814 + 3815 + total_free += slab_free; 3816 + if (total_free >= pc->max_objects) 3817 + break; 3818 + } 3819 + 3820 + spin_unlock_irqrestore(&n->list_lock, flags); 3821 + return total_free > 0; 3822 + } 3823 + 3824 + /* 3825 + * Try to allocate object from a partial slab on a specific node. 3826 + */ 3827 + static void *get_from_partial_node(struct kmem_cache *s, 3828 + struct kmem_cache_node *n, 3829 + struct partial_context *pc) 3830 + { 3831 + struct slab *slab, *slab2; 3832 + unsigned long flags; 3833 + void *object = NULL; 3680 3834 3681 3835 /* 3682 3836 * Racy check. If we mistakenly see no partial slabs then we 3683 3837 * just allocate an empty slab. If we mistakenly try to get a 3684 - * partial slab and there is none available then get_partial() 3838 + * partial slab and there is none available then get_from_partial() 3685 3839 * will return NULL. 3686 3840 */ 3687 3841 if (!n || !n->nr_partial) ··· 3741 3797 else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3742 3798 return NULL; 3743 3799 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3800 + 3801 + struct freelist_counters old, new; 3802 + 3744 3803 if (!pfmemalloc_match(slab, pc->flags)) 3745 3804 continue; 3746 3805 3747 3806 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 3748 - void *object = alloc_single_from_partial(s, n, slab, 3807 + object = alloc_single_from_partial(s, n, slab, 3749 3808 pc->orig_size); 3750 - if (object) { 3751 - partial = slab; 3752 - pc->object = object; 3809 + if (object) 3753 3810 break; 3754 - } 3755 3811 continue; 3756 3812 } 3757 3813 3758 - remove_partial(n, slab); 3814 + /* 3815 + * get a single object from the slab. This might race against 3816 + * __slab_free(), which however has to take the list_lock if 3817 + * it's about to make the slab fully free. 3818 + */ 3819 + do { 3820 + old.freelist = slab->freelist; 3821 + old.counters = slab->counters; 3759 3822 3760 - if (!partial) { 3761 - partial = slab; 3762 - stat(s, ALLOC_FROM_PARTIAL); 3823 + new.freelist = get_freepointer(s, old.freelist); 3824 + new.counters = old.counters; 3825 + new.inuse++; 3763 3826 3764 - if ((slub_get_cpu_partial(s) == 0)) { 3765 - break; 3766 - } 3767 - } else { 3768 - put_cpu_partial(s, slab, 0); 3769 - stat(s, CPU_PARTIAL_NODE); 3827 + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); 3770 3828 3771 - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { 3772 - break; 3773 - } 3774 - } 3829 + object = old.freelist; 3830 + if (!new.freelist) 3831 + remove_partial(n, slab); 3832 + 3833 + break; 3775 3834 } 3776 3835 spin_unlock_irqrestore(&n->list_lock, flags); 3777 - return partial; 3836 + return object; 3778 3837 } 3779 3838 3780 3839 /* 3781 - * Get a slab from somewhere. Search in increasing NUMA distances. 3840 + * Get an object from somewhere. Search in increasing NUMA distances. 3782 3841 */ 3783 - static struct slab *get_any_partial(struct kmem_cache *s, 3784 - struct partial_context *pc) 3842 + static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) 3785 3843 { 3786 3844 #ifdef CONFIG_NUMA 3787 3845 struct zonelist *zonelist; 3788 3846 struct zoneref *z; 3789 3847 struct zone *zone; 3790 3848 enum zone_type highest_zoneidx = gfp_zone(pc->flags); 3791 - struct slab *slab; 3792 3849 unsigned int cpuset_mems_cookie; 3793 3850 3794 3851 /* ··· 3824 3879 3825 3880 if (n && cpuset_zone_allowed(zone, pc->flags) && 3826 3881 n->nr_partial > s->min_partial) { 3827 - slab = get_partial_node(s, n, pc); 3828 - if (slab) { 3882 + 3883 + void *object = get_from_partial_node(s, n, pc); 3884 + 3885 + if (object) { 3829 3886 /* 3830 3887 * Don't check read_mems_allowed_retry() 3831 3888 * here - if mems_allowed was updated in ··· 3835 3888 * between allocation and the cpuset 3836 3889 * update 3837 3890 */ 3838 - return slab; 3891 + return object; 3839 3892 } 3840 3893 } 3841 3894 } ··· 3845 3898 } 3846 3899 3847 3900 /* 3848 - * Get a partial slab, lock it and return it. 3901 + * Get an object from a partial slab 3849 3902 */ 3850 - static struct slab *get_partial(struct kmem_cache *s, int node, 3851 - struct partial_context *pc) 3903 + static void *get_from_partial(struct kmem_cache *s, int node, 3904 + struct partial_context *pc) 3852 3905 { 3853 - struct slab *slab; 3854 3906 int searchnode = node; 3907 + void *object; 3855 3908 3856 3909 if (node == NUMA_NO_NODE) 3857 3910 searchnode = numa_mem_id(); 3858 3911 3859 - slab = get_partial_node(s, get_node(s, searchnode), pc); 3860 - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3861 - return slab; 3912 + object = get_from_partial_node(s, get_node(s, searchnode), pc); 3913 + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3914 + return object; 3862 3915 3863 - return get_any_partial(s, pc); 3864 - } 3865 - 3866 - #ifdef CONFIG_PREEMPTION 3867 - /* 3868 - * Calculate the next globally unique transaction for disambiguation 3869 - * during cmpxchg. The transactions start with the cpu number and are then 3870 - * incremented by CONFIG_NR_CPUS. 3871 - */ 3872 - #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 3873 - #else 3874 - /* 3875 - * No preemption supported therefore also no need to check for 3876 - * different cpus. 3877 - */ 3878 - #define TID_STEP 1 3879 - #endif /* CONFIG_PREEMPTION */ 3880 - 3881 - static inline unsigned long next_tid(unsigned long tid) 3882 - { 3883 - return tid + TID_STEP; 3884 - } 3885 - 3886 - #ifdef SLUB_DEBUG_CMPXCHG 3887 - static inline unsigned int tid_to_cpu(unsigned long tid) 3888 - { 3889 - return tid % TID_STEP; 3890 - } 3891 - 3892 - static inline unsigned long tid_to_event(unsigned long tid) 3893 - { 3894 - return tid / TID_STEP; 3895 - } 3896 - #endif 3897 - 3898 - static inline unsigned int init_tid(int cpu) 3899 - { 3900 - return cpu; 3901 - } 3902 - 3903 - static inline void note_cmpxchg_failure(const char *n, 3904 - const struct kmem_cache *s, unsigned long tid) 3905 - { 3906 - #ifdef SLUB_DEBUG_CMPXCHG 3907 - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 3908 - 3909 - pr_info("%s %s: cmpxchg redo ", n, s->name); 3910 - 3911 - if (IS_ENABLED(CONFIG_PREEMPTION) && 3912 - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { 3913 - pr_warn("due to cpu change %d -> %d\n", 3914 - tid_to_cpu(tid), tid_to_cpu(actual_tid)); 3915 - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { 3916 - pr_warn("due to cpu running other code. Event %ld->%ld\n", 3917 - tid_to_event(tid), tid_to_event(actual_tid)); 3918 - } else { 3919 - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", 3920 - actual_tid, tid, next_tid(tid)); 3921 - } 3922 - #endif 3923 - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 3924 - } 3925 - 3926 - static void init_kmem_cache_cpus(struct kmem_cache *s) 3927 - { 3928 - #ifdef CONFIG_PREEMPT_RT 3929 - /* 3930 - * Register lockdep key for non-boot kmem caches to avoid 3931 - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() 3932 - */ 3933 - bool finegrain_lockdep = !init_section_contains(s, 1); 3934 - #else 3935 - /* 3936 - * Don't bother with different lockdep classes for each 3937 - * kmem_cache, since we only use local_trylock_irqsave(). 3938 - */ 3939 - bool finegrain_lockdep = false; 3940 - #endif 3941 - int cpu; 3942 - struct kmem_cache_cpu *c; 3943 - 3944 - if (finegrain_lockdep) 3945 - lockdep_register_key(&s->lock_key); 3946 - for_each_possible_cpu(cpu) { 3947 - c = per_cpu_ptr(s->cpu_slab, cpu); 3948 - local_trylock_init(&c->lock); 3949 - if (finegrain_lockdep) 3950 - lockdep_set_class(&c->lock, &s->lock_key); 3951 - c->tid = init_tid(cpu); 3952 - } 3953 - } 3954 - 3955 - /* 3956 - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, 3957 - * unfreezes the slabs and puts it on the proper list. 3958 - * Assumes the slab has been already safely taken away from kmem_cache_cpu 3959 - * by the caller. 3960 - */ 3961 - static void deactivate_slab(struct kmem_cache *s, struct slab *slab, 3962 - void *freelist) 3963 - { 3964 - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); 3965 - int free_delta = 0; 3966 - void *nextfree, *freelist_iter, *freelist_tail; 3967 - int tail = DEACTIVATE_TO_HEAD; 3968 - unsigned long flags = 0; 3969 - struct freelist_counters old, new; 3970 - 3971 - if (READ_ONCE(slab->freelist)) { 3972 - stat(s, DEACTIVATE_REMOTE_FREES); 3973 - tail = DEACTIVATE_TO_TAIL; 3974 - } 3975 - 3976 - /* 3977 - * Stage one: Count the objects on cpu's freelist as free_delta and 3978 - * remember the last object in freelist_tail for later splicing. 3979 - */ 3980 - freelist_tail = NULL; 3981 - freelist_iter = freelist; 3982 - while (freelist_iter) { 3983 - nextfree = get_freepointer(s, freelist_iter); 3984 - 3985 - /* 3986 - * If 'nextfree' is invalid, it is possible that the object at 3987 - * 'freelist_iter' is already corrupted. So isolate all objects 3988 - * starting at 'freelist_iter' by skipping them. 3989 - */ 3990 - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) 3991 - break; 3992 - 3993 - freelist_tail = freelist_iter; 3994 - free_delta++; 3995 - 3996 - freelist_iter = nextfree; 3997 - } 3998 - 3999 - /* 4000 - * Stage two: Unfreeze the slab while splicing the per-cpu 4001 - * freelist to the head of slab's freelist. 4002 - */ 4003 - do { 4004 - old.freelist = READ_ONCE(slab->freelist); 4005 - old.counters = READ_ONCE(slab->counters); 4006 - VM_BUG_ON(!old.frozen); 4007 - 4008 - /* Determine target state of the slab */ 4009 - new.counters = old.counters; 4010 - new.frozen = 0; 4011 - if (freelist_tail) { 4012 - new.inuse -= free_delta; 4013 - set_freepointer(s, freelist_tail, old.freelist); 4014 - new.freelist = freelist; 4015 - } else { 4016 - new.freelist = old.freelist; 4017 - } 4018 - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); 4019 - 4020 - /* 4021 - * Stage three: Manipulate the slab list based on the updated state. 4022 - */ 4023 - if (!new.inuse && n->nr_partial >= s->min_partial) { 4024 - stat(s, DEACTIVATE_EMPTY); 4025 - discard_slab(s, slab); 4026 - stat(s, FREE_SLAB); 4027 - } else if (new.freelist) { 4028 - spin_lock_irqsave(&n->list_lock, flags); 4029 - add_partial(n, slab, tail); 4030 - spin_unlock_irqrestore(&n->list_lock, flags); 4031 - stat(s, tail); 4032 - } else { 4033 - stat(s, DEACTIVATE_FULL); 4034 - } 4035 - } 4036 - 4037 - /* 4038 - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock 4039 - * can be acquired without a deadlock before invoking the function. 4040 - * 4041 - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is 4042 - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), 4043 - * and kmalloc() is not used in an unsupported context. 4044 - * 4045 - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). 4046 - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but 4047 - * lockdep_assert() will catch a bug in case: 4048 - * #1 4049 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() 4050 - * or 4051 - * #2 4052 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() 4053 - * 4054 - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt 4055 - * disabled context. The lock will always be acquired and if needed it 4056 - * block and sleep until the lock is available. 4057 - * #1 is possible in !PREEMPT_RT only. 4058 - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: 4059 - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> 4060 - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) 4061 - * 4062 - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B 4063 - */ 4064 - #if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) 4065 - #define local_lock_cpu_slab(s, flags) \ 4066 - local_lock_irqsave(&(s)->cpu_slab->lock, flags) 4067 - #else 4068 - #define local_lock_cpu_slab(s, flags) \ 4069 - do { \ 4070 - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ 4071 - lockdep_assert(__l); \ 4072 - } while (0) 4073 - #endif 4074 - 4075 - #define local_unlock_cpu_slab(s, flags) \ 4076 - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) 4077 - 4078 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4079 - static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) 4080 - { 4081 - struct kmem_cache_node *n = NULL, *n2 = NULL; 4082 - struct slab *slab, *slab_to_discard = NULL; 4083 - unsigned long flags = 0; 4084 - 4085 - while (partial_slab) { 4086 - slab = partial_slab; 4087 - partial_slab = slab->next; 4088 - 4089 - n2 = get_node(s, slab_nid(slab)); 4090 - if (n != n2) { 4091 - if (n) 4092 - spin_unlock_irqrestore(&n->list_lock, flags); 4093 - 4094 - n = n2; 4095 - spin_lock_irqsave(&n->list_lock, flags); 4096 - } 4097 - 4098 - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { 4099 - slab->next = slab_to_discard; 4100 - slab_to_discard = slab; 4101 - } else { 4102 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 4103 - stat(s, FREE_ADD_PARTIAL); 4104 - } 4105 - } 4106 - 4107 - if (n) 4108 - spin_unlock_irqrestore(&n->list_lock, flags); 4109 - 4110 - while (slab_to_discard) { 4111 - slab = slab_to_discard; 4112 - slab_to_discard = slab_to_discard->next; 4113 - 4114 - stat(s, DEACTIVATE_EMPTY); 4115 - discard_slab(s, slab); 4116 - stat(s, FREE_SLAB); 4117 - } 4118 - } 4119 - 4120 - /* 4121 - * Put all the cpu partial slabs to the node partial list. 4122 - */ 4123 - static void put_partials(struct kmem_cache *s) 4124 - { 4125 - struct slab *partial_slab; 4126 - unsigned long flags; 4127 - 4128 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4129 - partial_slab = this_cpu_read(s->cpu_slab->partial); 4130 - this_cpu_write(s->cpu_slab->partial, NULL); 4131 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4132 - 4133 - if (partial_slab) 4134 - __put_partials(s, partial_slab); 4135 - } 4136 - 4137 - static void put_partials_cpu(struct kmem_cache *s, 4138 - struct kmem_cache_cpu *c) 4139 - { 4140 - struct slab *partial_slab; 4141 - 4142 - partial_slab = slub_percpu_partial(c); 4143 - c->partial = NULL; 4144 - 4145 - if (partial_slab) 4146 - __put_partials(s, partial_slab); 4147 - } 4148 - 4149 - /* 4150 - * Put a slab into a partial slab slot if available. 4151 - * 4152 - * If we did not find a slot then simply move all the partials to the 4153 - * per node partial list. 4154 - */ 4155 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) 4156 - { 4157 - struct slab *oldslab; 4158 - struct slab *slab_to_put = NULL; 4159 - unsigned long flags; 4160 - int slabs = 0; 4161 - 4162 - local_lock_cpu_slab(s, flags); 4163 - 4164 - oldslab = this_cpu_read(s->cpu_slab->partial); 4165 - 4166 - if (oldslab) { 4167 - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { 4168 - /* 4169 - * Partial array is full. Move the existing set to the 4170 - * per node partial list. Postpone the actual unfreezing 4171 - * outside of the critical section. 4172 - */ 4173 - slab_to_put = oldslab; 4174 - oldslab = NULL; 4175 - } else { 4176 - slabs = oldslab->slabs; 4177 - } 4178 - } 4179 - 4180 - slabs++; 4181 - 4182 - slab->slabs = slabs; 4183 - slab->next = oldslab; 4184 - 4185 - this_cpu_write(s->cpu_slab->partial, slab); 4186 - 4187 - local_unlock_cpu_slab(s, flags); 4188 - 4189 - if (slab_to_put) { 4190 - __put_partials(s, slab_to_put); 4191 - stat(s, CPU_PARTIAL_DRAIN); 4192 - } 4193 - } 4194 - 4195 - #else /* CONFIG_SLUB_CPU_PARTIAL */ 4196 - 4197 - static inline void put_partials(struct kmem_cache *s) { } 4198 - static inline void put_partials_cpu(struct kmem_cache *s, 4199 - struct kmem_cache_cpu *c) { } 4200 - 4201 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 4202 - 4203 - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 4204 - { 4205 - unsigned long flags; 4206 - struct slab *slab; 4207 - void *freelist; 4208 - 4209 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4210 - 4211 - slab = c->slab; 4212 - freelist = c->freelist; 4213 - 4214 - c->slab = NULL; 4215 - c->freelist = NULL; 4216 - c->tid = next_tid(c->tid); 4217 - 4218 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4219 - 4220 - if (slab) { 4221 - deactivate_slab(s, slab, freelist); 4222 - stat(s, CPUSLAB_FLUSH); 4223 - } 4224 - } 4225 - 4226 - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 4227 - { 4228 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4229 - void *freelist = c->freelist; 4230 - struct slab *slab = c->slab; 4231 - 4232 - c->slab = NULL; 4233 - c->freelist = NULL; 4234 - c->tid = next_tid(c->tid); 4235 - 4236 - if (slab) { 4237 - deactivate_slab(s, slab, freelist); 4238 - stat(s, CPUSLAB_FLUSH); 4239 - } 4240 - 4241 - put_partials_cpu(s, c); 4242 - } 4243 - 4244 - static inline void flush_this_cpu_slab(struct kmem_cache *s) 4245 - { 4246 - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 4247 - 4248 - if (c->slab) 4249 - flush_slab(s, c); 4250 - 4251 - put_partials(s); 4252 - } 4253 - 4254 - static bool has_cpu_slab(int cpu, struct kmem_cache *s) 4255 - { 4256 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4257 - 4258 - return c->slab || slub_percpu_partial(c); 3916 + return get_from_any_partial(s, pc); 4259 3917 } 4260 3918 4261 3919 static bool has_pcs_used(int cpu, struct kmem_cache *s) 4262 3920 { 4263 3921 struct slub_percpu_sheaves *pcs; 4264 3922 4265 - if (!s->cpu_sheaves) 3923 + if (!cache_has_sheaves(s)) 4266 3924 return false; 4267 3925 4268 3926 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); ··· 3876 4324 } 3877 4325 3878 4326 /* 3879 - * Flush cpu slab. 4327 + * Flush percpu sheaves 3880 4328 * 3881 4329 * Called from CPU work handler with migration disabled. 3882 4330 */ 3883 - static void flush_cpu_slab(struct work_struct *w) 4331 + static void flush_cpu_sheaves(struct work_struct *w) 3884 4332 { 3885 4333 struct kmem_cache *s; 3886 4334 struct slub_flush_work *sfw; ··· 3889 4337 3890 4338 s = sfw->s; 3891 4339 3892 - if (s->cpu_sheaves) 4340 + if (cache_has_sheaves(s)) 3893 4341 pcs_flush_all(s); 3894 - 3895 - flush_this_cpu_slab(s); 3896 4342 } 3897 4343 3898 4344 static void flush_all_cpus_locked(struct kmem_cache *s) ··· 3903 4353 3904 4354 for_each_online_cpu(cpu) { 3905 4355 sfw = &per_cpu(slub_flush, cpu); 3906 - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { 4356 + if (!has_pcs_used(cpu, s)) { 3907 4357 sfw->skip = true; 3908 4358 continue; 3909 4359 } 3910 - INIT_WORK(&sfw->work, flush_cpu_slab); 4360 + INIT_WORK(&sfw->work, flush_cpu_sheaves); 3911 4361 sfw->skip = false; 3912 4362 sfw->s = s; 3913 4363 queue_work_on(cpu, flushwq, &sfw->work); ··· 3992 4442 mutex_lock(&slab_mutex); 3993 4443 3994 4444 list_for_each_entry(s, &slab_caches, list) { 3995 - if (!s->cpu_sheaves) 4445 + if (!cache_has_sheaves(s)) 3996 4446 continue; 3997 4447 flush_rcu_sheaves_on_cache(s); 3998 4448 } ··· 4013 4463 4014 4464 mutex_lock(&slab_mutex); 4015 4465 list_for_each_entry(s, &slab_caches, list) { 4016 - __flush_cpu_slab(s, cpu); 4017 - if (s->cpu_sheaves) 4466 + if (cache_has_sheaves(s)) 4018 4467 __pcs_flush_all_cpu(s, cpu); 4019 4468 } 4020 4469 mutex_unlock(&slab_mutex); 4021 4470 return 0; 4022 - } 4023 - 4024 - /* 4025 - * Check if the objects in a per cpu structure fit numa 4026 - * locality expectations. 4027 - */ 4028 - static inline int node_match(struct slab *slab, int node) 4029 - { 4030 - #ifdef CONFIG_NUMA 4031 - if (node != NUMA_NO_NODE && slab_nid(slab) != node) 4032 - return 0; 4033 - #endif 4034 - return 1; 4035 4471 } 4036 4472 4037 4473 #ifdef CONFIG_SLUB_DEBUG ··· 4192 4656 return true; 4193 4657 } 4194 4658 4195 - static inline bool 4196 - __update_cpu_freelist_fast(struct kmem_cache *s, 4197 - void *freelist_old, void *freelist_new, 4198 - unsigned long tid) 4199 - { 4200 - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; 4201 - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; 4202 - 4203 - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, 4204 - &old.freelist_tid, new.freelist_tid); 4205 - } 4206 - 4207 4659 /* 4208 - * Check the slab->freelist and either transfer the freelist to the 4209 - * per cpu freelist or deactivate the slab. 4660 + * Get the slab's freelist and do not freeze it. 4210 4661 * 4211 - * The slab is still frozen if the return value is not NULL. 4662 + * Assumes the slab is isolated from node partial list and not frozen. 4212 4663 * 4213 - * If this function returns NULL then the slab has been unfrozen. 4664 + * Assumes this is performed only for caches without debugging so we 4665 + * don't need to worry about adding the slab to the full list. 4214 4666 */ 4215 - static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 4216 - { 4217 - struct freelist_counters old, new; 4218 - 4219 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4220 - 4221 - do { 4222 - old.freelist = slab->freelist; 4223 - old.counters = slab->counters; 4224 - 4225 - new.freelist = NULL; 4226 - new.counters = old.counters; 4227 - 4228 - new.inuse = old.objects; 4229 - new.frozen = old.freelist != NULL; 4230 - 4231 - 4232 - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); 4233 - 4234 - return old.freelist; 4235 - } 4236 - 4237 - /* 4238 - * Freeze the partial slab and return the pointer to the freelist. 4239 - */ 4240 - static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 4667 + static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) 4241 4668 { 4242 4669 struct freelist_counters old, new; 4243 4670 ··· 4210 4711 4211 4712 new.freelist = NULL; 4212 4713 new.counters = old.counters; 4213 - VM_BUG_ON(new.frozen); 4714 + VM_WARN_ON_ONCE(new.frozen); 4214 4715 4215 4716 new.inuse = old.objects; 4216 - new.frozen = 1; 4217 4717 4218 - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4718 + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); 4219 4719 4220 4720 return old.freelist; 4221 - } 4222 - 4223 - /* 4224 - * Slow path. The lockless freelist is empty or we need to perform 4225 - * debugging duties. 4226 - * 4227 - * Processing is still very fast if new objects have been freed to the 4228 - * regular freelist. In that case we simply take over the regular freelist 4229 - * as the lockless freelist and zap the regular freelist. 4230 - * 4231 - * If that is not working then we fall back to the partial lists. We take the 4232 - * first element of the freelist as the object to allocate now and move the 4233 - * rest of the freelist to the lockless freelist. 4234 - * 4235 - * And if we were unable to get a new slab from the partial slab lists then 4236 - * we need to allocate a new slab. This is the slowest path since it involves 4237 - * a call to the page allocator and the setup of a new slab. 4238 - * 4239 - * Version of __slab_alloc to use when we know that preemption is 4240 - * already disabled (which is the case for bulk allocation). 4241 - */ 4242 - static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4243 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4244 - { 4245 - bool allow_spin = gfpflags_allow_spinning(gfpflags); 4246 - void *freelist; 4247 - struct slab *slab; 4248 - unsigned long flags; 4249 - struct partial_context pc; 4250 - bool try_thisnode = true; 4251 - 4252 - stat(s, ALLOC_SLOWPATH); 4253 - 4254 - reread_slab: 4255 - 4256 - slab = READ_ONCE(c->slab); 4257 - if (!slab) { 4258 - /* 4259 - * if the node is not online or has no normal memory, just 4260 - * ignore the node constraint 4261 - */ 4262 - if (unlikely(node != NUMA_NO_NODE && 4263 - !node_isset(node, slab_nodes))) 4264 - node = NUMA_NO_NODE; 4265 - goto new_slab; 4266 - } 4267 - 4268 - if (unlikely(!node_match(slab, node))) { 4269 - /* 4270 - * same as above but node_match() being false already 4271 - * implies node != NUMA_NO_NODE. 4272 - * 4273 - * We don't strictly honor pfmemalloc and NUMA preferences 4274 - * when !allow_spin because: 4275 - * 4276 - * 1. Most kmalloc() users allocate objects on the local node, 4277 - * so kmalloc_nolock() tries not to interfere with them by 4278 - * deactivating the cpu slab. 4279 - * 4280 - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause 4281 - * unnecessary slab allocations even when n->partial list 4282 - * is not empty. 4283 - */ 4284 - if (!node_isset(node, slab_nodes) || 4285 - !allow_spin) { 4286 - node = NUMA_NO_NODE; 4287 - } else { 4288 - stat(s, ALLOC_NODE_MISMATCH); 4289 - goto deactivate_slab; 4290 - } 4291 - } 4292 - 4293 - /* 4294 - * By rights, we should be searching for a slab page that was 4295 - * PFMEMALLOC but right now, we are losing the pfmemalloc 4296 - * information when the page leaves the per-cpu allocator 4297 - */ 4298 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) 4299 - goto deactivate_slab; 4300 - 4301 - /* must check again c->slab in case we got preempted and it changed */ 4302 - local_lock_cpu_slab(s, flags); 4303 - 4304 - if (unlikely(slab != c->slab)) { 4305 - local_unlock_cpu_slab(s, flags); 4306 - goto reread_slab; 4307 - } 4308 - freelist = c->freelist; 4309 - if (freelist) 4310 - goto load_freelist; 4311 - 4312 - freelist = get_freelist(s, slab); 4313 - 4314 - if (!freelist) { 4315 - c->slab = NULL; 4316 - c->tid = next_tid(c->tid); 4317 - local_unlock_cpu_slab(s, flags); 4318 - stat(s, DEACTIVATE_BYPASS); 4319 - goto new_slab; 4320 - } 4321 - 4322 - stat(s, ALLOC_REFILL); 4323 - 4324 - load_freelist: 4325 - 4326 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4327 - 4328 - /* 4329 - * freelist is pointing to the list of objects to be used. 4330 - * slab is pointing to the slab from which the objects are obtained. 4331 - * That slab must be frozen for per cpu allocations to work. 4332 - */ 4333 - VM_BUG_ON(!c->slab->frozen); 4334 - c->freelist = get_freepointer(s, freelist); 4335 - c->tid = next_tid(c->tid); 4336 - local_unlock_cpu_slab(s, flags); 4337 - return freelist; 4338 - 4339 - deactivate_slab: 4340 - 4341 - local_lock_cpu_slab(s, flags); 4342 - if (slab != c->slab) { 4343 - local_unlock_cpu_slab(s, flags); 4344 - goto reread_slab; 4345 - } 4346 - freelist = c->freelist; 4347 - c->slab = NULL; 4348 - c->freelist = NULL; 4349 - c->tid = next_tid(c->tid); 4350 - local_unlock_cpu_slab(s, flags); 4351 - deactivate_slab(s, slab, freelist); 4352 - 4353 - new_slab: 4354 - 4355 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4356 - while (slub_percpu_partial(c)) { 4357 - local_lock_cpu_slab(s, flags); 4358 - if (unlikely(c->slab)) { 4359 - local_unlock_cpu_slab(s, flags); 4360 - goto reread_slab; 4361 - } 4362 - if (unlikely(!slub_percpu_partial(c))) { 4363 - local_unlock_cpu_slab(s, flags); 4364 - /* we were preempted and partial list got empty */ 4365 - goto new_objects; 4366 - } 4367 - 4368 - slab = slub_percpu_partial(c); 4369 - slub_set_percpu_partial(c, slab); 4370 - 4371 - if (likely(node_match(slab, node) && 4372 - pfmemalloc_match(slab, gfpflags)) || 4373 - !allow_spin) { 4374 - c->slab = slab; 4375 - freelist = get_freelist(s, slab); 4376 - VM_BUG_ON(!freelist); 4377 - stat(s, CPU_PARTIAL_ALLOC); 4378 - goto load_freelist; 4379 - } 4380 - 4381 - local_unlock_cpu_slab(s, flags); 4382 - 4383 - slab->next = NULL; 4384 - __put_partials(s, slab); 4385 - } 4386 - #endif 4387 - 4388 - new_objects: 4389 - 4390 - pc.flags = gfpflags; 4391 - /* 4392 - * When a preferred node is indicated but no __GFP_THISNODE 4393 - * 4394 - * 1) try to get a partial slab from target node only by having 4395 - * __GFP_THISNODE in pc.flags for get_partial() 4396 - * 2) if 1) failed, try to allocate a new slab from target node with 4397 - * GPF_NOWAIT | __GFP_THISNODE opportunistically 4398 - * 3) if 2) failed, retry with original gfpflags which will allow 4399 - * get_partial() try partial lists of other nodes before potentially 4400 - * allocating new page from other nodes 4401 - */ 4402 - if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4403 - && try_thisnode)) { 4404 - if (unlikely(!allow_spin)) 4405 - /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 4406 - pc.flags = gfpflags | __GFP_THISNODE; 4407 - else 4408 - pc.flags = GFP_NOWAIT | __GFP_THISNODE; 4409 - } 4410 - 4411 - pc.orig_size = orig_size; 4412 - slab = get_partial(s, node, &pc); 4413 - if (slab) { 4414 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4415 - freelist = pc.object; 4416 - /* 4417 - * For debug caches here we had to go through 4418 - * alloc_single_from_partial() so just store the 4419 - * tracking info and return the object. 4420 - * 4421 - * Due to disabled preemption we need to disallow 4422 - * blocking. The flags are further adjusted by 4423 - * gfp_nested_mask() in stack_depot itself. 4424 - */ 4425 - if (s->flags & SLAB_STORE_USER) 4426 - set_track(s, freelist, TRACK_ALLOC, addr, 4427 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4428 - 4429 - return freelist; 4430 - } 4431 - 4432 - freelist = freeze_slab(s, slab); 4433 - goto retry_load_slab; 4434 - } 4435 - 4436 - slub_put_cpu_ptr(s->cpu_slab); 4437 - slab = new_slab(s, pc.flags, node); 4438 - c = slub_get_cpu_ptr(s->cpu_slab); 4439 - 4440 - if (unlikely(!slab)) { 4441 - if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4442 - && try_thisnode) { 4443 - try_thisnode = false; 4444 - goto new_objects; 4445 - } 4446 - slab_out_of_memory(s, gfpflags, node); 4447 - return NULL; 4448 - } 4449 - 4450 - stat(s, ALLOC_SLAB); 4451 - 4452 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4453 - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4454 - 4455 - if (unlikely(!freelist)) { 4456 - /* This could cause an endless loop. Fail instead. */ 4457 - if (!allow_spin) 4458 - return NULL; 4459 - goto new_objects; 4460 - } 4461 - 4462 - if (s->flags & SLAB_STORE_USER) 4463 - set_track(s, freelist, TRACK_ALLOC, addr, 4464 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4465 - 4466 - return freelist; 4467 - } 4468 - 4469 - /* 4470 - * No other reference to the slab yet so we can 4471 - * muck around with it freely without cmpxchg 4472 - */ 4473 - freelist = slab->freelist; 4474 - slab->freelist = NULL; 4475 - slab->inuse = slab->objects; 4476 - slab->frozen = 1; 4477 - 4478 - inc_slabs_node(s, slab_nid(slab), slab->objects); 4479 - 4480 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { 4481 - /* 4482 - * For !pfmemalloc_match() case we don't load freelist so that 4483 - * we don't make further mismatched allocations easier. 4484 - */ 4485 - deactivate_slab(s, slab, get_freepointer(s, freelist)); 4486 - return freelist; 4487 - } 4488 - 4489 - retry_load_slab: 4490 - 4491 - local_lock_cpu_slab(s, flags); 4492 - if (unlikely(c->slab)) { 4493 - void *flush_freelist = c->freelist; 4494 - struct slab *flush_slab = c->slab; 4495 - 4496 - c->slab = NULL; 4497 - c->freelist = NULL; 4498 - c->tid = next_tid(c->tid); 4499 - 4500 - local_unlock_cpu_slab(s, flags); 4501 - 4502 - if (unlikely(!allow_spin)) { 4503 - /* Reentrant slub cannot take locks, defer */ 4504 - defer_deactivate_slab(flush_slab, flush_freelist); 4505 - } else { 4506 - deactivate_slab(s, flush_slab, flush_freelist); 4507 - } 4508 - 4509 - stat(s, CPUSLAB_FLUSH); 4510 - 4511 - goto retry_load_slab; 4512 - } 4513 - c->slab = slab; 4514 - 4515 - goto load_freelist; 4516 - } 4517 - /* 4518 - * We disallow kprobes in ___slab_alloc() to prevent reentrance 4519 - * 4520 - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of 4521 - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> 4522 - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() 4523 - * manipulating c->freelist without lock. 4524 - * 4525 - * This does not prevent kprobe in functions called from ___slab_alloc() such as 4526 - * local_lock_irqsave() itself, and that is fine, we only need to protect the 4527 - * c->freelist manipulation in ___slab_alloc() itself. 4528 - */ 4529 - NOKPROBE_SYMBOL(___slab_alloc); 4530 - 4531 - /* 4532 - * A wrapper for ___slab_alloc() for contexts where preemption is not yet 4533 - * disabled. Compensates for possible cpu changes by refetching the per cpu area 4534 - * pointer. 4535 - */ 4536 - static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4537 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4538 - { 4539 - void *p; 4540 - 4541 - #ifdef CONFIG_PREEMPT_COUNT 4542 - /* 4543 - * We may have been preempted and rescheduled on a different 4544 - * cpu before disabling preemption. Need to reload cpu area 4545 - * pointer. 4546 - */ 4547 - c = slub_get_cpu_ptr(s->cpu_slab); 4548 - #endif 4549 - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { 4550 - if (local_lock_is_locked(&s->cpu_slab->lock)) { 4551 - /* 4552 - * EBUSY is an internal signal to kmalloc_nolock() to 4553 - * retry a different bucket. It's not propagated 4554 - * to the caller. 4555 - */ 4556 - p = ERR_PTR(-EBUSY); 4557 - goto out; 4558 - } 4559 - } 4560 - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); 4561 - out: 4562 - #ifdef CONFIG_PREEMPT_COUNT 4563 - slub_put_cpu_ptr(s->cpu_slab); 4564 - #endif 4565 - return p; 4566 - } 4567 - 4568 - static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 4569 - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 4570 - { 4571 - struct kmem_cache_cpu *c; 4572 - struct slab *slab; 4573 - unsigned long tid; 4574 - void *object; 4575 - 4576 - redo: 4577 - /* 4578 - * Must read kmem_cache cpu data via this cpu ptr. Preemption is 4579 - * enabled. We may switch back and forth between cpus while 4580 - * reading from one cpu area. That does not matter as long 4581 - * as we end up on the original cpu again when doing the cmpxchg. 4582 - * 4583 - * We must guarantee that tid and kmem_cache_cpu are retrieved on the 4584 - * same cpu. We read first the kmem_cache_cpu pointer and use it to read 4585 - * the tid. If we are preempted and switched to another cpu between the 4586 - * two reads, it's OK as the two are still associated with the same cpu 4587 - * and cmpxchg later will validate the cpu. 4588 - */ 4589 - c = raw_cpu_ptr(s->cpu_slab); 4590 - tid = READ_ONCE(c->tid); 4591 - 4592 - /* 4593 - * Irqless object alloc/free algorithm used here depends on sequence 4594 - * of fetching cpu_slab's data. tid should be fetched before anything 4595 - * on c to guarantee that object and slab associated with previous tid 4596 - * won't be used with current tid. If we fetch tid first, object and 4597 - * slab could be one associated with next tid and our alloc/free 4598 - * request will be failed. In this case, we will retry. So, no problem. 4599 - */ 4600 - barrier(); 4601 - 4602 - /* 4603 - * The transaction ids are globally unique per cpu and per operation on 4604 - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 4605 - * occurs on the right processor and that there was no operation on the 4606 - * linked list in between. 4607 - */ 4608 - 4609 - object = c->freelist; 4610 - slab = c->slab; 4611 - 4612 - #ifdef CONFIG_NUMA 4613 - if (static_branch_unlikely(&strict_numa) && 4614 - node == NUMA_NO_NODE) { 4615 - 4616 - struct mempolicy *mpol = current->mempolicy; 4617 - 4618 - if (mpol) { 4619 - /* 4620 - * Special BIND rule support. If existing slab 4621 - * is in permitted set then do not redirect 4622 - * to a particular node. 4623 - * Otherwise we apply the memory policy to get 4624 - * the node we need to allocate on. 4625 - */ 4626 - if (mpol->mode != MPOL_BIND || !slab || 4627 - !node_isset(slab_nid(slab), mpol->nodes)) 4628 - 4629 - node = mempolicy_slab_node(); 4630 - } 4631 - } 4632 - #endif 4633 - 4634 - if (!USE_LOCKLESS_FAST_PATH() || 4635 - unlikely(!object || !slab || !node_match(slab, node))) { 4636 - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); 4637 - } else { 4638 - void *next_object = get_freepointer_safe(s, object); 4639 - 4640 - /* 4641 - * The cmpxchg will only match if there was no additional 4642 - * operation and if we are on the right processor. 4643 - * 4644 - * The cmpxchg does the following atomically (without lock 4645 - * semantics!) 4646 - * 1. Relocate first pointer to the current per cpu area. 4647 - * 2. Verify that tid and freelist have not been changed 4648 - * 3. If they were not changed replace tid and freelist 4649 - * 4650 - * Since this is without lock semantics the protection is only 4651 - * against code executing on this cpu *not* from access by 4652 - * other cpus. 4653 - */ 4654 - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { 4655 - note_cmpxchg_failure("slab_alloc", s, tid); 4656 - goto redo; 4657 - } 4658 - prefetch_freepointer(s, next_object); 4659 - stat(s, ALLOC_FASTPATH); 4660 - } 4661 - 4662 - return object; 4663 4721 } 4664 4722 4665 4723 /* ··· 4232 5176 !freeptr_outside_object(s)) 4233 5177 memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 4234 5178 0, sizeof(void *)); 5179 + } 5180 + 5181 + static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, 5182 + void **p, unsigned int count, bool allow_spin) 5183 + { 5184 + unsigned int allocated = 0; 5185 + struct kmem_cache_node *n; 5186 + bool needs_add_partial; 5187 + unsigned long flags; 5188 + void *object; 5189 + 5190 + /* 5191 + * Are we going to put the slab on the partial list? 5192 + * Note slab->inuse is 0 on a new slab. 5193 + */ 5194 + needs_add_partial = (slab->objects > count); 5195 + 5196 + if (!allow_spin && needs_add_partial) { 5197 + 5198 + n = get_node(s, slab_nid(slab)); 5199 + 5200 + if (!spin_trylock_irqsave(&n->list_lock, flags)) { 5201 + /* Unlucky, discard newly allocated slab */ 5202 + free_new_slab_nolock(s, slab); 5203 + return 0; 5204 + } 5205 + } 5206 + 5207 + object = slab->freelist; 5208 + while (object && allocated < count) { 5209 + p[allocated] = object; 5210 + object = get_freepointer(s, object); 5211 + maybe_wipe_obj_freeptr(s, p[allocated]); 5212 + 5213 + slab->inuse++; 5214 + allocated++; 5215 + } 5216 + slab->freelist = object; 5217 + 5218 + if (needs_add_partial) { 5219 + 5220 + if (allow_spin) { 5221 + n = get_node(s, slab_nid(slab)); 5222 + spin_lock_irqsave(&n->list_lock, flags); 5223 + } 5224 + add_partial(n, slab, ADD_TO_HEAD); 5225 + spin_unlock_irqrestore(&n->list_lock, flags); 5226 + } 5227 + 5228 + inc_slabs_node(s, slab_nid(slab), slab->objects); 5229 + return allocated; 5230 + } 5231 + 5232 + /* 5233 + * Slow path. We failed to allocate via percpu sheaves or they are not available 5234 + * due to bootstrap or debugging enabled or SLUB_TINY. 5235 + * 5236 + * We try to allocate from partial slab lists and fall back to allocating a new 5237 + * slab. 5238 + */ 5239 + static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 5240 + unsigned long addr, unsigned int orig_size) 5241 + { 5242 + bool allow_spin = gfpflags_allow_spinning(gfpflags); 5243 + void *object; 5244 + struct slab *slab; 5245 + struct partial_context pc; 5246 + bool try_thisnode = true; 5247 + 5248 + stat(s, ALLOC_SLOWPATH); 5249 + 5250 + new_objects: 5251 + 5252 + pc.flags = gfpflags; 5253 + /* 5254 + * When a preferred node is indicated but no __GFP_THISNODE 5255 + * 5256 + * 1) try to get a partial slab from target node only by having 5257 + * __GFP_THISNODE in pc.flags for get_from_partial() 5258 + * 2) if 1) failed, try to allocate a new slab from target node with 5259 + * GPF_NOWAIT | __GFP_THISNODE opportunistically 5260 + * 3) if 2) failed, retry with original gfpflags which will allow 5261 + * get_from_partial() try partial lists of other nodes before 5262 + * potentially allocating new page from other nodes 5263 + */ 5264 + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 5265 + && try_thisnode)) { 5266 + if (unlikely(!allow_spin)) 5267 + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 5268 + pc.flags = gfpflags | __GFP_THISNODE; 5269 + else 5270 + pc.flags = GFP_NOWAIT | __GFP_THISNODE; 5271 + } 5272 + 5273 + pc.orig_size = orig_size; 5274 + object = get_from_partial(s, node, &pc); 5275 + if (object) 5276 + goto success; 5277 + 5278 + slab = new_slab(s, pc.flags, node); 5279 + 5280 + if (unlikely(!slab)) { 5281 + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 5282 + && try_thisnode) { 5283 + try_thisnode = false; 5284 + goto new_objects; 5285 + } 5286 + slab_out_of_memory(s, gfpflags, node); 5287 + return NULL; 5288 + } 5289 + 5290 + stat(s, ALLOC_SLAB); 5291 + 5292 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5293 + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 5294 + 5295 + if (likely(object)) 5296 + goto success; 5297 + } else { 5298 + alloc_from_new_slab(s, slab, &object, 1, allow_spin); 5299 + 5300 + /* we don't need to check SLAB_STORE_USER here */ 5301 + if (likely(object)) 5302 + return object; 5303 + } 5304 + 5305 + if (allow_spin) 5306 + goto new_objects; 5307 + 5308 + /* This could cause an endless loop. Fail instead. */ 5309 + return NULL; 5310 + 5311 + success: 5312 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) 5313 + set_track(s, object, TRACK_ALLOC, addr, gfpflags); 5314 + 5315 + return object; 5316 + } 5317 + 5318 + static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 5319 + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 5320 + { 5321 + void *object; 5322 + 5323 + #ifdef CONFIG_NUMA 5324 + if (static_branch_unlikely(&strict_numa) && 5325 + node == NUMA_NO_NODE) { 5326 + 5327 + struct mempolicy *mpol = current->mempolicy; 5328 + 5329 + if (mpol) { 5330 + /* 5331 + * Special BIND rule support. If the local node 5332 + * is in permitted set then do not redirect 5333 + * to a particular node. 5334 + * Otherwise we apply the memory policy to get 5335 + * the node we need to allocate on. 5336 + */ 5337 + if (mpol->mode != MPOL_BIND || 5338 + !node_isset(numa_mem_id(), mpol->nodes)) 5339 + node = mempolicy_slab_node(); 5340 + } 5341 + } 5342 + #endif 5343 + 5344 + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); 5345 + 5346 + return object; 4235 5347 } 4236 5348 4237 5349 static __fastpath_inline ··· 4488 5264 4489 5265 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 4490 5266 5267 + /* Bootstrap or debug cache, back off */ 5268 + if (unlikely(!cache_has_sheaves(s))) { 5269 + local_unlock(&s->cpu_sheaves->lock); 5270 + return NULL; 5271 + } 5272 + 4491 5273 if (pcs->spare && pcs->spare->size > 0) { 4492 5274 swap(pcs->main, pcs->spare); 4493 5275 return pcs; ··· 4505 5275 return NULL; 4506 5276 } 4507 5277 4508 - full = barn_replace_empty_sheaf(barn, pcs->main); 5278 + full = barn_replace_empty_sheaf(barn, pcs->main, 5279 + gfpflags_allow_spinning(gfp)); 4509 5280 4510 5281 if (full) { 4511 5282 stat(s, BARN_GET); ··· 4523 5292 empty = pcs->spare; 4524 5293 pcs->spare = NULL; 4525 5294 } else { 4526 - empty = barn_get_empty_sheaf(barn); 5295 + empty = barn_get_empty_sheaf(barn, true); 4527 5296 } 4528 5297 } 4529 5298 ··· 4565 5334 */ 4566 5335 4567 5336 if (pcs->main->size == 0) { 4568 - barn_put_empty_sheaf(barn, pcs->main); 5337 + if (!pcs->spare) 5338 + pcs->spare = pcs->main; 5339 + else 5340 + barn_put_empty_sheaf(barn, pcs->main); 4569 5341 pcs->main = full; 4570 5342 return pcs; 4571 5343 } ··· 4625 5391 * We assume the percpu sheaves contain only local objects although it's 4626 5392 * not completely guaranteed, so we verify later. 4627 5393 */ 4628 - if (unlikely(node_requested && node != numa_mem_id())) 5394 + if (unlikely(node_requested && node != numa_mem_id())) { 5395 + stat(s, ALLOC_NODE_MISMATCH); 4629 5396 return NULL; 5397 + } 4630 5398 4631 5399 if (!local_trylock(&s->cpu_sheaves->lock)) 4632 5400 return NULL; ··· 4651 5415 */ 4652 5416 if (page_to_nid(virt_to_page(object)) != node) { 4653 5417 local_unlock(&s->cpu_sheaves->lock); 5418 + stat(s, ALLOC_NODE_MISMATCH); 4654 5419 return NULL; 4655 5420 } 4656 5421 } ··· 4660 5423 4661 5424 local_unlock(&s->cpu_sheaves->lock); 4662 5425 4663 - stat(s, ALLOC_PCS); 5426 + stat(s, ALLOC_FASTPATH); 4664 5427 4665 5428 return object; 4666 5429 } 4667 5430 4668 5431 static __fastpath_inline 4669 - unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) 5432 + unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, 5433 + void **p) 4670 5434 { 4671 5435 struct slub_percpu_sheaves *pcs; 4672 5436 struct slab_sheaf *main; ··· 4685 5447 struct slab_sheaf *full; 4686 5448 struct node_barn *barn; 4687 5449 5450 + if (unlikely(!cache_has_sheaves(s))) { 5451 + local_unlock(&s->cpu_sheaves->lock); 5452 + return allocated; 5453 + } 5454 + 4688 5455 if (pcs->spare && pcs->spare->size > 0) { 4689 5456 swap(pcs->main, pcs->spare); 4690 5457 goto do_alloc; ··· 4701 5458 return allocated; 4702 5459 } 4703 5460 4704 - full = barn_replace_empty_sheaf(barn, pcs->main); 5461 + full = barn_replace_empty_sheaf(barn, pcs->main, 5462 + gfpflags_allow_spinning(gfp)); 4705 5463 4706 5464 if (full) { 4707 5465 stat(s, BARN_GET); ··· 4732 5488 4733 5489 local_unlock(&s->cpu_sheaves->lock); 4734 5490 4735 - stat_add(s, ALLOC_PCS, batch); 5491 + stat_add(s, ALLOC_FASTPATH, batch); 4736 5492 4737 5493 allocated += batch; 4738 5494 ··· 4770 5526 if (unlikely(object)) 4771 5527 goto out; 4772 5528 4773 - if (s->cpu_sheaves) 4774 - object = alloc_from_pcs(s, gfpflags, node); 5529 + object = alloc_from_pcs(s, gfpflags, node); 4775 5530 4776 5531 if (!object) 4777 5532 object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); ··· 4865 5622 return ret; 4866 5623 } 4867 5624 5625 + static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 5626 + size_t size, void **p); 5627 + 4868 5628 /* 4869 5629 * returns a sheaf that has at least the requested size 4870 5630 * when prefilling is needed, do so with given gfp flags ··· 4881 5635 struct slab_sheaf *sheaf = NULL; 4882 5636 struct node_barn *barn; 4883 5637 4884 - if (unlikely(size > s->sheaf_capacity)) { 5638 + if (unlikely(!size)) 5639 + return NULL; 4885 5640 4886 - /* 4887 - * slab_debug disables cpu sheaves intentionally so all 4888 - * prefilled sheaves become "oversize" and we give up on 4889 - * performance for the debugging. Same with SLUB_TINY. 4890 - * Creating a cache without sheaves and then requesting a 4891 - * prefilled sheaf is however not expected, so warn. 4892 - */ 4893 - WARN_ON_ONCE(s->sheaf_capacity == 0 && 4894 - !IS_ENABLED(CONFIG_SLUB_TINY) && 4895 - !(s->flags & SLAB_DEBUG_FLAGS)); 5641 + if (unlikely(size > s->sheaf_capacity)) { 4896 5642 4897 5643 sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); 4898 5644 if (!sheaf) ··· 5206 5968 gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; 5207 5969 struct kmem_cache *s; 5208 5970 bool can_retry = true; 5209 - void *ret = ERR_PTR(-EBUSY); 5971 + void *ret; 5210 5972 5211 5973 VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | 5212 5974 __GFP_NO_OBJ_EXT)); ··· 5214 5976 if (unlikely(!size)) 5215 5977 return ZERO_SIZE_PTR; 5216 5978 5217 - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) 5218 - /* 5219 - * kmalloc_nolock() in PREEMPT_RT is not supported from 5220 - * non-preemptible context because local_lock becomes a 5221 - * sleeping lock on RT. 5222 - */ 5979 + /* 5980 + * See the comment for the same check in 5981 + * alloc_frozen_pages_nolock_noprof() 5982 + */ 5983 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 5223 5984 return NULL; 5985 + 5224 5986 retry: 5225 5987 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 5226 5988 return NULL; ··· 5229 5991 if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) 5230 5992 /* 5231 5993 * kmalloc_nolock() is not supported on architectures that 5232 - * don't implement cmpxchg16b, but debug caches don't use 5233 - * per-cpu slab and per-cpu partial slabs. They rely on 5234 - * kmem_cache_node->list_lock, so kmalloc_nolock() can 5235 - * attempt to allocate from debug caches by 5994 + * don't implement cmpxchg16b and thus need slab_lock() 5995 + * which could be preempted by a nmi. 5996 + * But debug caches don't use that and only rely on 5997 + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt 5998 + * to allocate from debug caches by 5236 5999 * spin_trylock_irqsave(&n->list_lock, ...) 5237 6000 */ 5238 6001 return NULL; 6002 + 6003 + ret = alloc_from_pcs(s, alloc_gfp, node); 6004 + if (ret) 6005 + goto success; 5239 6006 5240 6007 /* 5241 6008 * Do not call slab_alloc_node(), since trylock mode isn't 5242 6009 * compatible with slab_pre_alloc_hook/should_failslab and 5243 6010 * kfence_alloc. Hence call __slab_alloc_node() (at most twice) 5244 6011 * and slab_post_alloc_hook() directly. 5245 - * 5246 - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair 5247 - * in irq saved region. It assumes that the same cpu will not 5248 - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. 5249 - * Therefore use in_nmi() to check whether particular bucket is in 5250 - * irq protected section. 5251 - * 5252 - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that 5253 - * this cpu was interrupted somewhere inside ___slab_alloc() after 5254 - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 5255 - * In this case fast path with __update_cpu_freelist_fast() is not safe. 5256 6012 */ 5257 - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 5258 - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 6013 + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5259 6014 5260 - if (PTR_ERR(ret) == -EBUSY) { 5261 - if (can_retry) { 5262 - /* pick the next kmalloc bucket */ 5263 - size = s->object_size + 1; 5264 - /* 5265 - * Another alternative is to 5266 - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5267 - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5268 - * to retry from bucket of the same size. 5269 - */ 5270 - can_retry = false; 5271 - goto retry; 5272 - } 5273 - ret = NULL; 6015 + /* 6016 + * It's possible we failed due to trylock as we preempted someone with 6017 + * the sheaves locked, and the list_lock is also held by another cpu. 6018 + * But it should be rare that multiple kmalloc buckets would have 6019 + * sheaves locked, so try a larger one. 6020 + */ 6021 + if (!ret && can_retry) { 6022 + /* pick the next kmalloc bucket */ 6023 + size = s->object_size + 1; 6024 + /* 6025 + * Another alternative is to 6026 + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 6027 + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 6028 + * to retry from bucket of the same size. 6029 + */ 6030 + can_retry = false; 6031 + goto retry; 5274 6032 } 5275 6033 6034 + success: 5276 6035 maybe_wipe_obj_freeptr(s, ret); 5277 6036 slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, 5278 6037 slab_want_init_on_alloc(alloc_gfp, s), size); ··· 5351 6116 /* was on full list */ 5352 6117 remove_full(s, n, slab); 5353 6118 if (!slab_free) { 5354 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 6119 + add_partial(n, slab, ADD_TO_TAIL); 5355 6120 stat(s, FREE_ADD_PARTIAL); 5356 6121 } 5357 6122 } else if (slab_free) { ··· 5389 6154 unsigned long addr) 5390 6155 5391 6156 { 5392 - bool was_frozen, was_full; 6157 + bool was_full; 5393 6158 struct freelist_counters old, new; 5394 6159 struct kmem_cache_node *n = NULL; 5395 6160 unsigned long flags; 5396 6161 bool on_node_partial; 5397 6162 5398 - stat(s, FREE_SLOWPATH); 5399 - 5400 6163 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5401 6164 free_to_partial_list(s, slab, head, tail, cnt, addr); 5402 6165 return; 5403 6166 } 5404 - 5405 - /* 5406 - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below 5407 - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) 5408 - * is the only other reason it can be false, and it is already handled 5409 - * above. 5410 - */ 5411 6167 5412 6168 do { 5413 6169 if (unlikely(n)) { ··· 5410 6184 old.counters = slab->counters; 5411 6185 5412 6186 was_full = (old.freelist == NULL); 5413 - was_frozen = old.frozen; 5414 6187 5415 6188 set_freepointer(s, tail, old.freelist); 5416 6189 ··· 5422 6197 * to (due to not being full anymore) the partial list. 5423 6198 * Unless it's frozen. 5424 6199 */ 5425 - if ((!new.inuse || was_full) && !was_frozen) { 6200 + if (!new.inuse || was_full) { 6201 + 6202 + n = get_node(s, slab_nid(slab)); 5426 6203 /* 5427 - * If slab becomes non-full and we have cpu partial 5428 - * lists, we put it there unconditionally to avoid 5429 - * taking the list_lock. Otherwise we need it. 6204 + * Speculatively acquire the list_lock. 6205 + * If the cmpxchg does not succeed then we may 6206 + * drop the list_lock without any processing. 6207 + * 6208 + * Otherwise the list_lock will synchronize with 6209 + * other processors updating the list of slabs. 5430 6210 */ 5431 - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { 6211 + spin_lock_irqsave(&n->list_lock, flags); 5432 6212 5433 - n = get_node(s, slab_nid(slab)); 5434 - /* 5435 - * Speculatively acquire the list_lock. 5436 - * If the cmpxchg does not succeed then we may 5437 - * drop the list_lock without any processing. 5438 - * 5439 - * Otherwise the list_lock will synchronize with 5440 - * other processors updating the list of slabs. 5441 - */ 5442 - spin_lock_irqsave(&n->list_lock, flags); 5443 - 5444 - on_node_partial = slab_test_node_partial(slab); 5445 - } 6213 + on_node_partial = slab_test_node_partial(slab); 5446 6214 } 5447 6215 5448 6216 } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); 5449 6217 5450 6218 if (likely(!n)) { 5451 - 5452 - if (likely(was_frozen)) { 5453 - /* 5454 - * The list lock was not taken therefore no list 5455 - * activity can be necessary. 5456 - */ 5457 - stat(s, FREE_FROZEN); 5458 - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { 5459 - /* 5460 - * If we started with a full slab then put it onto the 5461 - * per cpu partial list. 5462 - */ 5463 - put_cpu_partial(s, slab, 1); 5464 - stat(s, CPU_PARTIAL_FREE); 5465 - } 5466 - 5467 6219 /* 5468 - * In other cases we didn't take the list_lock because the slab 5469 - * was already on the partial list and will remain there. 6220 + * We didn't take the list_lock because the slab was already on 6221 + * the partial list and will remain there. 5470 6222 */ 5471 - 5472 6223 return; 5473 6224 } 5474 6225 ··· 5466 6265 5467 6266 /* 5468 6267 * Objects left in the slab. If it was not on the partial list before 5469 - * then add it. This can only happen when cache has no per cpu partial 5470 - * list otherwise we would have put it there. 6268 + * then add it. 5471 6269 */ 5472 - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { 5473 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 6270 + if (unlikely(was_full)) { 6271 + add_partial(n, slab, ADD_TO_TAIL); 5474 6272 stat(s, FREE_ADD_PARTIAL); 5475 6273 } 5476 6274 spin_unlock_irqrestore(&n->list_lock, flags); ··· 5555 6355 * unlocked. 5556 6356 */ 5557 6357 static struct slub_percpu_sheaves * 5558 - __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) 6358 + __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, 6359 + bool allow_spin) 5559 6360 { 5560 6361 struct slab_sheaf *empty; 5561 6362 struct node_barn *barn; ··· 5564 6363 5565 6364 restart: 5566 6365 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 6366 + 6367 + /* Bootstrap or debug cache, back off */ 6368 + if (unlikely(!cache_has_sheaves(s))) { 6369 + local_unlock(&s->cpu_sheaves->lock); 6370 + return NULL; 6371 + } 5567 6372 5568 6373 barn = get_barn(s); 5569 6374 if (!barn) { ··· 5580 6373 put_fail = false; 5581 6374 5582 6375 if (!pcs->spare) { 5583 - empty = barn_get_empty_sheaf(barn); 6376 + empty = barn_get_empty_sheaf(barn, allow_spin); 5584 6377 if (empty) { 5585 6378 pcs->spare = pcs->main; 5586 6379 pcs->main = empty; ··· 5594 6387 return pcs; 5595 6388 } 5596 6389 5597 - empty = barn_replace_full_sheaf(barn, pcs->main); 6390 + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); 5598 6391 5599 6392 if (!IS_ERR(empty)) { 5600 6393 stat(s, BARN_PUT); ··· 5602 6395 return pcs; 5603 6396 } 5604 6397 5605 - if (PTR_ERR(empty) == -E2BIG) { 6398 + /* sheaf_flush_unused() doesn't support !allow_spin */ 6399 + if (PTR_ERR(empty) == -E2BIG && allow_spin) { 5606 6400 /* Since we got here, spare exists and is full */ 5607 6401 struct slab_sheaf *to_flush = pcs->spare; 5608 6402 ··· 5627 6419 5628 6420 alloc_empty: 5629 6421 local_unlock(&s->cpu_sheaves->lock); 6422 + 6423 + /* 6424 + * alloc_empty_sheaf() doesn't support !allow_spin and it's 6425 + * easier to fall back to freeing directly without sheaves 6426 + * than add the support (and to sheaf_flush_unused() above) 6427 + */ 6428 + if (!allow_spin) 6429 + return NULL; 5630 6430 5631 6431 empty = alloc_empty_sheaf(s, GFP_NOWAIT); 5632 6432 if (empty) ··· 5678 6462 * The object is expected to have passed slab_free_hook() already. 5679 6463 */ 5680 6464 static __fastpath_inline 5681 - bool free_to_pcs(struct kmem_cache *s, void *object) 6465 + bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) 5682 6466 { 5683 6467 struct slub_percpu_sheaves *pcs; 5684 6468 ··· 5689 6473 5690 6474 if (unlikely(pcs->main->size == s->sheaf_capacity)) { 5691 6475 5692 - pcs = __pcs_replace_full_main(s, pcs); 6476 + pcs = __pcs_replace_full_main(s, pcs, allow_spin); 5693 6477 if (unlikely(!pcs)) 5694 6478 return false; 5695 6479 } ··· 5698 6482 5699 6483 local_unlock(&s->cpu_sheaves->lock); 5700 6484 5701 - stat(s, FREE_PCS); 6485 + stat(s, FREE_FASTPATH); 5702 6486 5703 6487 return true; 5704 6488 } ··· 5796 6580 struct slab_sheaf *empty; 5797 6581 struct node_barn *barn; 5798 6582 6583 + /* Bootstrap or debug cache, fall back */ 6584 + if (unlikely(!cache_has_sheaves(s))) { 6585 + local_unlock(&s->cpu_sheaves->lock); 6586 + goto fail; 6587 + } 6588 + 5799 6589 if (pcs->spare && pcs->spare->size == 0) { 5800 6590 pcs->rcu_free = pcs->spare; 5801 6591 pcs->spare = NULL; ··· 5814 6592 goto fail; 5815 6593 } 5816 6594 5817 - empty = barn_get_empty_sheaf(barn); 6595 + empty = barn_get_empty_sheaf(barn, true); 5818 6596 5819 6597 if (empty) { 5820 6598 pcs->rcu_free = empty; ··· 5934 6712 goto no_empty; 5935 6713 5936 6714 if (!pcs->spare) { 5937 - empty = barn_get_empty_sheaf(barn); 6715 + empty = barn_get_empty_sheaf(barn, true); 5938 6716 if (!empty) 5939 6717 goto no_empty; 5940 6718 ··· 5948 6726 goto do_free; 5949 6727 } 5950 6728 5951 - empty = barn_replace_full_sheaf(barn, pcs->main); 6729 + empty = barn_replace_full_sheaf(barn, pcs->main, true); 5952 6730 if (IS_ERR(empty)) { 5953 6731 stat(s, BARN_PUT_FAIL); 5954 6732 goto no_empty; ··· 5966 6744 5967 6745 local_unlock(&s->cpu_sheaves->lock); 5968 6746 5969 - stat_add(s, FREE_PCS, batch); 6747 + stat_add(s, FREE_FASTPATH, batch); 5970 6748 5971 6749 if (batch < size) { 5972 6750 p += batch; ··· 5988 6766 */ 5989 6767 fallback: 5990 6768 __kmem_cache_free_bulk(s, size, p); 6769 + stat_add(s, FREE_SLOWPATH, size); 5991 6770 5992 6771 flush_remote: 5993 6772 if (remote_nr) { 5994 6773 __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); 6774 + stat_add(s, FREE_SLOWPATH, remote_nr); 5995 6775 if (i < size) { 5996 6776 remote_nr = 0; 5997 6777 goto next_remote_batch; ··· 6003 6779 6004 6780 struct defer_free { 6005 6781 struct llist_head objects; 6006 - struct llist_head slabs; 6007 6782 struct irq_work work; 6008 6783 }; 6009 6784 ··· 6010 6787 6011 6788 static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { 6012 6789 .objects = LLIST_HEAD_INIT(objects), 6013 - .slabs = LLIST_HEAD_INIT(slabs), 6014 6790 .work = IRQ_WORK_INIT(free_deferred_objects), 6015 6791 }; 6016 6792 6017 6793 /* 6018 6794 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe 6019 - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). 6795 + * to take sleeping spin_locks from __slab_free(). 6020 6796 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). 6021 6797 */ 6022 6798 static void free_deferred_objects(struct irq_work *work) 6023 6799 { 6024 6800 struct defer_free *df = container_of(work, struct defer_free, work); 6025 6801 struct llist_head *objs = &df->objects; 6026 - struct llist_head *slabs = &df->slabs; 6027 6802 struct llist_node *llnode, *pos, *t; 6028 6803 6029 - if (llist_empty(objs) && llist_empty(slabs)) 6804 + if (llist_empty(objs)) 6030 6805 return; 6031 6806 6032 6807 llnode = llist_del_all(objs); ··· 6047 6826 set_freepointer(s, x, NULL); 6048 6827 6049 6828 __slab_free(s, slab, x, x, 1, _THIS_IP_); 6050 - } 6051 - 6052 - llnode = llist_del_all(slabs); 6053 - llist_for_each_safe(pos, t, llnode) { 6054 - struct slab *slab = container_of(pos, struct slab, llnode); 6055 - 6056 - if (slab->frozen) 6057 - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); 6058 - else 6059 - free_slab(slab->slab_cache, slab); 6829 + stat(s, FREE_SLOWPATH); 6060 6830 } 6061 6831 } 6062 6832 ··· 6064 6852 irq_work_queue(&df->work); 6065 6853 } 6066 6854 6067 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) 6068 - { 6069 - struct defer_free *df; 6070 - 6071 - slab->flush_freelist = flush_freelist; 6072 - 6073 - guard(preempt)(); 6074 - 6075 - df = this_cpu_ptr(&defer_free_objects); 6076 - if (llist_add(&slab->llnode, &df->slabs)) 6077 - irq_work_queue(&df->work); 6078 - } 6079 - 6080 6855 void defer_free_barrier(void) 6081 6856 { 6082 6857 int cpu; 6083 6858 6084 6859 for_each_possible_cpu(cpu) 6085 6860 irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); 6086 - } 6087 - 6088 - /* 6089 - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 6090 - * can perform fastpath freeing without additional function calls. 6091 - * 6092 - * The fastpath is only possible if we are freeing to the current cpu slab 6093 - * of this processor. This typically the case if we have just allocated 6094 - * the item before. 6095 - * 6096 - * If fastpath is not possible then fall back to __slab_free where we deal 6097 - * with all sorts of special processing. 6098 - * 6099 - * Bulk free of a freelist with several objects (all pointing to the 6100 - * same slab) possible by specifying head and tail ptr, plus objects 6101 - * count (cnt). Bulk free indicated by tail pointer being set. 6102 - */ 6103 - static __always_inline void do_slab_free(struct kmem_cache *s, 6104 - struct slab *slab, void *head, void *tail, 6105 - int cnt, unsigned long addr) 6106 - { 6107 - /* cnt == 0 signals that it's called from kfree_nolock() */ 6108 - bool allow_spin = cnt; 6109 - struct kmem_cache_cpu *c; 6110 - unsigned long tid; 6111 - void **freelist; 6112 - 6113 - redo: 6114 - /* 6115 - * Determine the currently cpus per cpu slab. 6116 - * The cpu may change afterward. However that does not matter since 6117 - * data is retrieved via this pointer. If we are on the same cpu 6118 - * during the cmpxchg then the free will succeed. 6119 - */ 6120 - c = raw_cpu_ptr(s->cpu_slab); 6121 - tid = READ_ONCE(c->tid); 6122 - 6123 - /* Same with comment on barrier() in __slab_alloc_node() */ 6124 - barrier(); 6125 - 6126 - if (unlikely(slab != c->slab)) { 6127 - if (unlikely(!allow_spin)) { 6128 - /* 6129 - * __slab_free() can locklessly cmpxchg16 into a slab, 6130 - * but then it might need to take spin_lock or local_lock 6131 - * in put_cpu_partial() for further processing. 6132 - * Avoid the complexity and simply add to a deferred list. 6133 - */ 6134 - defer_free(s, head); 6135 - } else { 6136 - __slab_free(s, slab, head, tail, cnt, addr); 6137 - } 6138 - return; 6139 - } 6140 - 6141 - if (unlikely(!allow_spin)) { 6142 - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && 6143 - local_lock_is_locked(&s->cpu_slab->lock)) { 6144 - defer_free(s, head); 6145 - return; 6146 - } 6147 - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ 6148 - } 6149 - 6150 - if (USE_LOCKLESS_FAST_PATH()) { 6151 - freelist = READ_ONCE(c->freelist); 6152 - 6153 - set_freepointer(s, tail, freelist); 6154 - 6155 - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { 6156 - note_cmpxchg_failure("slab_free", s, tid); 6157 - goto redo; 6158 - } 6159 - } else { 6160 - __maybe_unused unsigned long flags = 0; 6161 - 6162 - /* Update the free list under the local lock */ 6163 - local_lock_cpu_slab(s, flags); 6164 - c = this_cpu_ptr(s->cpu_slab); 6165 - if (unlikely(slab != c->slab)) { 6166 - local_unlock_cpu_slab(s, flags); 6167 - goto redo; 6168 - } 6169 - tid = c->tid; 6170 - freelist = c->freelist; 6171 - 6172 - set_freepointer(s, tail, freelist); 6173 - c->freelist = head; 6174 - c->tid = next_tid(tid); 6175 - 6176 - local_unlock_cpu_slab(s, flags); 6177 - } 6178 - stat_add(s, FREE_FASTPATH, cnt); 6179 6861 } 6180 6862 6181 6863 static __fastpath_inline ··· 6082 6976 if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6083 6977 return; 6084 6978 6085 - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6086 - slab_nid(slab) == numa_mem_id()) 6087 - && likely(!slab_test_pfmemalloc(slab))) { 6088 - if (likely(free_to_pcs(s, object))) 6979 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) 6980 + && likely(!slab_test_pfmemalloc(slab))) { 6981 + if (likely(free_to_pcs(s, object, true))) 6089 6982 return; 6090 6983 } 6091 6984 6092 - do_slab_free(s, slab, object, object, 1, addr); 6985 + __slab_free(s, slab, object, object, 1, addr); 6986 + stat(s, FREE_SLOWPATH); 6093 6987 } 6094 6988 6095 6989 #ifdef CONFIG_MEMCG ··· 6098 6992 void memcg_alloc_abort_single(struct kmem_cache *s, void *object) 6099 6993 { 6100 6994 if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6101 - do_slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); 6995 + __slab_free(s, virt_to_slab(object), object, object, 1, _RET_IP_); 6102 6996 } 6103 6997 #endif 6104 6998 ··· 6112 7006 * With KASAN enabled slab_free_freelist_hook modifies the freelist 6113 7007 * to remove objects, whose reuse must be delayed. 6114 7008 */ 6115 - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) 6116 - do_slab_free(s, slab, head, tail, cnt, addr); 7009 + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { 7010 + __slab_free(s, slab, head, tail, cnt, addr); 7011 + stat_add(s, FREE_SLOWPATH, cnt); 7012 + } 6117 7013 } 6118 7014 6119 7015 #ifdef CONFIG_SLUB_RCU_DEBUG ··· 6140 7032 return; 6141 7033 6142 7034 /* resume freeing */ 6143 - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) 6144 - do_slab_free(s, slab, object, object, 1, _THIS_IP_); 7035 + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { 7036 + __slab_free(s, slab, object, object, 1, _THIS_IP_); 7037 + stat(s, FREE_SLOWPATH); 7038 + } 6145 7039 } 6146 7040 #endif /* CONFIG_SLUB_RCU_DEBUG */ 6147 7041 6148 7042 #ifdef CONFIG_KASAN_GENERIC 6149 7043 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) 6150 7044 { 6151 - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); 7045 + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); 7046 + stat(cache, FREE_SLOWPATH); 6152 7047 } 6153 7048 #endif 6154 7049 ··· 6451 7340 * since kasan quarantine takes locks and not supported from NMI. 6452 7341 */ 6453 7342 kasan_slab_free(s, x, false, false, /* skip quarantine */true); 6454 - do_slab_free(s, slab, x, x, 0, _RET_IP_); 7343 + 7344 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { 7345 + if (likely(free_to_pcs(s, x, false))) 7346 + return; 7347 + } 7348 + 7349 + /* 7350 + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might 7351 + * need to take spin_lock for further processing. 7352 + * Avoid the complexity and simply add to a deferred list. 7353 + */ 7354 + defer_free(s, x); 6455 7355 } 6456 7356 EXPORT_SYMBOL_GPL(kfree_nolock); 6457 7357 ··· 6888 7766 if (kfence_free(df.freelist)) 6889 7767 continue; 6890 7768 6891 - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 7769 + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 6892 7770 _RET_IP_); 6893 7771 } while (likely(size)); 6894 7772 } ··· 6903 7781 * freeing to sheaves is so incompatible with the detached freelist so 6904 7782 * once we go that way, we have to do everything differently 6905 7783 */ 6906 - if (s && s->cpu_sheaves) { 7784 + if (s && cache_has_sheaves(s)) { 6907 7785 free_to_pcs_bulk(s, size, p); 6908 7786 return; 6909 7787 } ··· 6921 7799 } 6922 7800 EXPORT_SYMBOL(kmem_cache_free_bulk); 6923 7801 7802 + static unsigned int 7803 + __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7804 + unsigned int max, struct kmem_cache_node *n, 7805 + bool allow_spin) 7806 + { 7807 + struct partial_bulk_context pc; 7808 + struct slab *slab, *slab2; 7809 + unsigned int refilled = 0; 7810 + unsigned long flags; 7811 + void *object; 7812 + 7813 + pc.flags = gfp; 7814 + pc.min_objects = min; 7815 + pc.max_objects = max; 7816 + 7817 + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) 7818 + return 0; 7819 + 7820 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7821 + 7822 + list_del(&slab->slab_list); 7823 + 7824 + object = get_freelist_nofreeze(s, slab); 7825 + 7826 + while (object && refilled < max) { 7827 + p[refilled] = object; 7828 + object = get_freepointer(s, object); 7829 + maybe_wipe_obj_freeptr(s, p[refilled]); 7830 + 7831 + refilled++; 7832 + } 7833 + 7834 + /* 7835 + * Freelist had more objects than we can accommodate, we need to 7836 + * free them back. We can treat it like a detached freelist, just 7837 + * need to find the tail object. 7838 + */ 7839 + if (unlikely(object)) { 7840 + void *head = object; 7841 + void *tail; 7842 + int cnt = 0; 7843 + 7844 + do { 7845 + tail = object; 7846 + cnt++; 7847 + object = get_freepointer(s, object); 7848 + } while (object); 7849 + __slab_free(s, slab, head, tail, cnt, _RET_IP_); 7850 + } 7851 + 7852 + if (refilled >= max) 7853 + break; 7854 + } 7855 + 7856 + if (unlikely(!list_empty(&pc.slabs))) { 7857 + spin_lock_irqsave(&n->list_lock, flags); 7858 + 7859 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7860 + 7861 + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) 7862 + continue; 7863 + 7864 + list_del(&slab->slab_list); 7865 + add_partial(n, slab, ADD_TO_HEAD); 7866 + } 7867 + 7868 + spin_unlock_irqrestore(&n->list_lock, flags); 7869 + 7870 + /* any slabs left are completely free and for discard */ 7871 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7872 + 7873 + list_del(&slab->slab_list); 7874 + discard_slab(s, slab); 7875 + } 7876 + } 7877 + 7878 + return refilled; 7879 + } 7880 + 7881 + #ifdef CONFIG_NUMA 7882 + static unsigned int 7883 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7884 + unsigned int max) 7885 + { 7886 + struct zonelist *zonelist; 7887 + struct zoneref *z; 7888 + struct zone *zone; 7889 + enum zone_type highest_zoneidx = gfp_zone(gfp); 7890 + unsigned int cpuset_mems_cookie; 7891 + unsigned int refilled = 0; 7892 + 7893 + /* see get_from_any_partial() for the defrag ratio description */ 7894 + if (!s->remote_node_defrag_ratio || 7895 + get_cycles() % 1024 > s->remote_node_defrag_ratio) 7896 + return 0; 7897 + 7898 + do { 7899 + cpuset_mems_cookie = read_mems_allowed_begin(); 7900 + zonelist = node_zonelist(mempolicy_slab_node(), gfp); 7901 + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { 7902 + struct kmem_cache_node *n; 7903 + unsigned int r; 7904 + 7905 + n = get_node(s, zone_to_nid(zone)); 7906 + 7907 + if (!n || !cpuset_zone_allowed(zone, gfp) || 7908 + n->nr_partial <= s->min_partial) 7909 + continue; 7910 + 7911 + r = __refill_objects_node(s, p, gfp, min, max, n, 7912 + /* allow_spin = */ false); 7913 + refilled += r; 7914 + 7915 + if (r >= min) { 7916 + /* 7917 + * Don't check read_mems_allowed_retry() here - 7918 + * if mems_allowed was updated in parallel, that 7919 + * was a harmless race between allocation and 7920 + * the cpuset update 7921 + */ 7922 + return refilled; 7923 + } 7924 + p += r; 7925 + min -= r; 7926 + max -= r; 7927 + } 7928 + } while (read_mems_allowed_retry(cpuset_mems_cookie)); 7929 + 7930 + return refilled; 7931 + } 7932 + #else 7933 + static inline unsigned int 7934 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7935 + unsigned int max) 7936 + { 7937 + return 0; 7938 + } 7939 + #endif 7940 + 7941 + static unsigned int 7942 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7943 + unsigned int max) 7944 + { 7945 + int local_node = numa_mem_id(); 7946 + unsigned int refilled; 7947 + struct slab *slab; 7948 + 7949 + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) 7950 + return 0; 7951 + 7952 + refilled = __refill_objects_node(s, p, gfp, min, max, 7953 + get_node(s, local_node), 7954 + /* allow_spin = */ true); 7955 + if (refilled >= min) 7956 + return refilled; 7957 + 7958 + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, 7959 + max - refilled); 7960 + if (refilled >= min) 7961 + return refilled; 7962 + 7963 + new_slab: 7964 + 7965 + slab = new_slab(s, gfp, local_node); 7966 + if (!slab) 7967 + goto out; 7968 + 7969 + stat(s, ALLOC_SLAB); 7970 + 7971 + /* 7972 + * TODO: possible optimization - if we know we will consume the whole 7973 + * slab we might skip creating the freelist? 7974 + */ 7975 + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, 7976 + /* allow_spin = */ true); 7977 + 7978 + if (refilled < min) 7979 + goto new_slab; 7980 + 7981 + out: 7982 + return refilled; 7983 + } 7984 + 6924 7985 static inline 6925 7986 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 6926 7987 void **p) 6927 7988 { 6928 - struct kmem_cache_cpu *c; 6929 - unsigned long irqflags; 6930 7989 int i; 6931 7990 6932 - /* 6933 - * Drain objects in the per cpu slab, while disabling local 6934 - * IRQs, which protects against PREEMPT and interrupts 6935 - * handlers invoking normal fastpath. 6936 - */ 6937 - c = slub_get_cpu_ptr(s->cpu_slab); 6938 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7991 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 7992 + for (i = 0; i < size; i++) { 6939 7993 6940 - for (i = 0; i < size; i++) { 6941 - void *object = c->freelist; 6942 - 6943 - if (unlikely(!object)) { 6944 - /* 6945 - * We may have removed an object from c->freelist using 6946 - * the fastpath in the previous iteration; in that case, 6947 - * c->tid has not been bumped yet. 6948 - * Since ___slab_alloc() may reenable interrupts while 6949 - * allocating memory, we should bump c->tid now. 6950 - */ 6951 - c->tid = next_tid(c->tid); 6952 - 6953 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6954 - 6955 - /* 6956 - * Invoking slow path likely have side-effect 6957 - * of re-populating per CPU c->freelist 6958 - */ 6959 - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, 6960 - _RET_IP_, c, s->object_size); 7994 + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, 7995 + s->object_size); 6961 7996 if (unlikely(!p[i])) 6962 7997 goto error; 6963 7998 6964 - c = this_cpu_ptr(s->cpu_slab); 6965 7999 maybe_wipe_obj_freeptr(s, p[i]); 6966 - 6967 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 6968 - 6969 - continue; /* goto for-loop */ 6970 8000 } 6971 - c->freelist = get_freepointer(s, object); 6972 - p[i] = object; 6973 - maybe_wipe_obj_freeptr(s, p[i]); 6974 - stat(s, ALLOC_FASTPATH); 8001 + } else { 8002 + i = refill_objects(s, p, flags, size, size); 8003 + if (i < size) 8004 + goto error; 8005 + stat_add(s, ALLOC_SLOWPATH, i); 6975 8006 } 6976 - c->tid = next_tid(c->tid); 6977 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6978 - slub_put_cpu_ptr(s->cpu_slab); 6979 8007 6980 8008 return i; 6981 8009 6982 8010 error: 6983 - slub_put_cpu_ptr(s->cpu_slab); 6984 8011 __kmem_cache_free_bulk(s, i, p); 6985 8012 return 0; 6986 8013 6987 8014 } 6988 8015 6989 - /* Note that interrupts must be enabled when calling this function. */ 8016 + /* 8017 + * Note that interrupts must be enabled when calling this function and gfp 8018 + * flags must allow spinning. 8019 + */ 6990 8020 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 6991 8021 void **p) 6992 8022 { ··· 7166 7892 size--; 7167 7893 } 7168 7894 7169 - if (s->cpu_sheaves) 7170 - i = alloc_from_pcs_bulk(s, size, p); 7895 + i = alloc_from_pcs_bulk(s, flags, size, p); 7171 7896 7172 7897 if (i < size) { 7173 7898 /* ··· 7354 8081 barn_init(barn); 7355 8082 } 7356 8083 7357 - static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 8084 + #ifdef CONFIG_SLUB_STATS 8085 + static inline int alloc_kmem_cache_stats(struct kmem_cache *s) 7358 8086 { 7359 8087 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 7360 8088 NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * 7361 - sizeof(struct kmem_cache_cpu)); 8089 + sizeof(struct kmem_cache_stats)); 7362 8090 7363 - /* 7364 - * Must align to double word boundary for the double cmpxchg 7365 - * instructions to work; see __pcpu_double_call_return_bool(). 7366 - */ 7367 - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 7368 - 2 * sizeof(void *)); 8091 + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); 7369 8092 7370 - if (!s->cpu_slab) 8093 + if (!s->cpu_stats) 7371 8094 return 0; 7372 - 7373 - init_kmem_cache_cpus(s); 7374 8095 7375 8096 return 1; 7376 8097 } 8098 + #endif 7377 8099 7378 8100 static int init_percpu_sheaves(struct kmem_cache *s) 7379 8101 { 8102 + static struct slab_sheaf bootstrap_sheaf = {}; 7380 8103 int cpu; 7381 8104 7382 8105 for_each_possible_cpu(cpu) { ··· 7382 8113 7383 8114 local_trylock_init(&pcs->lock); 7384 8115 7385 - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 8116 + /* 8117 + * Bootstrap sheaf has zero size so fast-path allocation fails. 8118 + * It has also size == s->sheaf_capacity, so fast-path free 8119 + * fails. In the slow paths we recognize the situation by 8120 + * checking s->sheaf_capacity. This allows fast paths to assume 8121 + * s->cpu_sheaves and pcs->main always exists and are valid. 8122 + * It's also safe to share the single static bootstrap_sheaf 8123 + * with zero-sized objects array as it's never modified. 8124 + * 8125 + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we 8126 + * recognize it and not attempt to free it when destroying the 8127 + * cache. 8128 + * 8129 + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, 8130 + * caches with debug enabled, and all caches with SLUB_TINY. 8131 + * For kmalloc caches it's used temporarily during the initial 8132 + * bootstrap. 8133 + */ 8134 + if (!s->sheaf_capacity) 8135 + pcs->main = &bootstrap_sheaf; 8136 + else 8137 + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 7386 8138 7387 8139 if (!pcs->main) 7388 8140 return -ENOMEM; ··· 7454 8164 * No locks need to be taken here as it has just been 7455 8165 * initialized and there is no concurrent access. 7456 8166 */ 7457 - __add_partial(n, slab, DEACTIVATE_TO_HEAD); 8167 + __add_partial(n, slab, ADD_TO_HEAD); 7458 8168 } 7459 8169 7460 8170 static void free_kmem_cache_nodes(struct kmem_cache *s) ··· 7478 8188 void __kmem_cache_release(struct kmem_cache *s) 7479 8189 { 7480 8190 cache_random_seq_destroy(s); 7481 - if (s->cpu_sheaves) 7482 - pcs_destroy(s); 7483 - #ifdef CONFIG_PREEMPT_RT 7484 - if (s->cpu_slab) 7485 - lockdep_unregister_key(&s->lock_key); 8191 + pcs_destroy(s); 8192 + #ifdef CONFIG_SLUB_STATS 8193 + free_percpu(s->cpu_stats); 7486 8194 #endif 7487 - free_percpu(s->cpu_slab); 7488 8195 free_kmem_cache_nodes(s); 7489 8196 } 7490 8197 ··· 7498 8211 continue; 7499 8212 } 7500 8213 7501 - if (s->cpu_sheaves) { 8214 + if (cache_has_sheaves(s)) { 7502 8215 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 7503 8216 7504 8217 if (!barn) ··· 7519 8232 return 1; 7520 8233 } 7521 8234 7522 - static void set_cpu_partial(struct kmem_cache *s) 8235 + static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, 8236 + struct kmem_cache_args *args) 8237 + 7523 8238 { 7524 - #ifdef CONFIG_SLUB_CPU_PARTIAL 7525 - unsigned int nr_objects; 8239 + unsigned int capacity; 8240 + size_t size; 8241 + 8242 + 8243 + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) 8244 + return 0; 7526 8245 7527 8246 /* 7528 - * cpu_partial determined the maximum number of objects kept in the 7529 - * per cpu partial lists of a processor. 7530 - * 7531 - * Per cpu partial lists mainly contain slabs that just have one 7532 - * object freed. If they are used for allocation then they can be 7533 - * filled up again with minimal effort. The slab will never hit the 7534 - * per node partial lists and therefore no locking will be required. 7535 - * 7536 - * For backwards compatibility reasons, this is determined as number 7537 - * of objects, even though we now limit maximum number of pages, see 7538 - * slub_set_cpu_partial() 8247 + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). 8248 + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not 8249 + * have sheaves to avoid recursion when sheaf allocation triggers 8250 + * kmemleak tracking. 7539 8251 */ 7540 - if (!kmem_cache_has_cpu_partial(s)) 7541 - nr_objects = 0; 7542 - else if (s->size >= PAGE_SIZE) 7543 - nr_objects = 6; 7544 - else if (s->size >= 1024) 7545 - nr_objects = 24; 7546 - else if (s->size >= 256) 7547 - nr_objects = 52; 7548 - else 7549 - nr_objects = 120; 8252 + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) 8253 + return 0; 7550 8254 7551 - slub_set_cpu_partial(s, nr_objects); 7552 - #endif 8255 + /* 8256 + * For now we use roughly similar formula (divided by two as there are 8257 + * two percpu sheaves) as what was used for percpu partial slabs, which 8258 + * should result in similar lock contention (barn or list_lock) 8259 + */ 8260 + if (s->size >= PAGE_SIZE) 8261 + capacity = 4; 8262 + else if (s->size >= 1024) 8263 + capacity = 12; 8264 + else if (s->size >= 256) 8265 + capacity = 26; 8266 + else 8267 + capacity = 60; 8268 + 8269 + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ 8270 + size = struct_size_t(struct slab_sheaf, objects, capacity); 8271 + size = kmalloc_size_roundup(size); 8272 + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); 8273 + 8274 + /* 8275 + * Respect an explicit request for capacity that's typically motivated by 8276 + * expected maximum size of kmem_cache_prefill_sheaf() to not end up 8277 + * using low-performance oversize sheaves 8278 + */ 8279 + return max(capacity, args->sheaf_capacity); 7553 8280 } 7554 8281 7555 8282 /* ··· 7711 8410 s->allocflags |= __GFP_RECLAIMABLE; 7712 8411 7713 8412 /* 8413 + * For KMALLOC_NORMAL caches we enable sheaves later by 8414 + * bootstrap_kmalloc_sheaves() to avoid recursion 8415 + */ 8416 + if (!is_kmalloc_normal(s)) 8417 + s->sheaf_capacity = calculate_sheaf_capacity(s, args); 8418 + 8419 + /* 7714 8420 * Determine the number of objects per slab 7715 8421 */ 7716 8422 s->oo = oo_make(order, size); ··· 7801 8493 flush_all_cpus_locked(s); 7802 8494 7803 8495 /* we might have rcu sheaves in flight */ 7804 - if (s->cpu_sheaves) 8496 + if (cache_has_sheaves(s)) 7805 8497 rcu_barrier(); 7806 8498 7807 8499 /* Attempt to free all objects */ ··· 8113 8805 if (get_node(s, nid)) 8114 8806 continue; 8115 8807 8116 - if (s->cpu_sheaves) { 8808 + if (cache_has_sheaves(s)) { 8117 8809 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); 8118 8810 8119 8811 if (!barn) { ··· 8188 8880 8189 8881 memcpy(s, static_cache, kmem_cache->object_size); 8190 8882 8191 - /* 8192 - * This runs very early, and only the boot processor is supposed to be 8193 - * up. Even if it weren't true, IRQs are not up so we couldn't fire 8194 - * IPIs around. 8195 - */ 8196 - __flush_cpu_slab(s, smp_processor_id()); 8197 8883 for_each_kmem_cache_node(s, node, n) { 8198 8884 struct slab *p; 8199 8885 ··· 8201 8899 } 8202 8900 list_add(&s->list, &slab_caches); 8203 8901 return s; 8902 + } 8903 + 8904 + /* 8905 + * Finish the sheaves initialization done normally by init_percpu_sheaves() and 8906 + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it 8907 + * since sheaves and barns are allocated by kmalloc. 8908 + */ 8909 + static void __init bootstrap_cache_sheaves(struct kmem_cache *s) 8910 + { 8911 + struct kmem_cache_args empty_args = {}; 8912 + unsigned int capacity; 8913 + bool failed = false; 8914 + int node, cpu; 8915 + 8916 + capacity = calculate_sheaf_capacity(s, &empty_args); 8917 + 8918 + /* capacity can be 0 due to debugging or SLUB_TINY */ 8919 + if (!capacity) 8920 + return; 8921 + 8922 + for_each_node_mask(node, slab_nodes) { 8923 + struct node_barn *barn; 8924 + 8925 + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 8926 + 8927 + if (!barn) { 8928 + failed = true; 8929 + goto out; 8930 + } 8931 + 8932 + barn_init(barn); 8933 + get_node(s, node)->barn = barn; 8934 + } 8935 + 8936 + for_each_possible_cpu(cpu) { 8937 + struct slub_percpu_sheaves *pcs; 8938 + 8939 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 8940 + 8941 + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); 8942 + 8943 + if (!pcs->main) { 8944 + failed = true; 8945 + break; 8946 + } 8947 + } 8948 + 8949 + out: 8950 + /* 8951 + * It's still early in boot so treat this like same as a failure to 8952 + * create the kmalloc cache in the first place 8953 + */ 8954 + if (failed) 8955 + panic("Out of memory when creating kmem_cache %s\n", s->name); 8956 + 8957 + s->sheaf_capacity = capacity; 8958 + } 8959 + 8960 + static void __init bootstrap_kmalloc_sheaves(void) 8961 + { 8962 + enum kmalloc_cache_type type; 8963 + 8964 + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { 8965 + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { 8966 + if (kmalloc_caches[type][idx]) 8967 + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); 8968 + } 8969 + } 8204 8970 } 8205 8971 8206 8972 void __init kmem_cache_init(void) ··· 8313 8943 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 8314 8944 setup_kmalloc_cache_index_table(); 8315 8945 create_kmalloc_caches(); 8946 + 8947 + bootstrap_kmalloc_sheaves(); 8316 8948 8317 8949 /* Setup random freelists for each cache */ 8318 8950 init_freelist_randomization(); ··· 8383 9011 s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); 8384 9012 s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); 8385 9013 8386 - set_cpu_partial(s); 8387 - 8388 - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) 8389 - && !(s->flags & SLAB_DEBUG_FLAGS)) { 8390 - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 8391 - if (!s->cpu_sheaves) { 8392 - err = -ENOMEM; 8393 - goto out; 8394 - } 8395 - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? 8396 - s->sheaf_capacity = args->sheaf_capacity; 9014 + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 9015 + if (!s->cpu_sheaves) { 9016 + err = -ENOMEM; 9017 + goto out; 8397 9018 } 8398 9019 8399 9020 #ifdef CONFIG_NUMA ··· 8402 9037 if (!init_kmem_cache_nodes(s)) 8403 9038 goto out; 8404 9039 8405 - if (!alloc_kmem_cache_cpus(s)) 9040 + #ifdef CONFIG_SLUB_STATS 9041 + if (!alloc_kmem_cache_stats(s)) 8406 9042 goto out; 9043 + #endif 8407 9044 8408 - if (s->cpu_sheaves) { 8409 - err = init_percpu_sheaves(s); 8410 - if (err) 8411 - goto out; 8412 - } 9045 + err = init_percpu_sheaves(s); 9046 + if (err) 9047 + goto out; 8413 9048 8414 9049 err = 0; 8415 9050 ··· 8724 9359 if (!nodes) 8725 9360 return -ENOMEM; 8726 9361 8727 - if (flags & SO_CPU) { 8728 - int cpu; 8729 - 8730 - for_each_possible_cpu(cpu) { 8731 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, 8732 - cpu); 8733 - int node; 8734 - struct slab *slab; 8735 - 8736 - slab = READ_ONCE(c->slab); 8737 - if (!slab) 8738 - continue; 8739 - 8740 - node = slab_nid(slab); 8741 - if (flags & SO_TOTAL) 8742 - x = slab->objects; 8743 - else if (flags & SO_OBJECTS) 8744 - x = slab->inuse; 8745 - else 8746 - x = 1; 8747 - 8748 - total += x; 8749 - nodes[node] += x; 8750 - 8751 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8752 - slab = slub_percpu_partial_read_once(c); 8753 - if (slab) { 8754 - node = slab_nid(slab); 8755 - if (flags & SO_TOTAL) 8756 - WARN_ON_ONCE(1); 8757 - else if (flags & SO_OBJECTS) 8758 - WARN_ON_ONCE(1); 8759 - else 8760 - x = data_race(slab->slabs); 8761 - total += x; 8762 - nodes[node] += x; 8763 - } 8764 - #endif 8765 - } 8766 - } 8767 - 8768 9362 /* 8769 9363 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" 8770 9364 * already held which will conflict with an existing lock order: ··· 8855 9531 8856 9532 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 8857 9533 { 8858 - unsigned int nr_partial = 0; 8859 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8860 - nr_partial = s->cpu_partial; 8861 - #endif 8862 - 8863 - return sysfs_emit(buf, "%u\n", nr_partial); 9534 + return sysfs_emit(buf, "0\n"); 8864 9535 } 8865 9536 8866 9537 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, ··· 8867 9548 err = kstrtouint(buf, 10, &objects); 8868 9549 if (err) 8869 9550 return err; 8870 - if (objects && !kmem_cache_has_cpu_partial(s)) 9551 + if (objects) 8871 9552 return -EINVAL; 8872 9553 8873 - slub_set_cpu_partial(s, objects); 8874 - flush_all(s); 8875 9554 return length; 8876 9555 } 8877 9556 SLAB_ATTR(cpu_partial); ··· 8908 9591 8909 9592 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 8910 9593 { 8911 - int objects = 0; 8912 - int slabs = 0; 8913 - int cpu __maybe_unused; 8914 - int len = 0; 8915 - 8916 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8917 - for_each_online_cpu(cpu) { 8918 - struct slab *slab; 8919 - 8920 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8921 - 8922 - if (slab) 8923 - slabs += data_race(slab->slabs); 8924 - } 8925 - #endif 8926 - 8927 - /* Approximate half-full slabs, see slub_set_cpu_partial() */ 8928 - objects = (slabs * oo_objects(s->oo)) / 2; 8929 - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); 8930 - 8931 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8932 - for_each_online_cpu(cpu) { 8933 - struct slab *slab; 8934 - 8935 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8936 - if (slab) { 8937 - slabs = data_race(slab->slabs); 8938 - objects = (slabs * oo_objects(s->oo)) / 2; 8939 - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", 8940 - cpu, objects, slabs); 8941 - } 8942 - } 8943 - #endif 8944 - len += sysfs_emit_at(buf, len, "\n"); 8945 - 8946 - return len; 9594 + return sysfs_emit(buf, "0(0)\n"); 8947 9595 } 8948 9596 SLAB_ATTR_RO(slabs_cpu_partial); 8949 9597 ··· 9094 9812 return -ENOMEM; 9095 9813 9096 9814 for_each_online_cpu(cpu) { 9097 - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 9815 + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; 9098 9816 9099 9817 data[cpu] = x; 9100 9818 sum += x; ··· 9120 9838 int cpu; 9121 9839 9122 9840 for_each_online_cpu(cpu) 9123 - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 9841 + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; 9124 9842 } 9125 9843 9126 9844 #define STAT_ATTR(si, text) \ ··· 9138 9856 } \ 9139 9857 SLAB_ATTR(text); \ 9140 9858 9141 - STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); 9142 9859 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 9143 9860 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 9144 - STAT_ATTR(FREE_PCS, free_cpu_sheaf); 9145 9861 STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); 9146 9862 STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); 9147 9863 STAT_ATTR(FREE_FASTPATH, free_fastpath); 9148 9864 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 9149 - STAT_ATTR(FREE_FROZEN, free_frozen); 9150 9865 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 9151 9866 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 9152 - STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 9153 9867 STAT_ATTR(ALLOC_SLAB, alloc_slab); 9154 - STAT_ATTR(ALLOC_REFILL, alloc_refill); 9155 9868 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 9156 9869 STAT_ATTR(FREE_SLAB, free_slab); 9157 - STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 9158 - STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 9159 - STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 9160 - STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 9161 - STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 9162 - STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 9163 - STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 9164 9870 STAT_ATTR(ORDER_FALLBACK, order_fallback); 9165 - STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 9166 9871 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 9167 - STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 9168 - STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 9169 - STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 9170 - STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 9171 9872 STAT_ATTR(SHEAF_FLUSH, sheaf_flush); 9172 9873 STAT_ATTR(SHEAF_REFILL, sheaf_refill); 9173 9874 STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); ··· 9226 9961 &remote_node_defrag_ratio_attr.attr, 9227 9962 #endif 9228 9963 #ifdef CONFIG_SLUB_STATS 9229 - &alloc_cpu_sheaf_attr.attr, 9230 9964 &alloc_fastpath_attr.attr, 9231 9965 &alloc_slowpath_attr.attr, 9232 - &free_cpu_sheaf_attr.attr, 9233 9966 &free_rcu_sheaf_attr.attr, 9234 9967 &free_rcu_sheaf_fail_attr.attr, 9235 9968 &free_fastpath_attr.attr, 9236 9969 &free_slowpath_attr.attr, 9237 - &free_frozen_attr.attr, 9238 9970 &free_add_partial_attr.attr, 9239 9971 &free_remove_partial_attr.attr, 9240 - &alloc_from_partial_attr.attr, 9241 9972 &alloc_slab_attr.attr, 9242 - &alloc_refill_attr.attr, 9243 9973 &alloc_node_mismatch_attr.attr, 9244 9974 &free_slab_attr.attr, 9245 - &cpuslab_flush_attr.attr, 9246 - &deactivate_full_attr.attr, 9247 - &deactivate_empty_attr.attr, 9248 - &deactivate_to_head_attr.attr, 9249 - &deactivate_to_tail_attr.attr, 9250 - &deactivate_remote_frees_attr.attr, 9251 - &deactivate_bypass_attr.attr, 9252 9975 &order_fallback_attr.attr, 9253 9976 &cmpxchg_double_fail_attr.attr, 9254 - &cmpxchg_double_cpu_fail_attr.attr, 9255 - &cpu_partial_alloc_attr.attr, 9256 - &cpu_partial_free_attr.attr, 9257 - &cpu_partial_node_attr.attr, 9258 - &cpu_partial_drain_attr.attr, 9259 9977 &sheaf_flush_attr.attr, 9260 9978 &sheaf_refill_attr.attr, 9261 9979 &sheaf_alloc_attr.attr,

Configure Feed

Configure Feed