Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'slab-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

Pull slab updates from Vlastimil Babka:

- The percpu sheaves caching layer was introduced as opt-in in 6.18 and
now we enable it for all caches and remove the previous cpu (partial)
slab caching mechanism.

Besides the lower locking overhead and much more likely fastpath when
freeing, this removes the rather complicated code related to the cpu
slab lockless fastpaths (using this_cpu_try_cmpxchg128/64) and all
its complications for PREEMPT_RT or kmalloc_nolock().

The lockless slab freelist+counters update operation using
try_cmpxchg128/64 remains and is crucial for freeing remote NUMA
objects, and to allow flushing objects from sheaves to slabs mostly
without the node list_lock (Vlastimil Babka)

- Eliminate slabobj_ext metadata overhead when possible. Instead of
using kmalloc() to allocate the array for memcg and/or allocation
profiling tag pointers, use leftover space in a slab or per-object
padding due to alignment (Harry Yoo)

- Various followup improvements to the above (Hao Li)

* tag 'slab-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab: (39 commits)
slub: let need_slab_obj_exts() return false if SLAB_NO_OBJ_EXT is set
mm/slab: only allow SLAB_OBJ_EXT_IN_OBJ for unmergeable caches
mm/slab: place slabobj_ext metadata in unused space within s->size
mm/slab: move [__]ksize and slab_ksize() to mm/slub.c
mm/slab: save memory by allocating slabobj_ext array from leftover
mm/memcontrol,alloc_tag: handle slabobj_ext access under KASAN poison
mm/slab: use stride to access slabobj_ext
mm/slab: abstract slabobj_ext access via new slab_obj_ext() helper
ext4: specify the free pointer offset for ext4_inode_cache
mm/slab: allow specifying free pointer offset when using constructor
mm/slab: use unsigned long for orig_size to ensure proper metadata align
slub: clarify object field layout comments
mm/slab: avoid allocating slabobj_ext array from its own slab
slub: avoid list_lock contention from __refill_objects_any()
mm/slub: cleanup and repurpose some stat items
mm/slub: remove DEACTIVATE_TO_* stat items
slab: remove frozen slab checks from __slab_free()
slab: update overview comments
slab: refill sheaves from all nodes
slab: remove unused PREEMPT_RT specific macros
...

+1773 -2079
+13 -6
fs/ext4/super.c
··· 1496 1496 1497 1497 static int __init init_inodecache(void) 1498 1498 { 1499 - ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", 1500 - sizeof(struct ext4_inode_info), 0, 1501 - SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, 1502 - offsetof(struct ext4_inode_info, i_data), 1503 - sizeof_field(struct ext4_inode_info, i_data), 1504 - init_once); 1499 + struct kmem_cache_args args = { 1500 + .useroffset = offsetof(struct ext4_inode_info, i_data), 1501 + .usersize = sizeof_field(struct ext4_inode_info, i_data), 1502 + .use_freeptr_offset = true, 1503 + .freeptr_offset = offsetof(struct ext4_inode_info, i_flags), 1504 + .ctor = init_once, 1505 + }; 1506 + 1507 + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", 1508 + sizeof(struct ext4_inode_info), 1509 + &args, 1510 + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT); 1511 + 1505 1512 if (ext4_inode_cachep == NULL) 1506 1513 return -ENOMEM; 1507 1514 return 0;
+22 -18
include/linux/slab.h
··· 58 58 #endif 59 59 _SLAB_OBJECT_POISON, 60 60 _SLAB_CMPXCHG_DOUBLE, 61 - #ifdef CONFIG_SLAB_OBJ_EXT 62 61 _SLAB_NO_OBJ_EXT, 62 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 63 + _SLAB_OBJ_EXT_IN_OBJ, 63 64 #endif 64 65 _SLAB_FLAGS_LAST_BIT 65 66 }; ··· 240 239 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 241 240 242 241 /* Slab created using create_boot_cache */ 243 - #ifdef CONFIG_SLAB_OBJ_EXT 244 242 #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) 243 + 244 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 245 + #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ) 245 246 #else 246 - #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED 247 + #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_UNUSED 247 248 #endif 248 249 249 250 /* ··· 303 300 unsigned int usersize; 304 301 /** 305 302 * @freeptr_offset: Custom offset for the free pointer 306 - * in &SLAB_TYPESAFE_BY_RCU caches 303 + * in caches with &SLAB_TYPESAFE_BY_RCU or @ctor 307 304 * 308 - * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer 309 - * outside of the object. This might cause the object to grow in size. 310 - * Cache creators that have a reason to avoid this can specify a custom 311 - * free pointer offset in their struct where the free pointer will be 312 - * placed. 305 + * By default, &SLAB_TYPESAFE_BY_RCU and @ctor caches place the free 306 + * pointer outside of the object. This might cause the object to grow 307 + * in size. Cache creators that have a reason to avoid this can specify 308 + * a custom free pointer offset in their data structure where the free 309 + * pointer will be placed. 313 310 * 314 - * Note that placing the free pointer inside the object requires the 315 - * caller to ensure that no fields are invalidated that are required to 316 - * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for 317 - * details). 311 + * For caches with &SLAB_TYPESAFE_BY_RCU, the caller must ensure that 312 + * the free pointer does not overlay fields required to guard against 313 + * object recycling (See &SLAB_TYPESAFE_BY_RCU for details). 314 + * 315 + * For caches with @ctor, the caller must ensure that the free pointer 316 + * does not overlay fields initialized by the constructor. 317 + * 318 + * Currently, only caches with &SLAB_TYPESAFE_BY_RCU or @ctor 319 + * may specify @freeptr_offset. 318 320 * 319 321 * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset 320 - * is specified, %use_freeptr_offset must be set %true. 321 - * 322 - * Note that @ctor currently isn't supported with custom free pointers 323 - * as a @ctor requires an external free pointer. 322 + * is specified, @use_freeptr_offset must be set %true. 324 323 */ 325 324 unsigned int freeptr_offset; 326 325 /** ··· 513 508 void kfree(const void *objp); 514 509 void kfree_nolock(const void *objp); 515 510 void kfree_sensitive(const void *objp); 516 - size_t __ksize(const void *objp); 517 511 518 512 DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) 519 513 DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))
-11
mm/Kconfig
··· 247 247 out which slabs are relevant to a particular load. 248 248 Try running: slabinfo -DA 249 249 250 - config SLUB_CPU_PARTIAL 251 - default y 252 - depends on SMP && !SLUB_TINY 253 - bool "Enable per cpu partial caches" 254 - help 255 - Per cpu partial caches accelerate objects allocation and freeing 256 - that is local to a processor at the price of more indeterminism 257 - in the latency of the free. On overflow these caches will be cleared 258 - which requires the taking of locks that may cause latency spikes. 259 - Typically one would choose no for a realtime system. 260 - 261 250 config RANDOM_KMALLOC_CACHES 262 251 default n 263 252 depends on !SLUB_TINY
+1
mm/internal.h
··· 838 838 struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); 839 839 #define alloc_frozen_pages_nolock(...) \ 840 840 alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) 841 + void free_frozen_pages_nolock(struct page *page, unsigned int order); 841 842 842 843 extern void zone_pcp_reset(struct zone *zone); 843 844 extern void zone_pcp_disable(struct zone *zone);
+24 -7
mm/memcontrol.c
··· 2627 2627 * Memcg membership data for each individual object is saved in 2628 2628 * slab->obj_exts. 2629 2629 */ 2630 - struct slabobj_ext *obj_exts; 2630 + unsigned long obj_exts; 2631 + struct slabobj_ext *obj_ext; 2631 2632 unsigned int off; 2632 2633 2633 2634 obj_exts = slab_obj_exts(slab); 2634 2635 if (!obj_exts) 2635 2636 return NULL; 2636 2637 2638 + get_slab_obj_exts(obj_exts); 2637 2639 off = obj_to_index(slab->slab_cache, slab, p); 2638 - if (obj_exts[off].objcg) 2639 - return obj_cgroup_memcg(obj_exts[off].objcg); 2640 + obj_ext = slab_obj_ext(slab, obj_exts, off); 2641 + if (obj_ext->objcg) { 2642 + struct obj_cgroup *objcg = obj_ext->objcg; 2643 + 2644 + put_slab_obj_exts(obj_exts); 2645 + return obj_cgroup_memcg(objcg); 2646 + } 2647 + put_slab_obj_exts(obj_exts); 2640 2648 2641 2649 return NULL; 2642 2650 } ··· 3230 3222 } 3231 3223 3232 3224 for (i = 0; i < size; i++) { 3225 + unsigned long obj_exts; 3226 + struct slabobj_ext *obj_ext; 3227 + 3233 3228 slab = virt_to_slab(p[i]); 3234 3229 3235 3230 if (!slab_obj_exts(slab) && ··· 3255 3244 slab_pgdat(slab), cache_vmstat_idx(s))) 3256 3245 return false; 3257 3246 3247 + obj_exts = slab_obj_exts(slab); 3248 + get_slab_obj_exts(obj_exts); 3258 3249 off = obj_to_index(s, slab, p[i]); 3250 + obj_ext = slab_obj_ext(slab, obj_exts, off); 3259 3251 obj_cgroup_get(objcg); 3260 - slab_obj_exts(slab)[off].objcg = objcg; 3252 + obj_ext->objcg = objcg; 3253 + put_slab_obj_exts(obj_exts); 3261 3254 } 3262 3255 3263 3256 return true; 3264 3257 } 3265 3258 3266 3259 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 3267 - void **p, int objects, struct slabobj_ext *obj_exts) 3260 + void **p, int objects, unsigned long obj_exts) 3268 3261 { 3269 3262 size_t obj_size = obj_full_size(s); 3270 3263 3271 3264 for (int i = 0; i < objects; i++) { 3272 3265 struct obj_cgroup *objcg; 3266 + struct slabobj_ext *obj_ext; 3273 3267 unsigned int off; 3274 3268 3275 3269 off = obj_to_index(s, slab, p[i]); 3276 - objcg = obj_exts[off].objcg; 3270 + obj_ext = slab_obj_ext(slab, obj_exts, off); 3271 + objcg = obj_ext->objcg; 3277 3272 if (!objcg) 3278 3273 continue; 3279 3274 3280 - obj_exts[off].objcg = NULL; 3275 + obj_ext->objcg = NULL; 3281 3276 refill_obj_stock(objcg, obj_size, true, -obj_size, 3282 3277 slab_pgdat(slab), cache_vmstat_idx(s)); 3283 3278 obj_cgroup_put(objcg);
+5
mm/page_alloc.c
··· 3011 3011 __free_frozen_pages(page, order, FPI_NONE); 3012 3012 } 3013 3013 3014 + void free_frozen_pages_nolock(struct page *page, unsigned int order) 3015 + { 3016 + __free_frozen_pages(page, order, FPI_TRYLOCK); 3017 + } 3018 + 3014 3019 /* 3015 3020 * Free a batch of folios 3016 3021 */
+135 -78
mm/slab.h
··· 21 21 # define system_has_freelist_aba() system_has_cmpxchg128() 22 22 # define try_cmpxchg_freelist try_cmpxchg128 23 23 # endif 24 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 25 24 typedef u128 freelist_full_t; 26 25 #else /* CONFIG_64BIT */ 27 26 # ifdef system_has_cmpxchg64 28 27 # define system_has_freelist_aba() system_has_cmpxchg64() 29 28 # define try_cmpxchg_freelist try_cmpxchg64 30 29 # endif 31 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 32 30 typedef u64 freelist_full_t; 33 31 #endif /* CONFIG_64BIT */ 34 32 ··· 53 55 * that the slab was corrupted 54 56 */ 55 57 unsigned frozen:1; 58 + #ifdef CONFIG_64BIT 59 + /* 60 + * Some optimizations use free bits in 'counters' field 61 + * to save memory. In case ->stride field is not available, 62 + * such optimizations are disabled. 63 + */ 64 + unsigned short stride; 65 + #endif 56 66 }; 57 67 }; 58 68 }; ··· 77 71 struct kmem_cache *slab_cache; 78 72 union { 79 73 struct { 80 - union { 81 - struct list_head slab_list; 82 - struct { /* For deferred deactivate_slab() */ 83 - struct llist_node llnode; 84 - void *flush_freelist; 85 - }; 86 - #ifdef CONFIG_SLUB_CPU_PARTIAL 87 - struct { 88 - struct slab *next; 89 - int slabs; /* Nr of slabs left */ 90 - }; 91 - #endif 92 - }; 74 + struct list_head slab_list; 93 75 /* Double-word boundary */ 94 76 struct freelist_counters; 95 77 }; ··· 182 188 return PAGE_SIZE << slab_order(slab); 183 189 } 184 190 185 - #ifdef CONFIG_SLUB_CPU_PARTIAL 186 - #define slub_percpu_partial(c) ((c)->partial) 187 - 188 - #define slub_set_percpu_partial(c, p) \ 189 - ({ \ 190 - slub_percpu_partial(c) = (p)->next; \ 191 - }) 192 - 193 - #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) 194 - #else 195 - #define slub_percpu_partial(c) NULL 196 - 197 - #define slub_set_percpu_partial(c, p) 198 - 199 - #define slub_percpu_partial_read_once(c) NULL 200 - #endif // CONFIG_SLUB_CPU_PARTIAL 201 - 202 191 /* 203 192 * Word size structure that can be atomically updated or read and that 204 193 * contains both the order and the number of objects that a slab of the ··· 195 218 * Slab cache management. 196 219 */ 197 220 struct kmem_cache { 198 - struct kmem_cache_cpu __percpu *cpu_slab; 199 - struct lock_class_key lock_key; 200 221 struct slub_percpu_sheaves __percpu *cpu_sheaves; 201 222 /* Used for retrieving partial slabs, etc. */ 202 223 slab_flags_t flags; ··· 203 228 unsigned int object_size; /* Object size without metadata */ 204 229 struct reciprocal_value reciprocal_size; 205 230 unsigned int offset; /* Free pointer offset */ 206 - #ifdef CONFIG_SLUB_CPU_PARTIAL 207 - /* Number of per cpu partial objects to keep around */ 208 - unsigned int cpu_partial; 209 - /* Number of per cpu partial slabs to keep around */ 210 - unsigned int cpu_partial_slabs; 211 - #endif 212 231 unsigned int sheaf_capacity; 213 232 struct kmem_cache_order_objects oo; 214 233 ··· 243 274 unsigned int usersize; /* Usercopy region size */ 244 275 #endif 245 276 277 + #ifdef CONFIG_SLUB_STATS 278 + struct kmem_cache_stats __percpu *cpu_stats; 279 + #endif 280 + 246 281 struct kmem_cache_node *node[MAX_NUMNODES]; 247 282 }; 283 + 284 + /* 285 + * Every cache has !NULL s->cpu_sheaves but they may point to the 286 + * bootstrap_sheaf temporarily during init, or permanently for the boot caches 287 + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This 288 + * helper distinguishes whether cache has real non-bootstrap sheaves. 289 + */ 290 + static inline bool cache_has_sheaves(struct kmem_cache *s) 291 + { 292 + /* Test CONFIG_SLUB_TINY for code elimination purposes */ 293 + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; 294 + } 248 295 249 296 #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) 250 297 #define SLAB_SUPPORTS_SYSFS 1 251 298 void sysfs_slab_unlink(struct kmem_cache *s); 252 299 void sysfs_slab_release(struct kmem_cache *s); 300 + int sysfs_slab_alias(struct kmem_cache *s, const char *name); 253 301 #else 254 302 static inline void sysfs_slab_unlink(struct kmem_cache *s) { } 255 303 static inline void sysfs_slab_release(struct kmem_cache *s) { } 304 + static inline int sysfs_slab_alias(struct kmem_cache *s, const char *name) 305 + { return 0; } 256 306 #endif 257 307 258 308 void *fixup_red_left(struct kmem_cache *s, void *p); ··· 388 400 unsigned int useroffset, unsigned int usersize); 389 401 390 402 int slab_unmergeable(struct kmem_cache *s); 391 - struct kmem_cache *find_mergeable(unsigned size, unsigned align, 392 - slab_flags_t flags, const char *name, void (*ctor)(void *)); 393 - struct kmem_cache * 394 - __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, 395 - slab_flags_t flags, void (*ctor)(void *)); 403 + bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags); 396 404 397 405 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); 398 406 ··· 486 502 static inline bool slab_in_kunit_test(void) { return false; } 487 503 #endif 488 504 505 + /* 506 + * slub is about to manipulate internal object metadata. This memory lies 507 + * outside the range of the allocated object, so accessing it would normally 508 + * be reported by kasan as a bounds error. metadata_access_enable() is used 509 + * to tell kasan that these accesses are OK. 510 + */ 511 + static inline void metadata_access_enable(void) 512 + { 513 + kasan_disable_current(); 514 + kmsan_disable_current(); 515 + } 516 + 517 + static inline void metadata_access_disable(void) 518 + { 519 + kmsan_enable_current(); 520 + kasan_enable_current(); 521 + } 522 + 489 523 #ifdef CONFIG_SLAB_OBJ_EXT 490 524 491 525 /* ··· 511 509 * associated with a slab. 512 510 * @slab: a pointer to the slab struct 513 511 * 514 - * Returns a pointer to the object extension vector associated with the slab, 515 - * or NULL if no such vector has been associated yet. 512 + * Returns the address of the object extension vector associated with the slab, 513 + * or zero if no such vector has been associated yet. 514 + * Do not dereference the return value directly; use get/put_slab_obj_exts() 515 + * pair and slab_obj_ext() to access individual elements. 516 + * 517 + * Example usage: 518 + * 519 + * obj_exts = slab_obj_exts(slab); 520 + * if (obj_exts) { 521 + * get_slab_obj_exts(obj_exts); 522 + * obj_ext = slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, obj)); 523 + * // do something with obj_ext 524 + * put_slab_obj_exts(obj_exts); 525 + * } 526 + * 527 + * Note that the get/put semantics does not involve reference counting. 528 + * Instead, it updates kasan/kmsan depth so that accesses to slabobj_ext 529 + * won't be reported as access violations. 516 530 */ 517 - static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) 531 + static inline unsigned long slab_obj_exts(struct slab *slab) 518 532 { 519 533 unsigned long obj_exts = READ_ONCE(slab->obj_exts); 520 534 ··· 543 525 obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); 544 526 VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); 545 527 #endif 546 - return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); 528 + 529 + return obj_exts & ~OBJEXTS_FLAGS_MASK; 530 + } 531 + 532 + static inline void get_slab_obj_exts(unsigned long obj_exts) 533 + { 534 + VM_WARN_ON_ONCE(!obj_exts); 535 + metadata_access_enable(); 536 + } 537 + 538 + static inline void put_slab_obj_exts(unsigned long obj_exts) 539 + { 540 + metadata_access_disable(); 541 + } 542 + 543 + #ifdef CONFIG_64BIT 544 + static inline void slab_set_stride(struct slab *slab, unsigned short stride) 545 + { 546 + slab->stride = stride; 547 + } 548 + static inline unsigned short slab_get_stride(struct slab *slab) 549 + { 550 + return slab->stride; 551 + } 552 + #else 553 + static inline void slab_set_stride(struct slab *slab, unsigned short stride) 554 + { 555 + VM_WARN_ON_ONCE(stride != sizeof(struct slabobj_ext)); 556 + } 557 + static inline unsigned short slab_get_stride(struct slab *slab) 558 + { 559 + return sizeof(struct slabobj_ext); 560 + } 561 + #endif 562 + 563 + /* 564 + * slab_obj_ext - get the pointer to the slab object extension metadata 565 + * associated with an object in a slab. 566 + * @slab: a pointer to the slab struct 567 + * @obj_exts: a pointer to the object extension vector 568 + * @index: an index of the object 569 + * 570 + * Returns a pointer to the object extension associated with the object. 571 + * Must be called within a section covered by get/put_slab_obj_exts(). 572 + */ 573 + static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, 574 + unsigned long obj_exts, 575 + unsigned int index) 576 + { 577 + struct slabobj_ext *obj_ext; 578 + 579 + VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab)); 580 + 581 + obj_ext = (struct slabobj_ext *)(obj_exts + 582 + slab_get_stride(slab) * index); 583 + return kasan_reset_tag(obj_ext); 547 584 } 548 585 549 586 int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, ··· 606 533 607 534 #else /* CONFIG_SLAB_OBJ_EXT */ 608 535 609 - static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) 536 + static inline unsigned long slab_obj_exts(struct slab *slab) 537 + { 538 + return 0; 539 + } 540 + 541 + static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, 542 + unsigned long obj_exts, 543 + unsigned int index) 610 544 { 611 545 return NULL; 612 546 } 547 + 548 + static inline void slab_set_stride(struct slab *slab, unsigned int stride) { } 549 + static inline unsigned int slab_get_stride(struct slab *slab) { return 0; } 550 + 613 551 614 552 #endif /* CONFIG_SLAB_OBJ_EXT */ 615 553 ··· 634 550 bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, 635 551 gfp_t flags, size_t size, void **p); 636 552 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 637 - void **p, int objects, struct slabobj_ext *obj_exts); 553 + void **p, int objects, unsigned long obj_exts); 638 554 #endif 639 555 640 556 void kvfree_rcu_cb(struct rcu_head *head); 641 - 642 - size_t __ksize(const void *objp); 643 - 644 - static inline size_t slab_ksize(const struct kmem_cache *s) 645 - { 646 - #ifdef CONFIG_SLUB_DEBUG 647 - /* 648 - * Debugging requires use of the padding between object 649 - * and whatever may come after it. 650 - */ 651 - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 652 - return s->object_size; 653 - #endif 654 - if (s->flags & SLAB_KASAN) 655 - return s->object_size; 656 - /* 657 - * If we have the need to store the freelist pointer 658 - * back there or track user information then we can 659 - * only use the space before that information. 660 - */ 661 - if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) 662 - return s->inuse; 663 - /* 664 - * Else we can use all the padding etc for the allocation 665 - */ 666 - return s->size; 667 - } 668 557 669 558 static inline unsigned int large_kmalloc_order(const struct page *page) 670 559 {
+66 -87
mm/slab_common.c
··· 43 43 struct kmem_cache *kmem_cache; 44 44 45 45 /* 46 - * Set of flags that will prevent slab merging 46 + * Set of flags that will prevent slab merging. 47 + * Any flag that adds per-object metadata should be included, 48 + * since slab merging can update s->inuse that affects the metadata layout. 47 49 */ 48 - #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 49 - SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ 50 - SLAB_FAILSLAB | SLAB_NO_MERGE) 50 + #define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \ 51 + SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \ 52 + SLAB_OBJ_EXT_IN_OBJ) 51 53 52 54 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 53 55 SLAB_CACHE_DMA32 | SLAB_ACCOUNT) ··· 165 163 return 1; 166 164 #endif 167 165 168 - if (s->cpu_sheaves) 169 - return 1; 170 - 171 166 /* 172 167 * We may have set a slab to be unmergeable during bootstrap. 173 168 */ ··· 174 175 return 0; 175 176 } 176 177 177 - struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, 178 - slab_flags_t flags, const char *name, void (*ctor)(void *)) 178 + bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags) 179 179 { 180 - struct kmem_cache *s; 181 - 182 180 if (slab_nomerge) 183 - return NULL; 181 + return true; 184 182 185 - if (ctor) 186 - return NULL; 183 + if (args->ctor) 184 + return true; 187 185 188 - flags = kmem_cache_flags(flags, name); 186 + if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) 187 + return true; 189 188 190 189 if (flags & SLAB_NEVER_MERGE) 190 + return true; 191 + 192 + return false; 193 + } 194 + 195 + static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags, 196 + const char *name, struct kmem_cache_args *args) 197 + { 198 + struct kmem_cache *s; 199 + unsigned int align; 200 + 201 + flags = kmem_cache_flags(flags, name); 202 + if (slab_args_unmergeable(args, flags)) 191 203 return NULL; 192 204 193 205 size = ALIGN(size, sizeof(void *)); 194 - align = calculate_alignment(flags, align, size); 206 + align = calculate_alignment(flags, args->align, size); 195 207 size = ALIGN(size, align); 196 208 197 209 list_for_each_entry_reverse(s, &slab_caches, list) { ··· 241 231 err = -EINVAL; 242 232 if (args->use_freeptr_offset && 243 233 (args->freeptr_offset >= object_size || 244 - !(flags & SLAB_TYPESAFE_BY_RCU) || 234 + (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) || 245 235 !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t)))) 246 236 goto out; 247 237 ··· 261 251 kmem_cache_free(kmem_cache, s); 262 252 out: 263 253 return ERR_PTR(err); 254 + } 255 + 256 + static struct kmem_cache * 257 + __kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags, 258 + struct kmem_cache_args *args) 259 + { 260 + struct kmem_cache *s; 261 + 262 + s = find_mergeable(size, flags, name, args); 263 + if (s) { 264 + if (sysfs_slab_alias(s, name)) 265 + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", 266 + name); 267 + 268 + s->refcount++; 269 + 270 + /* 271 + * Adjust the object sizes so that we clear 272 + * the complete object on kzalloc. 273 + */ 274 + s->object_size = max(s->object_size, size); 275 + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); 276 + } 277 + 278 + return s; 264 279 } 265 280 266 281 /** ··· 340 305 flags &= ~SLAB_DEBUG_FLAGS; 341 306 #endif 342 307 308 + /* 309 + * Caches with specific capacity are special enough. It's simpler to 310 + * make them unmergeable. 311 + */ 312 + if (args->sheaf_capacity) 313 + flags |= SLAB_NO_MERGE; 314 + 343 315 mutex_lock(&slab_mutex); 344 316 345 317 err = kmem_cache_sanity_check(name, object_size); ··· 366 324 object_size - args->usersize < args->useroffset)) 367 325 args->usersize = args->useroffset = 0; 368 326 369 - if (!args->usersize && !args->sheaf_capacity) 370 - s = __kmem_cache_alias(name, object_size, args->align, flags, 371 - args->ctor); 327 + s = __kmem_cache_alias(name, object_size, flags, args); 372 328 if (s) 373 329 goto out_unlock; 374 330 ··· 1023 983 0, SLAB_NO_MERGE, NULL); 1024 984 } 1025 985 1026 - /** 1027 - * __ksize -- Report full size of underlying allocation 1028 - * @object: pointer to the object 1029 - * 1030 - * This should only be used internally to query the true size of allocations. 1031 - * It is not meant to be a way to discover the usable size of an allocation 1032 - * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond 1033 - * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, 1034 - * and/or FORTIFY_SOURCE. 1035 - * 1036 - * Return: size of the actual memory used by @object in bytes 1037 - */ 1038 - size_t __ksize(const void *object) 1039 - { 1040 - const struct page *page; 1041 - const struct slab *slab; 1042 - 1043 - if (unlikely(object == ZERO_SIZE_PTR)) 1044 - return 0; 1045 - 1046 - page = virt_to_page(object); 1047 - 1048 - if (unlikely(PageLargeKmalloc(page))) 1049 - return large_kmalloc_size(page); 1050 - 1051 - slab = page_slab(page); 1052 - /* Delete this after we're sure there are no users */ 1053 - if (WARN_ON(!slab)) 1054 - return page_size(page); 1055 - 1056 - #ifdef CONFIG_SLUB_DEBUG 1057 - skip_orig_size_check(slab->slab_cache, object); 1058 - #endif 1059 - 1060 - return slab_ksize(slab->slab_cache); 1061 - } 1062 - 1063 986 gfp_t kmalloc_fix_flags(gfp_t flags) 1064 987 { 1065 988 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; ··· 1237 1234 kfree(mem); 1238 1235 } 1239 1236 EXPORT_SYMBOL(kfree_sensitive); 1240 - 1241 - size_t ksize(const void *objp) 1242 - { 1243 - /* 1244 - * We need to first check that the pointer to the object is valid. 1245 - * The KASAN report printed from ksize() is more useful, then when 1246 - * it's printed later when the behaviour could be undefined due to 1247 - * a potential use-after-free or double-free. 1248 - * 1249 - * We use kasan_check_byte(), which is supported for the hardware 1250 - * tag-based KASAN mode, unlike kasan_check_read/write(). 1251 - * 1252 - * If the pointed to memory is invalid, we return 0 to avoid users of 1253 - * ksize() writing to and potentially corrupting the memory region. 1254 - * 1255 - * We want to perform the check before __ksize(), to avoid potentially 1256 - * crashing in __ksize() due to accessing invalid metadata. 1257 - */ 1258 - if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) 1259 - return 0; 1260 - 1261 - return kfence_ksize(objp) ?: __ksize(objp); 1262 - } 1263 - EXPORT_SYMBOL(ksize); 1264 1237 1265 1238 #ifdef CONFIG_BPF_SYSCALL 1266 1239 #include <linux/btf.h> ··· 1604 1625 return false; 1605 1626 1606 1627 s = slab->slab_cache; 1607 - if (s->cpu_sheaves) { 1608 - if (likely(!IS_ENABLED(CONFIG_NUMA) || 1609 - slab_nid(slab) == numa_mem_id())) 1610 - return __kfree_rcu_sheaf(s, obj); 1611 - } 1628 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) 1629 + return __kfree_rcu_sheaf(s, obj); 1612 1630 1613 1631 return false; 1614 1632 } ··· 2109 2133 */ 2110 2134 void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) 2111 2135 { 2112 - if (s->cpu_sheaves) 2136 + if (cache_has_sheaves(s)) { 2113 2137 flush_rcu_sheaves_on_cache(s); 2138 + rcu_barrier(); 2139 + } 2140 + 2114 2141 /* 2115 2142 * TODO: Introduce a version of __kvfree_rcu_barrier() that works 2116 2143 * on a specific slab cache.
+1507 -1872
mm/slub.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 - * SLUB: A slab allocator that limits cache line use instead of queuing 4 - * objects in per cpu and per node lists. 3 + * SLUB: A slab allocator with low overhead percpu array caches and mostly 4 + * lockless freeing of objects to slabs in the slowpath. 5 5 * 6 - * The allocator synchronizes using per slab locks or atomic operations 7 - * and only uses a centralized lock to manage a pool of partial slabs. 6 + * The allocator synchronizes using spin_trylock for percpu arrays in the 7 + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. 8 + * Uses a centralized lock to manage a pool of partial slabs. 8 9 * 9 10 * (C) 2007 SGI, Christoph Lameter 10 11 * (C) 2011 Linux Foundation, Christoph Lameter 12 + * (C) 2025 SUSE, Vlastimil Babka 11 13 */ 12 14 13 15 #include <linux/mm.h> ··· 55 53 56 54 /* 57 55 * Lock order: 58 - * 1. slab_mutex (Global Mutex) 59 - * 2. node->list_lock (Spinlock) 60 - * 3. kmem_cache->cpu_slab->lock (Local lock) 61 - * 4. slab_lock(slab) (Only on some arches) 62 - * 5. object_map_lock (Only for debugging) 56 + * 0. cpu_hotplug_lock 57 + * 1. slab_mutex (Global Mutex) 58 + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) 59 + * 2b. node->barn->lock (Spinlock) 60 + * 2c. node->list_lock (Spinlock) 61 + * 3. slab_lock(slab) (Only on some arches) 62 + * 4. object_map_lock (Only for debugging) 63 63 * 64 64 * slab_mutex 65 65 * ··· 82 78 * C. slab->objects -> Number of objects in slab 83 79 * D. slab->frozen -> frozen state 84 80 * 81 + * SL_partial slabs 82 + * 83 + * Slabs on node partial list have at least one free object. A limited number 84 + * of slabs on the list can be fully free (slab->inuse == 0), until we start 85 + * discarding them. These slabs are marked with SL_partial, and the flag is 86 + * cleared while removing them, usually to grab their freelist afterwards. 87 + * This clearing also exempts them from list management. Please see 88 + * __slab_free() for more details. 89 + * 90 + * Full slabs 91 + * 92 + * For caches without debugging enabled, full slabs (slab->inuse == 93 + * slab->objects and slab->freelist == NULL) are not placed on any list. 94 + * The __slab_free() freeing the first object from such a slab will place 95 + * it on the partial list. Caches with debugging enabled place such slab 96 + * on the full list and use different allocation and freeing paths. 97 + * 85 98 * Frozen slabs 86 99 * 87 - * If a slab is frozen then it is exempt from list management. It is 88 - * the cpu slab which is actively allocated from by the processor that 89 - * froze it and it is not on any list. The processor that froze the 90 - * slab is the one who can perform list operations on the slab. Other 91 - * processors may put objects onto the freelist but the processor that 92 - * froze the slab is the only one that can retrieve the objects from the 93 - * slab's freelist. 94 - * 95 - * CPU partial slabs 96 - * 97 - * The partially empty slabs cached on the CPU partial list are used 98 - * for performance reasons, which speeds up the allocation process. 99 - * These slabs are not frozen, but are also exempt from list management, 100 - * by clearing the SL_partial flag when moving out of the node 101 - * partial list. Please see __slab_free() for more details. 100 + * If a slab is frozen then it is exempt from list management. It is used to 101 + * indicate a slab that has failed consistency checks and thus cannot be 102 + * allocated from anymore - it is also marked as full. Any previously 103 + * allocated objects will be simply leaked upon freeing instead of attempting 104 + * to modify the potentially corrupted freelist and metadata. 102 105 * 103 106 * To sum up, the current scheme is: 104 - * - node partial slab: SL_partial && !frozen 105 - * - cpu partial slab: !SL_partial && !frozen 106 - * - cpu slab: !SL_partial && frozen 107 - * - full slab: !SL_partial && !frozen 107 + * - node partial slab: SL_partial && !full && !frozen 108 + * - taken off partial list: !SL_partial && !full && !frozen 109 + * - full slab, not on any list: !SL_partial && full && !frozen 110 + * - frozen due to inconsistency: !SL_partial && full && frozen 108 111 * 109 - * list_lock 112 + * node->list_lock (spinlock) 110 113 * 111 114 * The list_lock protects the partial and full list on each node and 112 115 * the partial slab counter. If taken then no new slabs may be added or ··· 123 112 * 124 113 * The list_lock is a centralized lock and thus we avoid taking it as 125 114 * much as possible. As long as SLUB does not have to handle partial 126 - * slabs, operations can continue without any centralized lock. F.e. 127 - * allocating a long series of objects that fill up slabs does not require 128 - * the list lock. 115 + * slabs, operations can continue without any centralized lock. 129 116 * 130 117 * For debug caches, all allocations are forced to go through a list_lock 131 118 * protected region to serialize against concurrent validation. 132 119 * 133 - * cpu_slab->lock local lock 120 + * cpu_sheaves->lock (local_trylock) 134 121 * 135 - * This locks protect slowpath manipulation of all kmem_cache_cpu fields 136 - * except the stat counters. This is a percpu structure manipulated only by 137 - * the local cpu, so the lock protects against being preempted or interrupted 138 - * by an irq. Fast path operations rely on lockless operations instead. 122 + * This lock protects fastpath operations on the percpu sheaves. On !RT it 123 + * only disables preemption and does no atomic operations. As long as the main 124 + * or spare sheaf can handle the allocation or free, there is no other 125 + * overhead. 139 126 * 140 - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption 141 - * which means the lockless fastpath cannot be used as it might interfere with 142 - * an in-progress slow path operations. In this case the local lock is always 143 - * taken but it still utilizes the freelist for the common operations. 127 + * node->barn->lock (spinlock) 144 128 * 145 - * lockless fastpaths 129 + * This lock protects the operations on per-NUMA-node barn. It can quickly 130 + * serve an empty or full sheaf if available, and avoid more expensive refill 131 + * or flush operation. 146 132 * 147 - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) 148 - * are fully lockless when satisfied from the percpu slab (and when 149 - * cmpxchg_double is possible to use, otherwise slab_lock is taken). 150 - * They also don't disable preemption or migration or irqs. They rely on 151 - * the transaction id (tid) field to detect being preempted or moved to 152 - * another cpu. 133 + * Lockless freeing 134 + * 135 + * Objects may have to be freed to their slabs when they are from a remote 136 + * node (where we want to avoid filling local sheaves with remote objects) 137 + * or when there are too many full sheaves. On architectures supporting 138 + * cmpxchg_double this is done by a lockless update of slab's freelist and 139 + * counters, otherwise slab_lock is taken. This only needs to take the 140 + * list_lock if it's a first free to a full slab, or when a slab becomes empty 141 + * after the free. 153 142 * 154 143 * irq, preemption, migration considerations 155 144 * 156 - * Interrupts are disabled as part of list_lock or local_lock operations, or 145 + * Interrupts are disabled as part of list_lock or barn lock operations, or 157 146 * around the slab_lock operation, in order to make the slab allocator safe 158 147 * to use in the context of an irq. 148 + * Preemption is disabled as part of local_trylock operations. 149 + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see 150 + * their limitations. 159 151 * 160 - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the 161 - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the 162 - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer 163 - * doesn't have to be revalidated in each section protected by the local lock. 164 - * 165 - * SLUB assigns one slab for allocation to each processor. 166 - * Allocations only occur from these slabs called cpu slabs. 152 + * SLUB assigns two object arrays called sheaves for caching allocations and 153 + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. 154 + * Allocations and frees are primarily served from these sheaves. 167 155 * 168 156 * Slabs with free elements are kept on a partial list and during regular 169 157 * operations no list for full slabs is used. If an object in a full slab is ··· 170 160 * We track full slabs for debugging purposes though because otherwise we 171 161 * cannot scan all objects. 172 162 * 173 - * Slabs are freed when they become empty. Teardown and setup is 174 - * minimal so we rely on the page allocators per cpu caches for 175 - * fast frees and allocs. 176 - * 177 - * slab->frozen The slab is frozen and exempt from list processing. 178 - * This means that the slab is dedicated to a purpose 179 - * such as satisfying allocations for a specific 180 - * processor. Objects may be freed in the slab while 181 - * it is frozen but slab_free will then skip the usual 182 - * list operations. It is up to the processor holding 183 - * the slab to integrate the slab into the slab lists 184 - * when the slab is no longer needed. 185 - * 186 - * One use of this flag is to mark slabs that are 187 - * used for allocations. Then such a slab becomes a cpu 188 - * slab. The cpu slab may be equipped with an additional 189 - * freelist that allows lockless access to 190 - * free objects in addition to the regular freelist 191 - * that requires the slab lock. 163 + * Slabs are freed when they become empty. Teardown and setup is minimal so we 164 + * rely on the page allocators per cpu caches for fast frees and allocs. 192 165 * 193 166 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug 194 167 * options set. This moves slab handling out of ··· 194 201 SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ 195 202 }; 196 203 197 - /* 198 - * We could simply use migrate_disable()/enable() but as long as it's a 199 - * function call even on !PREEMPT_RT, use inline preempt_disable() there. 200 - */ 201 - #ifndef CONFIG_PREEMPT_RT 202 - #define slub_get_cpu_ptr(var) get_cpu_ptr(var) 203 - #define slub_put_cpu_ptr(var) put_cpu_ptr(var) 204 - #define USE_LOCKLESS_FAST_PATH() (true) 205 - #else 206 - #define slub_get_cpu_ptr(var) \ 207 - ({ \ 208 - migrate_disable(); \ 209 - this_cpu_ptr(var); \ 210 - }) 211 - #define slub_put_cpu_ptr(var) \ 212 - do { \ 213 - (void)(var); \ 214 - migrate_enable(); \ 215 - } while (0) 216 - #define USE_LOCKLESS_FAST_PATH() (false) 217 - #endif 218 - 219 204 #ifndef CONFIG_SLUB_TINY 220 205 #define __fastpath_inline __always_inline 221 206 #else ··· 212 241 static DEFINE_STATIC_KEY_FALSE(strict_numa); 213 242 #endif 214 243 215 - /* Structure holding parameters for get_partial() call chain */ 244 + /* Structure holding parameters for get_from_partial() call chain */ 216 245 struct partial_context { 217 246 gfp_t flags; 218 247 unsigned int orig_size; 219 - void *object; 248 + }; 249 + 250 + /* Structure holding parameters for get_partial_node_bulk() */ 251 + struct partial_bulk_context { 252 + gfp_t flags; 253 + unsigned int min_objects; 254 + unsigned int max_objects; 255 + struct list_head slabs; 220 256 }; 221 257 222 258 static inline bool kmem_cache_debug(struct kmem_cache *s) ··· 237 259 p += s->red_left_pad; 238 260 239 261 return p; 240 - } 241 - 242 - static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 243 - { 244 - #ifdef CONFIG_SLUB_CPU_PARTIAL 245 - return !kmem_cache_debug(s); 246 - #else 247 - return false; 248 - #endif 249 262 } 250 263 251 264 /* ··· 319 350 320 351 #ifdef SLAB_SUPPORTS_SYSFS 321 352 static int sysfs_slab_add(struct kmem_cache *); 322 - static int sysfs_slab_alias(struct kmem_cache *, const char *); 323 353 #else 324 354 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 325 - static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 326 - { return 0; } 327 355 #endif 328 356 329 357 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) ··· 329 363 static inline void debugfs_slab_add(struct kmem_cache *s) { } 330 364 #endif 331 365 366 + enum add_mode { 367 + ADD_TO_HEAD, 368 + ADD_TO_TAIL, 369 + }; 370 + 332 371 enum stat_item { 333 - ALLOC_PCS, /* Allocation from percpu sheaf */ 334 - ALLOC_FASTPATH, /* Allocation from cpu slab */ 335 - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 336 - FREE_PCS, /* Free to percpu sheaf */ 372 + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ 373 + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ 337 374 FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ 338 375 FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ 339 - FREE_FASTPATH, /* Free to cpu slab */ 340 - FREE_SLOWPATH, /* Freeing not to cpu slab */ 341 - FREE_FROZEN, /* Freeing to frozen slab */ 376 + FREE_FASTPATH, /* Free to percpu sheaves */ 377 + FREE_SLOWPATH, /* Free to a slab */ 342 378 FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ 343 379 FREE_REMOVE_PARTIAL, /* Freeing removes last object */ 344 - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ 345 - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ 346 - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ 347 - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ 380 + ALLOC_SLAB, /* New slab acquired from page allocator */ 381 + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ 348 382 FREE_SLAB, /* Slab freed to the page allocator */ 349 - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ 350 - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ 351 - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ 352 - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ 353 - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ 354 - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ 355 - DEACTIVATE_BYPASS, /* Implicit deactivation */ 356 383 ORDER_FALLBACK, /* Number of times fallback was necessary */ 357 - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ 358 384 CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ 359 - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ 360 - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ 361 - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ 362 - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ 363 385 SHEAF_FLUSH, /* Objects flushed from a sheaf */ 364 386 SHEAF_REFILL, /* Objects refilled to a sheaf */ 365 387 SHEAF_ALLOC, /* Allocation of an empty sheaf */ ··· 364 410 NR_SLUB_STAT_ITEMS 365 411 }; 366 412 367 - struct freelist_tid { 368 - union { 369 - struct { 370 - void *freelist; /* Pointer to next available object */ 371 - unsigned long tid; /* Globally unique transaction id */ 372 - }; 373 - freelist_full_t freelist_tid; 374 - }; 375 - }; 376 - 377 - /* 378 - * When changing the layout, make sure freelist and tid are still compatible 379 - * with this_cpu_cmpxchg_double() alignment requirements. 380 - */ 381 - struct kmem_cache_cpu { 382 - struct freelist_tid; 383 - struct slab *slab; /* The slab from which we are allocating */ 384 - #ifdef CONFIG_SLUB_CPU_PARTIAL 385 - struct slab *partial; /* Partially allocated slabs */ 386 - #endif 387 - local_trylock_t lock; /* Protects the fields above */ 388 413 #ifdef CONFIG_SLUB_STATS 414 + struct kmem_cache_stats { 389 415 unsigned int stat[NR_SLUB_STAT_ITEMS]; 390 - #endif 391 416 }; 417 + #endif 392 418 393 419 static inline void stat(const struct kmem_cache *s, enum stat_item si) 394 420 { ··· 377 443 * The rmw is racy on a preemptible kernel but this is acceptable, so 378 444 * avoid this_cpu_add()'s irq-disable overhead. 379 445 */ 380 - raw_cpu_inc(s->cpu_slab->stat[si]); 446 + raw_cpu_inc(s->cpu_stats->stat[si]); 381 447 #endif 382 448 } 383 449 ··· 385 451 void stat_add(const struct kmem_cache *s, enum stat_item si, int v) 386 452 { 387 453 #ifdef CONFIG_SLUB_STATS 388 - raw_cpu_add(s->cpu_slab->stat[si], v); 454 + raw_cpu_add(s->cpu_stats->stat[si], v); 389 455 #endif 390 456 } 391 457 ··· 474 540 static nodemask_t slab_nodes; 475 541 476 542 /* 477 - * Workqueue used for flush_cpu_slab(). 543 + * Workqueue used for flushing cpu and kfree_rcu sheaves. 478 544 */ 479 545 static struct workqueue_struct *flushwq; 480 546 ··· 531 597 ptr_addr = (unsigned long)object + s->offset; 532 598 p = *(freeptr_t *)(ptr_addr); 533 599 return freelist_ptr_decode(s, p, ptr_addr); 534 - } 535 - 536 - static void prefetch_freepointer(const struct kmem_cache *s, void *object) 537 - { 538 - prefetchw(object + s->offset); 539 - } 540 - 541 - /* 542 - * When running under KMSAN, get_freepointer_safe() may return an uninitialized 543 - * pointer value in the case the current thread loses the race for the next 544 - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in 545 - * slab_alloc_node() will fail, so the uninitialized value won't be used, but 546 - * KMSAN will still check all arguments of cmpxchg because of imperfect 547 - * handling of inline assembly. 548 - * To work around this problem, we apply __no_kmsan_checks to ensure that 549 - * get_freepointer_safe() returns initialized memory. 550 - */ 551 - __no_kmsan_checks 552 - static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 553 - { 554 - unsigned long freepointer_addr; 555 - freeptr_t p; 556 - 557 - if (!debug_pagealloc_enabled_static()) 558 - return get_freepointer(s, object); 559 - 560 - object = kasan_reset_tag(object); 561 - freepointer_addr = (unsigned long)object + s->offset; 562 - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); 563 - return freelist_ptr_decode(s, p, freepointer_addr); 564 600 } 565 601 566 602 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) ··· 596 692 return x.x & OO_MASK; 597 693 } 598 694 599 - #ifdef CONFIG_SLUB_CPU_PARTIAL 600 - static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 601 - { 602 - unsigned int nr_slabs; 603 - 604 - s->cpu_partial = nr_objects; 605 - 606 - /* 607 - * We take the number of objects but actually limit the number of 608 - * slabs on the per cpu partial list, in order to limit excessive 609 - * growth of the list. For simplicity we assume that the slabs will 610 - * be half-full. 611 - */ 612 - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); 613 - s->cpu_partial_slabs = nr_slabs; 614 - } 615 - 616 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 617 - { 618 - return s->cpu_partial_slabs; 619 - } 620 - #else 621 - #ifdef SLAB_SUPPORTS_SYSFS 622 - static inline void 623 - slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 624 - { 625 - } 626 - #endif 627 - 628 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 629 - { 630 - return 0; 631 - } 632 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 633 - 634 695 /* 635 696 * If network-based swap is enabled, slub must keep track of whether memory 636 697 * were allocated from pfmemalloc reserves. ··· 651 782 if (slab->freelist == old->freelist && 652 783 slab->counters == old->counters) { 653 784 slab->freelist = new->freelist; 654 - slab->counters = new->counters; 785 + /* prevent tearing for the read in get_partial_node_bulk() */ 786 + WRITE_ONCE(slab->counters, new->counters); 655 787 ret = true; 656 788 } 657 789 slab_unlock(slab); ··· 672 802 { 673 803 bool ret; 674 804 675 - if (USE_LOCKLESS_FAST_PATH()) 805 + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) 676 806 lockdep_assert_irqs_disabled(); 677 807 678 808 if (s->flags & __CMPXCHG_DOUBLE) ··· 727 857 * request size in the meta data area, for better debug and sanity check. 728 858 */ 729 859 static inline void set_orig_size(struct kmem_cache *s, 730 - void *object, unsigned int orig_size) 860 + void *object, unsigned long orig_size) 731 861 { 732 862 void *p = kasan_reset_tag(object); 733 863 ··· 737 867 p += get_info_end(s); 738 868 p += sizeof(struct track) * 2; 739 869 740 - *(unsigned int *)p = orig_size; 870 + *(unsigned long *)p = orig_size; 741 871 } 742 872 743 - static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) 873 + static inline unsigned long get_orig_size(struct kmem_cache *s, void *object) 744 874 { 745 875 void *p = kasan_reset_tag(object); 746 876 ··· 753 883 p += get_info_end(s); 754 884 p += sizeof(struct track) * 2; 755 885 756 - return *(unsigned int *)p; 886 + return *(unsigned long *)p; 757 887 } 888 + 889 + #ifdef CONFIG_SLAB_OBJ_EXT 890 + 891 + /* 892 + * Check if memory cgroup or memory allocation profiling is enabled. 893 + * If enabled, SLUB tries to reduce memory overhead of accounting 894 + * slab objects. If neither is enabled when this function is called, 895 + * the optimization is simply skipped to avoid affecting caches that do not 896 + * need slabobj_ext metadata. 897 + * 898 + * However, this may disable optimization when memory cgroup or memory 899 + * allocation profiling is used, but slabs are created too early 900 + * even before those subsystems are initialized. 901 + */ 902 + static inline bool need_slab_obj_exts(struct kmem_cache *s) 903 + { 904 + if (s->flags & SLAB_NO_OBJ_EXT) 905 + return false; 906 + 907 + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) 908 + return true; 909 + 910 + if (mem_alloc_profiling_enabled()) 911 + return true; 912 + 913 + return false; 914 + } 915 + 916 + static inline unsigned int obj_exts_size_in_slab(struct slab *slab) 917 + { 918 + return sizeof(struct slabobj_ext) * slab->objects; 919 + } 920 + 921 + static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, 922 + struct slab *slab) 923 + { 924 + unsigned long objext_offset; 925 + 926 + objext_offset = s->size * slab->objects; 927 + objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext)); 928 + return objext_offset; 929 + } 930 + 931 + static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, 932 + struct slab *slab) 933 + { 934 + unsigned long objext_offset = obj_exts_offset_in_slab(s, slab); 935 + unsigned long objext_size = obj_exts_size_in_slab(slab); 936 + 937 + return objext_offset + objext_size <= slab_size(slab); 938 + } 939 + 940 + static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) 941 + { 942 + unsigned long obj_exts; 943 + unsigned long start; 944 + unsigned long end; 945 + 946 + obj_exts = slab_obj_exts(slab); 947 + if (!obj_exts) 948 + return false; 949 + 950 + start = (unsigned long)slab_address(slab); 951 + end = start + slab_size(slab); 952 + return (obj_exts >= start) && (obj_exts < end); 953 + } 954 + #else 955 + static inline bool need_slab_obj_exts(struct kmem_cache *s) 956 + { 957 + return false; 958 + } 959 + 960 + static inline unsigned int obj_exts_size_in_slab(struct slab *slab) 961 + { 962 + return 0; 963 + } 964 + 965 + static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, 966 + struct slab *slab) 967 + { 968 + return 0; 969 + } 970 + 971 + static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, 972 + struct slab *slab) 973 + { 974 + return false; 975 + } 976 + 977 + static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) 978 + { 979 + return false; 980 + } 981 + 982 + #endif 983 + 984 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 985 + static bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) 986 + { 987 + /* 988 + * Note we cannot rely on the SLAB_OBJ_EXT_IN_OBJ flag here and need to 989 + * check the stride. A cache can have SLAB_OBJ_EXT_IN_OBJ set, but 990 + * allocations within_slab_leftover are preferred. And those may be 991 + * possible or not depending on the particular slab's size. 992 + */ 993 + return obj_exts_in_slab(s, slab) && 994 + (slab_get_stride(slab) == s->size); 995 + } 996 + 997 + static unsigned int obj_exts_offset_in_object(struct kmem_cache *s) 998 + { 999 + unsigned int offset = get_info_end(s); 1000 + 1001 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) 1002 + offset += sizeof(struct track) * 2; 1003 + 1004 + if (slub_debug_orig_size(s)) 1005 + offset += sizeof(unsigned long); 1006 + 1007 + offset += kasan_metadata_size(s, false); 1008 + 1009 + return offset; 1010 + } 1011 + #else 1012 + static inline bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) 1013 + { 1014 + return false; 1015 + } 1016 + 1017 + static inline unsigned int obj_exts_offset_in_object(struct kmem_cache *s) 1018 + { 1019 + return 0; 1020 + } 1021 + #endif 758 1022 759 1023 #ifdef CONFIG_SLUB_DEBUG 760 1024 ··· 980 976 static int disable_higher_order_debug; 981 977 982 978 /* 983 - * slub is about to manipulate internal object metadata. This memory lies 984 - * outside the range of the allocated object, so accessing it would normally 985 - * be reported by kasan as a bounds error. metadata_access_enable() is used 986 - * to tell kasan that these accesses are OK. 987 - */ 988 - static inline void metadata_access_enable(void) 989 - { 990 - kasan_disable_current(); 991 - kmsan_disable_current(); 992 - } 993 - 994 - static inline void metadata_access_disable(void) 995 - { 996 - kmsan_enable_current(); 997 - kasan_enable_current(); 998 - } 999 - 1000 - /* 1001 979 * Object debugging 1002 980 */ 1003 981 ··· 1051 1065 p->handle = handle; 1052 1066 #endif 1053 1067 p->addr = addr; 1054 - p->cpu = smp_processor_id(); 1068 + p->cpu = raw_smp_processor_id(); 1055 1069 p->pid = current->pid; 1056 1070 p->when = jiffies; 1057 1071 } ··· 1184 1198 off += 2 * sizeof(struct track); 1185 1199 1186 1200 if (slub_debug_orig_size(s)) 1187 - off += sizeof(unsigned int); 1201 + off += sizeof(unsigned long); 1188 1202 1189 1203 off += kasan_metadata_size(s, false); 1204 + 1205 + if (obj_exts_in_object(s, slab)) 1206 + off += sizeof(struct slabobj_ext); 1190 1207 1191 1208 if (off != size_from_object(s)) 1192 1209 /* Beginning of the filler is the free pointer */ ··· 1213 1224 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 1214 1225 1215 1226 WARN_ON(1); 1216 - } 1217 - 1218 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 1219 - void **freelist, void *nextfree) 1220 - { 1221 - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && 1222 - !check_valid_pointer(s, slab, nextfree) && freelist) { 1223 - object_err(s, slab, *freelist, "Freechain corrupt"); 1224 - *freelist = NULL; 1225 - slab_fix(s, "Isolate corrupted freechain"); 1226 - return true; 1227 - } 1228 - 1229 - return false; 1230 1227 } 1231 1228 1232 1229 static void __slab_err(struct slab *slab) ··· 1322 1347 } 1323 1348 1324 1349 /* 1325 - * Object layout: 1350 + * Object field layout: 1326 1351 * 1327 - * object address 1328 - * Bytes of the object to be managed. 1329 - * If the freepointer may overlay the object then the free 1330 - * pointer is at the middle of the object. 1352 + * [Left redzone padding] (if SLAB_RED_ZONE) 1353 + * - Field size: s->red_left_pad 1354 + * - Immediately precedes each object when SLAB_RED_ZONE is set. 1355 + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and 1356 + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. 1331 1357 * 1332 - * Poisoning uses 0x6b (POISON_FREE) and the last byte is 1333 - * 0xa5 (POISON_END) 1358 + * [Object bytes] (object address starts here) 1359 + * - Field size: s->object_size 1360 + * - Object payload bytes. 1361 + * - If the freepointer may overlap the object, it is stored inside 1362 + * the object (typically near the middle). 1363 + * - Poisoning uses 0x6b (POISON_FREE) and the last byte is 1364 + * 0xa5 (POISON_END) when __OBJECT_POISON is enabled. 1334 1365 * 1335 - * object + s->object_size 1336 - * Padding to reach word boundary. This is also used for Redzoning. 1337 - * Padding is extended by another word if Redzoning is enabled and 1338 - * object_size == inuse. 1366 + * [Word-align padding] (right redzone when SLAB_RED_ZONE is set) 1367 + * - Field size: s->inuse - s->object_size 1368 + * - If redzoning is enabled and ALIGN(size, sizeof(void *)) adds no 1369 + * padding, explicitly extend by one word so the right redzone is 1370 + * non-empty. 1371 + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and 1372 + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. 1339 1373 * 1340 - * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with 1341 - * 0xcc (SLUB_RED_ACTIVE) for objects in use. 1374 + * [Metadata starts at object + s->inuse] 1375 + * - A. freelist pointer (if freeptr_outside_object) 1376 + * - B. alloc tracking (SLAB_STORE_USER) 1377 + * - C. free tracking (SLAB_STORE_USER) 1378 + * - D. original request size (SLAB_KMALLOC && SLAB_STORE_USER) 1379 + * - E. KASAN metadata (if enabled) 1342 1380 * 1343 - * object + s->inuse 1344 - * Meta data starts here. 1381 + * [Mandatory padding] (if CONFIG_SLUB_DEBUG && SLAB_RED_ZONE) 1382 + * - One mandatory debug word to guarantee a minimum poisoned gap 1383 + * between metadata and the next object, independent of alignment. 1384 + * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set. 1385 + * [Final alignment padding] 1386 + * - Bytes added by ALIGN(size, s->align) to reach s->size. 1387 + * - When the padding is large enough, it can be used to store 1388 + * struct slabobj_ext for accounting metadata (obj_exts_in_object()). 1389 + * - The remaining bytes (if any) are filled with 0x5a (POISON_INUSE) 1390 + * when SLAB_POISON is set. 1345 1391 * 1346 - * A. Free pointer (if we cannot overwrite object on free) 1347 - * B. Tracking data for SLAB_STORE_USER 1348 - * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) 1349 - * D. Padding to reach required alignment boundary or at minimum 1350 - * one word if debugging is on to be able to detect writes 1351 - * before the word boundary. 1392 + * Notes: 1393 + * - Redzones are filled by init_object() with SLUB_RED_ACTIVE/INACTIVE. 1394 + * - Object contents are poisoned with POISON_FREE/END when __OBJECT_POISON. 1395 + * - The trailing padding is pre-filled with POISON_INUSE by 1396 + * setup_slab_debug() when SLAB_POISON is set, and is validated by 1397 + * check_pad_bytes(). 1398 + * - The first object pointer is slab_address(slab) + 1399 + * (s->red_left_pad if redzoning); subsequent objects are reached by 1400 + * adding s->size each time. 1352 1401 * 1353 - * Padding is done using 0x5a (POISON_INUSE) 1354 - * 1355 - * object + s->size 1356 - * Nothing is used beyond s->size. 1357 - * 1358 - * If slabcaches are merged then the object_size and inuse boundaries are mostly 1359 - * ignored. And therefore no slab options that rely on these boundaries 1360 - * may be used with merged slabcaches. 1402 + * If a slab cache flag relies on specific metadata to exist at a fixed 1403 + * offset, the flag must be included in SLAB_NEVER_MERGE to prevent merging. 1404 + * Otherwise, the cache would misbehave as s->object_size and s->inuse are 1405 + * adjusted during cache merging (see __kmem_cache_alias()). 1361 1406 */ 1362 - 1363 1407 static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) 1364 1408 { 1365 1409 unsigned long off = get_info_end(s); /* The end of info */ ··· 1388 1394 off += 2 * sizeof(struct track); 1389 1395 1390 1396 if (s->flags & SLAB_KMALLOC) 1391 - off += sizeof(unsigned int); 1397 + off += sizeof(unsigned long); 1392 1398 } 1393 1399 1394 1400 off += kasan_metadata_size(s, false); 1401 + 1402 + if (obj_exts_in_object(s, slab)) 1403 + off += sizeof(struct slabobj_ext); 1395 1404 1396 1405 if (size_from_object(s) == off) 1397 1406 return 1; ··· 1420 1423 start = slab_address(slab); 1421 1424 length = slab_size(slab); 1422 1425 end = start + length; 1423 - remainder = length % s->size; 1426 + 1427 + if (obj_exts_in_slab(s, slab) && !obj_exts_in_object(s, slab)) { 1428 + remainder = length; 1429 + remainder -= obj_exts_offset_in_slab(s, slab); 1430 + remainder -= obj_exts_size_in_slab(slab); 1431 + } else { 1432 + remainder = length % s->size; 1433 + } 1434 + 1424 1435 if (!remainder) 1425 1436 return; 1426 1437 ··· 2026 2021 int objects) {} 2027 2022 static inline void dec_slabs_node(struct kmem_cache *s, int node, 2028 2023 int objects) {} 2029 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 2030 - void **freelist, void *nextfree) 2031 - { 2032 - return false; 2033 - } 2034 2024 #endif /* CONFIG_SLUB_DEBUG */ 2035 2025 2036 2026 /* ··· 2042 2042 2043 2043 static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) 2044 2044 { 2045 - struct slabobj_ext *slab_exts; 2046 2045 struct slab *obj_exts_slab; 2046 + unsigned long slab_exts; 2047 2047 2048 2048 obj_exts_slab = virt_to_slab(obj_exts); 2049 2049 slab_exts = slab_obj_exts(obj_exts_slab); 2050 2050 if (slab_exts) { 2051 + get_slab_obj_exts(slab_exts); 2051 2052 unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, 2052 2053 obj_exts_slab, obj_exts); 2054 + struct slabobj_ext *ext = slab_obj_ext(obj_exts_slab, 2055 + slab_exts, offs); 2053 2056 2054 - if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) 2057 + if (unlikely(is_codetag_empty(&ext->ref))) { 2058 + put_slab_obj_exts(slab_exts); 2055 2059 return; 2060 + } 2056 2061 2057 2062 /* codetag should be NULL here */ 2058 - WARN_ON(slab_exts[offs].ref.ct); 2059 - set_codetag_empty(&slab_exts[offs].ref); 2063 + WARN_ON(ext->ref.ct); 2064 + set_codetag_empty(&ext->ref); 2065 + put_slab_obj_exts(slab_exts); 2060 2066 } 2061 2067 } 2062 2068 ··· 2101 2095 slab->obj_exts = 0; 2102 2096 } 2103 2097 2098 + /* 2099 + * Calculate the allocation size for slabobj_ext array. 2100 + * 2101 + * When memory allocation profiling is enabled, the obj_exts array 2102 + * could be allocated from the same slab cache it's being allocated for. 2103 + * This would prevent the slab from ever being freed because it would 2104 + * always contain at least one allocated object (its own obj_exts array). 2105 + * 2106 + * To avoid this, increase the allocation size when we detect the array 2107 + * may come from the same cache, forcing it to use a different cache. 2108 + */ 2109 + static inline size_t obj_exts_alloc_size(struct kmem_cache *s, 2110 + struct slab *slab, gfp_t gfp) 2111 + { 2112 + size_t sz = sizeof(struct slabobj_ext) * slab->objects; 2113 + struct kmem_cache *obj_exts_cache; 2114 + 2115 + /* 2116 + * slabobj_ext array for KMALLOC_CGROUP allocations 2117 + * are served from KMALLOC_NORMAL caches. 2118 + */ 2119 + if (!mem_alloc_profiling_enabled()) 2120 + return sz; 2121 + 2122 + if (sz > KMALLOC_MAX_CACHE_SIZE) 2123 + return sz; 2124 + 2125 + if (!is_kmalloc_normal(s)) 2126 + return sz; 2127 + 2128 + obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0); 2129 + /* 2130 + * We can't simply compare s with obj_exts_cache, because random kmalloc 2131 + * caches have multiple caches per size, selected by caller address. 2132 + * Since caller address may differ between kmalloc_slab() and actual 2133 + * allocation, bump size when sizes are equal. 2134 + */ 2135 + if (s->object_size == obj_exts_cache->object_size) 2136 + return obj_exts_cache->object_size + 1; 2137 + 2138 + return sz; 2139 + } 2140 + 2104 2141 int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, 2105 2142 gfp_t gfp, bool new_slab) 2106 2143 { ··· 2152 2103 unsigned long new_exts; 2153 2104 unsigned long old_exts; 2154 2105 struct slabobj_ext *vec; 2106 + size_t sz; 2155 2107 2156 2108 gfp &= ~OBJCGS_CLEAR_MASK; 2157 2109 /* Prevent recursive extension vector allocation */ 2158 2110 gfp |= __GFP_NO_OBJ_EXT; 2111 + 2112 + sz = obj_exts_alloc_size(s, slab, gfp); 2159 2113 2160 2114 /* 2161 2115 * Note that allow_spin may be false during early boot and its ··· 2166 2114 * architectures with cmpxchg16b, early obj_exts will be missing for 2167 2115 * very early allocations on those. 2168 2116 */ 2169 - if (unlikely(!allow_spin)) { 2170 - size_t sz = objects * sizeof(struct slabobj_ext); 2171 - 2117 + if (unlikely(!allow_spin)) 2172 2118 vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, 2173 2119 slab_nid(slab)); 2174 - } else { 2175 - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, 2176 - slab_nid(slab)); 2177 - } 2120 + else 2121 + vec = kmalloc_node(sz, gfp | __GFP_ZERO, slab_nid(slab)); 2122 + 2178 2123 if (!vec) { 2179 2124 /* 2180 2125 * Try to mark vectors which failed to allocate. ··· 2185 2136 return -ENOMEM; 2186 2137 } 2187 2138 2139 + VM_WARN_ON_ONCE(virt_to_slab(vec) != NULL && 2140 + virt_to_slab(vec)->slab_cache == s); 2141 + 2188 2142 new_exts = (unsigned long)vec; 2189 2143 if (unlikely(!allow_spin)) 2190 2144 new_exts |= OBJEXTS_NOSPIN_ALLOC; ··· 2197 2145 retry: 2198 2146 old_exts = READ_ONCE(slab->obj_exts); 2199 2147 handle_failed_objexts_alloc(old_exts, vec, objects); 2148 + slab_set_stride(slab, sizeof(struct slabobj_ext)); 2149 + 2200 2150 if (new_slab) { 2201 2151 /* 2202 2152 * If the slab is brand new and nobody can yet access its ··· 2232 2178 { 2233 2179 struct slabobj_ext *obj_exts; 2234 2180 2235 - obj_exts = slab_obj_exts(slab); 2181 + obj_exts = (struct slabobj_ext *)slab_obj_exts(slab); 2236 2182 if (!obj_exts) { 2237 2183 /* 2238 2184 * If obj_exts allocation failed, slab->obj_exts is set to 2239 2185 * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should 2240 2186 * clear the flag. 2241 2187 */ 2188 + slab->obj_exts = 0; 2189 + return; 2190 + } 2191 + 2192 + if (obj_exts_in_slab(slab->slab_cache, slab)) { 2242 2193 slab->obj_exts = 0; 2243 2194 return; 2244 2195 } ··· 2263 2204 slab->obj_exts = 0; 2264 2205 } 2265 2206 2207 + /* 2208 + * Try to allocate slabobj_ext array from unused space. 2209 + * This function must be called on a freshly allocated slab to prevent 2210 + * concurrency problems. 2211 + */ 2212 + static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) 2213 + { 2214 + void *addr; 2215 + unsigned long obj_exts; 2216 + 2217 + if (!need_slab_obj_exts(s)) 2218 + return; 2219 + 2220 + if (obj_exts_fit_within_slab_leftover(s, slab)) { 2221 + addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab); 2222 + addr = kasan_reset_tag(addr); 2223 + obj_exts = (unsigned long)addr; 2224 + 2225 + get_slab_obj_exts(obj_exts); 2226 + memset(addr, 0, obj_exts_size_in_slab(slab)); 2227 + put_slab_obj_exts(obj_exts); 2228 + 2229 + #ifdef CONFIG_MEMCG 2230 + obj_exts |= MEMCG_DATA_OBJEXTS; 2231 + #endif 2232 + slab->obj_exts = obj_exts; 2233 + slab_set_stride(slab, sizeof(struct slabobj_ext)); 2234 + } else if (s->flags & SLAB_OBJ_EXT_IN_OBJ) { 2235 + unsigned int offset = obj_exts_offset_in_object(s); 2236 + 2237 + obj_exts = (unsigned long)slab_address(slab); 2238 + obj_exts += s->red_left_pad; 2239 + obj_exts += offset; 2240 + 2241 + get_slab_obj_exts(obj_exts); 2242 + for_each_object(addr, s, slab_address(slab), slab->objects) 2243 + memset(kasan_reset_tag(addr) + offset, 0, 2244 + sizeof(struct slabobj_ext)); 2245 + put_slab_obj_exts(obj_exts); 2246 + 2247 + #ifdef CONFIG_MEMCG 2248 + obj_exts |= MEMCG_DATA_OBJEXTS; 2249 + #endif 2250 + slab->obj_exts = obj_exts; 2251 + slab_set_stride(slab, s->size); 2252 + } 2253 + } 2254 + 2266 2255 #else /* CONFIG_SLAB_OBJ_EXT */ 2267 2256 2268 2257 static inline void init_slab_obj_exts(struct slab *slab) ··· 2327 2220 { 2328 2221 } 2329 2222 2223 + static inline void alloc_slab_obj_exts_early(struct kmem_cache *s, 2224 + struct slab *slab) 2225 + { 2226 + } 2227 + 2330 2228 #endif /* CONFIG_SLAB_OBJ_EXT */ 2331 2229 2332 2230 #ifdef CONFIG_MEM_ALLOC_PROFILING 2333 2231 2334 - static inline struct slabobj_ext * 2335 - prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) 2232 + static inline unsigned long 2233 + prepare_slab_obj_exts_hook(struct kmem_cache *s, struct slab *slab, 2234 + gfp_t flags, void *p) 2336 2235 { 2337 - struct slab *slab; 2338 - 2339 - slab = virt_to_slab(p); 2340 2236 if (!slab_obj_exts(slab) && 2341 2237 alloc_slab_obj_exts(slab, s, flags, false)) { 2342 2238 pr_warn_once("%s, %s: Failed to create slab extension vector!\n", 2343 2239 __func__, s->name); 2344 - return NULL; 2240 + return 0; 2345 2241 } 2346 2242 2347 - return slab_obj_exts(slab) + obj_to_index(s, slab, p); 2243 + return slab_obj_exts(slab); 2348 2244 } 2245 + 2349 2246 2350 2247 /* Should be called only if mem_alloc_profiling_enabled() */ 2351 2248 static noinline void 2352 2249 __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) 2353 2250 { 2354 - struct slabobj_ext *obj_exts; 2251 + unsigned long obj_exts; 2252 + struct slabobj_ext *obj_ext; 2253 + struct slab *slab; 2355 2254 2356 2255 if (!object) 2357 2256 return; ··· 2368 2255 if (flags & __GFP_NO_OBJ_EXT) 2369 2256 return; 2370 2257 2371 - obj_exts = prepare_slab_obj_exts_hook(s, flags, object); 2258 + slab = virt_to_slab(object); 2259 + obj_exts = prepare_slab_obj_exts_hook(s, slab, flags, object); 2372 2260 /* 2373 2261 * Currently obj_exts is used only for allocation profiling. 2374 2262 * If other users appear then mem_alloc_profiling_enabled() 2375 2263 * check should be added before alloc_tag_add(). 2376 2264 */ 2377 - if (likely(obj_exts)) 2378 - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); 2379 - else 2265 + if (obj_exts) { 2266 + unsigned int obj_idx = obj_to_index(s, slab, object); 2267 + 2268 + get_slab_obj_exts(obj_exts); 2269 + obj_ext = slab_obj_ext(slab, obj_exts, obj_idx); 2270 + alloc_tag_add(&obj_ext->ref, current->alloc_tag, s->size); 2271 + put_slab_obj_exts(obj_exts); 2272 + } else { 2380 2273 alloc_tag_set_inaccurate(current->alloc_tag); 2274 + } 2381 2275 } 2382 2276 2383 2277 static inline void ··· 2399 2279 __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, 2400 2280 int objects) 2401 2281 { 2402 - struct slabobj_ext *obj_exts; 2403 2282 int i; 2283 + unsigned long obj_exts; 2404 2284 2405 2285 /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ 2406 2286 if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) ··· 2410 2290 if (!obj_exts) 2411 2291 return; 2412 2292 2293 + get_slab_obj_exts(obj_exts); 2413 2294 for (i = 0; i < objects; i++) { 2414 2295 unsigned int off = obj_to_index(s, slab, p[i]); 2415 2296 2416 - alloc_tag_sub(&obj_exts[off].ref, s->size); 2297 + alloc_tag_sub(&slab_obj_ext(slab, obj_exts, off)->ref, s->size); 2417 2298 } 2299 + put_slab_obj_exts(obj_exts); 2418 2300 } 2419 2301 2420 2302 static inline void ··· 2474 2352 void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, 2475 2353 int objects) 2476 2354 { 2477 - struct slabobj_ext *obj_exts; 2355 + unsigned long obj_exts; 2478 2356 2479 2357 if (!memcg_kmem_online()) 2480 2358 return; ··· 2483 2361 if (likely(!obj_exts)) 2484 2362 return; 2485 2363 2364 + get_slab_obj_exts(obj_exts); 2486 2365 __memcg_slab_free_hook(s, slab, p, objects, obj_exts); 2366 + put_slab_obj_exts(obj_exts); 2487 2367 } 2488 2368 2489 2369 static __fastpath_inline 2490 2370 bool memcg_slab_post_charge(void *p, gfp_t flags) 2491 2371 { 2492 - struct slabobj_ext *slab_exts; 2372 + unsigned long obj_exts; 2373 + struct slabobj_ext *obj_ext; 2493 2374 struct kmem_cache *s; 2494 2375 struct page *page; 2495 2376 struct slab *slab; ··· 2533 2408 return true; 2534 2409 2535 2410 /* Ignore already charged objects. */ 2536 - slab_exts = slab_obj_exts(slab); 2537 - if (slab_exts) { 2411 + obj_exts = slab_obj_exts(slab); 2412 + if (obj_exts) { 2413 + get_slab_obj_exts(obj_exts); 2538 2414 off = obj_to_index(s, slab, p); 2539 - if (unlikely(slab_exts[off].objcg)) 2415 + obj_ext = slab_obj_ext(slab, obj_exts, off); 2416 + if (unlikely(obj_ext->objcg)) { 2417 + put_slab_obj_exts(obj_exts); 2540 2418 return true; 2419 + } 2420 + put_slab_obj_exts(obj_exts); 2541 2421 } 2542 2422 2543 2423 return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); ··· 2726 2596 return object; 2727 2597 } 2728 2598 2729 - static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) 2599 + static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, 2600 + unsigned int capacity) 2730 2601 { 2731 2602 struct slab_sheaf *sheaf; 2732 2603 size_t sheaf_size; ··· 2745 2614 if (s->flags & SLAB_KMALLOC) 2746 2615 gfp |= __GFP_NO_OBJ_EXT; 2747 2616 2748 - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); 2617 + sheaf_size = struct_size(sheaf, objects, capacity); 2749 2618 sheaf = kzalloc(sheaf_size, gfp); 2750 2619 2751 2620 if (unlikely(!sheaf)) ··· 2758 2627 return sheaf; 2759 2628 } 2760 2629 2630 + static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, 2631 + gfp_t gfp) 2632 + { 2633 + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); 2634 + } 2635 + 2761 2636 static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) 2762 2637 { 2763 2638 kfree(sheaf); ··· 2771 2634 stat(s, SHEAF_FREE); 2772 2635 } 2773 2636 2774 - static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 2775 - size_t size, void **p); 2776 - 2637 + static unsigned int 2638 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 2639 + unsigned int max); 2777 2640 2778 2641 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, 2779 2642 gfp_t gfp) ··· 2784 2647 if (!to_fill) 2785 2648 return 0; 2786 2649 2787 - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, 2788 - &sheaf->objects[sheaf->size]); 2650 + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, 2651 + to_fill); 2789 2652 2790 2653 sheaf->size += filled; 2791 2654 ··· 2986 2849 { 2987 2850 int cpu; 2988 2851 2852 + /* 2853 + * We may be unwinding cache creation that failed before or during the 2854 + * allocation of this. 2855 + */ 2856 + if (!s->cpu_sheaves) 2857 + return; 2858 + 2859 + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ 2860 + if (!cache_has_sheaves(s)) 2861 + goto free_pcs; 2862 + 2989 2863 for_each_possible_cpu(cpu) { 2990 2864 struct slub_percpu_sheaves *pcs; 2991 2865 2992 2866 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 2993 2867 2994 - /* can happen when unwinding failed create */ 2868 + /* This can happen when unwinding failed cache creation. */ 2995 2869 if (!pcs->main) 2996 2870 continue; 2997 2871 ··· 3024 2876 } 3025 2877 } 3026 2878 2879 + free_pcs: 3027 2880 free_percpu(s->cpu_sheaves); 3028 2881 s->cpu_sheaves = NULL; 3029 2882 } 3030 2883 3031 - static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) 2884 + static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, 2885 + bool allow_spin) 3032 2886 { 3033 2887 struct slab_sheaf *empty = NULL; 3034 2888 unsigned long flags; ··· 3038 2888 if (!data_race(barn->nr_empty)) 3039 2889 return NULL; 3040 2890 3041 - spin_lock_irqsave(&barn->lock, flags); 2891 + if (likely(allow_spin)) 2892 + spin_lock_irqsave(&barn->lock, flags); 2893 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 2894 + return NULL; 3042 2895 3043 2896 if (likely(barn->nr_empty)) { 3044 2897 empty = list_first_entry(&barn->sheaves_empty, ··· 3118 2965 * change. 3119 2966 */ 3120 2967 static struct slab_sheaf * 3121 - barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) 2968 + barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, 2969 + bool allow_spin) 3122 2970 { 3123 2971 struct slab_sheaf *full = NULL; 3124 2972 unsigned long flags; ··· 3127 2973 if (!data_race(barn->nr_full)) 3128 2974 return NULL; 3129 2975 3130 - spin_lock_irqsave(&barn->lock, flags); 2976 + if (likely(allow_spin)) 2977 + spin_lock_irqsave(&barn->lock, flags); 2978 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 2979 + return NULL; 3131 2980 3132 2981 if (likely(barn->nr_full)) { 3133 2982 full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, ··· 3151 2994 * barn. But if there are too many full sheaves, reject this with -E2BIG. 3152 2995 */ 3153 2996 static struct slab_sheaf * 3154 - barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) 2997 + barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, 2998 + bool allow_spin) 3155 2999 { 3156 3000 struct slab_sheaf *empty; 3157 3001 unsigned long flags; ··· 3163 3005 if (!data_race(barn->nr_empty)) 3164 3006 return ERR_PTR(-ENOMEM); 3165 3007 3166 - spin_lock_irqsave(&barn->lock, flags); 3008 + if (likely(allow_spin)) 3009 + spin_lock_irqsave(&barn->lock, flags); 3010 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3011 + return ERR_PTR(-EBUSY); 3167 3012 3168 3013 if (likely(barn->nr_empty)) { 3169 3014 empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, ··· 3359 3198 static __always_inline void account_slab(struct slab *slab, int order, 3360 3199 struct kmem_cache *s, gfp_t gfp) 3361 3200 { 3362 - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) 3201 + if (memcg_kmem_online() && 3202 + (s->flags & SLAB_ACCOUNT) && 3203 + !slab_obj_exts(slab)) 3363 3204 alloc_slab_obj_exts(slab, s, gfp, true); 3364 3205 3365 3206 mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), ··· 3425 3262 slab->objects = oo_objects(oo); 3426 3263 slab->inuse = 0; 3427 3264 slab->frozen = 0; 3428 - init_slab_obj_exts(slab); 3429 - 3430 - account_slab(slab, oo_order(oo), s, flags); 3431 3265 3432 3266 slab->slab_cache = s; 3433 3267 ··· 3433 3273 start = slab_address(slab); 3434 3274 3435 3275 setup_slab_debug(s, slab, start); 3276 + init_slab_obj_exts(slab); 3277 + /* 3278 + * Poison the slab before initializing the slabobj_ext array 3279 + * to prevent the array from being overwritten. 3280 + */ 3281 + alloc_slab_obj_exts_early(s, slab); 3282 + account_slab(slab, oo_order(oo), s, flags); 3436 3283 3437 3284 shuffle = shuffle_freelist(s, slab); 3438 3285 ··· 3470 3303 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 3471 3304 } 3472 3305 3473 - static void __free_slab(struct kmem_cache *s, struct slab *slab) 3306 + static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) 3474 3307 { 3475 3308 struct page *page = slab_page(slab); 3476 3309 int order = compound_order(page); ··· 3481 3314 __ClearPageSlab(page); 3482 3315 mm_account_reclaimed_pages(pages); 3483 3316 unaccount_slab(slab, order, s); 3484 - free_frozen_pages(page, order); 3317 + if (allow_spin) 3318 + free_frozen_pages(page, order); 3319 + else 3320 + free_frozen_pages_nolock(page, order); 3321 + } 3322 + 3323 + static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) 3324 + { 3325 + /* 3326 + * Since it was just allocated, we can skip the actions in 3327 + * discard_slab() and free_slab(). 3328 + */ 3329 + __free_slab(s, slab, false); 3485 3330 } 3486 3331 3487 3332 static void rcu_free_slab(struct rcu_head *h) 3488 3333 { 3489 3334 struct slab *slab = container_of(h, struct slab, rcu_head); 3490 3335 3491 - __free_slab(slab->slab_cache, slab); 3336 + __free_slab(slab->slab_cache, slab, true); 3492 3337 } 3493 3338 3494 3339 static void free_slab(struct kmem_cache *s, struct slab *slab) ··· 3516 3337 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) 3517 3338 call_rcu(&slab->rcu_head, rcu_free_slab); 3518 3339 else 3519 - __free_slab(s, slab); 3340 + __free_slab(s, slab, true); 3520 3341 } 3521 3342 3522 3343 static void discard_slab(struct kmem_cache *s, struct slab *slab) ··· 3544 3365 * Management of partially allocated slabs. 3545 3366 */ 3546 3367 static inline void 3547 - __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) 3368 + __add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) 3548 3369 { 3549 3370 n->nr_partial++; 3550 - if (tail == DEACTIVATE_TO_TAIL) 3371 + if (mode == ADD_TO_TAIL) 3551 3372 list_add_tail(&slab->slab_list, &n->partial); 3552 3373 else 3553 3374 list_add(&slab->slab_list, &n->partial); ··· 3555 3376 } 3556 3377 3557 3378 static inline void add_partial(struct kmem_cache_node *n, 3558 - struct slab *slab, int tail) 3379 + struct slab *slab, enum add_mode mode) 3559 3380 { 3560 3381 lockdep_assert_held(&n->list_lock); 3561 - __add_partial(n, slab, tail); 3382 + __add_partial(n, slab, mode); 3562 3383 } 3563 3384 3564 3385 static inline void remove_partial(struct kmem_cache_node *n, ··· 3609 3430 return object; 3610 3431 } 3611 3432 3612 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); 3613 - 3614 3433 /* 3615 3434 * Called only for kmem_cache_debug() caches to allocate from a freshly 3616 3435 * allocated slab. Allocate a single object instead of whole freelist ··· 3624 3447 void *object; 3625 3448 3626 3449 if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { 3627 - /* Unlucky, discard newly allocated slab */ 3628 - defer_deactivate_slab(slab, NULL); 3450 + /* Unlucky, discard newly allocated slab. */ 3451 + free_new_slab_nolock(s, slab); 3629 3452 return NULL; 3630 3453 } 3631 3454 ··· 3651 3474 if (slab->inuse == slab->objects) 3652 3475 add_full(s, n, slab); 3653 3476 else 3654 - add_partial(n, slab, DEACTIVATE_TO_HEAD); 3477 + add_partial(n, slab, ADD_TO_HEAD); 3655 3478 3656 3479 inc_slabs_node(s, nid, slab->objects); 3657 3480 spin_unlock_irqrestore(&n->list_lock, flags); ··· 3659 3482 return object; 3660 3483 } 3661 3484 3662 - #ifdef CONFIG_SLUB_CPU_PARTIAL 3663 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); 3664 - #else 3665 - static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, 3666 - int drain) { } 3667 - #endif 3668 3485 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); 3669 3486 3670 - /* 3671 - * Try to allocate a partial slab from a specific node. 3672 - */ 3673 - static struct slab *get_partial_node(struct kmem_cache *s, 3674 - struct kmem_cache_node *n, 3675 - struct partial_context *pc) 3487 + static bool get_partial_node_bulk(struct kmem_cache *s, 3488 + struct kmem_cache_node *n, 3489 + struct partial_bulk_context *pc, 3490 + bool allow_spin) 3676 3491 { 3677 - struct slab *slab, *slab2, *partial = NULL; 3492 + struct slab *slab, *slab2; 3493 + unsigned int total_free = 0; 3678 3494 unsigned long flags; 3679 - unsigned int partial_slabs = 0; 3495 + 3496 + /* Racy check to avoid taking the lock unnecessarily. */ 3497 + if (!n || data_race(!n->nr_partial)) 3498 + return false; 3499 + 3500 + INIT_LIST_HEAD(&pc->slabs); 3501 + 3502 + if (allow_spin) 3503 + spin_lock_irqsave(&n->list_lock, flags); 3504 + else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3505 + return false; 3506 + 3507 + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3508 + struct freelist_counters flc; 3509 + unsigned int slab_free; 3510 + 3511 + if (!pfmemalloc_match(slab, pc->flags)) 3512 + continue; 3513 + 3514 + /* 3515 + * determine the number of free objects in the slab racily 3516 + * 3517 + * slab_free is a lower bound due to possible subsequent 3518 + * concurrent freeing, so the caller may get more objects than 3519 + * requested and must handle that 3520 + */ 3521 + flc.counters = data_race(READ_ONCE(slab->counters)); 3522 + slab_free = flc.objects - flc.inuse; 3523 + 3524 + /* we have already min and this would get us over the max */ 3525 + if (total_free >= pc->min_objects 3526 + && total_free + slab_free > pc->max_objects) 3527 + break; 3528 + 3529 + remove_partial(n, slab); 3530 + 3531 + list_add(&slab->slab_list, &pc->slabs); 3532 + 3533 + total_free += slab_free; 3534 + if (total_free >= pc->max_objects) 3535 + break; 3536 + } 3537 + 3538 + spin_unlock_irqrestore(&n->list_lock, flags); 3539 + return total_free > 0; 3540 + } 3541 + 3542 + /* 3543 + * Try to allocate object from a partial slab on a specific node. 3544 + */ 3545 + static void *get_from_partial_node(struct kmem_cache *s, 3546 + struct kmem_cache_node *n, 3547 + struct partial_context *pc) 3548 + { 3549 + struct slab *slab, *slab2; 3550 + unsigned long flags; 3551 + void *object = NULL; 3680 3552 3681 3553 /* 3682 3554 * Racy check. If we mistakenly see no partial slabs then we 3683 3555 * just allocate an empty slab. If we mistakenly try to get a 3684 - * partial slab and there is none available then get_partial() 3556 + * partial slab and there is none available then get_from_partial() 3685 3557 * will return NULL. 3686 3558 */ 3687 3559 if (!n || !n->nr_partial) ··· 3741 3515 else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3742 3516 return NULL; 3743 3517 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3518 + 3519 + struct freelist_counters old, new; 3520 + 3744 3521 if (!pfmemalloc_match(slab, pc->flags)) 3745 3522 continue; 3746 3523 3747 3524 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 3748 - void *object = alloc_single_from_partial(s, n, slab, 3525 + object = alloc_single_from_partial(s, n, slab, 3749 3526 pc->orig_size); 3750 - if (object) { 3751 - partial = slab; 3752 - pc->object = object; 3527 + if (object) 3753 3528 break; 3754 - } 3755 3529 continue; 3756 3530 } 3757 3531 3758 - remove_partial(n, slab); 3532 + /* 3533 + * get a single object from the slab. This might race against 3534 + * __slab_free(), which however has to take the list_lock if 3535 + * it's about to make the slab fully free. 3536 + */ 3537 + do { 3538 + old.freelist = slab->freelist; 3539 + old.counters = slab->counters; 3759 3540 3760 - if (!partial) { 3761 - partial = slab; 3762 - stat(s, ALLOC_FROM_PARTIAL); 3541 + new.freelist = get_freepointer(s, old.freelist); 3542 + new.counters = old.counters; 3543 + new.inuse++; 3763 3544 3764 - if ((slub_get_cpu_partial(s) == 0)) { 3765 - break; 3766 - } 3767 - } else { 3768 - put_cpu_partial(s, slab, 0); 3769 - stat(s, CPU_PARTIAL_NODE); 3545 + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); 3770 3546 3771 - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { 3772 - break; 3773 - } 3774 - } 3547 + object = old.freelist; 3548 + if (!new.freelist) 3549 + remove_partial(n, slab); 3550 + 3551 + break; 3775 3552 } 3776 3553 spin_unlock_irqrestore(&n->list_lock, flags); 3777 - return partial; 3554 + return object; 3778 3555 } 3779 3556 3780 3557 /* 3781 - * Get a slab from somewhere. Search in increasing NUMA distances. 3558 + * Get an object from somewhere. Search in increasing NUMA distances. 3782 3559 */ 3783 - static struct slab *get_any_partial(struct kmem_cache *s, 3784 - struct partial_context *pc) 3560 + static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) 3785 3561 { 3786 3562 #ifdef CONFIG_NUMA 3787 3563 struct zonelist *zonelist; 3788 3564 struct zoneref *z; 3789 3565 struct zone *zone; 3790 3566 enum zone_type highest_zoneidx = gfp_zone(pc->flags); 3791 - struct slab *slab; 3792 3567 unsigned int cpuset_mems_cookie; 3793 3568 3794 3569 /* ··· 3824 3597 3825 3598 if (n && cpuset_zone_allowed(zone, pc->flags) && 3826 3599 n->nr_partial > s->min_partial) { 3827 - slab = get_partial_node(s, n, pc); 3828 - if (slab) { 3600 + 3601 + void *object = get_from_partial_node(s, n, pc); 3602 + 3603 + if (object) { 3829 3604 /* 3830 3605 * Don't check read_mems_allowed_retry() 3831 3606 * here - if mems_allowed was updated in ··· 3835 3606 * between allocation and the cpuset 3836 3607 * update 3837 3608 */ 3838 - return slab; 3609 + return object; 3839 3610 } 3840 3611 } 3841 3612 } ··· 3845 3616 } 3846 3617 3847 3618 /* 3848 - * Get a partial slab, lock it and return it. 3619 + * Get an object from a partial slab 3849 3620 */ 3850 - static struct slab *get_partial(struct kmem_cache *s, int node, 3851 - struct partial_context *pc) 3621 + static void *get_from_partial(struct kmem_cache *s, int node, 3622 + struct partial_context *pc) 3852 3623 { 3853 - struct slab *slab; 3854 3624 int searchnode = node; 3625 + void *object; 3855 3626 3856 3627 if (node == NUMA_NO_NODE) 3857 3628 searchnode = numa_mem_id(); 3858 3629 3859 - slab = get_partial_node(s, get_node(s, searchnode), pc); 3860 - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3861 - return slab; 3630 + object = get_from_partial_node(s, get_node(s, searchnode), pc); 3631 + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3632 + return object; 3862 3633 3863 - return get_any_partial(s, pc); 3864 - } 3865 - 3866 - #ifdef CONFIG_PREEMPTION 3867 - /* 3868 - * Calculate the next globally unique transaction for disambiguation 3869 - * during cmpxchg. The transactions start with the cpu number and are then 3870 - * incremented by CONFIG_NR_CPUS. 3871 - */ 3872 - #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 3873 - #else 3874 - /* 3875 - * No preemption supported therefore also no need to check for 3876 - * different cpus. 3877 - */ 3878 - #define TID_STEP 1 3879 - #endif /* CONFIG_PREEMPTION */ 3880 - 3881 - static inline unsigned long next_tid(unsigned long tid) 3882 - { 3883 - return tid + TID_STEP; 3884 - } 3885 - 3886 - #ifdef SLUB_DEBUG_CMPXCHG 3887 - static inline unsigned int tid_to_cpu(unsigned long tid) 3888 - { 3889 - return tid % TID_STEP; 3890 - } 3891 - 3892 - static inline unsigned long tid_to_event(unsigned long tid) 3893 - { 3894 - return tid / TID_STEP; 3895 - } 3896 - #endif 3897 - 3898 - static inline unsigned int init_tid(int cpu) 3899 - { 3900 - return cpu; 3901 - } 3902 - 3903 - static inline void note_cmpxchg_failure(const char *n, 3904 - const struct kmem_cache *s, unsigned long tid) 3905 - { 3906 - #ifdef SLUB_DEBUG_CMPXCHG 3907 - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 3908 - 3909 - pr_info("%s %s: cmpxchg redo ", n, s->name); 3910 - 3911 - if (IS_ENABLED(CONFIG_PREEMPTION) && 3912 - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { 3913 - pr_warn("due to cpu change %d -> %d\n", 3914 - tid_to_cpu(tid), tid_to_cpu(actual_tid)); 3915 - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { 3916 - pr_warn("due to cpu running other code. Event %ld->%ld\n", 3917 - tid_to_event(tid), tid_to_event(actual_tid)); 3918 - } else { 3919 - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", 3920 - actual_tid, tid, next_tid(tid)); 3921 - } 3922 - #endif 3923 - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 3924 - } 3925 - 3926 - static void init_kmem_cache_cpus(struct kmem_cache *s) 3927 - { 3928 - #ifdef CONFIG_PREEMPT_RT 3929 - /* 3930 - * Register lockdep key for non-boot kmem caches to avoid 3931 - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() 3932 - */ 3933 - bool finegrain_lockdep = !init_section_contains(s, 1); 3934 - #else 3935 - /* 3936 - * Don't bother with different lockdep classes for each 3937 - * kmem_cache, since we only use local_trylock_irqsave(). 3938 - */ 3939 - bool finegrain_lockdep = false; 3940 - #endif 3941 - int cpu; 3942 - struct kmem_cache_cpu *c; 3943 - 3944 - if (finegrain_lockdep) 3945 - lockdep_register_key(&s->lock_key); 3946 - for_each_possible_cpu(cpu) { 3947 - c = per_cpu_ptr(s->cpu_slab, cpu); 3948 - local_trylock_init(&c->lock); 3949 - if (finegrain_lockdep) 3950 - lockdep_set_class(&c->lock, &s->lock_key); 3951 - c->tid = init_tid(cpu); 3952 - } 3953 - } 3954 - 3955 - /* 3956 - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, 3957 - * unfreezes the slabs and puts it on the proper list. 3958 - * Assumes the slab has been already safely taken away from kmem_cache_cpu 3959 - * by the caller. 3960 - */ 3961 - static void deactivate_slab(struct kmem_cache *s, struct slab *slab, 3962 - void *freelist) 3963 - { 3964 - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); 3965 - int free_delta = 0; 3966 - void *nextfree, *freelist_iter, *freelist_tail; 3967 - int tail = DEACTIVATE_TO_HEAD; 3968 - unsigned long flags = 0; 3969 - struct freelist_counters old, new; 3970 - 3971 - if (READ_ONCE(slab->freelist)) { 3972 - stat(s, DEACTIVATE_REMOTE_FREES); 3973 - tail = DEACTIVATE_TO_TAIL; 3974 - } 3975 - 3976 - /* 3977 - * Stage one: Count the objects on cpu's freelist as free_delta and 3978 - * remember the last object in freelist_tail for later splicing. 3979 - */ 3980 - freelist_tail = NULL; 3981 - freelist_iter = freelist; 3982 - while (freelist_iter) { 3983 - nextfree = get_freepointer(s, freelist_iter); 3984 - 3985 - /* 3986 - * If 'nextfree' is invalid, it is possible that the object at 3987 - * 'freelist_iter' is already corrupted. So isolate all objects 3988 - * starting at 'freelist_iter' by skipping them. 3989 - */ 3990 - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) 3991 - break; 3992 - 3993 - freelist_tail = freelist_iter; 3994 - free_delta++; 3995 - 3996 - freelist_iter = nextfree; 3997 - } 3998 - 3999 - /* 4000 - * Stage two: Unfreeze the slab while splicing the per-cpu 4001 - * freelist to the head of slab's freelist. 4002 - */ 4003 - do { 4004 - old.freelist = READ_ONCE(slab->freelist); 4005 - old.counters = READ_ONCE(slab->counters); 4006 - VM_BUG_ON(!old.frozen); 4007 - 4008 - /* Determine target state of the slab */ 4009 - new.counters = old.counters; 4010 - new.frozen = 0; 4011 - if (freelist_tail) { 4012 - new.inuse -= free_delta; 4013 - set_freepointer(s, freelist_tail, old.freelist); 4014 - new.freelist = freelist; 4015 - } else { 4016 - new.freelist = old.freelist; 4017 - } 4018 - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); 4019 - 4020 - /* 4021 - * Stage three: Manipulate the slab list based on the updated state. 4022 - */ 4023 - if (!new.inuse && n->nr_partial >= s->min_partial) { 4024 - stat(s, DEACTIVATE_EMPTY); 4025 - discard_slab(s, slab); 4026 - stat(s, FREE_SLAB); 4027 - } else if (new.freelist) { 4028 - spin_lock_irqsave(&n->list_lock, flags); 4029 - add_partial(n, slab, tail); 4030 - spin_unlock_irqrestore(&n->list_lock, flags); 4031 - stat(s, tail); 4032 - } else { 4033 - stat(s, DEACTIVATE_FULL); 4034 - } 4035 - } 4036 - 4037 - /* 4038 - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock 4039 - * can be acquired without a deadlock before invoking the function. 4040 - * 4041 - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is 4042 - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), 4043 - * and kmalloc() is not used in an unsupported context. 4044 - * 4045 - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). 4046 - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but 4047 - * lockdep_assert() will catch a bug in case: 4048 - * #1 4049 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() 4050 - * or 4051 - * #2 4052 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() 4053 - * 4054 - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt 4055 - * disabled context. The lock will always be acquired and if needed it 4056 - * block and sleep until the lock is available. 4057 - * #1 is possible in !PREEMPT_RT only. 4058 - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: 4059 - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> 4060 - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) 4061 - * 4062 - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B 4063 - */ 4064 - #if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) 4065 - #define local_lock_cpu_slab(s, flags) \ 4066 - local_lock_irqsave(&(s)->cpu_slab->lock, flags) 4067 - #else 4068 - #define local_lock_cpu_slab(s, flags) \ 4069 - do { \ 4070 - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ 4071 - lockdep_assert(__l); \ 4072 - } while (0) 4073 - #endif 4074 - 4075 - #define local_unlock_cpu_slab(s, flags) \ 4076 - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) 4077 - 4078 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4079 - static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) 4080 - { 4081 - struct kmem_cache_node *n = NULL, *n2 = NULL; 4082 - struct slab *slab, *slab_to_discard = NULL; 4083 - unsigned long flags = 0; 4084 - 4085 - while (partial_slab) { 4086 - slab = partial_slab; 4087 - partial_slab = slab->next; 4088 - 4089 - n2 = get_node(s, slab_nid(slab)); 4090 - if (n != n2) { 4091 - if (n) 4092 - spin_unlock_irqrestore(&n->list_lock, flags); 4093 - 4094 - n = n2; 4095 - spin_lock_irqsave(&n->list_lock, flags); 4096 - } 4097 - 4098 - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { 4099 - slab->next = slab_to_discard; 4100 - slab_to_discard = slab; 4101 - } else { 4102 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 4103 - stat(s, FREE_ADD_PARTIAL); 4104 - } 4105 - } 4106 - 4107 - if (n) 4108 - spin_unlock_irqrestore(&n->list_lock, flags); 4109 - 4110 - while (slab_to_discard) { 4111 - slab = slab_to_discard; 4112 - slab_to_discard = slab_to_discard->next; 4113 - 4114 - stat(s, DEACTIVATE_EMPTY); 4115 - discard_slab(s, slab); 4116 - stat(s, FREE_SLAB); 4117 - } 4118 - } 4119 - 4120 - /* 4121 - * Put all the cpu partial slabs to the node partial list. 4122 - */ 4123 - static void put_partials(struct kmem_cache *s) 4124 - { 4125 - struct slab *partial_slab; 4126 - unsigned long flags; 4127 - 4128 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4129 - partial_slab = this_cpu_read(s->cpu_slab->partial); 4130 - this_cpu_write(s->cpu_slab->partial, NULL); 4131 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4132 - 4133 - if (partial_slab) 4134 - __put_partials(s, partial_slab); 4135 - } 4136 - 4137 - static void put_partials_cpu(struct kmem_cache *s, 4138 - struct kmem_cache_cpu *c) 4139 - { 4140 - struct slab *partial_slab; 4141 - 4142 - partial_slab = slub_percpu_partial(c); 4143 - c->partial = NULL; 4144 - 4145 - if (partial_slab) 4146 - __put_partials(s, partial_slab); 4147 - } 4148 - 4149 - /* 4150 - * Put a slab into a partial slab slot if available. 4151 - * 4152 - * If we did not find a slot then simply move all the partials to the 4153 - * per node partial list. 4154 - */ 4155 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) 4156 - { 4157 - struct slab *oldslab; 4158 - struct slab *slab_to_put = NULL; 4159 - unsigned long flags; 4160 - int slabs = 0; 4161 - 4162 - local_lock_cpu_slab(s, flags); 4163 - 4164 - oldslab = this_cpu_read(s->cpu_slab->partial); 4165 - 4166 - if (oldslab) { 4167 - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { 4168 - /* 4169 - * Partial array is full. Move the existing set to the 4170 - * per node partial list. Postpone the actual unfreezing 4171 - * outside of the critical section. 4172 - */ 4173 - slab_to_put = oldslab; 4174 - oldslab = NULL; 4175 - } else { 4176 - slabs = oldslab->slabs; 4177 - } 4178 - } 4179 - 4180 - slabs++; 4181 - 4182 - slab->slabs = slabs; 4183 - slab->next = oldslab; 4184 - 4185 - this_cpu_write(s->cpu_slab->partial, slab); 4186 - 4187 - local_unlock_cpu_slab(s, flags); 4188 - 4189 - if (slab_to_put) { 4190 - __put_partials(s, slab_to_put); 4191 - stat(s, CPU_PARTIAL_DRAIN); 4192 - } 4193 - } 4194 - 4195 - #else /* CONFIG_SLUB_CPU_PARTIAL */ 4196 - 4197 - static inline void put_partials(struct kmem_cache *s) { } 4198 - static inline void put_partials_cpu(struct kmem_cache *s, 4199 - struct kmem_cache_cpu *c) { } 4200 - 4201 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 4202 - 4203 - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 4204 - { 4205 - unsigned long flags; 4206 - struct slab *slab; 4207 - void *freelist; 4208 - 4209 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4210 - 4211 - slab = c->slab; 4212 - freelist = c->freelist; 4213 - 4214 - c->slab = NULL; 4215 - c->freelist = NULL; 4216 - c->tid = next_tid(c->tid); 4217 - 4218 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4219 - 4220 - if (slab) { 4221 - deactivate_slab(s, slab, freelist); 4222 - stat(s, CPUSLAB_FLUSH); 4223 - } 4224 - } 4225 - 4226 - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 4227 - { 4228 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4229 - void *freelist = c->freelist; 4230 - struct slab *slab = c->slab; 4231 - 4232 - c->slab = NULL; 4233 - c->freelist = NULL; 4234 - c->tid = next_tid(c->tid); 4235 - 4236 - if (slab) { 4237 - deactivate_slab(s, slab, freelist); 4238 - stat(s, CPUSLAB_FLUSH); 4239 - } 4240 - 4241 - put_partials_cpu(s, c); 4242 - } 4243 - 4244 - static inline void flush_this_cpu_slab(struct kmem_cache *s) 4245 - { 4246 - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 4247 - 4248 - if (c->slab) 4249 - flush_slab(s, c); 4250 - 4251 - put_partials(s); 4252 - } 4253 - 4254 - static bool has_cpu_slab(int cpu, struct kmem_cache *s) 4255 - { 4256 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4257 - 4258 - return c->slab || slub_percpu_partial(c); 3634 + return get_from_any_partial(s, pc); 4259 3635 } 4260 3636 4261 3637 static bool has_pcs_used(int cpu, struct kmem_cache *s) 4262 3638 { 4263 3639 struct slub_percpu_sheaves *pcs; 4264 3640 4265 - if (!s->cpu_sheaves) 3641 + if (!cache_has_sheaves(s)) 4266 3642 return false; 4267 3643 4268 3644 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); ··· 3876 4042 } 3877 4043 3878 4044 /* 3879 - * Flush cpu slab. 4045 + * Flush percpu sheaves 3880 4046 * 3881 4047 * Called from CPU work handler with migration disabled. 3882 4048 */ 3883 - static void flush_cpu_slab(struct work_struct *w) 4049 + static void flush_cpu_sheaves(struct work_struct *w) 3884 4050 { 3885 4051 struct kmem_cache *s; 3886 4052 struct slub_flush_work *sfw; ··· 3889 4055 3890 4056 s = sfw->s; 3891 4057 3892 - if (s->cpu_sheaves) 4058 + if (cache_has_sheaves(s)) 3893 4059 pcs_flush_all(s); 3894 - 3895 - flush_this_cpu_slab(s); 3896 4060 } 3897 4061 3898 4062 static void flush_all_cpus_locked(struct kmem_cache *s) ··· 3903 4071 3904 4072 for_each_online_cpu(cpu) { 3905 4073 sfw = &per_cpu(slub_flush, cpu); 3906 - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { 4074 + if (!has_pcs_used(cpu, s)) { 3907 4075 sfw->skip = true; 3908 4076 continue; 3909 4077 } 3910 - INIT_WORK(&sfw->work, flush_cpu_slab); 4078 + INIT_WORK(&sfw->work, flush_cpu_sheaves); 3911 4079 sfw->skip = false; 3912 4080 sfw->s = s; 3913 4081 queue_work_on(cpu, flushwq, &sfw->work); ··· 3992 4160 mutex_lock(&slab_mutex); 3993 4161 3994 4162 list_for_each_entry(s, &slab_caches, list) { 3995 - if (!s->cpu_sheaves) 4163 + if (!cache_has_sheaves(s)) 3996 4164 continue; 3997 4165 flush_rcu_sheaves_on_cache(s); 3998 4166 } ··· 4013 4181 4014 4182 mutex_lock(&slab_mutex); 4015 4183 list_for_each_entry(s, &slab_caches, list) { 4016 - __flush_cpu_slab(s, cpu); 4017 - if (s->cpu_sheaves) 4184 + if (cache_has_sheaves(s)) 4018 4185 __pcs_flush_all_cpu(s, cpu); 4019 4186 } 4020 4187 mutex_unlock(&slab_mutex); 4021 4188 return 0; 4022 - } 4023 - 4024 - /* 4025 - * Check if the objects in a per cpu structure fit numa 4026 - * locality expectations. 4027 - */ 4028 - static inline int node_match(struct slab *slab, int node) 4029 - { 4030 - #ifdef CONFIG_NUMA 4031 - if (node != NUMA_NO_NODE && slab_nid(slab) != node) 4032 - return 0; 4033 - #endif 4034 - return 1; 4035 4189 } 4036 4190 4037 4191 #ifdef CONFIG_SLUB_DEBUG ··· 4192 4374 return true; 4193 4375 } 4194 4376 4195 - static inline bool 4196 - __update_cpu_freelist_fast(struct kmem_cache *s, 4197 - void *freelist_old, void *freelist_new, 4198 - unsigned long tid) 4199 - { 4200 - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; 4201 - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; 4202 - 4203 - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, 4204 - &old.freelist_tid, new.freelist_tid); 4205 - } 4206 - 4207 4377 /* 4208 - * Check the slab->freelist and either transfer the freelist to the 4209 - * per cpu freelist or deactivate the slab. 4378 + * Get the slab's freelist and do not freeze it. 4210 4379 * 4211 - * The slab is still frozen if the return value is not NULL. 4380 + * Assumes the slab is isolated from node partial list and not frozen. 4212 4381 * 4213 - * If this function returns NULL then the slab has been unfrozen. 4382 + * Assumes this is performed only for caches without debugging so we 4383 + * don't need to worry about adding the slab to the full list. 4214 4384 */ 4215 - static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 4216 - { 4217 - struct freelist_counters old, new; 4218 - 4219 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4220 - 4221 - do { 4222 - old.freelist = slab->freelist; 4223 - old.counters = slab->counters; 4224 - 4225 - new.freelist = NULL; 4226 - new.counters = old.counters; 4227 - 4228 - new.inuse = old.objects; 4229 - new.frozen = old.freelist != NULL; 4230 - 4231 - 4232 - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); 4233 - 4234 - return old.freelist; 4235 - } 4236 - 4237 - /* 4238 - * Freeze the partial slab and return the pointer to the freelist. 4239 - */ 4240 - static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 4385 + static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) 4241 4386 { 4242 4387 struct freelist_counters old, new; 4243 4388 ··· 4210 4429 4211 4430 new.freelist = NULL; 4212 4431 new.counters = old.counters; 4213 - VM_BUG_ON(new.frozen); 4432 + VM_WARN_ON_ONCE(new.frozen); 4214 4433 4215 4434 new.inuse = old.objects; 4216 - new.frozen = 1; 4217 4435 4218 - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4436 + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); 4219 4437 4220 4438 return old.freelist; 4221 - } 4222 - 4223 - /* 4224 - * Slow path. The lockless freelist is empty or we need to perform 4225 - * debugging duties. 4226 - * 4227 - * Processing is still very fast if new objects have been freed to the 4228 - * regular freelist. In that case we simply take over the regular freelist 4229 - * as the lockless freelist and zap the regular freelist. 4230 - * 4231 - * If that is not working then we fall back to the partial lists. We take the 4232 - * first element of the freelist as the object to allocate now and move the 4233 - * rest of the freelist to the lockless freelist. 4234 - * 4235 - * And if we were unable to get a new slab from the partial slab lists then 4236 - * we need to allocate a new slab. This is the slowest path since it involves 4237 - * a call to the page allocator and the setup of a new slab. 4238 - * 4239 - * Version of __slab_alloc to use when we know that preemption is 4240 - * already disabled (which is the case for bulk allocation). 4241 - */ 4242 - static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4243 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4244 - { 4245 - bool allow_spin = gfpflags_allow_spinning(gfpflags); 4246 - void *freelist; 4247 - struct slab *slab; 4248 - unsigned long flags; 4249 - struct partial_context pc; 4250 - bool try_thisnode = true; 4251 - 4252 - stat(s, ALLOC_SLOWPATH); 4253 - 4254 - reread_slab: 4255 - 4256 - slab = READ_ONCE(c->slab); 4257 - if (!slab) { 4258 - /* 4259 - * if the node is not online or has no normal memory, just 4260 - * ignore the node constraint 4261 - */ 4262 - if (unlikely(node != NUMA_NO_NODE && 4263 - !node_isset(node, slab_nodes))) 4264 - node = NUMA_NO_NODE; 4265 - goto new_slab; 4266 - } 4267 - 4268 - if (unlikely(!node_match(slab, node))) { 4269 - /* 4270 - * same as above but node_match() being false already 4271 - * implies node != NUMA_NO_NODE. 4272 - * 4273 - * We don't strictly honor pfmemalloc and NUMA preferences 4274 - * when !allow_spin because: 4275 - * 4276 - * 1. Most kmalloc() users allocate objects on the local node, 4277 - * so kmalloc_nolock() tries not to interfere with them by 4278 - * deactivating the cpu slab. 4279 - * 4280 - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause 4281 - * unnecessary slab allocations even when n->partial list 4282 - * is not empty. 4283 - */ 4284 - if (!node_isset(node, slab_nodes) || 4285 - !allow_spin) { 4286 - node = NUMA_NO_NODE; 4287 - } else { 4288 - stat(s, ALLOC_NODE_MISMATCH); 4289 - goto deactivate_slab; 4290 - } 4291 - } 4292 - 4293 - /* 4294 - * By rights, we should be searching for a slab page that was 4295 - * PFMEMALLOC but right now, we are losing the pfmemalloc 4296 - * information when the page leaves the per-cpu allocator 4297 - */ 4298 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) 4299 - goto deactivate_slab; 4300 - 4301 - /* must check again c->slab in case we got preempted and it changed */ 4302 - local_lock_cpu_slab(s, flags); 4303 - 4304 - if (unlikely(slab != c->slab)) { 4305 - local_unlock_cpu_slab(s, flags); 4306 - goto reread_slab; 4307 - } 4308 - freelist = c->freelist; 4309 - if (freelist) 4310 - goto load_freelist; 4311 - 4312 - freelist = get_freelist(s, slab); 4313 - 4314 - if (!freelist) { 4315 - c->slab = NULL; 4316 - c->tid = next_tid(c->tid); 4317 - local_unlock_cpu_slab(s, flags); 4318 - stat(s, DEACTIVATE_BYPASS); 4319 - goto new_slab; 4320 - } 4321 - 4322 - stat(s, ALLOC_REFILL); 4323 - 4324 - load_freelist: 4325 - 4326 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4327 - 4328 - /* 4329 - * freelist is pointing to the list of objects to be used. 4330 - * slab is pointing to the slab from which the objects are obtained. 4331 - * That slab must be frozen for per cpu allocations to work. 4332 - */ 4333 - VM_BUG_ON(!c->slab->frozen); 4334 - c->freelist = get_freepointer(s, freelist); 4335 - c->tid = next_tid(c->tid); 4336 - local_unlock_cpu_slab(s, flags); 4337 - return freelist; 4338 - 4339 - deactivate_slab: 4340 - 4341 - local_lock_cpu_slab(s, flags); 4342 - if (slab != c->slab) { 4343 - local_unlock_cpu_slab(s, flags); 4344 - goto reread_slab; 4345 - } 4346 - freelist = c->freelist; 4347 - c->slab = NULL; 4348 - c->freelist = NULL; 4349 - c->tid = next_tid(c->tid); 4350 - local_unlock_cpu_slab(s, flags); 4351 - deactivate_slab(s, slab, freelist); 4352 - 4353 - new_slab: 4354 - 4355 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4356 - while (slub_percpu_partial(c)) { 4357 - local_lock_cpu_slab(s, flags); 4358 - if (unlikely(c->slab)) { 4359 - local_unlock_cpu_slab(s, flags); 4360 - goto reread_slab; 4361 - } 4362 - if (unlikely(!slub_percpu_partial(c))) { 4363 - local_unlock_cpu_slab(s, flags); 4364 - /* we were preempted and partial list got empty */ 4365 - goto new_objects; 4366 - } 4367 - 4368 - slab = slub_percpu_partial(c); 4369 - slub_set_percpu_partial(c, slab); 4370 - 4371 - if (likely(node_match(slab, node) && 4372 - pfmemalloc_match(slab, gfpflags)) || 4373 - !allow_spin) { 4374 - c->slab = slab; 4375 - freelist = get_freelist(s, slab); 4376 - VM_BUG_ON(!freelist); 4377 - stat(s, CPU_PARTIAL_ALLOC); 4378 - goto load_freelist; 4379 - } 4380 - 4381 - local_unlock_cpu_slab(s, flags); 4382 - 4383 - slab->next = NULL; 4384 - __put_partials(s, slab); 4385 - } 4386 - #endif 4387 - 4388 - new_objects: 4389 - 4390 - pc.flags = gfpflags; 4391 - /* 4392 - * When a preferred node is indicated but no __GFP_THISNODE 4393 - * 4394 - * 1) try to get a partial slab from target node only by having 4395 - * __GFP_THISNODE in pc.flags for get_partial() 4396 - * 2) if 1) failed, try to allocate a new slab from target node with 4397 - * GPF_NOWAIT | __GFP_THISNODE opportunistically 4398 - * 3) if 2) failed, retry with original gfpflags which will allow 4399 - * get_partial() try partial lists of other nodes before potentially 4400 - * allocating new page from other nodes 4401 - */ 4402 - if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4403 - && try_thisnode)) { 4404 - if (unlikely(!allow_spin)) 4405 - /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 4406 - pc.flags = gfpflags | __GFP_THISNODE; 4407 - else 4408 - pc.flags = GFP_NOWAIT | __GFP_THISNODE; 4409 - } 4410 - 4411 - pc.orig_size = orig_size; 4412 - slab = get_partial(s, node, &pc); 4413 - if (slab) { 4414 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4415 - freelist = pc.object; 4416 - /* 4417 - * For debug caches here we had to go through 4418 - * alloc_single_from_partial() so just store the 4419 - * tracking info and return the object. 4420 - * 4421 - * Due to disabled preemption we need to disallow 4422 - * blocking. The flags are further adjusted by 4423 - * gfp_nested_mask() in stack_depot itself. 4424 - */ 4425 - if (s->flags & SLAB_STORE_USER) 4426 - set_track(s, freelist, TRACK_ALLOC, addr, 4427 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4428 - 4429 - return freelist; 4430 - } 4431 - 4432 - freelist = freeze_slab(s, slab); 4433 - goto retry_load_slab; 4434 - } 4435 - 4436 - slub_put_cpu_ptr(s->cpu_slab); 4437 - slab = new_slab(s, pc.flags, node); 4438 - c = slub_get_cpu_ptr(s->cpu_slab); 4439 - 4440 - if (unlikely(!slab)) { 4441 - if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4442 - && try_thisnode) { 4443 - try_thisnode = false; 4444 - goto new_objects; 4445 - } 4446 - slab_out_of_memory(s, gfpflags, node); 4447 - return NULL; 4448 - } 4449 - 4450 - stat(s, ALLOC_SLAB); 4451 - 4452 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4453 - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4454 - 4455 - if (unlikely(!freelist)) { 4456 - /* This could cause an endless loop. Fail instead. */ 4457 - if (!allow_spin) 4458 - return NULL; 4459 - goto new_objects; 4460 - } 4461 - 4462 - if (s->flags & SLAB_STORE_USER) 4463 - set_track(s, freelist, TRACK_ALLOC, addr, 4464 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4465 - 4466 - return freelist; 4467 - } 4468 - 4469 - /* 4470 - * No other reference to the slab yet so we can 4471 - * muck around with it freely without cmpxchg 4472 - */ 4473 - freelist = slab->freelist; 4474 - slab->freelist = NULL; 4475 - slab->inuse = slab->objects; 4476 - slab->frozen = 1; 4477 - 4478 - inc_slabs_node(s, slab_nid(slab), slab->objects); 4479 - 4480 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { 4481 - /* 4482 - * For !pfmemalloc_match() case we don't load freelist so that 4483 - * we don't make further mismatched allocations easier. 4484 - */ 4485 - deactivate_slab(s, slab, get_freepointer(s, freelist)); 4486 - return freelist; 4487 - } 4488 - 4489 - retry_load_slab: 4490 - 4491 - local_lock_cpu_slab(s, flags); 4492 - if (unlikely(c->slab)) { 4493 - void *flush_freelist = c->freelist; 4494 - struct slab *flush_slab = c->slab; 4495 - 4496 - c->slab = NULL; 4497 - c->freelist = NULL; 4498 - c->tid = next_tid(c->tid); 4499 - 4500 - local_unlock_cpu_slab(s, flags); 4501 - 4502 - if (unlikely(!allow_spin)) { 4503 - /* Reentrant slub cannot take locks, defer */ 4504 - defer_deactivate_slab(flush_slab, flush_freelist); 4505 - } else { 4506 - deactivate_slab(s, flush_slab, flush_freelist); 4507 - } 4508 - 4509 - stat(s, CPUSLAB_FLUSH); 4510 - 4511 - goto retry_load_slab; 4512 - } 4513 - c->slab = slab; 4514 - 4515 - goto load_freelist; 4516 - } 4517 - /* 4518 - * We disallow kprobes in ___slab_alloc() to prevent reentrance 4519 - * 4520 - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of 4521 - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> 4522 - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() 4523 - * manipulating c->freelist without lock. 4524 - * 4525 - * This does not prevent kprobe in functions called from ___slab_alloc() such as 4526 - * local_lock_irqsave() itself, and that is fine, we only need to protect the 4527 - * c->freelist manipulation in ___slab_alloc() itself. 4528 - */ 4529 - NOKPROBE_SYMBOL(___slab_alloc); 4530 - 4531 - /* 4532 - * A wrapper for ___slab_alloc() for contexts where preemption is not yet 4533 - * disabled. Compensates for possible cpu changes by refetching the per cpu area 4534 - * pointer. 4535 - */ 4536 - static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4537 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4538 - { 4539 - void *p; 4540 - 4541 - #ifdef CONFIG_PREEMPT_COUNT 4542 - /* 4543 - * We may have been preempted and rescheduled on a different 4544 - * cpu before disabling preemption. Need to reload cpu area 4545 - * pointer. 4546 - */ 4547 - c = slub_get_cpu_ptr(s->cpu_slab); 4548 - #endif 4549 - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { 4550 - if (local_lock_is_locked(&s->cpu_slab->lock)) { 4551 - /* 4552 - * EBUSY is an internal signal to kmalloc_nolock() to 4553 - * retry a different bucket. It's not propagated 4554 - * to the caller. 4555 - */ 4556 - p = ERR_PTR(-EBUSY); 4557 - goto out; 4558 - } 4559 - } 4560 - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); 4561 - out: 4562 - #ifdef CONFIG_PREEMPT_COUNT 4563 - slub_put_cpu_ptr(s->cpu_slab); 4564 - #endif 4565 - return p; 4566 - } 4567 - 4568 - static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 4569 - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 4570 - { 4571 - struct kmem_cache_cpu *c; 4572 - struct slab *slab; 4573 - unsigned long tid; 4574 - void *object; 4575 - 4576 - redo: 4577 - /* 4578 - * Must read kmem_cache cpu data via this cpu ptr. Preemption is 4579 - * enabled. We may switch back and forth between cpus while 4580 - * reading from one cpu area. That does not matter as long 4581 - * as we end up on the original cpu again when doing the cmpxchg. 4582 - * 4583 - * We must guarantee that tid and kmem_cache_cpu are retrieved on the 4584 - * same cpu. We read first the kmem_cache_cpu pointer and use it to read 4585 - * the tid. If we are preempted and switched to another cpu between the 4586 - * two reads, it's OK as the two are still associated with the same cpu 4587 - * and cmpxchg later will validate the cpu. 4588 - */ 4589 - c = raw_cpu_ptr(s->cpu_slab); 4590 - tid = READ_ONCE(c->tid); 4591 - 4592 - /* 4593 - * Irqless object alloc/free algorithm used here depends on sequence 4594 - * of fetching cpu_slab's data. tid should be fetched before anything 4595 - * on c to guarantee that object and slab associated with previous tid 4596 - * won't be used with current tid. If we fetch tid first, object and 4597 - * slab could be one associated with next tid and our alloc/free 4598 - * request will be failed. In this case, we will retry. So, no problem. 4599 - */ 4600 - barrier(); 4601 - 4602 - /* 4603 - * The transaction ids are globally unique per cpu and per operation on 4604 - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 4605 - * occurs on the right processor and that there was no operation on the 4606 - * linked list in between. 4607 - */ 4608 - 4609 - object = c->freelist; 4610 - slab = c->slab; 4611 - 4612 - #ifdef CONFIG_NUMA 4613 - if (static_branch_unlikely(&strict_numa) && 4614 - node == NUMA_NO_NODE) { 4615 - 4616 - struct mempolicy *mpol = current->mempolicy; 4617 - 4618 - if (mpol) { 4619 - /* 4620 - * Special BIND rule support. If existing slab 4621 - * is in permitted set then do not redirect 4622 - * to a particular node. 4623 - * Otherwise we apply the memory policy to get 4624 - * the node we need to allocate on. 4625 - */ 4626 - if (mpol->mode != MPOL_BIND || !slab || 4627 - !node_isset(slab_nid(slab), mpol->nodes)) 4628 - 4629 - node = mempolicy_slab_node(); 4630 - } 4631 - } 4632 - #endif 4633 - 4634 - if (!USE_LOCKLESS_FAST_PATH() || 4635 - unlikely(!object || !slab || !node_match(slab, node))) { 4636 - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); 4637 - } else { 4638 - void *next_object = get_freepointer_safe(s, object); 4639 - 4640 - /* 4641 - * The cmpxchg will only match if there was no additional 4642 - * operation and if we are on the right processor. 4643 - * 4644 - * The cmpxchg does the following atomically (without lock 4645 - * semantics!) 4646 - * 1. Relocate first pointer to the current per cpu area. 4647 - * 2. Verify that tid and freelist have not been changed 4648 - * 3. If they were not changed replace tid and freelist 4649 - * 4650 - * Since this is without lock semantics the protection is only 4651 - * against code executing on this cpu *not* from access by 4652 - * other cpus. 4653 - */ 4654 - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { 4655 - note_cmpxchg_failure("slab_alloc", s, tid); 4656 - goto redo; 4657 - } 4658 - prefetch_freepointer(s, next_object); 4659 - stat(s, ALLOC_FASTPATH); 4660 - } 4661 - 4662 - return object; 4663 4439 } 4664 4440 4665 4441 /* ··· 4232 4894 !freeptr_outside_object(s)) 4233 4895 memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 4234 4896 0, sizeof(void *)); 4897 + } 4898 + 4899 + static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, 4900 + void **p, unsigned int count, bool allow_spin) 4901 + { 4902 + unsigned int allocated = 0; 4903 + struct kmem_cache_node *n; 4904 + bool needs_add_partial; 4905 + unsigned long flags; 4906 + void *object; 4907 + 4908 + /* 4909 + * Are we going to put the slab on the partial list? 4910 + * Note slab->inuse is 0 on a new slab. 4911 + */ 4912 + needs_add_partial = (slab->objects > count); 4913 + 4914 + if (!allow_spin && needs_add_partial) { 4915 + 4916 + n = get_node(s, slab_nid(slab)); 4917 + 4918 + if (!spin_trylock_irqsave(&n->list_lock, flags)) { 4919 + /* Unlucky, discard newly allocated slab */ 4920 + free_new_slab_nolock(s, slab); 4921 + return 0; 4922 + } 4923 + } 4924 + 4925 + object = slab->freelist; 4926 + while (object && allocated < count) { 4927 + p[allocated] = object; 4928 + object = get_freepointer(s, object); 4929 + maybe_wipe_obj_freeptr(s, p[allocated]); 4930 + 4931 + slab->inuse++; 4932 + allocated++; 4933 + } 4934 + slab->freelist = object; 4935 + 4936 + if (needs_add_partial) { 4937 + 4938 + if (allow_spin) { 4939 + n = get_node(s, slab_nid(slab)); 4940 + spin_lock_irqsave(&n->list_lock, flags); 4941 + } 4942 + add_partial(n, slab, ADD_TO_HEAD); 4943 + spin_unlock_irqrestore(&n->list_lock, flags); 4944 + } 4945 + 4946 + inc_slabs_node(s, slab_nid(slab), slab->objects); 4947 + return allocated; 4948 + } 4949 + 4950 + /* 4951 + * Slow path. We failed to allocate via percpu sheaves or they are not available 4952 + * due to bootstrap or debugging enabled or SLUB_TINY. 4953 + * 4954 + * We try to allocate from partial slab lists and fall back to allocating a new 4955 + * slab. 4956 + */ 4957 + static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4958 + unsigned long addr, unsigned int orig_size) 4959 + { 4960 + bool allow_spin = gfpflags_allow_spinning(gfpflags); 4961 + void *object; 4962 + struct slab *slab; 4963 + struct partial_context pc; 4964 + bool try_thisnode = true; 4965 + 4966 + stat(s, ALLOC_SLOWPATH); 4967 + 4968 + new_objects: 4969 + 4970 + pc.flags = gfpflags; 4971 + /* 4972 + * When a preferred node is indicated but no __GFP_THISNODE 4973 + * 4974 + * 1) try to get a partial slab from target node only by having 4975 + * __GFP_THISNODE in pc.flags for get_from_partial() 4976 + * 2) if 1) failed, try to allocate a new slab from target node with 4977 + * GPF_NOWAIT | __GFP_THISNODE opportunistically 4978 + * 3) if 2) failed, retry with original gfpflags which will allow 4979 + * get_from_partial() try partial lists of other nodes before 4980 + * potentially allocating new page from other nodes 4981 + */ 4982 + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4983 + && try_thisnode)) { 4984 + if (unlikely(!allow_spin)) 4985 + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 4986 + pc.flags = gfpflags | __GFP_THISNODE; 4987 + else 4988 + pc.flags = GFP_NOWAIT | __GFP_THISNODE; 4989 + } 4990 + 4991 + pc.orig_size = orig_size; 4992 + object = get_from_partial(s, node, &pc); 4993 + if (object) 4994 + goto success; 4995 + 4996 + slab = new_slab(s, pc.flags, node); 4997 + 4998 + if (unlikely(!slab)) { 4999 + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 5000 + && try_thisnode) { 5001 + try_thisnode = false; 5002 + goto new_objects; 5003 + } 5004 + slab_out_of_memory(s, gfpflags, node); 5005 + return NULL; 5006 + } 5007 + 5008 + stat(s, ALLOC_SLAB); 5009 + 5010 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5011 + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 5012 + 5013 + if (likely(object)) 5014 + goto success; 5015 + } else { 5016 + alloc_from_new_slab(s, slab, &object, 1, allow_spin); 5017 + 5018 + /* we don't need to check SLAB_STORE_USER here */ 5019 + if (likely(object)) 5020 + return object; 5021 + } 5022 + 5023 + if (allow_spin) 5024 + goto new_objects; 5025 + 5026 + /* This could cause an endless loop. Fail instead. */ 5027 + return NULL; 5028 + 5029 + success: 5030 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) 5031 + set_track(s, object, TRACK_ALLOC, addr, gfpflags); 5032 + 5033 + return object; 5034 + } 5035 + 5036 + static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 5037 + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 5038 + { 5039 + void *object; 5040 + 5041 + #ifdef CONFIG_NUMA 5042 + if (static_branch_unlikely(&strict_numa) && 5043 + node == NUMA_NO_NODE) { 5044 + 5045 + struct mempolicy *mpol = current->mempolicy; 5046 + 5047 + if (mpol) { 5048 + /* 5049 + * Special BIND rule support. If the local node 5050 + * is in permitted set then do not redirect 5051 + * to a particular node. 5052 + * Otherwise we apply the memory policy to get 5053 + * the node we need to allocate on. 5054 + */ 5055 + if (mpol->mode != MPOL_BIND || 5056 + !node_isset(numa_mem_id(), mpol->nodes)) 5057 + node = mempolicy_slab_node(); 5058 + } 5059 + } 5060 + #endif 5061 + 5062 + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); 5063 + 5064 + return object; 4235 5065 } 4236 5066 4237 5067 static __fastpath_inline ··· 4488 4982 4489 4983 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 4490 4984 4985 + /* Bootstrap or debug cache, back off */ 4986 + if (unlikely(!cache_has_sheaves(s))) { 4987 + local_unlock(&s->cpu_sheaves->lock); 4988 + return NULL; 4989 + } 4990 + 4491 4991 if (pcs->spare && pcs->spare->size > 0) { 4492 4992 swap(pcs->main, pcs->spare); 4493 4993 return pcs; ··· 4505 4993 return NULL; 4506 4994 } 4507 4995 4508 - full = barn_replace_empty_sheaf(barn, pcs->main); 4996 + full = barn_replace_empty_sheaf(barn, pcs->main, 4997 + gfpflags_allow_spinning(gfp)); 4509 4998 4510 4999 if (full) { 4511 5000 stat(s, BARN_GET); ··· 4523 5010 empty = pcs->spare; 4524 5011 pcs->spare = NULL; 4525 5012 } else { 4526 - empty = barn_get_empty_sheaf(barn); 5013 + empty = barn_get_empty_sheaf(barn, true); 4527 5014 } 4528 5015 } 4529 5016 ··· 4565 5052 */ 4566 5053 4567 5054 if (pcs->main->size == 0) { 4568 - barn_put_empty_sheaf(barn, pcs->main); 5055 + if (!pcs->spare) 5056 + pcs->spare = pcs->main; 5057 + else 5058 + barn_put_empty_sheaf(barn, pcs->main); 4569 5059 pcs->main = full; 4570 5060 return pcs; 4571 5061 } ··· 4625 5109 * We assume the percpu sheaves contain only local objects although it's 4626 5110 * not completely guaranteed, so we verify later. 4627 5111 */ 4628 - if (unlikely(node_requested && node != numa_mem_id())) 5112 + if (unlikely(node_requested && node != numa_mem_id())) { 5113 + stat(s, ALLOC_NODE_MISMATCH); 4629 5114 return NULL; 5115 + } 4630 5116 4631 5117 if (!local_trylock(&s->cpu_sheaves->lock)) 4632 5118 return NULL; ··· 4651 5133 */ 4652 5134 if (page_to_nid(virt_to_page(object)) != node) { 4653 5135 local_unlock(&s->cpu_sheaves->lock); 5136 + stat(s, ALLOC_NODE_MISMATCH); 4654 5137 return NULL; 4655 5138 } 4656 5139 } ··· 4660 5141 4661 5142 local_unlock(&s->cpu_sheaves->lock); 4662 5143 4663 - stat(s, ALLOC_PCS); 5144 + stat(s, ALLOC_FASTPATH); 4664 5145 4665 5146 return object; 4666 5147 } 4667 5148 4668 5149 static __fastpath_inline 4669 - unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) 5150 + unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, 5151 + void **p) 4670 5152 { 4671 5153 struct slub_percpu_sheaves *pcs; 4672 5154 struct slab_sheaf *main; ··· 4685 5165 struct slab_sheaf *full; 4686 5166 struct node_barn *barn; 4687 5167 5168 + if (unlikely(!cache_has_sheaves(s))) { 5169 + local_unlock(&s->cpu_sheaves->lock); 5170 + return allocated; 5171 + } 5172 + 4688 5173 if (pcs->spare && pcs->spare->size > 0) { 4689 5174 swap(pcs->main, pcs->spare); 4690 5175 goto do_alloc; ··· 4701 5176 return allocated; 4702 5177 } 4703 5178 4704 - full = barn_replace_empty_sheaf(barn, pcs->main); 5179 + full = barn_replace_empty_sheaf(barn, pcs->main, 5180 + gfpflags_allow_spinning(gfp)); 4705 5181 4706 5182 if (full) { 4707 5183 stat(s, BARN_GET); ··· 4732 5206 4733 5207 local_unlock(&s->cpu_sheaves->lock); 4734 5208 4735 - stat_add(s, ALLOC_PCS, batch); 5209 + stat_add(s, ALLOC_FASTPATH, batch); 4736 5210 4737 5211 allocated += batch; 4738 5212 ··· 4770 5244 if (unlikely(object)) 4771 5245 goto out; 4772 5246 4773 - if (s->cpu_sheaves) 4774 - object = alloc_from_pcs(s, gfpflags, node); 5247 + object = alloc_from_pcs(s, gfpflags, node); 4775 5248 4776 5249 if (!object) 4777 5250 object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); ··· 4865 5340 return ret; 4866 5341 } 4867 5342 5343 + static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 5344 + size_t size, void **p); 5345 + 4868 5346 /* 4869 5347 * returns a sheaf that has at least the requested size 4870 5348 * when prefilling is needed, do so with given gfp flags ··· 4881 5353 struct slab_sheaf *sheaf = NULL; 4882 5354 struct node_barn *barn; 4883 5355 4884 - if (unlikely(size > s->sheaf_capacity)) { 5356 + if (unlikely(!size)) 5357 + return NULL; 4885 5358 4886 - /* 4887 - * slab_debug disables cpu sheaves intentionally so all 4888 - * prefilled sheaves become "oversize" and we give up on 4889 - * performance for the debugging. Same with SLUB_TINY. 4890 - * Creating a cache without sheaves and then requesting a 4891 - * prefilled sheaf is however not expected, so warn. 4892 - */ 4893 - WARN_ON_ONCE(s->sheaf_capacity == 0 && 4894 - !IS_ENABLED(CONFIG_SLUB_TINY) && 4895 - !(s->flags & SLAB_DEBUG_FLAGS)); 5359 + if (unlikely(size > s->sheaf_capacity)) { 4896 5360 4897 5361 sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); 4898 5362 if (!sheaf) ··· 5206 5686 gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; 5207 5687 struct kmem_cache *s; 5208 5688 bool can_retry = true; 5209 - void *ret = ERR_PTR(-EBUSY); 5689 + void *ret; 5210 5690 5211 5691 VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | 5212 5692 __GFP_NO_OBJ_EXT)); ··· 5214 5694 if (unlikely(!size)) 5215 5695 return ZERO_SIZE_PTR; 5216 5696 5217 - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) 5218 - /* 5219 - * kmalloc_nolock() in PREEMPT_RT is not supported from 5220 - * non-preemptible context because local_lock becomes a 5221 - * sleeping lock on RT. 5222 - */ 5697 + /* 5698 + * See the comment for the same check in 5699 + * alloc_frozen_pages_nolock_noprof() 5700 + */ 5701 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 5223 5702 return NULL; 5703 + 5224 5704 retry: 5225 5705 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 5226 5706 return NULL; ··· 5229 5709 if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) 5230 5710 /* 5231 5711 * kmalloc_nolock() is not supported on architectures that 5232 - * don't implement cmpxchg16b, but debug caches don't use 5233 - * per-cpu slab and per-cpu partial slabs. They rely on 5234 - * kmem_cache_node->list_lock, so kmalloc_nolock() can 5235 - * attempt to allocate from debug caches by 5712 + * don't implement cmpxchg16b and thus need slab_lock() 5713 + * which could be preempted by a nmi. 5714 + * But debug caches don't use that and only rely on 5715 + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt 5716 + * to allocate from debug caches by 5236 5717 * spin_trylock_irqsave(&n->list_lock, ...) 5237 5718 */ 5238 5719 return NULL; 5720 + 5721 + ret = alloc_from_pcs(s, alloc_gfp, node); 5722 + if (ret) 5723 + goto success; 5239 5724 5240 5725 /* 5241 5726 * Do not call slab_alloc_node(), since trylock mode isn't 5242 5727 * compatible with slab_pre_alloc_hook/should_failslab and 5243 5728 * kfence_alloc. Hence call __slab_alloc_node() (at most twice) 5244 5729 * and slab_post_alloc_hook() directly. 5245 - * 5246 - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair 5247 - * in irq saved region. It assumes that the same cpu will not 5248 - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. 5249 - * Therefore use in_nmi() to check whether particular bucket is in 5250 - * irq protected section. 5251 - * 5252 - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that 5253 - * this cpu was interrupted somewhere inside ___slab_alloc() after 5254 - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 5255 - * In this case fast path with __update_cpu_freelist_fast() is not safe. 5256 5730 */ 5257 - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 5258 - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5731 + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5259 5732 5260 - if (PTR_ERR(ret) == -EBUSY) { 5261 - if (can_retry) { 5262 - /* pick the next kmalloc bucket */ 5263 - size = s->object_size + 1; 5264 - /* 5265 - * Another alternative is to 5266 - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5267 - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5268 - * to retry from bucket of the same size. 5269 - */ 5270 - can_retry = false; 5271 - goto retry; 5272 - } 5273 - ret = NULL; 5733 + /* 5734 + * It's possible we failed due to trylock as we preempted someone with 5735 + * the sheaves locked, and the list_lock is also held by another cpu. 5736 + * But it should be rare that multiple kmalloc buckets would have 5737 + * sheaves locked, so try a larger one. 5738 + */ 5739 + if (!ret && can_retry) { 5740 + /* pick the next kmalloc bucket */ 5741 + size = s->object_size + 1; 5742 + /* 5743 + * Another alternative is to 5744 + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5745 + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5746 + * to retry from bucket of the same size. 5747 + */ 5748 + can_retry = false; 5749 + goto retry; 5274 5750 } 5275 5751 5752 + success: 5276 5753 maybe_wipe_obj_freeptr(s, ret); 5277 5754 slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, 5278 5755 slab_want_init_on_alloc(alloc_gfp, s), size); ··· 5351 5834 /* was on full list */ 5352 5835 remove_full(s, n, slab); 5353 5836 if (!slab_free) { 5354 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 5837 + add_partial(n, slab, ADD_TO_TAIL); 5355 5838 stat(s, FREE_ADD_PARTIAL); 5356 5839 } 5357 5840 } else if (slab_free) { ··· 5389 5872 unsigned long addr) 5390 5873 5391 5874 { 5392 - bool was_frozen, was_full; 5875 + bool was_full; 5393 5876 struct freelist_counters old, new; 5394 5877 struct kmem_cache_node *n = NULL; 5395 5878 unsigned long flags; 5396 5879 bool on_node_partial; 5397 5880 5398 - stat(s, FREE_SLOWPATH); 5399 - 5400 5881 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5401 5882 free_to_partial_list(s, slab, head, tail, cnt, addr); 5402 5883 return; 5403 5884 } 5404 - 5405 - /* 5406 - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below 5407 - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) 5408 - * is the only other reason it can be false, and it is already handled 5409 - * above. 5410 - */ 5411 5885 5412 5886 do { 5413 5887 if (unlikely(n)) { ··· 5410 5902 old.counters = slab->counters; 5411 5903 5412 5904 was_full = (old.freelist == NULL); 5413 - was_frozen = old.frozen; 5414 5905 5415 5906 set_freepointer(s, tail, old.freelist); 5416 5907 ··· 5422 5915 * to (due to not being full anymore) the partial list. 5423 5916 * Unless it's frozen. 5424 5917 */ 5425 - if ((!new.inuse || was_full) && !was_frozen) { 5918 + if (!new.inuse || was_full) { 5919 + 5920 + n = get_node(s, slab_nid(slab)); 5426 5921 /* 5427 - * If slab becomes non-full and we have cpu partial 5428 - * lists, we put it there unconditionally to avoid 5429 - * taking the list_lock. Otherwise we need it. 5922 + * Speculatively acquire the list_lock. 5923 + * If the cmpxchg does not succeed then we may 5924 + * drop the list_lock without any processing. 5925 + * 5926 + * Otherwise the list_lock will synchronize with 5927 + * other processors updating the list of slabs. 5430 5928 */ 5431 - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { 5929 + spin_lock_irqsave(&n->list_lock, flags); 5432 5930 5433 - n = get_node(s, slab_nid(slab)); 5434 - /* 5435 - * Speculatively acquire the list_lock. 5436 - * If the cmpxchg does not succeed then we may 5437 - * drop the list_lock without any processing. 5438 - * 5439 - * Otherwise the list_lock will synchronize with 5440 - * other processors updating the list of slabs. 5441 - */ 5442 - spin_lock_irqsave(&n->list_lock, flags); 5443 - 5444 - on_node_partial = slab_test_node_partial(slab); 5445 - } 5931 + on_node_partial = slab_test_node_partial(slab); 5446 5932 } 5447 5933 5448 5934 } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); 5449 5935 5450 5936 if (likely(!n)) { 5451 - 5452 - if (likely(was_frozen)) { 5453 - /* 5454 - * The list lock was not taken therefore no list 5455 - * activity can be necessary. 5456 - */ 5457 - stat(s, FREE_FROZEN); 5458 - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { 5459 - /* 5460 - * If we started with a full slab then put it onto the 5461 - * per cpu partial list. 5462 - */ 5463 - put_cpu_partial(s, slab, 1); 5464 - stat(s, CPU_PARTIAL_FREE); 5465 - } 5466 - 5467 5937 /* 5468 - * In other cases we didn't take the list_lock because the slab 5469 - * was already on the partial list and will remain there. 5938 + * We didn't take the list_lock because the slab was already on 5939 + * the partial list and will remain there. 5470 5940 */ 5471 - 5472 5941 return; 5473 5942 } 5474 5943 ··· 5466 5983 5467 5984 /* 5468 5985 * Objects left in the slab. If it was not on the partial list before 5469 - * then add it. This can only happen when cache has no per cpu partial 5470 - * list otherwise we would have put it there. 5986 + * then add it. 5471 5987 */ 5472 - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { 5473 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 5988 + if (unlikely(was_full)) { 5989 + add_partial(n, slab, ADD_TO_TAIL); 5474 5990 stat(s, FREE_ADD_PARTIAL); 5475 5991 } 5476 5992 spin_unlock_irqrestore(&n->list_lock, flags); ··· 5555 6073 * unlocked. 5556 6074 */ 5557 6075 static struct slub_percpu_sheaves * 5558 - __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) 6076 + __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, 6077 + bool allow_spin) 5559 6078 { 5560 6079 struct slab_sheaf *empty; 5561 6080 struct node_barn *barn; ··· 5564 6081 5565 6082 restart: 5566 6083 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 6084 + 6085 + /* Bootstrap or debug cache, back off */ 6086 + if (unlikely(!cache_has_sheaves(s))) { 6087 + local_unlock(&s->cpu_sheaves->lock); 6088 + return NULL; 6089 + } 5567 6090 5568 6091 barn = get_barn(s); 5569 6092 if (!barn) { ··· 5580 6091 put_fail = false; 5581 6092 5582 6093 if (!pcs->spare) { 5583 - empty = barn_get_empty_sheaf(barn); 6094 + empty = barn_get_empty_sheaf(barn, allow_spin); 5584 6095 if (empty) { 5585 6096 pcs->spare = pcs->main; 5586 6097 pcs->main = empty; ··· 5594 6105 return pcs; 5595 6106 } 5596 6107 5597 - empty = barn_replace_full_sheaf(barn, pcs->main); 6108 + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); 5598 6109 5599 6110 if (!IS_ERR(empty)) { 5600 6111 stat(s, BARN_PUT); ··· 5602 6113 return pcs; 5603 6114 } 5604 6115 5605 - if (PTR_ERR(empty) == -E2BIG) { 6116 + /* sheaf_flush_unused() doesn't support !allow_spin */ 6117 + if (PTR_ERR(empty) == -E2BIG && allow_spin) { 5606 6118 /* Since we got here, spare exists and is full */ 5607 6119 struct slab_sheaf *to_flush = pcs->spare; 5608 6120 ··· 5627 6137 5628 6138 alloc_empty: 5629 6139 local_unlock(&s->cpu_sheaves->lock); 6140 + 6141 + /* 6142 + * alloc_empty_sheaf() doesn't support !allow_spin and it's 6143 + * easier to fall back to freeing directly without sheaves 6144 + * than add the support (and to sheaf_flush_unused() above) 6145 + */ 6146 + if (!allow_spin) 6147 + return NULL; 5630 6148 5631 6149 empty = alloc_empty_sheaf(s, GFP_NOWAIT); 5632 6150 if (empty) ··· 5678 6180 * The object is expected to have passed slab_free_hook() already. 5679 6181 */ 5680 6182 static __fastpath_inline 5681 - bool free_to_pcs(struct kmem_cache *s, void *object) 6183 + bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) 5682 6184 { 5683 6185 struct slub_percpu_sheaves *pcs; 5684 6186 ··· 5689 6191 5690 6192 if (unlikely(pcs->main->size == s->sheaf_capacity)) { 5691 6193 5692 - pcs = __pcs_replace_full_main(s, pcs); 6194 + pcs = __pcs_replace_full_main(s, pcs, allow_spin); 5693 6195 if (unlikely(!pcs)) 5694 6196 return false; 5695 6197 } ··· 5698 6200 5699 6201 local_unlock(&s->cpu_sheaves->lock); 5700 6202 5701 - stat(s, FREE_PCS); 6203 + stat(s, FREE_FASTPATH); 5702 6204 5703 6205 return true; 5704 6206 } ··· 5763 6265 free_empty_sheaf(s, sheaf); 5764 6266 } 5765 6267 6268 + /* 6269 + * kvfree_call_rcu() can be called while holding a raw_spinlock_t. Since 6270 + * __kfree_rcu_sheaf() may acquire a spinlock_t (sleeping lock on PREEMPT_RT), 6271 + * this would violate lock nesting rules. Therefore, kvfree_call_rcu() avoids 6272 + * this problem by bypassing the sheaves layer entirely on PREEMPT_RT. 6273 + * 6274 + * However, lockdep still complains that it is invalid to acquire spinlock_t 6275 + * while holding raw_spinlock_t, even on !PREEMPT_RT where spinlock_t is a 6276 + * spinning lock. Tell lockdep that acquiring spinlock_t is valid here 6277 + * by temporarily raising the wait-type to LD_WAIT_CONFIG. 6278 + */ 6279 + static DEFINE_WAIT_OVERRIDE_MAP(kfree_rcu_sheaf_map, LD_WAIT_CONFIG); 6280 + 5766 6281 bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) 5767 6282 { 5768 6283 struct slub_percpu_sheaves *pcs; 5769 6284 struct slab_sheaf *rcu_sheaf; 6285 + 6286 + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 6287 + return false; 6288 + 6289 + lock_map_acquire_try(&kfree_rcu_sheaf_map); 5770 6290 5771 6291 if (!local_trylock(&s->cpu_sheaves->lock)) 5772 6292 goto fail; ··· 5795 6279 5796 6280 struct slab_sheaf *empty; 5797 6281 struct node_barn *barn; 6282 + 6283 + /* Bootstrap or debug cache, fall back */ 6284 + if (unlikely(!cache_has_sheaves(s))) { 6285 + local_unlock(&s->cpu_sheaves->lock); 6286 + goto fail; 6287 + } 5798 6288 5799 6289 if (pcs->spare && pcs->spare->size == 0) { 5800 6290 pcs->rcu_free = pcs->spare; ··· 5814 6292 goto fail; 5815 6293 } 5816 6294 5817 - empty = barn_get_empty_sheaf(barn); 6295 + empty = barn_get_empty_sheaf(barn, true); 5818 6296 5819 6297 if (empty) { 5820 6298 pcs->rcu_free = empty; ··· 5868 6346 local_unlock(&s->cpu_sheaves->lock); 5869 6347 5870 6348 stat(s, FREE_RCU_SHEAF); 6349 + lock_map_release(&kfree_rcu_sheaf_map); 5871 6350 return true; 5872 6351 5873 6352 fail: 5874 6353 stat(s, FREE_RCU_SHEAF_FAIL); 6354 + lock_map_release(&kfree_rcu_sheaf_map); 5875 6355 return false; 5876 6356 } 5877 6357 ··· 5934 6410 goto no_empty; 5935 6411 5936 6412 if (!pcs->spare) { 5937 - empty = barn_get_empty_sheaf(barn); 6413 + empty = barn_get_empty_sheaf(barn, true); 5938 6414 if (!empty) 5939 6415 goto no_empty; 5940 6416 ··· 5948 6424 goto do_free; 5949 6425 } 5950 6426 5951 - empty = barn_replace_full_sheaf(barn, pcs->main); 6427 + empty = barn_replace_full_sheaf(barn, pcs->main, true); 5952 6428 if (IS_ERR(empty)) { 5953 6429 stat(s, BARN_PUT_FAIL); 5954 6430 goto no_empty; ··· 5966 6442 5967 6443 local_unlock(&s->cpu_sheaves->lock); 5968 6444 5969 - stat_add(s, FREE_PCS, batch); 6445 + stat_add(s, FREE_FASTPATH, batch); 5970 6446 5971 6447 if (batch < size) { 5972 6448 p += batch; ··· 5988 6464 */ 5989 6465 fallback: 5990 6466 __kmem_cache_free_bulk(s, size, p); 6467 + stat_add(s, FREE_SLOWPATH, size); 5991 6468 5992 6469 flush_remote: 5993 6470 if (remote_nr) { 5994 6471 __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); 6472 + stat_add(s, FREE_SLOWPATH, remote_nr); 5995 6473 if (i < size) { 5996 6474 remote_nr = 0; 5997 6475 goto next_remote_batch; ··· 6003 6477 6004 6478 struct defer_free { 6005 6479 struct llist_head objects; 6006 - struct llist_head slabs; 6007 6480 struct irq_work work; 6008 6481 }; 6009 6482 ··· 6010 6485 6011 6486 static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { 6012 6487 .objects = LLIST_HEAD_INIT(objects), 6013 - .slabs = LLIST_HEAD_INIT(slabs), 6014 6488 .work = IRQ_WORK_INIT(free_deferred_objects), 6015 6489 }; 6016 6490 6017 6491 /* 6018 6492 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe 6019 - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). 6493 + * to take sleeping spin_locks from __slab_free(). 6020 6494 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). 6021 6495 */ 6022 6496 static void free_deferred_objects(struct irq_work *work) 6023 6497 { 6024 6498 struct defer_free *df = container_of(work, struct defer_free, work); 6025 6499 struct llist_head *objs = &df->objects; 6026 - struct llist_head *slabs = &df->slabs; 6027 6500 struct llist_node *llnode, *pos, *t; 6028 6501 6029 - if (llist_empty(objs) && llist_empty(slabs)) 6502 + if (llist_empty(objs)) 6030 6503 return; 6031 6504 6032 6505 llnode = llist_del_all(objs); ··· 6047 6524 set_freepointer(s, x, NULL); 6048 6525 6049 6526 __slab_free(s, slab, x, x, 1, _THIS_IP_); 6050 - } 6051 - 6052 - llnode = llist_del_all(slabs); 6053 - llist_for_each_safe(pos, t, llnode) { 6054 - struct slab *slab = container_of(pos, struct slab, llnode); 6055 - 6056 - if (slab->frozen) 6057 - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); 6058 - else 6059 - free_slab(slab->slab_cache, slab); 6527 + stat(s, FREE_SLOWPATH); 6060 6528 } 6061 6529 } 6062 6530 ··· 6064 6550 irq_work_queue(&df->work); 6065 6551 } 6066 6552 6067 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) 6068 - { 6069 - struct defer_free *df; 6070 - 6071 - slab->flush_freelist = flush_freelist; 6072 - 6073 - guard(preempt)(); 6074 - 6075 - df = this_cpu_ptr(&defer_free_objects); 6076 - if (llist_add(&slab->llnode, &df->slabs)) 6077 - irq_work_queue(&df->work); 6078 - } 6079 - 6080 6553 void defer_free_barrier(void) 6081 6554 { 6082 6555 int cpu; 6083 6556 6084 6557 for_each_possible_cpu(cpu) 6085 6558 irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); 6086 - } 6087 - 6088 - /* 6089 - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 6090 - * can perform fastpath freeing without additional function calls. 6091 - * 6092 - * The fastpath is only possible if we are freeing to the current cpu slab 6093 - * of this processor. This typically the case if we have just allocated 6094 - * the item before. 6095 - * 6096 - * If fastpath is not possible then fall back to __slab_free where we deal 6097 - * with all sorts of special processing. 6098 - * 6099 - * Bulk free of a freelist with several objects (all pointing to the 6100 - * same slab) possible by specifying head and tail ptr, plus objects 6101 - * count (cnt). Bulk free indicated by tail pointer being set. 6102 - */ 6103 - static __always_inline void do_slab_free(struct kmem_cache *s, 6104 - struct slab *slab, void *head, void *tail, 6105 - int cnt, unsigned long addr) 6106 - { 6107 - /* cnt == 0 signals that it's called from kfree_nolock() */ 6108 - bool allow_spin = cnt; 6109 - struct kmem_cache_cpu *c; 6110 - unsigned long tid; 6111 - void **freelist; 6112 - 6113 - redo: 6114 - /* 6115 - * Determine the currently cpus per cpu slab. 6116 - * The cpu may change afterward. However that does not matter since 6117 - * data is retrieved via this pointer. If we are on the same cpu 6118 - * during the cmpxchg then the free will succeed. 6119 - */ 6120 - c = raw_cpu_ptr(s->cpu_slab); 6121 - tid = READ_ONCE(c->tid); 6122 - 6123 - /* Same with comment on barrier() in __slab_alloc_node() */ 6124 - barrier(); 6125 - 6126 - if (unlikely(slab != c->slab)) { 6127 - if (unlikely(!allow_spin)) { 6128 - /* 6129 - * __slab_free() can locklessly cmpxchg16 into a slab, 6130 - * but then it might need to take spin_lock or local_lock 6131 - * in put_cpu_partial() for further processing. 6132 - * Avoid the complexity and simply add to a deferred list. 6133 - */ 6134 - defer_free(s, head); 6135 - } else { 6136 - __slab_free(s, slab, head, tail, cnt, addr); 6137 - } 6138 - return; 6139 - } 6140 - 6141 - if (unlikely(!allow_spin)) { 6142 - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && 6143 - local_lock_is_locked(&s->cpu_slab->lock)) { 6144 - defer_free(s, head); 6145 - return; 6146 - } 6147 - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ 6148 - } 6149 - 6150 - if (USE_LOCKLESS_FAST_PATH()) { 6151 - freelist = READ_ONCE(c->freelist); 6152 - 6153 - set_freepointer(s, tail, freelist); 6154 - 6155 - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { 6156 - note_cmpxchg_failure("slab_free", s, tid); 6157 - goto redo; 6158 - } 6159 - } else { 6160 - __maybe_unused unsigned long flags = 0; 6161 - 6162 - /* Update the free list under the local lock */ 6163 - local_lock_cpu_slab(s, flags); 6164 - c = this_cpu_ptr(s->cpu_slab); 6165 - if (unlikely(slab != c->slab)) { 6166 - local_unlock_cpu_slab(s, flags); 6167 - goto redo; 6168 - } 6169 - tid = c->tid; 6170 - freelist = c->freelist; 6171 - 6172 - set_freepointer(s, tail, freelist); 6173 - c->freelist = head; 6174 - c->tid = next_tid(tid); 6175 - 6176 - local_unlock_cpu_slab(s, flags); 6177 - } 6178 - stat_add(s, FREE_FASTPATH, cnt); 6179 6559 } 6180 6560 6181 6561 static __fastpath_inline ··· 6082 6674 if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6083 6675 return; 6084 6676 6085 - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6086 - slab_nid(slab) == numa_mem_id()) 6087 - && likely(!slab_test_pfmemalloc(slab))) { 6088 - if (likely(free_to_pcs(s, object))) 6677 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) 6678 + && likely(!slab_test_pfmemalloc(slab))) { 6679 + if (likely(free_to_pcs(s, object, true))) 6089 6680 return; 6090 6681 } 6091 6682 6092 - do_slab_free(s, slab, object, object, 1, addr); 6683 + __slab_free(s, slab, object, object, 1, addr); 6684 + stat(s, FREE_SLOWPATH); 6093 6685 } 6094 6686 6095 6687 #ifdef CONFIG_MEMCG ··· 6102 6694 alloc_tagging_slab_free_hook(s, slab, &object, 1); 6103 6695 6104 6696 if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6105 - do_slab_free(s, slab, object, object, 1, _RET_IP_); 6697 + __slab_free(s, slab, object, object, 1, _RET_IP_); 6106 6698 } 6107 6699 #endif 6108 6700 ··· 6116 6708 * With KASAN enabled slab_free_freelist_hook modifies the freelist 6117 6709 * to remove objects, whose reuse must be delayed. 6118 6710 */ 6119 - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) 6120 - do_slab_free(s, slab, head, tail, cnt, addr); 6711 + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { 6712 + __slab_free(s, slab, head, tail, cnt, addr); 6713 + stat_add(s, FREE_SLOWPATH, cnt); 6714 + } 6121 6715 } 6122 6716 6123 6717 #ifdef CONFIG_SLUB_RCU_DEBUG ··· 6144 6734 return; 6145 6735 6146 6736 /* resume freeing */ 6147 - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) 6148 - do_slab_free(s, slab, object, object, 1, _THIS_IP_); 6737 + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { 6738 + __slab_free(s, slab, object, object, 1, _THIS_IP_); 6739 + stat(s, FREE_SLOWPATH); 6740 + } 6149 6741 } 6150 6742 #endif /* CONFIG_SLUB_RCU_DEBUG */ 6151 6743 6152 6744 #ifdef CONFIG_KASAN_GENERIC 6153 6745 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) 6154 6746 { 6155 - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); 6747 + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); 6748 + stat(cache, FREE_SLOWPATH); 6156 6749 } 6157 6750 #endif 6158 6751 6159 - static inline struct kmem_cache *virt_to_cache(const void *obj) 6752 + static noinline void warn_free_bad_obj(struct kmem_cache *s, void *obj) 6160 6753 { 6754 + struct kmem_cache *cachep; 6161 6755 struct slab *slab; 6162 6756 6163 6757 slab = virt_to_slab(obj); 6164 - if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) 6165 - return NULL; 6166 - return slab->slab_cache; 6167 - } 6758 + if (WARN_ONCE(!slab, 6759 + "kmem_cache_free(%s, %p): object is not in a slab page\n", 6760 + s->name, obj)) 6761 + return; 6168 6762 6169 - static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 6170 - { 6171 - struct kmem_cache *cachep; 6763 + cachep = slab->slab_cache; 6172 6764 6173 - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && 6174 - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) 6175 - return s; 6176 - 6177 - cachep = virt_to_cache(x); 6178 - if (WARN(cachep && cachep != s, 6179 - "%s: Wrong slab cache. %s but object is from %s\n", 6180 - __func__, s->name, cachep->name)) 6181 - print_tracking(cachep, x); 6182 - return cachep; 6765 + if (WARN_ONCE(cachep != s, 6766 + "kmem_cache_free(%s, %p): object belongs to different cache %s\n", 6767 + s->name, obj, cachep ? cachep->name : "(NULL)")) { 6768 + if (cachep) 6769 + print_tracking(cachep, obj); 6770 + return; 6771 + } 6183 6772 } 6184 6773 6185 6774 /** ··· 6191 6782 */ 6192 6783 void kmem_cache_free(struct kmem_cache *s, void *x) 6193 6784 { 6194 - s = cache_from_obj(s, x); 6195 - if (!s) 6196 - return; 6785 + struct slab *slab; 6786 + 6787 + slab = virt_to_slab(x); 6788 + 6789 + if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) || 6790 + kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { 6791 + 6792 + /* 6793 + * Intentionally leak the object in these cases, because it 6794 + * would be too dangerous to continue. 6795 + */ 6796 + if (unlikely(!slab || (slab->slab_cache != s))) { 6797 + warn_free_bad_obj(s, x); 6798 + return; 6799 + } 6800 + } 6801 + 6197 6802 trace_kmem_cache_free(_RET_IP_, x, s); 6198 - slab_free(s, virt_to_slab(x), x, _RET_IP_); 6803 + slab_free(s, slab, x, _RET_IP_); 6199 6804 } 6200 6805 EXPORT_SYMBOL(kmem_cache_free); 6806 + 6807 + static inline size_t slab_ksize(struct slab *slab) 6808 + { 6809 + struct kmem_cache *s = slab->slab_cache; 6810 + 6811 + #ifdef CONFIG_SLUB_DEBUG 6812 + /* 6813 + * Debugging requires use of the padding between object 6814 + * and whatever may come after it. 6815 + */ 6816 + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 6817 + return s->object_size; 6818 + #endif 6819 + if (s->flags & SLAB_KASAN) 6820 + return s->object_size; 6821 + /* 6822 + * If we have the need to store the freelist pointer 6823 + * or any other metadata back there then we can 6824 + * only use the space before that information. 6825 + */ 6826 + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) 6827 + return s->inuse; 6828 + else if (obj_exts_in_object(s, slab)) 6829 + return s->inuse; 6830 + /* 6831 + * Else we can use all the padding etc for the allocation 6832 + */ 6833 + return s->size; 6834 + } 6835 + 6836 + static size_t __ksize(const void *object) 6837 + { 6838 + struct page *page; 6839 + struct slab *slab; 6840 + 6841 + if (unlikely(object == ZERO_SIZE_PTR)) 6842 + return 0; 6843 + 6844 + page = virt_to_page(object); 6845 + 6846 + if (unlikely(PageLargeKmalloc(page))) 6847 + return large_kmalloc_size(page); 6848 + 6849 + slab = page_slab(page); 6850 + /* Delete this after we're sure there are no users */ 6851 + if (WARN_ON(!slab)) 6852 + return page_size(page); 6853 + 6854 + #ifdef CONFIG_SLUB_DEBUG 6855 + skip_orig_size_check(slab->slab_cache, object); 6856 + #endif 6857 + 6858 + return slab_ksize(slab); 6859 + } 6860 + 6861 + /** 6862 + * ksize -- Report full size of underlying allocation 6863 + * @objp: pointer to the object 6864 + * 6865 + * This should only be used internally to query the true size of allocations. 6866 + * It is not meant to be a way to discover the usable size of an allocation 6867 + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond 6868 + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, 6869 + * and/or FORTIFY_SOURCE. 6870 + * 6871 + * Return: size of the actual memory used by @objp in bytes 6872 + */ 6873 + size_t ksize(const void *objp) 6874 + { 6875 + /* 6876 + * We need to first check that the pointer to the object is valid. 6877 + * The KASAN report printed from ksize() is more useful, then when 6878 + * it's printed later when the behaviour could be undefined due to 6879 + * a potential use-after-free or double-free. 6880 + * 6881 + * We use kasan_check_byte(), which is supported for the hardware 6882 + * tag-based KASAN mode, unlike kasan_check_read/write(). 6883 + * 6884 + * If the pointed to memory is invalid, we return 0 to avoid users of 6885 + * ksize() writing to and potentially corrupting the memory region. 6886 + * 6887 + * We want to perform the check before __ksize(), to avoid potentially 6888 + * crashing in __ksize() due to accessing invalid metadata. 6889 + */ 6890 + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) 6891 + return 0; 6892 + 6893 + return kfence_ksize(objp) ?: __ksize(objp); 6894 + } 6895 + EXPORT_SYMBOL(ksize); 6201 6896 6202 6897 static void free_large_kmalloc(struct page *page, void *object) 6203 6898 { ··· 6455 6942 * since kasan quarantine takes locks and not supported from NMI. 6456 6943 */ 6457 6944 kasan_slab_free(s, x, false, false, /* skip quarantine */true); 6458 - do_slab_free(s, slab, x, x, 0, _RET_IP_); 6945 + 6946 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { 6947 + if (likely(free_to_pcs(s, x, false))) 6948 + return; 6949 + } 6950 + 6951 + /* 6952 + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might 6953 + * need to take spin_lock for further processing. 6954 + * Avoid the complexity and simply add to a deferred list. 6955 + */ 6956 + defer_free(s, x); 6459 6957 } 6460 6958 EXPORT_SYMBOL_GPL(kfree_nolock); 6461 6959 ··· 6837 7313 df->s = slab->slab_cache; 6838 7314 } else { 6839 7315 df->slab = slab; 6840 - df->s = cache_from_obj(s, object); /* Support for memcg */ 7316 + df->s = s; 6841 7317 } 6842 7318 6843 7319 /* Start new detached freelist */ ··· 6892 7368 if (kfence_free(df.freelist)) 6893 7369 continue; 6894 7370 6895 - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 7371 + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 6896 7372 _RET_IP_); 6897 7373 } while (likely(size)); 6898 7374 } ··· 6907 7383 * freeing to sheaves is so incompatible with the detached freelist so 6908 7384 * once we go that way, we have to do everything differently 6909 7385 */ 6910 - if (s && s->cpu_sheaves) { 7386 + if (s && cache_has_sheaves(s)) { 6911 7387 free_to_pcs_bulk(s, size, p); 6912 7388 return; 6913 7389 } ··· 6925 7401 } 6926 7402 EXPORT_SYMBOL(kmem_cache_free_bulk); 6927 7403 7404 + static unsigned int 7405 + __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7406 + unsigned int max, struct kmem_cache_node *n, 7407 + bool allow_spin) 7408 + { 7409 + struct partial_bulk_context pc; 7410 + struct slab *slab, *slab2; 7411 + unsigned int refilled = 0; 7412 + unsigned long flags; 7413 + void *object; 7414 + 7415 + pc.flags = gfp; 7416 + pc.min_objects = min; 7417 + pc.max_objects = max; 7418 + 7419 + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) 7420 + return 0; 7421 + 7422 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7423 + 7424 + list_del(&slab->slab_list); 7425 + 7426 + object = get_freelist_nofreeze(s, slab); 7427 + 7428 + while (object && refilled < max) { 7429 + p[refilled] = object; 7430 + object = get_freepointer(s, object); 7431 + maybe_wipe_obj_freeptr(s, p[refilled]); 7432 + 7433 + refilled++; 7434 + } 7435 + 7436 + /* 7437 + * Freelist had more objects than we can accommodate, we need to 7438 + * free them back. We can treat it like a detached freelist, just 7439 + * need to find the tail object. 7440 + */ 7441 + if (unlikely(object)) { 7442 + void *head = object; 7443 + void *tail; 7444 + int cnt = 0; 7445 + 7446 + do { 7447 + tail = object; 7448 + cnt++; 7449 + object = get_freepointer(s, object); 7450 + } while (object); 7451 + __slab_free(s, slab, head, tail, cnt, _RET_IP_); 7452 + } 7453 + 7454 + if (refilled >= max) 7455 + break; 7456 + } 7457 + 7458 + if (unlikely(!list_empty(&pc.slabs))) { 7459 + spin_lock_irqsave(&n->list_lock, flags); 7460 + 7461 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7462 + 7463 + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) 7464 + continue; 7465 + 7466 + list_del(&slab->slab_list); 7467 + add_partial(n, slab, ADD_TO_HEAD); 7468 + } 7469 + 7470 + spin_unlock_irqrestore(&n->list_lock, flags); 7471 + 7472 + /* any slabs left are completely free and for discard */ 7473 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7474 + 7475 + list_del(&slab->slab_list); 7476 + discard_slab(s, slab); 7477 + } 7478 + } 7479 + 7480 + return refilled; 7481 + } 7482 + 7483 + #ifdef CONFIG_NUMA 7484 + static unsigned int 7485 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7486 + unsigned int max) 7487 + { 7488 + struct zonelist *zonelist; 7489 + struct zoneref *z; 7490 + struct zone *zone; 7491 + enum zone_type highest_zoneidx = gfp_zone(gfp); 7492 + unsigned int cpuset_mems_cookie; 7493 + unsigned int refilled = 0; 7494 + 7495 + /* see get_from_any_partial() for the defrag ratio description */ 7496 + if (!s->remote_node_defrag_ratio || 7497 + get_cycles() % 1024 > s->remote_node_defrag_ratio) 7498 + return 0; 7499 + 7500 + do { 7501 + cpuset_mems_cookie = read_mems_allowed_begin(); 7502 + zonelist = node_zonelist(mempolicy_slab_node(), gfp); 7503 + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { 7504 + struct kmem_cache_node *n; 7505 + unsigned int r; 7506 + 7507 + n = get_node(s, zone_to_nid(zone)); 7508 + 7509 + if (!n || !cpuset_zone_allowed(zone, gfp) || 7510 + n->nr_partial <= s->min_partial) 7511 + continue; 7512 + 7513 + r = __refill_objects_node(s, p, gfp, min, max, n, 7514 + /* allow_spin = */ false); 7515 + refilled += r; 7516 + 7517 + if (r >= min) { 7518 + /* 7519 + * Don't check read_mems_allowed_retry() here - 7520 + * if mems_allowed was updated in parallel, that 7521 + * was a harmless race between allocation and 7522 + * the cpuset update 7523 + */ 7524 + return refilled; 7525 + } 7526 + p += r; 7527 + min -= r; 7528 + max -= r; 7529 + } 7530 + } while (read_mems_allowed_retry(cpuset_mems_cookie)); 7531 + 7532 + return refilled; 7533 + } 7534 + #else 7535 + static inline unsigned int 7536 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7537 + unsigned int max) 7538 + { 7539 + return 0; 7540 + } 7541 + #endif 7542 + 7543 + static unsigned int 7544 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7545 + unsigned int max) 7546 + { 7547 + int local_node = numa_mem_id(); 7548 + unsigned int refilled; 7549 + struct slab *slab; 7550 + 7551 + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) 7552 + return 0; 7553 + 7554 + refilled = __refill_objects_node(s, p, gfp, min, max, 7555 + get_node(s, local_node), 7556 + /* allow_spin = */ true); 7557 + if (refilled >= min) 7558 + return refilled; 7559 + 7560 + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, 7561 + max - refilled); 7562 + if (refilled >= min) 7563 + return refilled; 7564 + 7565 + new_slab: 7566 + 7567 + slab = new_slab(s, gfp, local_node); 7568 + if (!slab) 7569 + goto out; 7570 + 7571 + stat(s, ALLOC_SLAB); 7572 + 7573 + /* 7574 + * TODO: possible optimization - if we know we will consume the whole 7575 + * slab we might skip creating the freelist? 7576 + */ 7577 + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, 7578 + /* allow_spin = */ true); 7579 + 7580 + if (refilled < min) 7581 + goto new_slab; 7582 + 7583 + out: 7584 + return refilled; 7585 + } 7586 + 6928 7587 static inline 6929 7588 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 6930 7589 void **p) 6931 7590 { 6932 - struct kmem_cache_cpu *c; 6933 - unsigned long irqflags; 6934 7591 int i; 6935 7592 6936 - /* 6937 - * Drain objects in the per cpu slab, while disabling local 6938 - * IRQs, which protects against PREEMPT and interrupts 6939 - * handlers invoking normal fastpath. 6940 - */ 6941 - c = slub_get_cpu_ptr(s->cpu_slab); 6942 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7593 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 7594 + for (i = 0; i < size; i++) { 6943 7595 6944 - for (i = 0; i < size; i++) { 6945 - void *object = c->freelist; 6946 - 6947 - if (unlikely(!object)) { 6948 - /* 6949 - * We may have removed an object from c->freelist using 6950 - * the fastpath in the previous iteration; in that case, 6951 - * c->tid has not been bumped yet. 6952 - * Since ___slab_alloc() may reenable interrupts while 6953 - * allocating memory, we should bump c->tid now. 6954 - */ 6955 - c->tid = next_tid(c->tid); 6956 - 6957 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6958 - 6959 - /* 6960 - * Invoking slow path likely have side-effect 6961 - * of re-populating per CPU c->freelist 6962 - */ 6963 - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, 6964 - _RET_IP_, c, s->object_size); 7596 + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, 7597 + s->object_size); 6965 7598 if (unlikely(!p[i])) 6966 7599 goto error; 6967 7600 6968 - c = this_cpu_ptr(s->cpu_slab); 6969 7601 maybe_wipe_obj_freeptr(s, p[i]); 6970 - 6971 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 6972 - 6973 - continue; /* goto for-loop */ 6974 7602 } 6975 - c->freelist = get_freepointer(s, object); 6976 - p[i] = object; 6977 - maybe_wipe_obj_freeptr(s, p[i]); 6978 - stat(s, ALLOC_FASTPATH); 7603 + } else { 7604 + i = refill_objects(s, p, flags, size, size); 7605 + if (i < size) 7606 + goto error; 7607 + stat_add(s, ALLOC_SLOWPATH, i); 6979 7608 } 6980 - c->tid = next_tid(c->tid); 6981 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6982 - slub_put_cpu_ptr(s->cpu_slab); 6983 7609 6984 7610 return i; 6985 7611 6986 7612 error: 6987 - slub_put_cpu_ptr(s->cpu_slab); 6988 7613 __kmem_cache_free_bulk(s, i, p); 6989 7614 return 0; 6990 7615 6991 7616 } 6992 7617 6993 - /* Note that interrupts must be enabled when calling this function. */ 7618 + /* 7619 + * Note that interrupts must be enabled when calling this function and gfp 7620 + * flags must allow spinning. 7621 + */ 6994 7622 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 6995 7623 void **p) 6996 7624 { ··· 7170 7494 size--; 7171 7495 } 7172 7496 7173 - if (s->cpu_sheaves) 7174 - i = alloc_from_pcs_bulk(s, size, p); 7497 + i = alloc_from_pcs_bulk(s, flags, size, p); 7175 7498 7176 7499 if (i < size) { 7177 7500 /* ··· 7358 7683 barn_init(barn); 7359 7684 } 7360 7685 7361 - static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 7686 + #ifdef CONFIG_SLUB_STATS 7687 + static inline int alloc_kmem_cache_stats(struct kmem_cache *s) 7362 7688 { 7363 7689 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 7364 7690 NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * 7365 - sizeof(struct kmem_cache_cpu)); 7691 + sizeof(struct kmem_cache_stats)); 7366 7692 7367 - /* 7368 - * Must align to double word boundary for the double cmpxchg 7369 - * instructions to work; see __pcpu_double_call_return_bool(). 7370 - */ 7371 - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 7372 - 2 * sizeof(void *)); 7693 + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); 7373 7694 7374 - if (!s->cpu_slab) 7695 + if (!s->cpu_stats) 7375 7696 return 0; 7376 - 7377 - init_kmem_cache_cpus(s); 7378 7697 7379 7698 return 1; 7380 7699 } 7700 + #endif 7381 7701 7382 7702 static int init_percpu_sheaves(struct kmem_cache *s) 7383 7703 { 7704 + static struct slab_sheaf bootstrap_sheaf = {}; 7384 7705 int cpu; 7385 7706 7386 7707 for_each_possible_cpu(cpu) { ··· 7386 7715 7387 7716 local_trylock_init(&pcs->lock); 7388 7717 7389 - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 7718 + /* 7719 + * Bootstrap sheaf has zero size so fast-path allocation fails. 7720 + * It has also size == s->sheaf_capacity, so fast-path free 7721 + * fails. In the slow paths we recognize the situation by 7722 + * checking s->sheaf_capacity. This allows fast paths to assume 7723 + * s->cpu_sheaves and pcs->main always exists and are valid. 7724 + * It's also safe to share the single static bootstrap_sheaf 7725 + * with zero-sized objects array as it's never modified. 7726 + * 7727 + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we 7728 + * recognize it and not attempt to free it when destroying the 7729 + * cache. 7730 + * 7731 + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, 7732 + * caches with debug enabled, and all caches with SLUB_TINY. 7733 + * For kmalloc caches it's used temporarily during the initial 7734 + * bootstrap. 7735 + */ 7736 + if (!s->sheaf_capacity) 7737 + pcs->main = &bootstrap_sheaf; 7738 + else 7739 + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 7390 7740 7391 7741 if (!pcs->main) 7392 7742 return -ENOMEM; ··· 7458 7766 * No locks need to be taken here as it has just been 7459 7767 * initialized and there is no concurrent access. 7460 7768 */ 7461 - __add_partial(n, slab, DEACTIVATE_TO_HEAD); 7769 + __add_partial(n, slab, ADD_TO_HEAD); 7462 7770 } 7463 7771 7464 7772 static void free_kmem_cache_nodes(struct kmem_cache *s) ··· 7482 7790 void __kmem_cache_release(struct kmem_cache *s) 7483 7791 { 7484 7792 cache_random_seq_destroy(s); 7485 - if (s->cpu_sheaves) 7486 - pcs_destroy(s); 7487 - #ifdef CONFIG_PREEMPT_RT 7488 - if (s->cpu_slab) 7489 - lockdep_unregister_key(&s->lock_key); 7793 + pcs_destroy(s); 7794 + #ifdef CONFIG_SLUB_STATS 7795 + free_percpu(s->cpu_stats); 7490 7796 #endif 7491 - free_percpu(s->cpu_slab); 7492 7797 free_kmem_cache_nodes(s); 7493 7798 } 7494 7799 ··· 7502 7813 continue; 7503 7814 } 7504 7815 7505 - if (s->cpu_sheaves) { 7816 + if (cache_has_sheaves(s)) { 7506 7817 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 7507 7818 7508 7819 if (!barn) ··· 7523 7834 return 1; 7524 7835 } 7525 7836 7526 - static void set_cpu_partial(struct kmem_cache *s) 7837 + static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, 7838 + struct kmem_cache_args *args) 7839 + 7527 7840 { 7528 - #ifdef CONFIG_SLUB_CPU_PARTIAL 7529 - unsigned int nr_objects; 7841 + unsigned int capacity; 7842 + size_t size; 7843 + 7844 + 7845 + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) 7846 + return 0; 7530 7847 7531 7848 /* 7532 - * cpu_partial determined the maximum number of objects kept in the 7533 - * per cpu partial lists of a processor. 7534 - * 7535 - * Per cpu partial lists mainly contain slabs that just have one 7536 - * object freed. If they are used for allocation then they can be 7537 - * filled up again with minimal effort. The slab will never hit the 7538 - * per node partial lists and therefore no locking will be required. 7539 - * 7540 - * For backwards compatibility reasons, this is determined as number 7541 - * of objects, even though we now limit maximum number of pages, see 7542 - * slub_set_cpu_partial() 7849 + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). 7850 + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not 7851 + * have sheaves to avoid recursion when sheaf allocation triggers 7852 + * kmemleak tracking. 7543 7853 */ 7544 - if (!kmem_cache_has_cpu_partial(s)) 7545 - nr_objects = 0; 7546 - else if (s->size >= PAGE_SIZE) 7547 - nr_objects = 6; 7548 - else if (s->size >= 1024) 7549 - nr_objects = 24; 7550 - else if (s->size >= 256) 7551 - nr_objects = 52; 7552 - else 7553 - nr_objects = 120; 7854 + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) 7855 + return 0; 7554 7856 7555 - slub_set_cpu_partial(s, nr_objects); 7556 - #endif 7857 + /* 7858 + * For now we use roughly similar formula (divided by two as there are 7859 + * two percpu sheaves) as what was used for percpu partial slabs, which 7860 + * should result in similar lock contention (barn or list_lock) 7861 + */ 7862 + if (s->size >= PAGE_SIZE) 7863 + capacity = 4; 7864 + else if (s->size >= 1024) 7865 + capacity = 12; 7866 + else if (s->size >= 256) 7867 + capacity = 26; 7868 + else 7869 + capacity = 60; 7870 + 7871 + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ 7872 + size = struct_size_t(struct slab_sheaf, objects, capacity); 7873 + size = kmalloc_size_roundup(size); 7874 + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); 7875 + 7876 + /* 7877 + * Respect an explicit request for capacity that's typically motivated by 7878 + * expected maximum size of kmem_cache_prefill_sheaf() to not end up 7879 + * using low-performance oversize sheaves 7880 + */ 7881 + return max(capacity, args->sheaf_capacity); 7557 7882 } 7558 7883 7559 7884 /* ··· 7578 7875 { 7579 7876 slab_flags_t flags = s->flags; 7580 7877 unsigned int size = s->object_size; 7878 + unsigned int aligned_size; 7581 7879 unsigned int order; 7582 7880 7583 7881 /* ··· 7602 7898 7603 7899 7604 7900 /* 7605 - * If we are Redzoning then check if there is some space between the 7606 - * end of the object and the free pointer. If not then add an 7607 - * additional word to have some bytes to store Redzone information. 7901 + * If we are Redzoning and there is no space between the end of the 7902 + * object and the following fields, add one word so the right Redzone 7903 + * is non-empty. 7608 7904 */ 7609 7905 if ((flags & SLAB_RED_ZONE) && size == s->object_size) 7610 7906 size += sizeof(void *); ··· 7617 7913 s->inuse = size; 7618 7914 7619 7915 if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || 7620 - (flags & SLAB_POISON) || s->ctor || 7916 + (flags & SLAB_POISON) || 7917 + (s->ctor && !args->use_freeptr_offset) || 7621 7918 ((flags & SLAB_RED_ZONE) && 7622 7919 (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { 7623 7920 /* ··· 7639 7934 */ 7640 7935 s->offset = size; 7641 7936 size += sizeof(void *); 7642 - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { 7937 + } else if (((flags & SLAB_TYPESAFE_BY_RCU) || s->ctor) && 7938 + args->use_freeptr_offset) { 7643 7939 s->offset = args->freeptr_offset; 7644 7940 } else { 7645 7941 /* ··· 7661 7955 7662 7956 /* Save the original kmalloc request size */ 7663 7957 if (flags & SLAB_KMALLOC) 7664 - size += sizeof(unsigned int); 7958 + size += sizeof(unsigned long); 7665 7959 } 7666 7960 #endif 7667 7961 ··· 7688 7982 * offset 0. In order to align the objects we have to simply size 7689 7983 * each object to conform to the alignment. 7690 7984 */ 7691 - size = ALIGN(size, s->align); 7985 + aligned_size = ALIGN(size, s->align); 7986 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 7987 + if (slab_args_unmergeable(args, s->flags) && 7988 + (aligned_size - size >= sizeof(struct slabobj_ext))) 7989 + s->flags |= SLAB_OBJ_EXT_IN_OBJ; 7990 + #endif 7991 + size = aligned_size; 7992 + 7692 7993 s->size = size; 7693 7994 s->reciprocal_size = reciprocal_value(size); 7694 7995 order = calculate_order(size); ··· 7713 8000 7714 8001 if (s->flags & SLAB_RECLAIM_ACCOUNT) 7715 8002 s->allocflags |= __GFP_RECLAIMABLE; 8003 + 8004 + /* 8005 + * For KMALLOC_NORMAL caches we enable sheaves later by 8006 + * bootstrap_kmalloc_sheaves() to avoid recursion 8007 + */ 8008 + if (!is_kmalloc_normal(s)) 8009 + s->sheaf_capacity = calculate_sheaf_capacity(s, args); 7716 8010 7717 8011 /* 7718 8012 * Determine the number of objects per slab ··· 7805 8085 flush_all_cpus_locked(s); 7806 8086 7807 8087 /* we might have rcu sheaves in flight */ 7808 - if (s->cpu_sheaves) 8088 + if (cache_has_sheaves(s)) 7809 8089 rcu_barrier(); 7810 8090 7811 8091 /* Attempt to free all objects */ ··· 8117 8397 if (get_node(s, nid)) 8118 8398 continue; 8119 8399 8120 - if (s->cpu_sheaves) { 8400 + if (cache_has_sheaves(s)) { 8121 8401 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); 8122 8402 8123 8403 if (!barn) { ··· 8192 8472 8193 8473 memcpy(s, static_cache, kmem_cache->object_size); 8194 8474 8195 - /* 8196 - * This runs very early, and only the boot processor is supposed to be 8197 - * up. Even if it weren't true, IRQs are not up so we couldn't fire 8198 - * IPIs around. 8199 - */ 8200 - __flush_cpu_slab(s, smp_processor_id()); 8201 8475 for_each_kmem_cache_node(s, node, n) { 8202 8476 struct slab *p; 8203 8477 ··· 8205 8491 } 8206 8492 list_add(&s->list, &slab_caches); 8207 8493 return s; 8494 + } 8495 + 8496 + /* 8497 + * Finish the sheaves initialization done normally by init_percpu_sheaves() and 8498 + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it 8499 + * since sheaves and barns are allocated by kmalloc. 8500 + */ 8501 + static void __init bootstrap_cache_sheaves(struct kmem_cache *s) 8502 + { 8503 + struct kmem_cache_args empty_args = {}; 8504 + unsigned int capacity; 8505 + bool failed = false; 8506 + int node, cpu; 8507 + 8508 + capacity = calculate_sheaf_capacity(s, &empty_args); 8509 + 8510 + /* capacity can be 0 due to debugging or SLUB_TINY */ 8511 + if (!capacity) 8512 + return; 8513 + 8514 + for_each_node_mask(node, slab_nodes) { 8515 + struct node_barn *barn; 8516 + 8517 + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 8518 + 8519 + if (!barn) { 8520 + failed = true; 8521 + goto out; 8522 + } 8523 + 8524 + barn_init(barn); 8525 + get_node(s, node)->barn = barn; 8526 + } 8527 + 8528 + for_each_possible_cpu(cpu) { 8529 + struct slub_percpu_sheaves *pcs; 8530 + 8531 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 8532 + 8533 + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); 8534 + 8535 + if (!pcs->main) { 8536 + failed = true; 8537 + break; 8538 + } 8539 + } 8540 + 8541 + out: 8542 + /* 8543 + * It's still early in boot so treat this like same as a failure to 8544 + * create the kmalloc cache in the first place 8545 + */ 8546 + if (failed) 8547 + panic("Out of memory when creating kmem_cache %s\n", s->name); 8548 + 8549 + s->sheaf_capacity = capacity; 8550 + } 8551 + 8552 + static void __init bootstrap_kmalloc_sheaves(void) 8553 + { 8554 + enum kmalloc_cache_type type; 8555 + 8556 + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { 8557 + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { 8558 + if (kmalloc_caches[type][idx]) 8559 + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); 8560 + } 8561 + } 8208 8562 } 8209 8563 8210 8564 void __init kmem_cache_init(void) ··· 8318 8536 setup_kmalloc_cache_index_table(); 8319 8537 create_kmalloc_caches(); 8320 8538 8539 + bootstrap_kmalloc_sheaves(); 8540 + 8321 8541 /* Setup random freelists for each cache */ 8322 8542 init_freelist_randomization(); 8323 8543 ··· 8336 8552 { 8337 8553 flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); 8338 8554 WARN_ON(!flushwq); 8339 - } 8340 - 8341 - struct kmem_cache * 8342 - __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, 8343 - slab_flags_t flags, void (*ctor)(void *)) 8344 - { 8345 - struct kmem_cache *s; 8346 - 8347 - s = find_mergeable(size, align, flags, name, ctor); 8348 - if (s) { 8349 - if (sysfs_slab_alias(s, name)) 8350 - pr_err("SLUB: Unable to add cache alias %s to sysfs\n", 8351 - name); 8352 - 8353 - s->refcount++; 8354 - 8355 - /* 8356 - * Adjust the object sizes so that we clear 8357 - * the complete object on kzalloc. 8358 - */ 8359 - s->object_size = max(s->object_size, size); 8360 - s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); 8361 - } 8362 - 8363 - return s; 8364 8555 } 8365 8556 8366 8557 int do_kmem_cache_create(struct kmem_cache *s, const char *name, ··· 8387 8628 s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); 8388 8629 s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); 8389 8630 8390 - set_cpu_partial(s); 8391 - 8392 - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) 8393 - && !(s->flags & SLAB_DEBUG_FLAGS)) { 8394 - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 8395 - if (!s->cpu_sheaves) { 8396 - err = -ENOMEM; 8397 - goto out; 8398 - } 8399 - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? 8400 - s->sheaf_capacity = args->sheaf_capacity; 8631 + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 8632 + if (!s->cpu_sheaves) { 8633 + err = -ENOMEM; 8634 + goto out; 8401 8635 } 8402 8636 8403 8637 #ifdef CONFIG_NUMA ··· 8406 8654 if (!init_kmem_cache_nodes(s)) 8407 8655 goto out; 8408 8656 8409 - if (!alloc_kmem_cache_cpus(s)) 8657 + #ifdef CONFIG_SLUB_STATS 8658 + if (!alloc_kmem_cache_stats(s)) 8410 8659 goto out; 8660 + #endif 8411 8661 8412 - if (s->cpu_sheaves) { 8413 - err = init_percpu_sheaves(s); 8414 - if (err) 8415 - goto out; 8416 - } 8662 + err = init_percpu_sheaves(s); 8663 + if (err) 8664 + goto out; 8417 8665 8418 8666 err = 0; 8419 8667 ··· 8728 8976 if (!nodes) 8729 8977 return -ENOMEM; 8730 8978 8731 - if (flags & SO_CPU) { 8732 - int cpu; 8733 - 8734 - for_each_possible_cpu(cpu) { 8735 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, 8736 - cpu); 8737 - int node; 8738 - struct slab *slab; 8739 - 8740 - slab = READ_ONCE(c->slab); 8741 - if (!slab) 8742 - continue; 8743 - 8744 - node = slab_nid(slab); 8745 - if (flags & SO_TOTAL) 8746 - x = slab->objects; 8747 - else if (flags & SO_OBJECTS) 8748 - x = slab->inuse; 8749 - else 8750 - x = 1; 8751 - 8752 - total += x; 8753 - nodes[node] += x; 8754 - 8755 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8756 - slab = slub_percpu_partial_read_once(c); 8757 - if (slab) { 8758 - node = slab_nid(slab); 8759 - if (flags & SO_TOTAL) 8760 - WARN_ON_ONCE(1); 8761 - else if (flags & SO_OBJECTS) 8762 - WARN_ON_ONCE(1); 8763 - else 8764 - x = data_race(slab->slabs); 8765 - total += x; 8766 - nodes[node] += x; 8767 - } 8768 - #endif 8769 - } 8770 - } 8771 - 8772 8979 /* 8773 8980 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" 8774 8981 * already held which will conflict with an existing lock order: ··· 8859 9148 8860 9149 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 8861 9150 { 8862 - unsigned int nr_partial = 0; 8863 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8864 - nr_partial = s->cpu_partial; 8865 - #endif 8866 - 8867 - return sysfs_emit(buf, "%u\n", nr_partial); 9151 + return sysfs_emit(buf, "0\n"); 8868 9152 } 8869 9153 8870 9154 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, ··· 8871 9165 err = kstrtouint(buf, 10, &objects); 8872 9166 if (err) 8873 9167 return err; 8874 - if (objects && !kmem_cache_has_cpu_partial(s)) 9168 + if (objects) 8875 9169 return -EINVAL; 8876 9170 8877 - slub_set_cpu_partial(s, objects); 8878 - flush_all(s); 8879 9171 return length; 8880 9172 } 8881 9173 SLAB_ATTR(cpu_partial); ··· 8912 9208 8913 9209 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 8914 9210 { 8915 - int objects = 0; 8916 - int slabs = 0; 8917 - int cpu __maybe_unused; 8918 - int len = 0; 8919 - 8920 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8921 - for_each_online_cpu(cpu) { 8922 - struct slab *slab; 8923 - 8924 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8925 - 8926 - if (slab) 8927 - slabs += data_race(slab->slabs); 8928 - } 8929 - #endif 8930 - 8931 - /* Approximate half-full slabs, see slub_set_cpu_partial() */ 8932 - objects = (slabs * oo_objects(s->oo)) / 2; 8933 - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); 8934 - 8935 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8936 - for_each_online_cpu(cpu) { 8937 - struct slab *slab; 8938 - 8939 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8940 - if (slab) { 8941 - slabs = data_race(slab->slabs); 8942 - objects = (slabs * oo_objects(s->oo)) / 2; 8943 - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", 8944 - cpu, objects, slabs); 8945 - } 8946 - } 8947 - #endif 8948 - len += sysfs_emit_at(buf, len, "\n"); 8949 - 8950 - return len; 9211 + return sysfs_emit(buf, "0(0)\n"); 8951 9212 } 8952 9213 SLAB_ATTR_RO(slabs_cpu_partial); 8953 9214 ··· 9098 9429 return -ENOMEM; 9099 9430 9100 9431 for_each_online_cpu(cpu) { 9101 - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 9432 + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; 9102 9433 9103 9434 data[cpu] = x; 9104 9435 sum += x; ··· 9124 9455 int cpu; 9125 9456 9126 9457 for_each_online_cpu(cpu) 9127 - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 9458 + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; 9128 9459 } 9129 9460 9130 9461 #define STAT_ATTR(si, text) \ ··· 9142 9473 } \ 9143 9474 SLAB_ATTR(text); \ 9144 9475 9145 - STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); 9146 9476 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 9147 9477 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 9148 - STAT_ATTR(FREE_PCS, free_cpu_sheaf); 9149 9478 STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); 9150 9479 STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); 9151 9480 STAT_ATTR(FREE_FASTPATH, free_fastpath); 9152 9481 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 9153 - STAT_ATTR(FREE_FROZEN, free_frozen); 9154 9482 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 9155 9483 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 9156 - STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 9157 9484 STAT_ATTR(ALLOC_SLAB, alloc_slab); 9158 - STAT_ATTR(ALLOC_REFILL, alloc_refill); 9159 9485 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 9160 9486 STAT_ATTR(FREE_SLAB, free_slab); 9161 - STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 9162 - STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 9163 - STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 9164 - STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 9165 - STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 9166 - STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 9167 - STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 9168 9487 STAT_ATTR(ORDER_FALLBACK, order_fallback); 9169 - STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 9170 9488 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 9171 - STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 9172 - STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 9173 - STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 9174 - STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 9175 9489 STAT_ATTR(SHEAF_FLUSH, sheaf_flush); 9176 9490 STAT_ATTR(SHEAF_REFILL, sheaf_refill); 9177 9491 STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); ··· 9230 9578 &remote_node_defrag_ratio_attr.attr, 9231 9579 #endif 9232 9580 #ifdef CONFIG_SLUB_STATS 9233 - &alloc_cpu_sheaf_attr.attr, 9234 9581 &alloc_fastpath_attr.attr, 9235 9582 &alloc_slowpath_attr.attr, 9236 - &free_cpu_sheaf_attr.attr, 9237 9583 &free_rcu_sheaf_attr.attr, 9238 9584 &free_rcu_sheaf_fail_attr.attr, 9239 9585 &free_fastpath_attr.attr, 9240 9586 &free_slowpath_attr.attr, 9241 - &free_frozen_attr.attr, 9242 9587 &free_add_partial_attr.attr, 9243 9588 &free_remove_partial_attr.attr, 9244 - &alloc_from_partial_attr.attr, 9245 9589 &alloc_slab_attr.attr, 9246 - &alloc_refill_attr.attr, 9247 9590 &alloc_node_mismatch_attr.attr, 9248 9591 &free_slab_attr.attr, 9249 - &cpuslab_flush_attr.attr, 9250 - &deactivate_full_attr.attr, 9251 - &deactivate_empty_attr.attr, 9252 - &deactivate_to_head_attr.attr, 9253 - &deactivate_to_tail_attr.attr, 9254 - &deactivate_remote_frees_attr.attr, 9255 - &deactivate_bypass_attr.attr, 9256 9592 &order_fallback_attr.attr, 9257 9593 &cmpxchg_double_fail_attr.attr, 9258 - &cmpxchg_double_cpu_fail_attr.attr, 9259 - &cpu_partial_alloc_attr.attr, 9260 - &cpu_partial_free_attr.attr, 9261 - &cpu_partial_node_attr.attr, 9262 - &cpu_partial_drain_attr.attr, 9263 9594 &sheaf_flush_attr.attr, 9264 9595 &sheaf_refill_attr.attr, 9265 9596 &sheaf_alloc_attr.attr, ··· 9446 9811 9447 9812 static struct saved_alias *alias_list; 9448 9813 9449 - static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 9814 + int sysfs_slab_alias(struct kmem_cache *s, const char *name) 9450 9815 { 9451 9816 struct saved_alias *al; 9452 9817