Merge tag 'slab-for-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab

+13 -6

fs/ext4/super.c

··· 1496 1496 1497 1497 static int __init init_inodecache(void) 1498 1498 { 1499 - ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache", 1500 - sizeof(struct ext4_inode_info), 0, 1501 - SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT, 1502 - offsetof(struct ext4_inode_info, i_data), 1503 - sizeof_field(struct ext4_inode_info, i_data), 1504 - init_once); 1499 + struct kmem_cache_args args = { 1500 + .useroffset = offsetof(struct ext4_inode_info, i_data), 1501 + .usersize = sizeof_field(struct ext4_inode_info, i_data), 1502 + .use_freeptr_offset = true, 1503 + .freeptr_offset = offsetof(struct ext4_inode_info, i_flags), 1504 + .ctor = init_once, 1505 + }; 1506 + 1507 + ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", 1508 + sizeof(struct ext4_inode_info), 1509 + &args, 1510 + SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT); 1511 + 1505 1512 if (ext4_inode_cachep == NULL) 1506 1513 return -ENOMEM; 1507 1514 return 0;

+22 -18

include/linux/slab.h

··· 58 58 #endif 59 59 _SLAB_OBJECT_POISON, 60 60 _SLAB_CMPXCHG_DOUBLE, 61 - #ifdef CONFIG_SLAB_OBJ_EXT 62 61 _SLAB_NO_OBJ_EXT, 62 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 63 + _SLAB_OBJ_EXT_IN_OBJ, 63 64 #endif 64 65 _SLAB_FLAGS_LAST_BIT 65 66 }; ··· 240 239 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ 241 240 242 241 /* Slab created using create_boot_cache */ 243 - #ifdef CONFIG_SLAB_OBJ_EXT 244 242 #define SLAB_NO_OBJ_EXT __SLAB_FLAG_BIT(_SLAB_NO_OBJ_EXT) 243 + 244 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 245 + #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_BIT(_SLAB_OBJ_EXT_IN_OBJ) 245 246 #else 246 - #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED 247 + #define SLAB_OBJ_EXT_IN_OBJ __SLAB_FLAG_UNUSED 247 248 #endif 248 249 249 250 /* ··· 303 300 unsigned int usersize; 304 301 /** 305 302 * @freeptr_offset: Custom offset for the free pointer 306 - * in &SLAB_TYPESAFE_BY_RCU caches 303 + * in caches with &SLAB_TYPESAFE_BY_RCU or @ctor 307 304 * 308 - * By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer 309 - * outside of the object. This might cause the object to grow in size. 310 - * Cache creators that have a reason to avoid this can specify a custom 311 - * free pointer offset in their struct where the free pointer will be 312 - * placed. 305 + * By default, &SLAB_TYPESAFE_BY_RCU and @ctor caches place the free 306 + * pointer outside of the object. This might cause the object to grow 307 + * in size. Cache creators that have a reason to avoid this can specify 308 + * a custom free pointer offset in their data structure where the free 309 + * pointer will be placed. 313 310 * 314 - * Note that placing the free pointer inside the object requires the 315 - * caller to ensure that no fields are invalidated that are required to 316 - * guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for 317 - * details). 311 + * For caches with &SLAB_TYPESAFE_BY_RCU, the caller must ensure that 312 + * the free pointer does not overlay fields required to guard against 313 + * object recycling (See &SLAB_TYPESAFE_BY_RCU for details). 314 + * 315 + * For caches with @ctor, the caller must ensure that the free pointer 316 + * does not overlay fields initialized by the constructor. 317 + * 318 + * Currently, only caches with &SLAB_TYPESAFE_BY_RCU or @ctor 319 + * may specify @freeptr_offset. 318 320 * 319 321 * Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset 320 - * is specified, %use_freeptr_offset must be set %true. 321 - * 322 - * Note that @ctor currently isn't supported with custom free pointers 323 - * as a @ctor requires an external free pointer. 322 + * is specified, @use_freeptr_offset must be set %true. 324 323 */ 325 324 unsigned int freeptr_offset; 326 325 /** ··· 513 508 void kfree(const void *objp); 514 509 void kfree_nolock(const void *objp); 515 510 void kfree_sensitive(const void *objp); 516 - size_t __ksize(const void *objp); 517 511 518 512 DEFINE_FREE(kfree, void *, if (!IS_ERR_OR_NULL(_T)) kfree(_T)) 519 513 DEFINE_FREE(kfree_sensitive, void *, if (_T) kfree_sensitive(_T))

-11

mm/Kconfig

··· 247 247 out which slabs are relevant to a particular load. 248 248 Try running: slabinfo -DA 249 249 250 - config SLUB_CPU_PARTIAL 251 - default y 252 - depends on SMP && !SLUB_TINY 253 - bool "Enable per cpu partial caches" 254 - help 255 - Per cpu partial caches accelerate objects allocation and freeing 256 - that is local to a processor at the price of more indeterminism 257 - in the latency of the free. On overflow these caches will be cleared 258 - which requires the taking of locks that may cause latency spikes. 259 - Typically one would choose no for a realtime system. 260 - 261 250 config RANDOM_KMALLOC_CACHES 262 251 default n 263 252 depends on !SLUB_TINY

+1

mm/internal.h

··· 838 838 struct page *alloc_frozen_pages_nolock_noprof(gfp_t gfp_flags, int nid, unsigned int order); 839 839 #define alloc_frozen_pages_nolock(...) \ 840 840 alloc_hooks(alloc_frozen_pages_nolock_noprof(__VA_ARGS__)) 841 + void free_frozen_pages_nolock(struct page *page, unsigned int order); 841 842 842 843 extern void zone_pcp_reset(struct zone *zone); 843 844 extern void zone_pcp_disable(struct zone *zone);

+24 -7

mm/memcontrol.c

··· 2627 2627 * Memcg membership data for each individual object is saved in 2628 2628 * slab->obj_exts. 2629 2629 */ 2630 - struct slabobj_ext *obj_exts; 2630 + unsigned long obj_exts; 2631 + struct slabobj_ext *obj_ext; 2631 2632 unsigned int off; 2632 2633 2633 2634 obj_exts = slab_obj_exts(slab); 2634 2635 if (!obj_exts) 2635 2636 return NULL; 2636 2637 2638 + get_slab_obj_exts(obj_exts); 2637 2639 off = obj_to_index(slab->slab_cache, slab, p); 2638 - if (obj_exts[off].objcg) 2639 - return obj_cgroup_memcg(obj_exts[off].objcg); 2640 + obj_ext = slab_obj_ext(slab, obj_exts, off); 2641 + if (obj_ext->objcg) { 2642 + struct obj_cgroup *objcg = obj_ext->objcg; 2643 + 2644 + put_slab_obj_exts(obj_exts); 2645 + return obj_cgroup_memcg(objcg); 2646 + } 2647 + put_slab_obj_exts(obj_exts); 2640 2648 2641 2649 return NULL; 2642 2650 } ··· 3230 3222 } 3231 3223 3232 3224 for (i = 0; i < size; i++) { 3225 + unsigned long obj_exts; 3226 + struct slabobj_ext *obj_ext; 3227 + 3233 3228 slab = virt_to_slab(p[i]); 3234 3229 3235 3230 if (!slab_obj_exts(slab) && ··· 3255 3244 slab_pgdat(slab), cache_vmstat_idx(s))) 3256 3245 return false; 3257 3246 3247 + obj_exts = slab_obj_exts(slab); 3248 + get_slab_obj_exts(obj_exts); 3258 3249 off = obj_to_index(s, slab, p[i]); 3250 + obj_ext = slab_obj_ext(slab, obj_exts, off); 3259 3251 obj_cgroup_get(objcg); 3260 - slab_obj_exts(slab)[off].objcg = objcg; 3252 + obj_ext->objcg = objcg; 3253 + put_slab_obj_exts(obj_exts); 3261 3254 } 3262 3255 3263 3256 return true; 3264 3257 } 3265 3258 3266 3259 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 3267 - void **p, int objects, struct slabobj_ext *obj_exts) 3260 + void **p, int objects, unsigned long obj_exts) 3268 3261 { 3269 3262 size_t obj_size = obj_full_size(s); 3270 3263 3271 3264 for (int i = 0; i < objects; i++) { 3272 3265 struct obj_cgroup *objcg; 3266 + struct slabobj_ext *obj_ext; 3273 3267 unsigned int off; 3274 3268 3275 3269 off = obj_to_index(s, slab, p[i]); 3276 - objcg = obj_exts[off].objcg; 3270 + obj_ext = slab_obj_ext(slab, obj_exts, off); 3271 + objcg = obj_ext->objcg; 3277 3272 if (!objcg) 3278 3273 continue; 3279 3274 3280 - obj_exts[off].objcg = NULL; 3275 + obj_ext->objcg = NULL; 3281 3276 refill_obj_stock(objcg, obj_size, true, -obj_size, 3282 3277 slab_pgdat(slab), cache_vmstat_idx(s)); 3283 3278 obj_cgroup_put(objcg);

+5

mm/page_alloc.c

··· 3011 3011 __free_frozen_pages(page, order, FPI_NONE); 3012 3012 } 3013 3013 3014 + void free_frozen_pages_nolock(struct page *page, unsigned int order) 3015 + { 3016 + __free_frozen_pages(page, order, FPI_TRYLOCK); 3017 + } 3018 + 3014 3019 /* 3015 3020 * Free a batch of folios 3016 3021 */

+135 -78

mm/slab.h

··· 21 21 # define system_has_freelist_aba() system_has_cmpxchg128() 22 22 # define try_cmpxchg_freelist try_cmpxchg128 23 23 # endif 24 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg128 25 24 typedef u128 freelist_full_t; 26 25 #else /* CONFIG_64BIT */ 27 26 # ifdef system_has_cmpxchg64 28 27 # define system_has_freelist_aba() system_has_cmpxchg64() 29 28 # define try_cmpxchg_freelist try_cmpxchg64 30 29 # endif 31 - #define this_cpu_try_cmpxchg_freelist this_cpu_try_cmpxchg64 32 30 typedef u64 freelist_full_t; 33 31 #endif /* CONFIG_64BIT */ 34 32 ··· 53 55 * that the slab was corrupted 54 56 */ 55 57 unsigned frozen:1; 58 + #ifdef CONFIG_64BIT 59 + /* 60 + * Some optimizations use free bits in 'counters' field 61 + * to save memory. In case ->stride field is not available, 62 + * such optimizations are disabled. 63 + */ 64 + unsigned short stride; 65 + #endif 56 66 }; 57 67 }; 58 68 }; ··· 77 71 struct kmem_cache *slab_cache; 78 72 union { 79 73 struct { 80 - union { 81 - struct list_head slab_list; 82 - struct { /* For deferred deactivate_slab() */ 83 - struct llist_node llnode; 84 - void *flush_freelist; 85 - }; 86 - #ifdef CONFIG_SLUB_CPU_PARTIAL 87 - struct { 88 - struct slab *next; 89 - int slabs; /* Nr of slabs left */ 90 - }; 91 - #endif 92 - }; 74 + struct list_head slab_list; 93 75 /* Double-word boundary */ 94 76 struct freelist_counters; 95 77 }; ··· 182 188 return PAGE_SIZE << slab_order(slab); 183 189 } 184 190 185 - #ifdef CONFIG_SLUB_CPU_PARTIAL 186 - #define slub_percpu_partial(c) ((c)->partial) 187 - 188 - #define slub_set_percpu_partial(c, p) \ 189 - ({ \ 190 - slub_percpu_partial(c) = (p)->next; \ 191 - }) 192 - 193 - #define slub_percpu_partial_read_once(c) READ_ONCE(slub_percpu_partial(c)) 194 - #else 195 - #define slub_percpu_partial(c) NULL 196 - 197 - #define slub_set_percpu_partial(c, p) 198 - 199 - #define slub_percpu_partial_read_once(c) NULL 200 - #endif // CONFIG_SLUB_CPU_PARTIAL 201 - 202 191 /* 203 192 * Word size structure that can be atomically updated or read and that 204 193 * contains both the order and the number of objects that a slab of the ··· 195 218 * Slab cache management. 196 219 */ 197 220 struct kmem_cache { 198 - struct kmem_cache_cpu __percpu *cpu_slab; 199 - struct lock_class_key lock_key; 200 221 struct slub_percpu_sheaves __percpu *cpu_sheaves; 201 222 /* Used for retrieving partial slabs, etc. */ 202 223 slab_flags_t flags; ··· 203 228 unsigned int object_size; /* Object size without metadata */ 204 229 struct reciprocal_value reciprocal_size; 205 230 unsigned int offset; /* Free pointer offset */ 206 - #ifdef CONFIG_SLUB_CPU_PARTIAL 207 - /* Number of per cpu partial objects to keep around */ 208 - unsigned int cpu_partial; 209 - /* Number of per cpu partial slabs to keep around */ 210 - unsigned int cpu_partial_slabs; 211 - #endif 212 231 unsigned int sheaf_capacity; 213 232 struct kmem_cache_order_objects oo; 214 233 ··· 243 274 unsigned int usersize; /* Usercopy region size */ 244 275 #endif 245 276 277 + #ifdef CONFIG_SLUB_STATS 278 + struct kmem_cache_stats __percpu *cpu_stats; 279 + #endif 280 + 246 281 struct kmem_cache_node *node[MAX_NUMNODES]; 247 282 }; 283 + 284 + /* 285 + * Every cache has !NULL s->cpu_sheaves but they may point to the 286 + * bootstrap_sheaf temporarily during init, or permanently for the boot caches 287 + * and caches with debugging enabled, or all caches with CONFIG_SLUB_TINY. This 288 + * helper distinguishes whether cache has real non-bootstrap sheaves. 289 + */ 290 + static inline bool cache_has_sheaves(struct kmem_cache *s) 291 + { 292 + /* Test CONFIG_SLUB_TINY for code elimination purposes */ 293 + return !IS_ENABLED(CONFIG_SLUB_TINY) && s->sheaf_capacity; 294 + } 248 295 249 296 #if defined(CONFIG_SYSFS) && !defined(CONFIG_SLUB_TINY) 250 297 #define SLAB_SUPPORTS_SYSFS 1 251 298 void sysfs_slab_unlink(struct kmem_cache *s); 252 299 void sysfs_slab_release(struct kmem_cache *s); 300 + int sysfs_slab_alias(struct kmem_cache *s, const char *name); 253 301 #else 254 302 static inline void sysfs_slab_unlink(struct kmem_cache *s) { } 255 303 static inline void sysfs_slab_release(struct kmem_cache *s) { } 304 + static inline int sysfs_slab_alias(struct kmem_cache *s, const char *name) 305 + { return 0; } 256 306 #endif 257 307 258 308 void *fixup_red_left(struct kmem_cache *s, void *p); ··· 388 400 unsigned int useroffset, unsigned int usersize); 389 401 390 402 int slab_unmergeable(struct kmem_cache *s); 391 - struct kmem_cache *find_mergeable(unsigned size, unsigned align, 392 - slab_flags_t flags, const char *name, void (*ctor)(void *)); 393 - struct kmem_cache * 394 - __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, 395 - slab_flags_t flags, void (*ctor)(void *)); 403 + bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags); 396 404 397 405 slab_flags_t kmem_cache_flags(slab_flags_t flags, const char *name); 398 406 ··· 486 502 static inline bool slab_in_kunit_test(void) { return false; } 487 503 #endif 488 504 505 + /* 506 + * slub is about to manipulate internal object metadata. This memory lies 507 + * outside the range of the allocated object, so accessing it would normally 508 + * be reported by kasan as a bounds error. metadata_access_enable() is used 509 + * to tell kasan that these accesses are OK. 510 + */ 511 + static inline void metadata_access_enable(void) 512 + { 513 + kasan_disable_current(); 514 + kmsan_disable_current(); 515 + } 516 + 517 + static inline void metadata_access_disable(void) 518 + { 519 + kmsan_enable_current(); 520 + kasan_enable_current(); 521 + } 522 + 489 523 #ifdef CONFIG_SLAB_OBJ_EXT 490 524 491 525 /* ··· 511 509 * associated with a slab. 512 510 * @slab: a pointer to the slab struct 513 511 * 514 - * Returns a pointer to the object extension vector associated with the slab, 515 - * or NULL if no such vector has been associated yet. 512 + * Returns the address of the object extension vector associated with the slab, 513 + * or zero if no such vector has been associated yet. 514 + * Do not dereference the return value directly; use get/put_slab_obj_exts() 515 + * pair and slab_obj_ext() to access individual elements. 516 + * 517 + * Example usage: 518 + * 519 + * obj_exts = slab_obj_exts(slab); 520 + * if (obj_exts) { 521 + * get_slab_obj_exts(obj_exts); 522 + * obj_ext = slab_obj_ext(slab, obj_exts, obj_to_index(s, slab, obj)); 523 + * // do something with obj_ext 524 + * put_slab_obj_exts(obj_exts); 525 + * } 526 + * 527 + * Note that the get/put semantics does not involve reference counting. 528 + * Instead, it updates kasan/kmsan depth so that accesses to slabobj_ext 529 + * won't be reported as access violations. 516 530 */ 517 - static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) 531 + static inline unsigned long slab_obj_exts(struct slab *slab) 518 532 { 519 533 unsigned long obj_exts = READ_ONCE(slab->obj_exts); 520 534 ··· 543 525 obj_exts != OBJEXTS_ALLOC_FAIL, slab_page(slab)); 544 526 VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); 545 527 #endif 546 - return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); 528 + 529 + return obj_exts & ~OBJEXTS_FLAGS_MASK; 530 + } 531 + 532 + static inline void get_slab_obj_exts(unsigned long obj_exts) 533 + { 534 + VM_WARN_ON_ONCE(!obj_exts); 535 + metadata_access_enable(); 536 + } 537 + 538 + static inline void put_slab_obj_exts(unsigned long obj_exts) 539 + { 540 + metadata_access_disable(); 541 + } 542 + 543 + #ifdef CONFIG_64BIT 544 + static inline void slab_set_stride(struct slab *slab, unsigned short stride) 545 + { 546 + slab->stride = stride; 547 + } 548 + static inline unsigned short slab_get_stride(struct slab *slab) 549 + { 550 + return slab->stride; 551 + } 552 + #else 553 + static inline void slab_set_stride(struct slab *slab, unsigned short stride) 554 + { 555 + VM_WARN_ON_ONCE(stride != sizeof(struct slabobj_ext)); 556 + } 557 + static inline unsigned short slab_get_stride(struct slab *slab) 558 + { 559 + return sizeof(struct slabobj_ext); 560 + } 561 + #endif 562 + 563 + /* 564 + * slab_obj_ext - get the pointer to the slab object extension metadata 565 + * associated with an object in a slab. 566 + * @slab: a pointer to the slab struct 567 + * @obj_exts: a pointer to the object extension vector 568 + * @index: an index of the object 569 + * 570 + * Returns a pointer to the object extension associated with the object. 571 + * Must be called within a section covered by get/put_slab_obj_exts(). 572 + */ 573 + static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, 574 + unsigned long obj_exts, 575 + unsigned int index) 576 + { 577 + struct slabobj_ext *obj_ext; 578 + 579 + VM_WARN_ON_ONCE(obj_exts != slab_obj_exts(slab)); 580 + 581 + obj_ext = (struct slabobj_ext *)(obj_exts + 582 + slab_get_stride(slab) * index); 583 + return kasan_reset_tag(obj_ext); 547 584 } 548 585 549 586 int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, ··· 606 533 607 534 #else /* CONFIG_SLAB_OBJ_EXT */ 608 535 609 - static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) 536 + static inline unsigned long slab_obj_exts(struct slab *slab) 537 + { 538 + return 0; 539 + } 540 + 541 + static inline struct slabobj_ext *slab_obj_ext(struct slab *slab, 542 + unsigned long obj_exts, 543 + unsigned int index) 610 544 { 611 545 return NULL; 612 546 } 547 + 548 + static inline void slab_set_stride(struct slab *slab, unsigned int stride) { } 549 + static inline unsigned int slab_get_stride(struct slab *slab) { return 0; } 550 + 613 551 614 552 #endif /* CONFIG_SLAB_OBJ_EXT */ 615 553 ··· 634 550 bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, 635 551 gfp_t flags, size_t size, void **p); 636 552 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 637 - void **p, int objects, struct slabobj_ext *obj_exts); 553 + void **p, int objects, unsigned long obj_exts); 638 554 #endif 639 555 640 556 void kvfree_rcu_cb(struct rcu_head *head); 641 - 642 - size_t __ksize(const void *objp); 643 - 644 - static inline size_t slab_ksize(const struct kmem_cache *s) 645 - { 646 - #ifdef CONFIG_SLUB_DEBUG 647 - /* 648 - * Debugging requires use of the padding between object 649 - * and whatever may come after it. 650 - */ 651 - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 652 - return s->object_size; 653 - #endif 654 - if (s->flags & SLAB_KASAN) 655 - return s->object_size; 656 - /* 657 - * If we have the need to store the freelist pointer 658 - * back there or track user information then we can 659 - * only use the space before that information. 660 - */ 661 - if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) 662 - return s->inuse; 663 - /* 664 - * Else we can use all the padding etc for the allocation 665 - */ 666 - return s->size; 667 - } 668 557 669 558 static inline unsigned int large_kmalloc_order(const struct page *page) 670 559 {

+66 -87

mm/slab_common.c

··· 43 43 struct kmem_cache *kmem_cache; 44 44 45 45 /* 46 - * Set of flags that will prevent slab merging 46 + * Set of flags that will prevent slab merging. 47 + * Any flag that adds per-object metadata should be included, 48 + * since slab merging can update s->inuse that affects the metadata layout. 47 49 */ 48 - #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ 49 - SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ 50 - SLAB_FAILSLAB | SLAB_NO_MERGE) 50 + #define SLAB_NEVER_MERGE (SLAB_DEBUG_FLAGS | SLAB_TYPESAFE_BY_RCU | \ 51 + SLAB_NOLEAKTRACE | SLAB_FAILSLAB | SLAB_NO_MERGE | \ 52 + SLAB_OBJ_EXT_IN_OBJ) 51 53 52 54 #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ 53 55 SLAB_CACHE_DMA32 | SLAB_ACCOUNT) ··· 165 163 return 1; 166 164 #endif 167 165 168 - if (s->cpu_sheaves) 169 - return 1; 170 - 171 166 /* 172 167 * We may have set a slab to be unmergeable during bootstrap. 173 168 */ ··· 174 175 return 0; 175 176 } 176 177 177 - struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, 178 - slab_flags_t flags, const char *name, void (*ctor)(void *)) 178 + bool slab_args_unmergeable(struct kmem_cache_args *args, slab_flags_t flags) 179 179 { 180 - struct kmem_cache *s; 181 - 182 180 if (slab_nomerge) 183 - return NULL; 181 + return true; 184 182 185 - if (ctor) 186 - return NULL; 183 + if (args->ctor) 184 + return true; 187 185 188 - flags = kmem_cache_flags(flags, name); 186 + if (IS_ENABLED(CONFIG_HARDENED_USERCOPY) && args->usersize) 187 + return true; 189 188 190 189 if (flags & SLAB_NEVER_MERGE) 190 + return true; 191 + 192 + return false; 193 + } 194 + 195 + static struct kmem_cache *find_mergeable(unsigned int size, slab_flags_t flags, 196 + const char *name, struct kmem_cache_args *args) 197 + { 198 + struct kmem_cache *s; 199 + unsigned int align; 200 + 201 + flags = kmem_cache_flags(flags, name); 202 + if (slab_args_unmergeable(args, flags)) 191 203 return NULL; 192 204 193 205 size = ALIGN(size, sizeof(void *)); 194 - align = calculate_alignment(flags, align, size); 206 + align = calculate_alignment(flags, args->align, size); 195 207 size = ALIGN(size, align); 196 208 197 209 list_for_each_entry_reverse(s, &slab_caches, list) { ··· 241 231 err = -EINVAL; 242 232 if (args->use_freeptr_offset && 243 233 (args->freeptr_offset >= object_size || 244 - !(flags & SLAB_TYPESAFE_BY_RCU) || 234 + (!(flags & SLAB_TYPESAFE_BY_RCU) && !args->ctor) || 245 235 !IS_ALIGNED(args->freeptr_offset, __alignof__(freeptr_t)))) 246 236 goto out; 247 237 ··· 261 251 kmem_cache_free(kmem_cache, s); 262 252 out: 263 253 return ERR_PTR(err); 254 + } 255 + 256 + static struct kmem_cache * 257 + __kmem_cache_alias(const char *name, unsigned int size, slab_flags_t flags, 258 + struct kmem_cache_args *args) 259 + { 260 + struct kmem_cache *s; 261 + 262 + s = find_mergeable(size, flags, name, args); 263 + if (s) { 264 + if (sysfs_slab_alias(s, name)) 265 + pr_err("SLUB: Unable to add cache alias %s to sysfs\n", 266 + name); 267 + 268 + s->refcount++; 269 + 270 + /* 271 + * Adjust the object sizes so that we clear 272 + * the complete object on kzalloc. 273 + */ 274 + s->object_size = max(s->object_size, size); 275 + s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); 276 + } 277 + 278 + return s; 264 279 } 265 280 266 281 /** ··· 340 305 flags &= ~SLAB_DEBUG_FLAGS; 341 306 #endif 342 307 308 + /* 309 + * Caches with specific capacity are special enough. It's simpler to 310 + * make them unmergeable. 311 + */ 312 + if (args->sheaf_capacity) 313 + flags |= SLAB_NO_MERGE; 314 + 343 315 mutex_lock(&slab_mutex); 344 316 345 317 err = kmem_cache_sanity_check(name, object_size); ··· 366 324 object_size - args->usersize < args->useroffset)) 367 325 args->usersize = args->useroffset = 0; 368 326 369 - if (!args->usersize && !args->sheaf_capacity) 370 - s = __kmem_cache_alias(name, object_size, args->align, flags, 371 - args->ctor); 327 + s = __kmem_cache_alias(name, object_size, flags, args); 372 328 if (s) 373 329 goto out_unlock; 374 330 ··· 1023 983 0, SLAB_NO_MERGE, NULL); 1024 984 } 1025 985 1026 - /** 1027 - * __ksize -- Report full size of underlying allocation 1028 - * @object: pointer to the object 1029 - * 1030 - * This should only be used internally to query the true size of allocations. 1031 - * It is not meant to be a way to discover the usable size of an allocation 1032 - * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond 1033 - * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, 1034 - * and/or FORTIFY_SOURCE. 1035 - * 1036 - * Return: size of the actual memory used by @object in bytes 1037 - */ 1038 - size_t __ksize(const void *object) 1039 - { 1040 - const struct page *page; 1041 - const struct slab *slab; 1042 - 1043 - if (unlikely(object == ZERO_SIZE_PTR)) 1044 - return 0; 1045 - 1046 - page = virt_to_page(object); 1047 - 1048 - if (unlikely(PageLargeKmalloc(page))) 1049 - return large_kmalloc_size(page); 1050 - 1051 - slab = page_slab(page); 1052 - /* Delete this after we're sure there are no users */ 1053 - if (WARN_ON(!slab)) 1054 - return page_size(page); 1055 - 1056 - #ifdef CONFIG_SLUB_DEBUG 1057 - skip_orig_size_check(slab->slab_cache, object); 1058 - #endif 1059 - 1060 - return slab_ksize(slab->slab_cache); 1061 - } 1062 - 1063 986 gfp_t kmalloc_fix_flags(gfp_t flags) 1064 987 { 1065 988 gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK; ··· 1237 1234 kfree(mem); 1238 1235 } 1239 1236 EXPORT_SYMBOL(kfree_sensitive); 1240 - 1241 - size_t ksize(const void *objp) 1242 - { 1243 - /* 1244 - * We need to first check that the pointer to the object is valid. 1245 - * The KASAN report printed from ksize() is more useful, then when 1246 - * it's printed later when the behaviour could be undefined due to 1247 - * a potential use-after-free or double-free. 1248 - * 1249 - * We use kasan_check_byte(), which is supported for the hardware 1250 - * tag-based KASAN mode, unlike kasan_check_read/write(). 1251 - * 1252 - * If the pointed to memory is invalid, we return 0 to avoid users of 1253 - * ksize() writing to and potentially corrupting the memory region. 1254 - * 1255 - * We want to perform the check before __ksize(), to avoid potentially 1256 - * crashing in __ksize() due to accessing invalid metadata. 1257 - */ 1258 - if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) 1259 - return 0; 1260 - 1261 - return kfence_ksize(objp) ?: __ksize(objp); 1262 - } 1263 - EXPORT_SYMBOL(ksize); 1264 1237 1265 1238 #ifdef CONFIG_BPF_SYSCALL 1266 1239 #include <linux/btf.h> ··· 1604 1625 return false; 1605 1626 1606 1627 s = slab->slab_cache; 1607 - if (s->cpu_sheaves) { 1608 - if (likely(!IS_ENABLED(CONFIG_NUMA) || 1609 - slab_nid(slab) == numa_mem_id())) 1610 - return __kfree_rcu_sheaf(s, obj); 1611 - } 1628 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) 1629 + return __kfree_rcu_sheaf(s, obj); 1612 1630 1613 1631 return false; 1614 1632 } ··· 2109 2133 */ 2110 2134 void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) 2111 2135 { 2112 - if (s->cpu_sheaves) 2136 + if (cache_has_sheaves(s)) { 2113 2137 flush_rcu_sheaves_on_cache(s); 2138 + rcu_barrier(); 2139 + } 2140 + 2114 2141 /* 2115 2142 * TODO: Introduce a version of __kvfree_rcu_barrier() that works 2116 2143 * on a specific slab cache.

+1507 -1872

mm/slub.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 /* 3 - * SLUB: A slab allocator that limits cache line use instead of queuing 4 - * objects in per cpu and per node lists. 3 + * SLUB: A slab allocator with low overhead percpu array caches and mostly 4 + * lockless freeing of objects to slabs in the slowpath. 5 5 * 6 - * The allocator synchronizes using per slab locks or atomic operations 7 - * and only uses a centralized lock to manage a pool of partial slabs. 6 + * The allocator synchronizes using spin_trylock for percpu arrays in the 7 + * fastpath, and cmpxchg_double (or bit spinlock) for slowpath freeing. 8 + * Uses a centralized lock to manage a pool of partial slabs. 8 9 * 9 10 * (C) 2007 SGI, Christoph Lameter 10 11 * (C) 2011 Linux Foundation, Christoph Lameter 12 + * (C) 2025 SUSE, Vlastimil Babka 11 13 */ 12 14 13 15 #include <linux/mm.h> ··· 55 53 56 54 /* 57 55 * Lock order: 58 - * 1. slab_mutex (Global Mutex) 59 - * 2. node->list_lock (Spinlock) 60 - * 3. kmem_cache->cpu_slab->lock (Local lock) 61 - * 4. slab_lock(slab) (Only on some arches) 62 - * 5. object_map_lock (Only for debugging) 56 + * 0. cpu_hotplug_lock 57 + * 1. slab_mutex (Global Mutex) 58 + * 2a. kmem_cache->cpu_sheaves->lock (Local trylock) 59 + * 2b. node->barn->lock (Spinlock) 60 + * 2c. node->list_lock (Spinlock) 61 + * 3. slab_lock(slab) (Only on some arches) 62 + * 4. object_map_lock (Only for debugging) 63 63 * 64 64 * slab_mutex 65 65 * ··· 82 78 * C. slab->objects -> Number of objects in slab 83 79 * D. slab->frozen -> frozen state 84 80 * 81 + * SL_partial slabs 82 + * 83 + * Slabs on node partial list have at least one free object. A limited number 84 + * of slabs on the list can be fully free (slab->inuse == 0), until we start 85 + * discarding them. These slabs are marked with SL_partial, and the flag is 86 + * cleared while removing them, usually to grab their freelist afterwards. 87 + * This clearing also exempts them from list management. Please see 88 + * __slab_free() for more details. 89 + * 90 + * Full slabs 91 + * 92 + * For caches without debugging enabled, full slabs (slab->inuse == 93 + * slab->objects and slab->freelist == NULL) are not placed on any list. 94 + * The __slab_free() freeing the first object from such a slab will place 95 + * it on the partial list. Caches with debugging enabled place such slab 96 + * on the full list and use different allocation and freeing paths. 97 + * 85 98 * Frozen slabs 86 99 * 87 - * If a slab is frozen then it is exempt from list management. It is 88 - * the cpu slab which is actively allocated from by the processor that 89 - * froze it and it is not on any list. The processor that froze the 90 - * slab is the one who can perform list operations on the slab. Other 91 - * processors may put objects onto the freelist but the processor that 92 - * froze the slab is the only one that can retrieve the objects from the 93 - * slab's freelist. 94 - * 95 - * CPU partial slabs 96 - * 97 - * The partially empty slabs cached on the CPU partial list are used 98 - * for performance reasons, which speeds up the allocation process. 99 - * These slabs are not frozen, but are also exempt from list management, 100 - * by clearing the SL_partial flag when moving out of the node 101 - * partial list. Please see __slab_free() for more details. 100 + * If a slab is frozen then it is exempt from list management. It is used to 101 + * indicate a slab that has failed consistency checks and thus cannot be 102 + * allocated from anymore - it is also marked as full. Any previously 103 + * allocated objects will be simply leaked upon freeing instead of attempting 104 + * to modify the potentially corrupted freelist and metadata. 102 105 * 103 106 * To sum up, the current scheme is: 104 - * - node partial slab: SL_partial && !frozen 105 - * - cpu partial slab: !SL_partial && !frozen 106 - * - cpu slab: !SL_partial && frozen 107 - * - full slab: !SL_partial && !frozen 107 + * - node partial slab: SL_partial && !full && !frozen 108 + * - taken off partial list: !SL_partial && !full && !frozen 109 + * - full slab, not on any list: !SL_partial && full && !frozen 110 + * - frozen due to inconsistency: !SL_partial && full && frozen 108 111 * 109 - * list_lock 112 + * node->list_lock (spinlock) 110 113 * 111 114 * The list_lock protects the partial and full list on each node and 112 115 * the partial slab counter. If taken then no new slabs may be added or ··· 123 112 * 124 113 * The list_lock is a centralized lock and thus we avoid taking it as 125 114 * much as possible. As long as SLUB does not have to handle partial 126 - * slabs, operations can continue without any centralized lock. F.e. 127 - * allocating a long series of objects that fill up slabs does not require 128 - * the list lock. 115 + * slabs, operations can continue without any centralized lock. 129 116 * 130 117 * For debug caches, all allocations are forced to go through a list_lock 131 118 * protected region to serialize against concurrent validation. 132 119 * 133 - * cpu_slab->lock local lock 120 + * cpu_sheaves->lock (local_trylock) 134 121 * 135 - * This locks protect slowpath manipulation of all kmem_cache_cpu fields 136 - * except the stat counters. This is a percpu structure manipulated only by 137 - * the local cpu, so the lock protects against being preempted or interrupted 138 - * by an irq. Fast path operations rely on lockless operations instead. 122 + * This lock protects fastpath operations on the percpu sheaves. On !RT it 123 + * only disables preemption and does no atomic operations. As long as the main 124 + * or spare sheaf can handle the allocation or free, there is no other 125 + * overhead. 139 126 * 140 - * On PREEMPT_RT, the local lock neither disables interrupts nor preemption 141 - * which means the lockless fastpath cannot be used as it might interfere with 142 - * an in-progress slow path operations. In this case the local lock is always 143 - * taken but it still utilizes the freelist for the common operations. 127 + * node->barn->lock (spinlock) 144 128 * 145 - * lockless fastpaths 129 + * This lock protects the operations on per-NUMA-node barn. It can quickly 130 + * serve an empty or full sheaf if available, and avoid more expensive refill 131 + * or flush operation. 146 132 * 147 - * The fast path allocation (slab_alloc_node()) and freeing (do_slab_free()) 148 - * are fully lockless when satisfied from the percpu slab (and when 149 - * cmpxchg_double is possible to use, otherwise slab_lock is taken). 150 - * They also don't disable preemption or migration or irqs. They rely on 151 - * the transaction id (tid) field to detect being preempted or moved to 152 - * another cpu. 133 + * Lockless freeing 134 + * 135 + * Objects may have to be freed to their slabs when they are from a remote 136 + * node (where we want to avoid filling local sheaves with remote objects) 137 + * or when there are too many full sheaves. On architectures supporting 138 + * cmpxchg_double this is done by a lockless update of slab's freelist and 139 + * counters, otherwise slab_lock is taken. This only needs to take the 140 + * list_lock if it's a first free to a full slab, or when a slab becomes empty 141 + * after the free. 153 142 * 154 143 * irq, preemption, migration considerations 155 144 * 156 - * Interrupts are disabled as part of list_lock or local_lock operations, or 145 + * Interrupts are disabled as part of list_lock or barn lock operations, or 157 146 * around the slab_lock operation, in order to make the slab allocator safe 158 147 * to use in the context of an irq. 148 + * Preemption is disabled as part of local_trylock operations. 149 + * kmalloc_nolock() and kfree_nolock() are safe in NMI context but see 150 + * their limitations. 159 151 * 160 - * In addition, preemption (or migration on PREEMPT_RT) is disabled in the 161 - * allocation slowpath, bulk allocation, and put_cpu_partial(), so that the 162 - * local cpu doesn't change in the process and e.g. the kmem_cache_cpu pointer 163 - * doesn't have to be revalidated in each section protected by the local lock. 164 - * 165 - * SLUB assigns one slab for allocation to each processor. 166 - * Allocations only occur from these slabs called cpu slabs. 152 + * SLUB assigns two object arrays called sheaves for caching allocations and 153 + * frees on each cpu, with a NUMA node shared barn for balancing between cpus. 154 + * Allocations and frees are primarily served from these sheaves. 167 155 * 168 156 * Slabs with free elements are kept on a partial list and during regular 169 157 * operations no list for full slabs is used. If an object in a full slab is ··· 170 160 * We track full slabs for debugging purposes though because otherwise we 171 161 * cannot scan all objects. 172 162 * 173 - * Slabs are freed when they become empty. Teardown and setup is 174 - * minimal so we rely on the page allocators per cpu caches for 175 - * fast frees and allocs. 176 - * 177 - * slab->frozen The slab is frozen and exempt from list processing. 178 - * This means that the slab is dedicated to a purpose 179 - * such as satisfying allocations for a specific 180 - * processor. Objects may be freed in the slab while 181 - * it is frozen but slab_free will then skip the usual 182 - * list operations. It is up to the processor holding 183 - * the slab to integrate the slab into the slab lists 184 - * when the slab is no longer needed. 185 - * 186 - * One use of this flag is to mark slabs that are 187 - * used for allocations. Then such a slab becomes a cpu 188 - * slab. The cpu slab may be equipped with an additional 189 - * freelist that allows lockless access to 190 - * free objects in addition to the regular freelist 191 - * that requires the slab lock. 163 + * Slabs are freed when they become empty. Teardown and setup is minimal so we 164 + * rely on the page allocators per cpu caches for fast frees and allocs. 192 165 * 193 166 * SLAB_DEBUG_FLAGS Slab requires special handling due to debug 194 167 * options set. This moves slab handling out of ··· 194 201 SL_pfmemalloc = PG_active, /* Historical reasons for this bit */ 195 202 }; 196 203 197 - /* 198 - * We could simply use migrate_disable()/enable() but as long as it's a 199 - * function call even on !PREEMPT_RT, use inline preempt_disable() there. 200 - */ 201 - #ifndef CONFIG_PREEMPT_RT 202 - #define slub_get_cpu_ptr(var) get_cpu_ptr(var) 203 - #define slub_put_cpu_ptr(var) put_cpu_ptr(var) 204 - #define USE_LOCKLESS_FAST_PATH() (true) 205 - #else 206 - #define slub_get_cpu_ptr(var) \ 207 - ({ \ 208 - migrate_disable(); \ 209 - this_cpu_ptr(var); \ 210 - }) 211 - #define slub_put_cpu_ptr(var) \ 212 - do { \ 213 - (void)(var); \ 214 - migrate_enable(); \ 215 - } while (0) 216 - #define USE_LOCKLESS_FAST_PATH() (false) 217 - #endif 218 - 219 204 #ifndef CONFIG_SLUB_TINY 220 205 #define __fastpath_inline __always_inline 221 206 #else ··· 212 241 static DEFINE_STATIC_KEY_FALSE(strict_numa); 213 242 #endif 214 243 215 - /* Structure holding parameters for get_partial() call chain */ 244 + /* Structure holding parameters for get_from_partial() call chain */ 216 245 struct partial_context { 217 246 gfp_t flags; 218 247 unsigned int orig_size; 219 - void *object; 248 + }; 249 + 250 + /* Structure holding parameters for get_partial_node_bulk() */ 251 + struct partial_bulk_context { 252 + gfp_t flags; 253 + unsigned int min_objects; 254 + unsigned int max_objects; 255 + struct list_head slabs; 220 256 }; 221 257 222 258 static inline bool kmem_cache_debug(struct kmem_cache *s) ··· 237 259 p += s->red_left_pad; 238 260 239 261 return p; 240 - } 241 - 242 - static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) 243 - { 244 - #ifdef CONFIG_SLUB_CPU_PARTIAL 245 - return !kmem_cache_debug(s); 246 - #else 247 - return false; 248 - #endif 249 262 } 250 263 251 264 /* ··· 319 350 320 351 #ifdef SLAB_SUPPORTS_SYSFS 321 352 static int sysfs_slab_add(struct kmem_cache *); 322 - static int sysfs_slab_alias(struct kmem_cache *, const char *); 323 353 #else 324 354 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 325 - static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 326 - { return 0; } 327 355 #endif 328 356 329 357 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_SLUB_DEBUG) ··· 329 363 static inline void debugfs_slab_add(struct kmem_cache *s) { } 330 364 #endif 331 365 366 + enum add_mode { 367 + ADD_TO_HEAD, 368 + ADD_TO_TAIL, 369 + }; 370 + 332 371 enum stat_item { 333 - ALLOC_PCS, /* Allocation from percpu sheaf */ 334 - ALLOC_FASTPATH, /* Allocation from cpu slab */ 335 - ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 336 - FREE_PCS, /* Free to percpu sheaf */ 372 + ALLOC_FASTPATH, /* Allocation from percpu sheaves */ 373 + ALLOC_SLOWPATH, /* Allocation from partial or new slab */ 337 374 FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ 338 375 FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ 339 - FREE_FASTPATH, /* Free to cpu slab */ 340 - FREE_SLOWPATH, /* Freeing not to cpu slab */ 341 - FREE_FROZEN, /* Freeing to frozen slab */ 376 + FREE_FASTPATH, /* Free to percpu sheaves */ 377 + FREE_SLOWPATH, /* Free to a slab */ 342 378 FREE_ADD_PARTIAL, /* Freeing moves slab to partial list */ 343 379 FREE_REMOVE_PARTIAL, /* Freeing removes last object */ 344 - ALLOC_FROM_PARTIAL, /* Cpu slab acquired from node partial list */ 345 - ALLOC_SLAB, /* Cpu slab acquired from page allocator */ 346 - ALLOC_REFILL, /* Refill cpu slab from slab freelist */ 347 - ALLOC_NODE_MISMATCH, /* Switching cpu slab */ 380 + ALLOC_SLAB, /* New slab acquired from page allocator */ 381 + ALLOC_NODE_MISMATCH, /* Requested node different from cpu sheaf */ 348 382 FREE_SLAB, /* Slab freed to the page allocator */ 349 - CPUSLAB_FLUSH, /* Abandoning of the cpu slab */ 350 - DEACTIVATE_FULL, /* Cpu slab was full when deactivated */ 351 - DEACTIVATE_EMPTY, /* Cpu slab was empty when deactivated */ 352 - DEACTIVATE_TO_HEAD, /* Cpu slab was moved to the head of partials */ 353 - DEACTIVATE_TO_TAIL, /* Cpu slab was moved to the tail of partials */ 354 - DEACTIVATE_REMOTE_FREES,/* Slab contained remotely freed objects */ 355 - DEACTIVATE_BYPASS, /* Implicit deactivation */ 356 383 ORDER_FALLBACK, /* Number of times fallback was necessary */ 357 - CMPXCHG_DOUBLE_CPU_FAIL,/* Failures of this_cpu_cmpxchg_double */ 358 384 CMPXCHG_DOUBLE_FAIL, /* Failures of slab freelist update */ 359 - CPU_PARTIAL_ALLOC, /* Used cpu partial on alloc */ 360 - CPU_PARTIAL_FREE, /* Refill cpu partial on free */ 361 - CPU_PARTIAL_NODE, /* Refill cpu partial from node partial */ 362 - CPU_PARTIAL_DRAIN, /* Drain cpu partial to node partial */ 363 385 SHEAF_FLUSH, /* Objects flushed from a sheaf */ 364 386 SHEAF_REFILL, /* Objects refilled to a sheaf */ 365 387 SHEAF_ALLOC, /* Allocation of an empty sheaf */ ··· 364 410 NR_SLUB_STAT_ITEMS 365 411 }; 366 412 367 - struct freelist_tid { 368 - union { 369 - struct { 370 - void *freelist; /* Pointer to next available object */ 371 - unsigned long tid; /* Globally unique transaction id */ 372 - }; 373 - freelist_full_t freelist_tid; 374 - }; 375 - }; 376 - 377 - /* 378 - * When changing the layout, make sure freelist and tid are still compatible 379 - * with this_cpu_cmpxchg_double() alignment requirements. 380 - */ 381 - struct kmem_cache_cpu { 382 - struct freelist_tid; 383 - struct slab *slab; /* The slab from which we are allocating */ 384 - #ifdef CONFIG_SLUB_CPU_PARTIAL 385 - struct slab *partial; /* Partially allocated slabs */ 386 - #endif 387 - local_trylock_t lock; /* Protects the fields above */ 388 413 #ifdef CONFIG_SLUB_STATS 414 + struct kmem_cache_stats { 389 415 unsigned int stat[NR_SLUB_STAT_ITEMS]; 390 - #endif 391 416 }; 417 + #endif 392 418 393 419 static inline void stat(const struct kmem_cache *s, enum stat_item si) 394 420 { ··· 377 443 * The rmw is racy on a preemptible kernel but this is acceptable, so 378 444 * avoid this_cpu_add()'s irq-disable overhead. 379 445 */ 380 - raw_cpu_inc(s->cpu_slab->stat[si]); 446 + raw_cpu_inc(s->cpu_stats->stat[si]); 381 447 #endif 382 448 } 383 449 ··· 385 451 void stat_add(const struct kmem_cache *s, enum stat_item si, int v) 386 452 { 387 453 #ifdef CONFIG_SLUB_STATS 388 - raw_cpu_add(s->cpu_slab->stat[si], v); 454 + raw_cpu_add(s->cpu_stats->stat[si], v); 389 455 #endif 390 456 } 391 457 ··· 474 540 static nodemask_t slab_nodes; 475 541 476 542 /* 477 - * Workqueue used for flush_cpu_slab(). 543 + * Workqueue used for flushing cpu and kfree_rcu sheaves. 478 544 */ 479 545 static struct workqueue_struct *flushwq; 480 546 ··· 531 597 ptr_addr = (unsigned long)object + s->offset; 532 598 p = *(freeptr_t *)(ptr_addr); 533 599 return freelist_ptr_decode(s, p, ptr_addr); 534 - } 535 - 536 - static void prefetch_freepointer(const struct kmem_cache *s, void *object) 537 - { 538 - prefetchw(object + s->offset); 539 - } 540 - 541 - /* 542 - * When running under KMSAN, get_freepointer_safe() may return an uninitialized 543 - * pointer value in the case the current thread loses the race for the next 544 - * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in 545 - * slab_alloc_node() will fail, so the uninitialized value won't be used, but 546 - * KMSAN will still check all arguments of cmpxchg because of imperfect 547 - * handling of inline assembly. 548 - * To work around this problem, we apply __no_kmsan_checks to ensure that 549 - * get_freepointer_safe() returns initialized memory. 550 - */ 551 - __no_kmsan_checks 552 - static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 553 - { 554 - unsigned long freepointer_addr; 555 - freeptr_t p; 556 - 557 - if (!debug_pagealloc_enabled_static()) 558 - return get_freepointer(s, object); 559 - 560 - object = kasan_reset_tag(object); 561 - freepointer_addr = (unsigned long)object + s->offset; 562 - copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p)); 563 - return freelist_ptr_decode(s, p, freepointer_addr); 564 600 } 565 601 566 602 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) ··· 596 692 return x.x & OO_MASK; 597 693 } 598 694 599 - #ifdef CONFIG_SLUB_CPU_PARTIAL 600 - static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 601 - { 602 - unsigned int nr_slabs; 603 - 604 - s->cpu_partial = nr_objects; 605 - 606 - /* 607 - * We take the number of objects but actually limit the number of 608 - * slabs on the per cpu partial list, in order to limit excessive 609 - * growth of the list. For simplicity we assume that the slabs will 610 - * be half-full. 611 - */ 612 - nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo)); 613 - s->cpu_partial_slabs = nr_slabs; 614 - } 615 - 616 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 617 - { 618 - return s->cpu_partial_slabs; 619 - } 620 - #else 621 - #ifdef SLAB_SUPPORTS_SYSFS 622 - static inline void 623 - slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects) 624 - { 625 - } 626 - #endif 627 - 628 - static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s) 629 - { 630 - return 0; 631 - } 632 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 633 - 634 695 /* 635 696 * If network-based swap is enabled, slub must keep track of whether memory 636 697 * were allocated from pfmemalloc reserves. ··· 651 782 if (slab->freelist == old->freelist && 652 783 slab->counters == old->counters) { 653 784 slab->freelist = new->freelist; 654 - slab->counters = new->counters; 785 + /* prevent tearing for the read in get_partial_node_bulk() */ 786 + WRITE_ONCE(slab->counters, new->counters); 655 787 ret = true; 656 788 } 657 789 slab_unlock(slab); ··· 672 802 { 673 803 bool ret; 674 804 675 - if (USE_LOCKLESS_FAST_PATH()) 805 + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) 676 806 lockdep_assert_irqs_disabled(); 677 807 678 808 if (s->flags & __CMPXCHG_DOUBLE) ··· 727 857 * request size in the meta data area, for better debug and sanity check. 728 858 */ 729 859 static inline void set_orig_size(struct kmem_cache *s, 730 - void *object, unsigned int orig_size) 860 + void *object, unsigned long orig_size) 731 861 { 732 862 void *p = kasan_reset_tag(object); 733 863 ··· 737 867 p += get_info_end(s); 738 868 p += sizeof(struct track) * 2; 739 869 740 - *(unsigned int *)p = orig_size; 870 + *(unsigned long *)p = orig_size; 741 871 } 742 872 743 - static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) 873 + static inline unsigned long get_orig_size(struct kmem_cache *s, void *object) 744 874 { 745 875 void *p = kasan_reset_tag(object); 746 876 ··· 753 883 p += get_info_end(s); 754 884 p += sizeof(struct track) * 2; 755 885 756 - return *(unsigned int *)p; 886 + return *(unsigned long *)p; 757 887 } 888 + 889 + #ifdef CONFIG_SLAB_OBJ_EXT 890 + 891 + /* 892 + * Check if memory cgroup or memory allocation profiling is enabled. 893 + * If enabled, SLUB tries to reduce memory overhead of accounting 894 + * slab objects. If neither is enabled when this function is called, 895 + * the optimization is simply skipped to avoid affecting caches that do not 896 + * need slabobj_ext metadata. 897 + * 898 + * However, this may disable optimization when memory cgroup or memory 899 + * allocation profiling is used, but slabs are created too early 900 + * even before those subsystems are initialized. 901 + */ 902 + static inline bool need_slab_obj_exts(struct kmem_cache *s) 903 + { 904 + if (s->flags & SLAB_NO_OBJ_EXT) 905 + return false; 906 + 907 + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) 908 + return true; 909 + 910 + if (mem_alloc_profiling_enabled()) 911 + return true; 912 + 913 + return false; 914 + } 915 + 916 + static inline unsigned int obj_exts_size_in_slab(struct slab *slab) 917 + { 918 + return sizeof(struct slabobj_ext) * slab->objects; 919 + } 920 + 921 + static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, 922 + struct slab *slab) 923 + { 924 + unsigned long objext_offset; 925 + 926 + objext_offset = s->size * slab->objects; 927 + objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext)); 928 + return objext_offset; 929 + } 930 + 931 + static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, 932 + struct slab *slab) 933 + { 934 + unsigned long objext_offset = obj_exts_offset_in_slab(s, slab); 935 + unsigned long objext_size = obj_exts_size_in_slab(slab); 936 + 937 + return objext_offset + objext_size <= slab_size(slab); 938 + } 939 + 940 + static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) 941 + { 942 + unsigned long obj_exts; 943 + unsigned long start; 944 + unsigned long end; 945 + 946 + obj_exts = slab_obj_exts(slab); 947 + if (!obj_exts) 948 + return false; 949 + 950 + start = (unsigned long)slab_address(slab); 951 + end = start + slab_size(slab); 952 + return (obj_exts >= start) && (obj_exts < end); 953 + } 954 + #else 955 + static inline bool need_slab_obj_exts(struct kmem_cache *s) 956 + { 957 + return false; 958 + } 959 + 960 + static inline unsigned int obj_exts_size_in_slab(struct slab *slab) 961 + { 962 + return 0; 963 + } 964 + 965 + static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s, 966 + struct slab *slab) 967 + { 968 + return 0; 969 + } 970 + 971 + static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s, 972 + struct slab *slab) 973 + { 974 + return false; 975 + } 976 + 977 + static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab) 978 + { 979 + return false; 980 + } 981 + 982 + #endif 983 + 984 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 985 + static bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) 986 + { 987 + /* 988 + * Note we cannot rely on the SLAB_OBJ_EXT_IN_OBJ flag here and need to 989 + * check the stride. A cache can have SLAB_OBJ_EXT_IN_OBJ set, but 990 + * allocations within_slab_leftover are preferred. And those may be 991 + * possible or not depending on the particular slab's size. 992 + */ 993 + return obj_exts_in_slab(s, slab) && 994 + (slab_get_stride(slab) == s->size); 995 + } 996 + 997 + static unsigned int obj_exts_offset_in_object(struct kmem_cache *s) 998 + { 999 + unsigned int offset = get_info_end(s); 1000 + 1001 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) 1002 + offset += sizeof(struct track) * 2; 1003 + 1004 + if (slub_debug_orig_size(s)) 1005 + offset += sizeof(unsigned long); 1006 + 1007 + offset += kasan_metadata_size(s, false); 1008 + 1009 + return offset; 1010 + } 1011 + #else 1012 + static inline bool obj_exts_in_object(struct kmem_cache *s, struct slab *slab) 1013 + { 1014 + return false; 1015 + } 1016 + 1017 + static inline unsigned int obj_exts_offset_in_object(struct kmem_cache *s) 1018 + { 1019 + return 0; 1020 + } 1021 + #endif 758 1022 759 1023 #ifdef CONFIG_SLUB_DEBUG 760 1024 ··· 980 976 static int disable_higher_order_debug; 981 977 982 978 /* 983 - * slub is about to manipulate internal object metadata. This memory lies 984 - * outside the range of the allocated object, so accessing it would normally 985 - * be reported by kasan as a bounds error. metadata_access_enable() is used 986 - * to tell kasan that these accesses are OK. 987 - */ 988 - static inline void metadata_access_enable(void) 989 - { 990 - kasan_disable_current(); 991 - kmsan_disable_current(); 992 - } 993 - 994 - static inline void metadata_access_disable(void) 995 - { 996 - kmsan_enable_current(); 997 - kasan_enable_current(); 998 - } 999 - 1000 - /* 1001 979 * Object debugging 1002 980 */ 1003 981 ··· 1051 1065 p->handle = handle; 1052 1066 #endif 1053 1067 p->addr = addr; 1054 - p->cpu = smp_processor_id(); 1068 + p->cpu = raw_smp_processor_id(); 1055 1069 p->pid = current->pid; 1056 1070 p->when = jiffies; 1057 1071 } ··· 1184 1198 off += 2 * sizeof(struct track); 1185 1199 1186 1200 if (slub_debug_orig_size(s)) 1187 - off += sizeof(unsigned int); 1201 + off += sizeof(unsigned long); 1188 1202 1189 1203 off += kasan_metadata_size(s, false); 1204 + 1205 + if (obj_exts_in_object(s, slab)) 1206 + off += sizeof(struct slabobj_ext); 1190 1207 1191 1208 if (off != size_from_object(s)) 1192 1209 /* Beginning of the filler is the free pointer */ ··· 1213 1224 add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); 1214 1225 1215 1226 WARN_ON(1); 1216 - } 1217 - 1218 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 1219 - void **freelist, void *nextfree) 1220 - { 1221 - if ((s->flags & SLAB_CONSISTENCY_CHECKS) && 1222 - !check_valid_pointer(s, slab, nextfree) && freelist) { 1223 - object_err(s, slab, *freelist, "Freechain corrupt"); 1224 - *freelist = NULL; 1225 - slab_fix(s, "Isolate corrupted freechain"); 1226 - return true; 1227 - } 1228 - 1229 - return false; 1230 1227 } 1231 1228 1232 1229 static void __slab_err(struct slab *slab) ··· 1322 1347 } 1323 1348 1324 1349 /* 1325 - * Object layout: 1350 + * Object field layout: 1326 1351 * 1327 - * object address 1328 - * Bytes of the object to be managed. 1329 - * If the freepointer may overlay the object then the free 1330 - * pointer is at the middle of the object. 1352 + * [Left redzone padding] (if SLAB_RED_ZONE) 1353 + * - Field size: s->red_left_pad 1354 + * - Immediately precedes each object when SLAB_RED_ZONE is set. 1355 + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and 1356 + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. 1331 1357 * 1332 - * Poisoning uses 0x6b (POISON_FREE) and the last byte is 1333 - * 0xa5 (POISON_END) 1358 + * [Object bytes] (object address starts here) 1359 + * - Field size: s->object_size 1360 + * - Object payload bytes. 1361 + * - If the freepointer may overlap the object, it is stored inside 1362 + * the object (typically near the middle). 1363 + * - Poisoning uses 0x6b (POISON_FREE) and the last byte is 1364 + * 0xa5 (POISON_END) when __OBJECT_POISON is enabled. 1334 1365 * 1335 - * object + s->object_size 1336 - * Padding to reach word boundary. This is also used for Redzoning. 1337 - * Padding is extended by another word if Redzoning is enabled and 1338 - * object_size == inuse. 1366 + * [Word-align padding] (right redzone when SLAB_RED_ZONE is set) 1367 + * - Field size: s->inuse - s->object_size 1368 + * - If redzoning is enabled and ALIGN(size, sizeof(void *)) adds no 1369 + * padding, explicitly extend by one word so the right redzone is 1370 + * non-empty. 1371 + * - Filled with 0xbb (SLUB_RED_INACTIVE) for inactive objects and 1372 + * 0xcc (SLUB_RED_ACTIVE) for objects in use when SLAB_RED_ZONE. 1339 1373 * 1340 - * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with 1341 - * 0xcc (SLUB_RED_ACTIVE) for objects in use. 1374 + * [Metadata starts at object + s->inuse] 1375 + * - A. freelist pointer (if freeptr_outside_object) 1376 + * - B. alloc tracking (SLAB_STORE_USER) 1377 + * - C. free tracking (SLAB_STORE_USER) 1378 + * - D. original request size (SLAB_KMALLOC && SLAB_STORE_USER) 1379 + * - E. KASAN metadata (if enabled) 1342 1380 * 1343 - * object + s->inuse 1344 - * Meta data starts here. 1381 + * [Mandatory padding] (if CONFIG_SLUB_DEBUG && SLAB_RED_ZONE) 1382 + * - One mandatory debug word to guarantee a minimum poisoned gap 1383 + * between metadata and the next object, independent of alignment. 1384 + * - Filled with 0x5a (POISON_INUSE) when SLAB_POISON is set. 1385 + * [Final alignment padding] 1386 + * - Bytes added by ALIGN(size, s->align) to reach s->size. 1387 + * - When the padding is large enough, it can be used to store 1388 + * struct slabobj_ext for accounting metadata (obj_exts_in_object()). 1389 + * - The remaining bytes (if any) are filled with 0x5a (POISON_INUSE) 1390 + * when SLAB_POISON is set. 1345 1391 * 1346 - * A. Free pointer (if we cannot overwrite object on free) 1347 - * B. Tracking data for SLAB_STORE_USER 1348 - * C. Original request size for kmalloc object (SLAB_STORE_USER enabled) 1349 - * D. Padding to reach required alignment boundary or at minimum 1350 - * one word if debugging is on to be able to detect writes 1351 - * before the word boundary. 1392 + * Notes: 1393 + * - Redzones are filled by init_object() with SLUB_RED_ACTIVE/INACTIVE. 1394 + * - Object contents are poisoned with POISON_FREE/END when __OBJECT_POISON. 1395 + * - The trailing padding is pre-filled with POISON_INUSE by 1396 + * setup_slab_debug() when SLAB_POISON is set, and is validated by 1397 + * check_pad_bytes(). 1398 + * - The first object pointer is slab_address(slab) + 1399 + * (s->red_left_pad if redzoning); subsequent objects are reached by 1400 + * adding s->size each time. 1352 1401 * 1353 - * Padding is done using 0x5a (POISON_INUSE) 1354 - * 1355 - * object + s->size 1356 - * Nothing is used beyond s->size. 1357 - * 1358 - * If slabcaches are merged then the object_size and inuse boundaries are mostly 1359 - * ignored. And therefore no slab options that rely on these boundaries 1360 - * may be used with merged slabcaches. 1402 + * If a slab cache flag relies on specific metadata to exist at a fixed 1403 + * offset, the flag must be included in SLAB_NEVER_MERGE to prevent merging. 1404 + * Otherwise, the cache would misbehave as s->object_size and s->inuse are 1405 + * adjusted during cache merging (see __kmem_cache_alias()). 1361 1406 */ 1362 - 1363 1407 static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) 1364 1408 { 1365 1409 unsigned long off = get_info_end(s); /* The end of info */ ··· 1388 1394 off += 2 * sizeof(struct track); 1389 1395 1390 1396 if (s->flags & SLAB_KMALLOC) 1391 - off += sizeof(unsigned int); 1397 + off += sizeof(unsigned long); 1392 1398 } 1393 1399 1394 1400 off += kasan_metadata_size(s, false); 1401 + 1402 + if (obj_exts_in_object(s, slab)) 1403 + off += sizeof(struct slabobj_ext); 1395 1404 1396 1405 if (size_from_object(s) == off) 1397 1406 return 1; ··· 1420 1423 start = slab_address(slab); 1421 1424 length = slab_size(slab); 1422 1425 end = start + length; 1423 - remainder = length % s->size; 1426 + 1427 + if (obj_exts_in_slab(s, slab) && !obj_exts_in_object(s, slab)) { 1428 + remainder = length; 1429 + remainder -= obj_exts_offset_in_slab(s, slab); 1430 + remainder -= obj_exts_size_in_slab(slab); 1431 + } else { 1432 + remainder = length % s->size; 1433 + } 1434 + 1424 1435 if (!remainder) 1425 1436 return; 1426 1437 ··· 2026 2021 int objects) {} 2027 2022 static inline void dec_slabs_node(struct kmem_cache *s, int node, 2028 2023 int objects) {} 2029 - static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, 2030 - void **freelist, void *nextfree) 2031 - { 2032 - return false; 2033 - } 2034 2024 #endif /* CONFIG_SLUB_DEBUG */ 2035 2025 2036 2026 /* ··· 2042 2042 2043 2043 static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) 2044 2044 { 2045 - struct slabobj_ext *slab_exts; 2046 2045 struct slab *obj_exts_slab; 2046 + unsigned long slab_exts; 2047 2047 2048 2048 obj_exts_slab = virt_to_slab(obj_exts); 2049 2049 slab_exts = slab_obj_exts(obj_exts_slab); 2050 2050 if (slab_exts) { 2051 + get_slab_obj_exts(slab_exts); 2051 2052 unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, 2052 2053 obj_exts_slab, obj_exts); 2054 + struct slabobj_ext *ext = slab_obj_ext(obj_exts_slab, 2055 + slab_exts, offs); 2053 2056 2054 - if (unlikely(is_codetag_empty(&slab_exts[offs].ref))) 2057 + if (unlikely(is_codetag_empty(&ext->ref))) { 2058 + put_slab_obj_exts(slab_exts); 2055 2059 return; 2060 + } 2056 2061 2057 2062 /* codetag should be NULL here */ 2058 - WARN_ON(slab_exts[offs].ref.ct); 2059 - set_codetag_empty(&slab_exts[offs].ref); 2063 + WARN_ON(ext->ref.ct); 2064 + set_codetag_empty(&ext->ref); 2065 + put_slab_obj_exts(slab_exts); 2060 2066 } 2061 2067 } 2062 2068 ··· 2101 2095 slab->obj_exts = 0; 2102 2096 } 2103 2097 2098 + /* 2099 + * Calculate the allocation size for slabobj_ext array. 2100 + * 2101 + * When memory allocation profiling is enabled, the obj_exts array 2102 + * could be allocated from the same slab cache it's being allocated for. 2103 + * This would prevent the slab from ever being freed because it would 2104 + * always contain at least one allocated object (its own obj_exts array). 2105 + * 2106 + * To avoid this, increase the allocation size when we detect the array 2107 + * may come from the same cache, forcing it to use a different cache. 2108 + */ 2109 + static inline size_t obj_exts_alloc_size(struct kmem_cache *s, 2110 + struct slab *slab, gfp_t gfp) 2111 + { 2112 + size_t sz = sizeof(struct slabobj_ext) * slab->objects; 2113 + struct kmem_cache *obj_exts_cache; 2114 + 2115 + /* 2116 + * slabobj_ext array for KMALLOC_CGROUP allocations 2117 + * are served from KMALLOC_NORMAL caches. 2118 + */ 2119 + if (!mem_alloc_profiling_enabled()) 2120 + return sz; 2121 + 2122 + if (sz > KMALLOC_MAX_CACHE_SIZE) 2123 + return sz; 2124 + 2125 + if (!is_kmalloc_normal(s)) 2126 + return sz; 2127 + 2128 + obj_exts_cache = kmalloc_slab(sz, NULL, gfp, 0); 2129 + /* 2130 + * We can't simply compare s with obj_exts_cache, because random kmalloc 2131 + * caches have multiple caches per size, selected by caller address. 2132 + * Since caller address may differ between kmalloc_slab() and actual 2133 + * allocation, bump size when sizes are equal. 2134 + */ 2135 + if (s->object_size == obj_exts_cache->object_size) 2136 + return obj_exts_cache->object_size + 1; 2137 + 2138 + return sz; 2139 + } 2140 + 2104 2141 int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, 2105 2142 gfp_t gfp, bool new_slab) 2106 2143 { ··· 2152 2103 unsigned long new_exts; 2153 2104 unsigned long old_exts; 2154 2105 struct slabobj_ext *vec; 2106 + size_t sz; 2155 2107 2156 2108 gfp &= ~OBJCGS_CLEAR_MASK; 2157 2109 /* Prevent recursive extension vector allocation */ 2158 2110 gfp |= __GFP_NO_OBJ_EXT; 2111 + 2112 + sz = obj_exts_alloc_size(s, slab, gfp); 2159 2113 2160 2114 /* 2161 2115 * Note that allow_spin may be false during early boot and its ··· 2166 2114 * architectures with cmpxchg16b, early obj_exts will be missing for 2167 2115 * very early allocations on those. 2168 2116 */ 2169 - if (unlikely(!allow_spin)) { 2170 - size_t sz = objects * sizeof(struct slabobj_ext); 2171 - 2117 + if (unlikely(!allow_spin)) 2172 2118 vec = kmalloc_nolock(sz, __GFP_ZERO | __GFP_NO_OBJ_EXT, 2173 2119 slab_nid(slab)); 2174 - } else { 2175 - vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, 2176 - slab_nid(slab)); 2177 - } 2120 + else 2121 + vec = kmalloc_node(sz, gfp | __GFP_ZERO, slab_nid(slab)); 2122 + 2178 2123 if (!vec) { 2179 2124 /* 2180 2125 * Try to mark vectors which failed to allocate. ··· 2185 2136 return -ENOMEM; 2186 2137 } 2187 2138 2139 + VM_WARN_ON_ONCE(virt_to_slab(vec) != NULL && 2140 + virt_to_slab(vec)->slab_cache == s); 2141 + 2188 2142 new_exts = (unsigned long)vec; 2189 2143 if (unlikely(!allow_spin)) 2190 2144 new_exts |= OBJEXTS_NOSPIN_ALLOC; ··· 2197 2145 retry: 2198 2146 old_exts = READ_ONCE(slab->obj_exts); 2199 2147 handle_failed_objexts_alloc(old_exts, vec, objects); 2148 + slab_set_stride(slab, sizeof(struct slabobj_ext)); 2149 + 2200 2150 if (new_slab) { 2201 2151 /* 2202 2152 * If the slab is brand new and nobody can yet access its ··· 2232 2178 { 2233 2179 struct slabobj_ext *obj_exts; 2234 2180 2235 - obj_exts = slab_obj_exts(slab); 2181 + obj_exts = (struct slabobj_ext *)slab_obj_exts(slab); 2236 2182 if (!obj_exts) { 2237 2183 /* 2238 2184 * If obj_exts allocation failed, slab->obj_exts is set to 2239 2185 * OBJEXTS_ALLOC_FAIL. In this case, we end up here and should 2240 2186 * clear the flag. 2241 2187 */ 2188 + slab->obj_exts = 0; 2189 + return; 2190 + } 2191 + 2192 + if (obj_exts_in_slab(slab->slab_cache, slab)) { 2242 2193 slab->obj_exts = 0; 2243 2194 return; 2244 2195 } ··· 2263 2204 slab->obj_exts = 0; 2264 2205 } 2265 2206 2207 + /* 2208 + * Try to allocate slabobj_ext array from unused space. 2209 + * This function must be called on a freshly allocated slab to prevent 2210 + * concurrency problems. 2211 + */ 2212 + static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab) 2213 + { 2214 + void *addr; 2215 + unsigned long obj_exts; 2216 + 2217 + if (!need_slab_obj_exts(s)) 2218 + return; 2219 + 2220 + if (obj_exts_fit_within_slab_leftover(s, slab)) { 2221 + addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab); 2222 + addr = kasan_reset_tag(addr); 2223 + obj_exts = (unsigned long)addr; 2224 + 2225 + get_slab_obj_exts(obj_exts); 2226 + memset(addr, 0, obj_exts_size_in_slab(slab)); 2227 + put_slab_obj_exts(obj_exts); 2228 + 2229 + #ifdef CONFIG_MEMCG 2230 + obj_exts |= MEMCG_DATA_OBJEXTS; 2231 + #endif 2232 + slab->obj_exts = obj_exts; 2233 + slab_set_stride(slab, sizeof(struct slabobj_ext)); 2234 + } else if (s->flags & SLAB_OBJ_EXT_IN_OBJ) { 2235 + unsigned int offset = obj_exts_offset_in_object(s); 2236 + 2237 + obj_exts = (unsigned long)slab_address(slab); 2238 + obj_exts += s->red_left_pad; 2239 + obj_exts += offset; 2240 + 2241 + get_slab_obj_exts(obj_exts); 2242 + for_each_object(addr, s, slab_address(slab), slab->objects) 2243 + memset(kasan_reset_tag(addr) + offset, 0, 2244 + sizeof(struct slabobj_ext)); 2245 + put_slab_obj_exts(obj_exts); 2246 + 2247 + #ifdef CONFIG_MEMCG 2248 + obj_exts |= MEMCG_DATA_OBJEXTS; 2249 + #endif 2250 + slab->obj_exts = obj_exts; 2251 + slab_set_stride(slab, s->size); 2252 + } 2253 + } 2254 + 2266 2255 #else /* CONFIG_SLAB_OBJ_EXT */ 2267 2256 2268 2257 static inline void init_slab_obj_exts(struct slab *slab) ··· 2327 2220 { 2328 2221 } 2329 2222 2223 + static inline void alloc_slab_obj_exts_early(struct kmem_cache *s, 2224 + struct slab *slab) 2225 + { 2226 + } 2227 + 2330 2228 #endif /* CONFIG_SLAB_OBJ_EXT */ 2331 2229 2332 2230 #ifdef CONFIG_MEM_ALLOC_PROFILING 2333 2231 2334 - static inline struct slabobj_ext * 2335 - prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) 2232 + static inline unsigned long 2233 + prepare_slab_obj_exts_hook(struct kmem_cache *s, struct slab *slab, 2234 + gfp_t flags, void *p) 2336 2235 { 2337 - struct slab *slab; 2338 - 2339 - slab = virt_to_slab(p); 2340 2236 if (!slab_obj_exts(slab) && 2341 2237 alloc_slab_obj_exts(slab, s, flags, false)) { 2342 2238 pr_warn_once("%s, %s: Failed to create slab extension vector!\n", 2343 2239 __func__, s->name); 2344 - return NULL; 2240 + return 0; 2345 2241 } 2346 2242 2347 - return slab_obj_exts(slab) + obj_to_index(s, slab, p); 2243 + return slab_obj_exts(slab); 2348 2244 } 2245 + 2349 2246 2350 2247 /* Should be called only if mem_alloc_profiling_enabled() */ 2351 2248 static noinline void 2352 2249 __alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) 2353 2250 { 2354 - struct slabobj_ext *obj_exts; 2251 + unsigned long obj_exts; 2252 + struct slabobj_ext *obj_ext; 2253 + struct slab *slab; 2355 2254 2356 2255 if (!object) 2357 2256 return; ··· 2368 2255 if (flags & __GFP_NO_OBJ_EXT) 2369 2256 return; 2370 2257 2371 - obj_exts = prepare_slab_obj_exts_hook(s, flags, object); 2258 + slab = virt_to_slab(object); 2259 + obj_exts = prepare_slab_obj_exts_hook(s, slab, flags, object); 2372 2260 /* 2373 2261 * Currently obj_exts is used only for allocation profiling. 2374 2262 * If other users appear then mem_alloc_profiling_enabled() 2375 2263 * check should be added before alloc_tag_add(). 2376 2264 */ 2377 - if (likely(obj_exts)) 2378 - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); 2379 - else 2265 + if (obj_exts) { 2266 + unsigned int obj_idx = obj_to_index(s, slab, object); 2267 + 2268 + get_slab_obj_exts(obj_exts); 2269 + obj_ext = slab_obj_ext(slab, obj_exts, obj_idx); 2270 + alloc_tag_add(&obj_ext->ref, current->alloc_tag, s->size); 2271 + put_slab_obj_exts(obj_exts); 2272 + } else { 2380 2273 alloc_tag_set_inaccurate(current->alloc_tag); 2274 + } 2381 2275 } 2382 2276 2383 2277 static inline void ··· 2399 2279 __alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, 2400 2280 int objects) 2401 2281 { 2402 - struct slabobj_ext *obj_exts; 2403 2282 int i; 2283 + unsigned long obj_exts; 2404 2284 2405 2285 /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ 2406 2286 if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) ··· 2410 2290 if (!obj_exts) 2411 2291 return; 2412 2292 2293 + get_slab_obj_exts(obj_exts); 2413 2294 for (i = 0; i < objects; i++) { 2414 2295 unsigned int off = obj_to_index(s, slab, p[i]); 2415 2296 2416 - alloc_tag_sub(&obj_exts[off].ref, s->size); 2297 + alloc_tag_sub(&slab_obj_ext(slab, obj_exts, off)->ref, s->size); 2417 2298 } 2299 + put_slab_obj_exts(obj_exts); 2418 2300 } 2419 2301 2420 2302 static inline void ··· 2474 2352 void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, 2475 2353 int objects) 2476 2354 { 2477 - struct slabobj_ext *obj_exts; 2355 + unsigned long obj_exts; 2478 2356 2479 2357 if (!memcg_kmem_online()) 2480 2358 return; ··· 2483 2361 if (likely(!obj_exts)) 2484 2362 return; 2485 2363 2364 + get_slab_obj_exts(obj_exts); 2486 2365 __memcg_slab_free_hook(s, slab, p, objects, obj_exts); 2366 + put_slab_obj_exts(obj_exts); 2487 2367 } 2488 2368 2489 2369 static __fastpath_inline 2490 2370 bool memcg_slab_post_charge(void *p, gfp_t flags) 2491 2371 { 2492 - struct slabobj_ext *slab_exts; 2372 + unsigned long obj_exts; 2373 + struct slabobj_ext *obj_ext; 2493 2374 struct kmem_cache *s; 2494 2375 struct page *page; 2495 2376 struct slab *slab; ··· 2533 2408 return true; 2534 2409 2535 2410 /* Ignore already charged objects. */ 2536 - slab_exts = slab_obj_exts(slab); 2537 - if (slab_exts) { 2411 + obj_exts = slab_obj_exts(slab); 2412 + if (obj_exts) { 2413 + get_slab_obj_exts(obj_exts); 2538 2414 off = obj_to_index(s, slab, p); 2539 - if (unlikely(slab_exts[off].objcg)) 2415 + obj_ext = slab_obj_ext(slab, obj_exts, off); 2416 + if (unlikely(obj_ext->objcg)) { 2417 + put_slab_obj_exts(obj_exts); 2540 2418 return true; 2419 + } 2420 + put_slab_obj_exts(obj_exts); 2541 2421 } 2542 2422 2543 2423 return __memcg_slab_post_alloc_hook(s, NULL, flags, 1, &p); ··· 2726 2596 return object; 2727 2597 } 2728 2598 2729 - static struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp) 2599 + static struct slab_sheaf *__alloc_empty_sheaf(struct kmem_cache *s, gfp_t gfp, 2600 + unsigned int capacity) 2730 2601 { 2731 2602 struct slab_sheaf *sheaf; 2732 2603 size_t sheaf_size; ··· 2745 2614 if (s->flags & SLAB_KMALLOC) 2746 2615 gfp |= __GFP_NO_OBJ_EXT; 2747 2616 2748 - sheaf_size = struct_size(sheaf, objects, s->sheaf_capacity); 2617 + sheaf_size = struct_size(sheaf, objects, capacity); 2749 2618 sheaf = kzalloc(sheaf_size, gfp); 2750 2619 2751 2620 if (unlikely(!sheaf)) ··· 2758 2627 return sheaf; 2759 2628 } 2760 2629 2630 + static inline struct slab_sheaf *alloc_empty_sheaf(struct kmem_cache *s, 2631 + gfp_t gfp) 2632 + { 2633 + return __alloc_empty_sheaf(s, gfp, s->sheaf_capacity); 2634 + } 2635 + 2761 2636 static void free_empty_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf) 2762 2637 { 2763 2638 kfree(sheaf); ··· 2771 2634 stat(s, SHEAF_FREE); 2772 2635 } 2773 2636 2774 - static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 2775 - size_t size, void **p); 2776 - 2637 + static unsigned int 2638 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 2639 + unsigned int max); 2777 2640 2778 2641 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, 2779 2642 gfp_t gfp) ··· 2784 2647 if (!to_fill) 2785 2648 return 0; 2786 2649 2787 - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, 2788 - &sheaf->objects[sheaf->size]); 2650 + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, 2651 + to_fill); 2789 2652 2790 2653 sheaf->size += filled; 2791 2654 ··· 2986 2849 { 2987 2850 int cpu; 2988 2851 2852 + /* 2853 + * We may be unwinding cache creation that failed before or during the 2854 + * allocation of this. 2855 + */ 2856 + if (!s->cpu_sheaves) 2857 + return; 2858 + 2859 + /* pcs->main can only point to the bootstrap sheaf, nothing to free */ 2860 + if (!cache_has_sheaves(s)) 2861 + goto free_pcs; 2862 + 2989 2863 for_each_possible_cpu(cpu) { 2990 2864 struct slub_percpu_sheaves *pcs; 2991 2865 2992 2866 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 2993 2867 2994 - /* can happen when unwinding failed create */ 2868 + /* This can happen when unwinding failed cache creation. */ 2995 2869 if (!pcs->main) 2996 2870 continue; 2997 2871 ··· 3024 2876 } 3025 2877 } 3026 2878 2879 + free_pcs: 3027 2880 free_percpu(s->cpu_sheaves); 3028 2881 s->cpu_sheaves = NULL; 3029 2882 } 3030 2883 3031 - static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn) 2884 + static struct slab_sheaf *barn_get_empty_sheaf(struct node_barn *barn, 2885 + bool allow_spin) 3032 2886 { 3033 2887 struct slab_sheaf *empty = NULL; 3034 2888 unsigned long flags; ··· 3038 2888 if (!data_race(barn->nr_empty)) 3039 2889 return NULL; 3040 2890 3041 - spin_lock_irqsave(&barn->lock, flags); 2891 + if (likely(allow_spin)) 2892 + spin_lock_irqsave(&barn->lock, flags); 2893 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 2894 + return NULL; 3042 2895 3043 2896 if (likely(barn->nr_empty)) { 3044 2897 empty = list_first_entry(&barn->sheaves_empty, ··· 3118 2965 * change. 3119 2966 */ 3120 2967 static struct slab_sheaf * 3121 - barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty) 2968 + barn_replace_empty_sheaf(struct node_barn *barn, struct slab_sheaf *empty, 2969 + bool allow_spin) 3122 2970 { 3123 2971 struct slab_sheaf *full = NULL; 3124 2972 unsigned long flags; ··· 3127 2973 if (!data_race(barn->nr_full)) 3128 2974 return NULL; 3129 2975 3130 - spin_lock_irqsave(&barn->lock, flags); 2976 + if (likely(allow_spin)) 2977 + spin_lock_irqsave(&barn->lock, flags); 2978 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 2979 + return NULL; 3131 2980 3132 2981 if (likely(barn->nr_full)) { 3133 2982 full = list_first_entry(&barn->sheaves_full, struct slab_sheaf, ··· 3151 2994 * barn. But if there are too many full sheaves, reject this with -E2BIG. 3152 2995 */ 3153 2996 static struct slab_sheaf * 3154 - barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full) 2997 + barn_replace_full_sheaf(struct node_barn *barn, struct slab_sheaf *full, 2998 + bool allow_spin) 3155 2999 { 3156 3000 struct slab_sheaf *empty; 3157 3001 unsigned long flags; ··· 3163 3005 if (!data_race(barn->nr_empty)) 3164 3006 return ERR_PTR(-ENOMEM); 3165 3007 3166 - spin_lock_irqsave(&barn->lock, flags); 3008 + if (likely(allow_spin)) 3009 + spin_lock_irqsave(&barn->lock, flags); 3010 + else if (!spin_trylock_irqsave(&barn->lock, flags)) 3011 + return ERR_PTR(-EBUSY); 3167 3012 3168 3013 if (likely(barn->nr_empty)) { 3169 3014 empty = list_first_entry(&barn->sheaves_empty, struct slab_sheaf, ··· 3359 3198 static __always_inline void account_slab(struct slab *slab, int order, 3360 3199 struct kmem_cache *s, gfp_t gfp) 3361 3200 { 3362 - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) 3201 + if (memcg_kmem_online() && 3202 + (s->flags & SLAB_ACCOUNT) && 3203 + !slab_obj_exts(slab)) 3363 3204 alloc_slab_obj_exts(slab, s, gfp, true); 3364 3205 3365 3206 mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), ··· 3425 3262 slab->objects = oo_objects(oo); 3426 3263 slab->inuse = 0; 3427 3264 slab->frozen = 0; 3428 - init_slab_obj_exts(slab); 3429 - 3430 - account_slab(slab, oo_order(oo), s, flags); 3431 3265 3432 3266 slab->slab_cache = s; 3433 3267 ··· 3433 3273 start = slab_address(slab); 3434 3274 3435 3275 setup_slab_debug(s, slab, start); 3276 + init_slab_obj_exts(slab); 3277 + /* 3278 + * Poison the slab before initializing the slabobj_ext array 3279 + * to prevent the array from being overwritten. 3280 + */ 3281 + alloc_slab_obj_exts_early(s, slab); 3282 + account_slab(slab, oo_order(oo), s, flags); 3436 3283 3437 3284 shuffle = shuffle_freelist(s, slab); 3438 3285 ··· 3470 3303 flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); 3471 3304 } 3472 3305 3473 - static void __free_slab(struct kmem_cache *s, struct slab *slab) 3306 + static void __free_slab(struct kmem_cache *s, struct slab *slab, bool allow_spin) 3474 3307 { 3475 3308 struct page *page = slab_page(slab); 3476 3309 int order = compound_order(page); ··· 3481 3314 __ClearPageSlab(page); 3482 3315 mm_account_reclaimed_pages(pages); 3483 3316 unaccount_slab(slab, order, s); 3484 - free_frozen_pages(page, order); 3317 + if (allow_spin) 3318 + free_frozen_pages(page, order); 3319 + else 3320 + free_frozen_pages_nolock(page, order); 3321 + } 3322 + 3323 + static void free_new_slab_nolock(struct kmem_cache *s, struct slab *slab) 3324 + { 3325 + /* 3326 + * Since it was just allocated, we can skip the actions in 3327 + * discard_slab() and free_slab(). 3328 + */ 3329 + __free_slab(s, slab, false); 3485 3330 } 3486 3331 3487 3332 static void rcu_free_slab(struct rcu_head *h) 3488 3333 { 3489 3334 struct slab *slab = container_of(h, struct slab, rcu_head); 3490 3335 3491 - __free_slab(slab->slab_cache, slab); 3336 + __free_slab(slab->slab_cache, slab, true); 3492 3337 } 3493 3338 3494 3339 static void free_slab(struct kmem_cache *s, struct slab *slab) ··· 3516 3337 if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) 3517 3338 call_rcu(&slab->rcu_head, rcu_free_slab); 3518 3339 else 3519 - __free_slab(s, slab); 3340 + __free_slab(s, slab, true); 3520 3341 } 3521 3342 3522 3343 static void discard_slab(struct kmem_cache *s, struct slab *slab) ··· 3544 3365 * Management of partially allocated slabs. 3545 3366 */ 3546 3367 static inline void 3547 - __add_partial(struct kmem_cache_node *n, struct slab *slab, int tail) 3368 + __add_partial(struct kmem_cache_node *n, struct slab *slab, enum add_mode mode) 3548 3369 { 3549 3370 n->nr_partial++; 3550 - if (tail == DEACTIVATE_TO_TAIL) 3371 + if (mode == ADD_TO_TAIL) 3551 3372 list_add_tail(&slab->slab_list, &n->partial); 3552 3373 else 3553 3374 list_add(&slab->slab_list, &n->partial); ··· 3555 3376 } 3556 3377 3557 3378 static inline void add_partial(struct kmem_cache_node *n, 3558 - struct slab *slab, int tail) 3379 + struct slab *slab, enum add_mode mode) 3559 3380 { 3560 3381 lockdep_assert_held(&n->list_lock); 3561 - __add_partial(n, slab, tail); 3382 + __add_partial(n, slab, mode); 3562 3383 } 3563 3384 3564 3385 static inline void remove_partial(struct kmem_cache_node *n, ··· 3609 3430 return object; 3610 3431 } 3611 3432 3612 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist); 3613 - 3614 3433 /* 3615 3434 * Called only for kmem_cache_debug() caches to allocate from a freshly 3616 3435 * allocated slab. Allocate a single object instead of whole freelist ··· 3624 3447 void *object; 3625 3448 3626 3449 if (!allow_spin && !spin_trylock_irqsave(&n->list_lock, flags)) { 3627 - /* Unlucky, discard newly allocated slab */ 3628 - defer_deactivate_slab(slab, NULL); 3450 + /* Unlucky, discard newly allocated slab. */ 3451 + free_new_slab_nolock(s, slab); 3629 3452 return NULL; 3630 3453 } 3631 3454 ··· 3651 3474 if (slab->inuse == slab->objects) 3652 3475 add_full(s, n, slab); 3653 3476 else 3654 - add_partial(n, slab, DEACTIVATE_TO_HEAD); 3477 + add_partial(n, slab, ADD_TO_HEAD); 3655 3478 3656 3479 inc_slabs_node(s, nid, slab->objects); 3657 3480 spin_unlock_irqrestore(&n->list_lock, flags); ··· 3659 3482 return object; 3660 3483 } 3661 3484 3662 - #ifdef CONFIG_SLUB_CPU_PARTIAL 3663 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain); 3664 - #else 3665 - static inline void put_cpu_partial(struct kmem_cache *s, struct slab *slab, 3666 - int drain) { } 3667 - #endif 3668 3485 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); 3669 3486 3670 - /* 3671 - * Try to allocate a partial slab from a specific node. 3672 - */ 3673 - static struct slab *get_partial_node(struct kmem_cache *s, 3674 - struct kmem_cache_node *n, 3675 - struct partial_context *pc) 3487 + static bool get_partial_node_bulk(struct kmem_cache *s, 3488 + struct kmem_cache_node *n, 3489 + struct partial_bulk_context *pc, 3490 + bool allow_spin) 3676 3491 { 3677 - struct slab *slab, *slab2, *partial = NULL; 3492 + struct slab *slab, *slab2; 3493 + unsigned int total_free = 0; 3678 3494 unsigned long flags; 3679 - unsigned int partial_slabs = 0; 3495 + 3496 + /* Racy check to avoid taking the lock unnecessarily. */ 3497 + if (!n || data_race(!n->nr_partial)) 3498 + return false; 3499 + 3500 + INIT_LIST_HEAD(&pc->slabs); 3501 + 3502 + if (allow_spin) 3503 + spin_lock_irqsave(&n->list_lock, flags); 3504 + else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3505 + return false; 3506 + 3507 + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3508 + struct freelist_counters flc; 3509 + unsigned int slab_free; 3510 + 3511 + if (!pfmemalloc_match(slab, pc->flags)) 3512 + continue; 3513 + 3514 + /* 3515 + * determine the number of free objects in the slab racily 3516 + * 3517 + * slab_free is a lower bound due to possible subsequent 3518 + * concurrent freeing, so the caller may get more objects than 3519 + * requested and must handle that 3520 + */ 3521 + flc.counters = data_race(READ_ONCE(slab->counters)); 3522 + slab_free = flc.objects - flc.inuse; 3523 + 3524 + /* we have already min and this would get us over the max */ 3525 + if (total_free >= pc->min_objects 3526 + && total_free + slab_free > pc->max_objects) 3527 + break; 3528 + 3529 + remove_partial(n, slab); 3530 + 3531 + list_add(&slab->slab_list, &pc->slabs); 3532 + 3533 + total_free += slab_free; 3534 + if (total_free >= pc->max_objects) 3535 + break; 3536 + } 3537 + 3538 + spin_unlock_irqrestore(&n->list_lock, flags); 3539 + return total_free > 0; 3540 + } 3541 + 3542 + /* 3543 + * Try to allocate object from a partial slab on a specific node. 3544 + */ 3545 + static void *get_from_partial_node(struct kmem_cache *s, 3546 + struct kmem_cache_node *n, 3547 + struct partial_context *pc) 3548 + { 3549 + struct slab *slab, *slab2; 3550 + unsigned long flags; 3551 + void *object = NULL; 3680 3552 3681 3553 /* 3682 3554 * Racy check. If we mistakenly see no partial slabs then we 3683 3555 * just allocate an empty slab. If we mistakenly try to get a 3684 - * partial slab and there is none available then get_partial() 3556 + * partial slab and there is none available then get_from_partial() 3685 3557 * will return NULL. 3686 3558 */ 3687 3559 if (!n || !n->nr_partial) ··· 3741 3515 else if (!spin_trylock_irqsave(&n->list_lock, flags)) 3742 3516 return NULL; 3743 3517 list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3518 + 3519 + struct freelist_counters old, new; 3520 + 3744 3521 if (!pfmemalloc_match(slab, pc->flags)) 3745 3522 continue; 3746 3523 3747 3524 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 3748 - void *object = alloc_single_from_partial(s, n, slab, 3525 + object = alloc_single_from_partial(s, n, slab, 3749 3526 pc->orig_size); 3750 - if (object) { 3751 - partial = slab; 3752 - pc->object = object; 3527 + if (object) 3753 3528 break; 3754 - } 3755 3529 continue; 3756 3530 } 3757 3531 3758 - remove_partial(n, slab); 3532 + /* 3533 + * get a single object from the slab. This might race against 3534 + * __slab_free(), which however has to take the list_lock if 3535 + * it's about to make the slab fully free. 3536 + */ 3537 + do { 3538 + old.freelist = slab->freelist; 3539 + old.counters = slab->counters; 3759 3540 3760 - if (!partial) { 3761 - partial = slab; 3762 - stat(s, ALLOC_FROM_PARTIAL); 3541 + new.freelist = get_freepointer(s, old.freelist); 3542 + new.counters = old.counters; 3543 + new.inuse++; 3763 3544 3764 - if ((slub_get_cpu_partial(s) == 0)) { 3765 - break; 3766 - } 3767 - } else { 3768 - put_cpu_partial(s, slab, 0); 3769 - stat(s, CPU_PARTIAL_NODE); 3545 + } while (!__slab_update_freelist(s, slab, &old, &new, "get_from_partial_node")); 3770 3546 3771 - if (++partial_slabs > slub_get_cpu_partial(s) / 2) { 3772 - break; 3773 - } 3774 - } 3547 + object = old.freelist; 3548 + if (!new.freelist) 3549 + remove_partial(n, slab); 3550 + 3551 + break; 3775 3552 } 3776 3553 spin_unlock_irqrestore(&n->list_lock, flags); 3777 - return partial; 3554 + return object; 3778 3555 } 3779 3556 3780 3557 /* 3781 - * Get a slab from somewhere. Search in increasing NUMA distances. 3558 + * Get an object from somewhere. Search in increasing NUMA distances. 3782 3559 */ 3783 - static struct slab *get_any_partial(struct kmem_cache *s, 3784 - struct partial_context *pc) 3560 + static void *get_from_any_partial(struct kmem_cache *s, struct partial_context *pc) 3785 3561 { 3786 3562 #ifdef CONFIG_NUMA 3787 3563 struct zonelist *zonelist; 3788 3564 struct zoneref *z; 3789 3565 struct zone *zone; 3790 3566 enum zone_type highest_zoneidx = gfp_zone(pc->flags); 3791 - struct slab *slab; 3792 3567 unsigned int cpuset_mems_cookie; 3793 3568 3794 3569 /* ··· 3824 3597 3825 3598 if (n && cpuset_zone_allowed(zone, pc->flags) && 3826 3599 n->nr_partial > s->min_partial) { 3827 - slab = get_partial_node(s, n, pc); 3828 - if (slab) { 3600 + 3601 + void *object = get_from_partial_node(s, n, pc); 3602 + 3603 + if (object) { 3829 3604 /* 3830 3605 * Don't check read_mems_allowed_retry() 3831 3606 * here - if mems_allowed was updated in ··· 3835 3606 * between allocation and the cpuset 3836 3607 * update 3837 3608 */ 3838 - return slab; 3609 + return object; 3839 3610 } 3840 3611 } 3841 3612 } ··· 3845 3616 } 3846 3617 3847 3618 /* 3848 - * Get a partial slab, lock it and return it. 3619 + * Get an object from a partial slab 3849 3620 */ 3850 - static struct slab *get_partial(struct kmem_cache *s, int node, 3851 - struct partial_context *pc) 3621 + static void *get_from_partial(struct kmem_cache *s, int node, 3622 + struct partial_context *pc) 3852 3623 { 3853 - struct slab *slab; 3854 3624 int searchnode = node; 3625 + void *object; 3855 3626 3856 3627 if (node == NUMA_NO_NODE) 3857 3628 searchnode = numa_mem_id(); 3858 3629 3859 - slab = get_partial_node(s, get_node(s, searchnode), pc); 3860 - if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3861 - return slab; 3630 + object = get_from_partial_node(s, get_node(s, searchnode), pc); 3631 + if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE))) 3632 + return object; 3862 3633 3863 - return get_any_partial(s, pc); 3864 - } 3865 - 3866 - #ifdef CONFIG_PREEMPTION 3867 - /* 3868 - * Calculate the next globally unique transaction for disambiguation 3869 - * during cmpxchg. The transactions start with the cpu number and are then 3870 - * incremented by CONFIG_NR_CPUS. 3871 - */ 3872 - #define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS) 3873 - #else 3874 - /* 3875 - * No preemption supported therefore also no need to check for 3876 - * different cpus. 3877 - */ 3878 - #define TID_STEP 1 3879 - #endif /* CONFIG_PREEMPTION */ 3880 - 3881 - static inline unsigned long next_tid(unsigned long tid) 3882 - { 3883 - return tid + TID_STEP; 3884 - } 3885 - 3886 - #ifdef SLUB_DEBUG_CMPXCHG 3887 - static inline unsigned int tid_to_cpu(unsigned long tid) 3888 - { 3889 - return tid % TID_STEP; 3890 - } 3891 - 3892 - static inline unsigned long tid_to_event(unsigned long tid) 3893 - { 3894 - return tid / TID_STEP; 3895 - } 3896 - #endif 3897 - 3898 - static inline unsigned int init_tid(int cpu) 3899 - { 3900 - return cpu; 3901 - } 3902 - 3903 - static inline void note_cmpxchg_failure(const char *n, 3904 - const struct kmem_cache *s, unsigned long tid) 3905 - { 3906 - #ifdef SLUB_DEBUG_CMPXCHG 3907 - unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); 3908 - 3909 - pr_info("%s %s: cmpxchg redo ", n, s->name); 3910 - 3911 - if (IS_ENABLED(CONFIG_PREEMPTION) && 3912 - tid_to_cpu(tid) != tid_to_cpu(actual_tid)) { 3913 - pr_warn("due to cpu change %d -> %d\n", 3914 - tid_to_cpu(tid), tid_to_cpu(actual_tid)); 3915 - } else if (tid_to_event(tid) != tid_to_event(actual_tid)) { 3916 - pr_warn("due to cpu running other code. Event %ld->%ld\n", 3917 - tid_to_event(tid), tid_to_event(actual_tid)); 3918 - } else { 3919 - pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", 3920 - actual_tid, tid, next_tid(tid)); 3921 - } 3922 - #endif 3923 - stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 3924 - } 3925 - 3926 - static void init_kmem_cache_cpus(struct kmem_cache *s) 3927 - { 3928 - #ifdef CONFIG_PREEMPT_RT 3929 - /* 3930 - * Register lockdep key for non-boot kmem caches to avoid 3931 - * WARN_ON_ONCE(static_obj(key))) in lockdep_register_key() 3932 - */ 3933 - bool finegrain_lockdep = !init_section_contains(s, 1); 3934 - #else 3935 - /* 3936 - * Don't bother with different lockdep classes for each 3937 - * kmem_cache, since we only use local_trylock_irqsave(). 3938 - */ 3939 - bool finegrain_lockdep = false; 3940 - #endif 3941 - int cpu; 3942 - struct kmem_cache_cpu *c; 3943 - 3944 - if (finegrain_lockdep) 3945 - lockdep_register_key(&s->lock_key); 3946 - for_each_possible_cpu(cpu) { 3947 - c = per_cpu_ptr(s->cpu_slab, cpu); 3948 - local_trylock_init(&c->lock); 3949 - if (finegrain_lockdep) 3950 - lockdep_set_class(&c->lock, &s->lock_key); 3951 - c->tid = init_tid(cpu); 3952 - } 3953 - } 3954 - 3955 - /* 3956 - * Finishes removing the cpu slab. Merges cpu's freelist with slab's freelist, 3957 - * unfreezes the slabs and puts it on the proper list. 3958 - * Assumes the slab has been already safely taken away from kmem_cache_cpu 3959 - * by the caller. 3960 - */ 3961 - static void deactivate_slab(struct kmem_cache *s, struct slab *slab, 3962 - void *freelist) 3963 - { 3964 - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); 3965 - int free_delta = 0; 3966 - void *nextfree, *freelist_iter, *freelist_tail; 3967 - int tail = DEACTIVATE_TO_HEAD; 3968 - unsigned long flags = 0; 3969 - struct freelist_counters old, new; 3970 - 3971 - if (READ_ONCE(slab->freelist)) { 3972 - stat(s, DEACTIVATE_REMOTE_FREES); 3973 - tail = DEACTIVATE_TO_TAIL; 3974 - } 3975 - 3976 - /* 3977 - * Stage one: Count the objects on cpu's freelist as free_delta and 3978 - * remember the last object in freelist_tail for later splicing. 3979 - */ 3980 - freelist_tail = NULL; 3981 - freelist_iter = freelist; 3982 - while (freelist_iter) { 3983 - nextfree = get_freepointer(s, freelist_iter); 3984 - 3985 - /* 3986 - * If 'nextfree' is invalid, it is possible that the object at 3987 - * 'freelist_iter' is already corrupted. So isolate all objects 3988 - * starting at 'freelist_iter' by skipping them. 3989 - */ 3990 - if (freelist_corrupted(s, slab, &freelist_iter, nextfree)) 3991 - break; 3992 - 3993 - freelist_tail = freelist_iter; 3994 - free_delta++; 3995 - 3996 - freelist_iter = nextfree; 3997 - } 3998 - 3999 - /* 4000 - * Stage two: Unfreeze the slab while splicing the per-cpu 4001 - * freelist to the head of slab's freelist. 4002 - */ 4003 - do { 4004 - old.freelist = READ_ONCE(slab->freelist); 4005 - old.counters = READ_ONCE(slab->counters); 4006 - VM_BUG_ON(!old.frozen); 4007 - 4008 - /* Determine target state of the slab */ 4009 - new.counters = old.counters; 4010 - new.frozen = 0; 4011 - if (freelist_tail) { 4012 - new.inuse -= free_delta; 4013 - set_freepointer(s, freelist_tail, old.freelist); 4014 - new.freelist = freelist; 4015 - } else { 4016 - new.freelist = old.freelist; 4017 - } 4018 - } while (!slab_update_freelist(s, slab, &old, &new, "unfreezing slab")); 4019 - 4020 - /* 4021 - * Stage three: Manipulate the slab list based on the updated state. 4022 - */ 4023 - if (!new.inuse && n->nr_partial >= s->min_partial) { 4024 - stat(s, DEACTIVATE_EMPTY); 4025 - discard_slab(s, slab); 4026 - stat(s, FREE_SLAB); 4027 - } else if (new.freelist) { 4028 - spin_lock_irqsave(&n->list_lock, flags); 4029 - add_partial(n, slab, tail); 4030 - spin_unlock_irqrestore(&n->list_lock, flags); 4031 - stat(s, tail); 4032 - } else { 4033 - stat(s, DEACTIVATE_FULL); 4034 - } 4035 - } 4036 - 4037 - /* 4038 - * ___slab_alloc()'s caller is supposed to check if kmem_cache::kmem_cache_cpu::lock 4039 - * can be acquired without a deadlock before invoking the function. 4040 - * 4041 - * Without LOCKDEP we trust the code to be correct. kmalloc_nolock() is 4042 - * using local_lock_is_locked() properly before calling local_lock_cpu_slab(), 4043 - * and kmalloc() is not used in an unsupported context. 4044 - * 4045 - * With LOCKDEP, on PREEMPT_RT lockdep does its checking in local_lock_irqsave(). 4046 - * On !PREEMPT_RT we use trylock to avoid false positives in NMI, but 4047 - * lockdep_assert() will catch a bug in case: 4048 - * #1 4049 - * kmalloc() -> ___slab_alloc() -> irqsave -> NMI -> bpf -> kmalloc_nolock() 4050 - * or 4051 - * #2 4052 - * kmalloc() -> ___slab_alloc() -> irqsave -> tracepoint/kprobe -> bpf -> kmalloc_nolock() 4053 - * 4054 - * On PREEMPT_RT an invocation is not possible from IRQ-off or preempt 4055 - * disabled context. The lock will always be acquired and if needed it 4056 - * block and sleep until the lock is available. 4057 - * #1 is possible in !PREEMPT_RT only. 4058 - * #2 is possible in both with a twist that irqsave is replaced with rt_spinlock: 4059 - * kmalloc() -> ___slab_alloc() -> rt_spin_lock(kmem_cache_A) -> 4060 - * tracepoint/kprobe -> bpf -> kmalloc_nolock() -> rt_spin_lock(kmem_cache_B) 4061 - * 4062 - * local_lock_is_locked() prevents the case kmem_cache_A == kmem_cache_B 4063 - */ 4064 - #if defined(CONFIG_PREEMPT_RT) || !defined(CONFIG_LOCKDEP) 4065 - #define local_lock_cpu_slab(s, flags) \ 4066 - local_lock_irqsave(&(s)->cpu_slab->lock, flags) 4067 - #else 4068 - #define local_lock_cpu_slab(s, flags) \ 4069 - do { \ 4070 - bool __l = local_trylock_irqsave(&(s)->cpu_slab->lock, flags); \ 4071 - lockdep_assert(__l); \ 4072 - } while (0) 4073 - #endif 4074 - 4075 - #define local_unlock_cpu_slab(s, flags) \ 4076 - local_unlock_irqrestore(&(s)->cpu_slab->lock, flags) 4077 - 4078 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4079 - static void __put_partials(struct kmem_cache *s, struct slab *partial_slab) 4080 - { 4081 - struct kmem_cache_node *n = NULL, *n2 = NULL; 4082 - struct slab *slab, *slab_to_discard = NULL; 4083 - unsigned long flags = 0; 4084 - 4085 - while (partial_slab) { 4086 - slab = partial_slab; 4087 - partial_slab = slab->next; 4088 - 4089 - n2 = get_node(s, slab_nid(slab)); 4090 - if (n != n2) { 4091 - if (n) 4092 - spin_unlock_irqrestore(&n->list_lock, flags); 4093 - 4094 - n = n2; 4095 - spin_lock_irqsave(&n->list_lock, flags); 4096 - } 4097 - 4098 - if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) { 4099 - slab->next = slab_to_discard; 4100 - slab_to_discard = slab; 4101 - } else { 4102 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 4103 - stat(s, FREE_ADD_PARTIAL); 4104 - } 4105 - } 4106 - 4107 - if (n) 4108 - spin_unlock_irqrestore(&n->list_lock, flags); 4109 - 4110 - while (slab_to_discard) { 4111 - slab = slab_to_discard; 4112 - slab_to_discard = slab_to_discard->next; 4113 - 4114 - stat(s, DEACTIVATE_EMPTY); 4115 - discard_slab(s, slab); 4116 - stat(s, FREE_SLAB); 4117 - } 4118 - } 4119 - 4120 - /* 4121 - * Put all the cpu partial slabs to the node partial list. 4122 - */ 4123 - static void put_partials(struct kmem_cache *s) 4124 - { 4125 - struct slab *partial_slab; 4126 - unsigned long flags; 4127 - 4128 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4129 - partial_slab = this_cpu_read(s->cpu_slab->partial); 4130 - this_cpu_write(s->cpu_slab->partial, NULL); 4131 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4132 - 4133 - if (partial_slab) 4134 - __put_partials(s, partial_slab); 4135 - } 4136 - 4137 - static void put_partials_cpu(struct kmem_cache *s, 4138 - struct kmem_cache_cpu *c) 4139 - { 4140 - struct slab *partial_slab; 4141 - 4142 - partial_slab = slub_percpu_partial(c); 4143 - c->partial = NULL; 4144 - 4145 - if (partial_slab) 4146 - __put_partials(s, partial_slab); 4147 - } 4148 - 4149 - /* 4150 - * Put a slab into a partial slab slot if available. 4151 - * 4152 - * If we did not find a slot then simply move all the partials to the 4153 - * per node partial list. 4154 - */ 4155 - static void put_cpu_partial(struct kmem_cache *s, struct slab *slab, int drain) 4156 - { 4157 - struct slab *oldslab; 4158 - struct slab *slab_to_put = NULL; 4159 - unsigned long flags; 4160 - int slabs = 0; 4161 - 4162 - local_lock_cpu_slab(s, flags); 4163 - 4164 - oldslab = this_cpu_read(s->cpu_slab->partial); 4165 - 4166 - if (oldslab) { 4167 - if (drain && oldslab->slabs >= s->cpu_partial_slabs) { 4168 - /* 4169 - * Partial array is full. Move the existing set to the 4170 - * per node partial list. Postpone the actual unfreezing 4171 - * outside of the critical section. 4172 - */ 4173 - slab_to_put = oldslab; 4174 - oldslab = NULL; 4175 - } else { 4176 - slabs = oldslab->slabs; 4177 - } 4178 - } 4179 - 4180 - slabs++; 4181 - 4182 - slab->slabs = slabs; 4183 - slab->next = oldslab; 4184 - 4185 - this_cpu_write(s->cpu_slab->partial, slab); 4186 - 4187 - local_unlock_cpu_slab(s, flags); 4188 - 4189 - if (slab_to_put) { 4190 - __put_partials(s, slab_to_put); 4191 - stat(s, CPU_PARTIAL_DRAIN); 4192 - } 4193 - } 4194 - 4195 - #else /* CONFIG_SLUB_CPU_PARTIAL */ 4196 - 4197 - static inline void put_partials(struct kmem_cache *s) { } 4198 - static inline void put_partials_cpu(struct kmem_cache *s, 4199 - struct kmem_cache_cpu *c) { } 4200 - 4201 - #endif /* CONFIG_SLUB_CPU_PARTIAL */ 4202 - 4203 - static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 4204 - { 4205 - unsigned long flags; 4206 - struct slab *slab; 4207 - void *freelist; 4208 - 4209 - local_lock_irqsave(&s->cpu_slab->lock, flags); 4210 - 4211 - slab = c->slab; 4212 - freelist = c->freelist; 4213 - 4214 - c->slab = NULL; 4215 - c->freelist = NULL; 4216 - c->tid = next_tid(c->tid); 4217 - 4218 - local_unlock_irqrestore(&s->cpu_slab->lock, flags); 4219 - 4220 - if (slab) { 4221 - deactivate_slab(s, slab, freelist); 4222 - stat(s, CPUSLAB_FLUSH); 4223 - } 4224 - } 4225 - 4226 - static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) 4227 - { 4228 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4229 - void *freelist = c->freelist; 4230 - struct slab *slab = c->slab; 4231 - 4232 - c->slab = NULL; 4233 - c->freelist = NULL; 4234 - c->tid = next_tid(c->tid); 4235 - 4236 - if (slab) { 4237 - deactivate_slab(s, slab, freelist); 4238 - stat(s, CPUSLAB_FLUSH); 4239 - } 4240 - 4241 - put_partials_cpu(s, c); 4242 - } 4243 - 4244 - static inline void flush_this_cpu_slab(struct kmem_cache *s) 4245 - { 4246 - struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); 4247 - 4248 - if (c->slab) 4249 - flush_slab(s, c); 4250 - 4251 - put_partials(s); 4252 - } 4253 - 4254 - static bool has_cpu_slab(int cpu, struct kmem_cache *s) 4255 - { 4256 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4257 - 4258 - return c->slab || slub_percpu_partial(c); 3634 + return get_from_any_partial(s, pc); 4259 3635 } 4260 3636 4261 3637 static bool has_pcs_used(int cpu, struct kmem_cache *s) 4262 3638 { 4263 3639 struct slub_percpu_sheaves *pcs; 4264 3640 4265 - if (!s->cpu_sheaves) 3641 + if (!cache_has_sheaves(s)) 4266 3642 return false; 4267 3643 4268 3644 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); ··· 3876 4042 } 3877 4043 3878 4044 /* 3879 - * Flush cpu slab. 4045 + * Flush percpu sheaves 3880 4046 * 3881 4047 * Called from CPU work handler with migration disabled. 3882 4048 */ 3883 - static void flush_cpu_slab(struct work_struct *w) 4049 + static void flush_cpu_sheaves(struct work_struct *w) 3884 4050 { 3885 4051 struct kmem_cache *s; 3886 4052 struct slub_flush_work *sfw; ··· 3889 4055 3890 4056 s = sfw->s; 3891 4057 3892 - if (s->cpu_sheaves) 4058 + if (cache_has_sheaves(s)) 3893 4059 pcs_flush_all(s); 3894 - 3895 - flush_this_cpu_slab(s); 3896 4060 } 3897 4061 3898 4062 static void flush_all_cpus_locked(struct kmem_cache *s) ··· 3903 4071 3904 4072 for_each_online_cpu(cpu) { 3905 4073 sfw = &per_cpu(slub_flush, cpu); 3906 - if (!has_cpu_slab(cpu, s) && !has_pcs_used(cpu, s)) { 4074 + if (!has_pcs_used(cpu, s)) { 3907 4075 sfw->skip = true; 3908 4076 continue; 3909 4077 } 3910 - INIT_WORK(&sfw->work, flush_cpu_slab); 4078 + INIT_WORK(&sfw->work, flush_cpu_sheaves); 3911 4079 sfw->skip = false; 3912 4080 sfw->s = s; 3913 4081 queue_work_on(cpu, flushwq, &sfw->work); ··· 3992 4160 mutex_lock(&slab_mutex); 3993 4161 3994 4162 list_for_each_entry(s, &slab_caches, list) { 3995 - if (!s->cpu_sheaves) 4163 + if (!cache_has_sheaves(s)) 3996 4164 continue; 3997 4165 flush_rcu_sheaves_on_cache(s); 3998 4166 } ··· 4013 4181 4014 4182 mutex_lock(&slab_mutex); 4015 4183 list_for_each_entry(s, &slab_caches, list) { 4016 - __flush_cpu_slab(s, cpu); 4017 - if (s->cpu_sheaves) 4184 + if (cache_has_sheaves(s)) 4018 4185 __pcs_flush_all_cpu(s, cpu); 4019 4186 } 4020 4187 mutex_unlock(&slab_mutex); 4021 4188 return 0; 4022 - } 4023 - 4024 - /* 4025 - * Check if the objects in a per cpu structure fit numa 4026 - * locality expectations. 4027 - */ 4028 - static inline int node_match(struct slab *slab, int node) 4029 - { 4030 - #ifdef CONFIG_NUMA 4031 - if (node != NUMA_NO_NODE && slab_nid(slab) != node) 4032 - return 0; 4033 - #endif 4034 - return 1; 4035 4189 } 4036 4190 4037 4191 #ifdef CONFIG_SLUB_DEBUG ··· 4192 4374 return true; 4193 4375 } 4194 4376 4195 - static inline bool 4196 - __update_cpu_freelist_fast(struct kmem_cache *s, 4197 - void *freelist_old, void *freelist_new, 4198 - unsigned long tid) 4199 - { 4200 - struct freelist_tid old = { .freelist = freelist_old, .tid = tid }; 4201 - struct freelist_tid new = { .freelist = freelist_new, .tid = next_tid(tid) }; 4202 - 4203 - return this_cpu_try_cmpxchg_freelist(s->cpu_slab->freelist_tid, 4204 - &old.freelist_tid, new.freelist_tid); 4205 - } 4206 - 4207 4377 /* 4208 - * Check the slab->freelist and either transfer the freelist to the 4209 - * per cpu freelist or deactivate the slab. 4378 + * Get the slab's freelist and do not freeze it. 4210 4379 * 4211 - * The slab is still frozen if the return value is not NULL. 4380 + * Assumes the slab is isolated from node partial list and not frozen. 4212 4381 * 4213 - * If this function returns NULL then the slab has been unfrozen. 4382 + * Assumes this is performed only for caches without debugging so we 4383 + * don't need to worry about adding the slab to the full list. 4214 4384 */ 4215 - static inline void *get_freelist(struct kmem_cache *s, struct slab *slab) 4216 - { 4217 - struct freelist_counters old, new; 4218 - 4219 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4220 - 4221 - do { 4222 - old.freelist = slab->freelist; 4223 - old.counters = slab->counters; 4224 - 4225 - new.freelist = NULL; 4226 - new.counters = old.counters; 4227 - 4228 - new.inuse = old.objects; 4229 - new.frozen = old.freelist != NULL; 4230 - 4231 - 4232 - } while (!__slab_update_freelist(s, slab, &old, &new, "get_freelist")); 4233 - 4234 - return old.freelist; 4235 - } 4236 - 4237 - /* 4238 - * Freeze the partial slab and return the pointer to the freelist. 4239 - */ 4240 - static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) 4385 + static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) 4241 4386 { 4242 4387 struct freelist_counters old, new; 4243 4388 ··· 4210 4429 4211 4430 new.freelist = NULL; 4212 4431 new.counters = old.counters; 4213 - VM_BUG_ON(new.frozen); 4432 + VM_WARN_ON_ONCE(new.frozen); 4214 4433 4215 4434 new.inuse = old.objects; 4216 - new.frozen = 1; 4217 4435 4218 - } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4436 + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); 4219 4437 4220 4438 return old.freelist; 4221 - } 4222 - 4223 - /* 4224 - * Slow path. The lockless freelist is empty or we need to perform 4225 - * debugging duties. 4226 - * 4227 - * Processing is still very fast if new objects have been freed to the 4228 - * regular freelist. In that case we simply take over the regular freelist 4229 - * as the lockless freelist and zap the regular freelist. 4230 - * 4231 - * If that is not working then we fall back to the partial lists. We take the 4232 - * first element of the freelist as the object to allocate now and move the 4233 - * rest of the freelist to the lockless freelist. 4234 - * 4235 - * And if we were unable to get a new slab from the partial slab lists then 4236 - * we need to allocate a new slab. This is the slowest path since it involves 4237 - * a call to the page allocator and the setup of a new slab. 4238 - * 4239 - * Version of __slab_alloc to use when we know that preemption is 4240 - * already disabled (which is the case for bulk allocation). 4241 - */ 4242 - static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4243 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4244 - { 4245 - bool allow_spin = gfpflags_allow_spinning(gfpflags); 4246 - void *freelist; 4247 - struct slab *slab; 4248 - unsigned long flags; 4249 - struct partial_context pc; 4250 - bool try_thisnode = true; 4251 - 4252 - stat(s, ALLOC_SLOWPATH); 4253 - 4254 - reread_slab: 4255 - 4256 - slab = READ_ONCE(c->slab); 4257 - if (!slab) { 4258 - /* 4259 - * if the node is not online or has no normal memory, just 4260 - * ignore the node constraint 4261 - */ 4262 - if (unlikely(node != NUMA_NO_NODE && 4263 - !node_isset(node, slab_nodes))) 4264 - node = NUMA_NO_NODE; 4265 - goto new_slab; 4266 - } 4267 - 4268 - if (unlikely(!node_match(slab, node))) { 4269 - /* 4270 - * same as above but node_match() being false already 4271 - * implies node != NUMA_NO_NODE. 4272 - * 4273 - * We don't strictly honor pfmemalloc and NUMA preferences 4274 - * when !allow_spin because: 4275 - * 4276 - * 1. Most kmalloc() users allocate objects on the local node, 4277 - * so kmalloc_nolock() tries not to interfere with them by 4278 - * deactivating the cpu slab. 4279 - * 4280 - * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause 4281 - * unnecessary slab allocations even when n->partial list 4282 - * is not empty. 4283 - */ 4284 - if (!node_isset(node, slab_nodes) || 4285 - !allow_spin) { 4286 - node = NUMA_NO_NODE; 4287 - } else { 4288 - stat(s, ALLOC_NODE_MISMATCH); 4289 - goto deactivate_slab; 4290 - } 4291 - } 4292 - 4293 - /* 4294 - * By rights, we should be searching for a slab page that was 4295 - * PFMEMALLOC but right now, we are losing the pfmemalloc 4296 - * information when the page leaves the per-cpu allocator 4297 - */ 4298 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) 4299 - goto deactivate_slab; 4300 - 4301 - /* must check again c->slab in case we got preempted and it changed */ 4302 - local_lock_cpu_slab(s, flags); 4303 - 4304 - if (unlikely(slab != c->slab)) { 4305 - local_unlock_cpu_slab(s, flags); 4306 - goto reread_slab; 4307 - } 4308 - freelist = c->freelist; 4309 - if (freelist) 4310 - goto load_freelist; 4311 - 4312 - freelist = get_freelist(s, slab); 4313 - 4314 - if (!freelist) { 4315 - c->slab = NULL; 4316 - c->tid = next_tid(c->tid); 4317 - local_unlock_cpu_slab(s, flags); 4318 - stat(s, DEACTIVATE_BYPASS); 4319 - goto new_slab; 4320 - } 4321 - 4322 - stat(s, ALLOC_REFILL); 4323 - 4324 - load_freelist: 4325 - 4326 - lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 4327 - 4328 - /* 4329 - * freelist is pointing to the list of objects to be used. 4330 - * slab is pointing to the slab from which the objects are obtained. 4331 - * That slab must be frozen for per cpu allocations to work. 4332 - */ 4333 - VM_BUG_ON(!c->slab->frozen); 4334 - c->freelist = get_freepointer(s, freelist); 4335 - c->tid = next_tid(c->tid); 4336 - local_unlock_cpu_slab(s, flags); 4337 - return freelist; 4338 - 4339 - deactivate_slab: 4340 - 4341 - local_lock_cpu_slab(s, flags); 4342 - if (slab != c->slab) { 4343 - local_unlock_cpu_slab(s, flags); 4344 - goto reread_slab; 4345 - } 4346 - freelist = c->freelist; 4347 - c->slab = NULL; 4348 - c->freelist = NULL; 4349 - c->tid = next_tid(c->tid); 4350 - local_unlock_cpu_slab(s, flags); 4351 - deactivate_slab(s, slab, freelist); 4352 - 4353 - new_slab: 4354 - 4355 - #ifdef CONFIG_SLUB_CPU_PARTIAL 4356 - while (slub_percpu_partial(c)) { 4357 - local_lock_cpu_slab(s, flags); 4358 - if (unlikely(c->slab)) { 4359 - local_unlock_cpu_slab(s, flags); 4360 - goto reread_slab; 4361 - } 4362 - if (unlikely(!slub_percpu_partial(c))) { 4363 - local_unlock_cpu_slab(s, flags); 4364 - /* we were preempted and partial list got empty */ 4365 - goto new_objects; 4366 - } 4367 - 4368 - slab = slub_percpu_partial(c); 4369 - slub_set_percpu_partial(c, slab); 4370 - 4371 - if (likely(node_match(slab, node) && 4372 - pfmemalloc_match(slab, gfpflags)) || 4373 - !allow_spin) { 4374 - c->slab = slab; 4375 - freelist = get_freelist(s, slab); 4376 - VM_BUG_ON(!freelist); 4377 - stat(s, CPU_PARTIAL_ALLOC); 4378 - goto load_freelist; 4379 - } 4380 - 4381 - local_unlock_cpu_slab(s, flags); 4382 - 4383 - slab->next = NULL; 4384 - __put_partials(s, slab); 4385 - } 4386 - #endif 4387 - 4388 - new_objects: 4389 - 4390 - pc.flags = gfpflags; 4391 - /* 4392 - * When a preferred node is indicated but no __GFP_THISNODE 4393 - * 4394 - * 1) try to get a partial slab from target node only by having 4395 - * __GFP_THISNODE in pc.flags for get_partial() 4396 - * 2) if 1) failed, try to allocate a new slab from target node with 4397 - * GPF_NOWAIT | __GFP_THISNODE opportunistically 4398 - * 3) if 2) failed, retry with original gfpflags which will allow 4399 - * get_partial() try partial lists of other nodes before potentially 4400 - * allocating new page from other nodes 4401 - */ 4402 - if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4403 - && try_thisnode)) { 4404 - if (unlikely(!allow_spin)) 4405 - /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 4406 - pc.flags = gfpflags | __GFP_THISNODE; 4407 - else 4408 - pc.flags = GFP_NOWAIT | __GFP_THISNODE; 4409 - } 4410 - 4411 - pc.orig_size = orig_size; 4412 - slab = get_partial(s, node, &pc); 4413 - if (slab) { 4414 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4415 - freelist = pc.object; 4416 - /* 4417 - * For debug caches here we had to go through 4418 - * alloc_single_from_partial() so just store the 4419 - * tracking info and return the object. 4420 - * 4421 - * Due to disabled preemption we need to disallow 4422 - * blocking. The flags are further adjusted by 4423 - * gfp_nested_mask() in stack_depot itself. 4424 - */ 4425 - if (s->flags & SLAB_STORE_USER) 4426 - set_track(s, freelist, TRACK_ALLOC, addr, 4427 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4428 - 4429 - return freelist; 4430 - } 4431 - 4432 - freelist = freeze_slab(s, slab); 4433 - goto retry_load_slab; 4434 - } 4435 - 4436 - slub_put_cpu_ptr(s->cpu_slab); 4437 - slab = new_slab(s, pc.flags, node); 4438 - c = slub_get_cpu_ptr(s->cpu_slab); 4439 - 4440 - if (unlikely(!slab)) { 4441 - if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4442 - && try_thisnode) { 4443 - try_thisnode = false; 4444 - goto new_objects; 4445 - } 4446 - slab_out_of_memory(s, gfpflags, node); 4447 - return NULL; 4448 - } 4449 - 4450 - stat(s, ALLOC_SLAB); 4451 - 4452 - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 4453 - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 4454 - 4455 - if (unlikely(!freelist)) { 4456 - /* This could cause an endless loop. Fail instead. */ 4457 - if (!allow_spin) 4458 - return NULL; 4459 - goto new_objects; 4460 - } 4461 - 4462 - if (s->flags & SLAB_STORE_USER) 4463 - set_track(s, freelist, TRACK_ALLOC, addr, 4464 - gfpflags & ~(__GFP_DIRECT_RECLAIM)); 4465 - 4466 - return freelist; 4467 - } 4468 - 4469 - /* 4470 - * No other reference to the slab yet so we can 4471 - * muck around with it freely without cmpxchg 4472 - */ 4473 - freelist = slab->freelist; 4474 - slab->freelist = NULL; 4475 - slab->inuse = slab->objects; 4476 - slab->frozen = 1; 4477 - 4478 - inc_slabs_node(s, slab_nid(slab), slab->objects); 4479 - 4480 - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) { 4481 - /* 4482 - * For !pfmemalloc_match() case we don't load freelist so that 4483 - * we don't make further mismatched allocations easier. 4484 - */ 4485 - deactivate_slab(s, slab, get_freepointer(s, freelist)); 4486 - return freelist; 4487 - } 4488 - 4489 - retry_load_slab: 4490 - 4491 - local_lock_cpu_slab(s, flags); 4492 - if (unlikely(c->slab)) { 4493 - void *flush_freelist = c->freelist; 4494 - struct slab *flush_slab = c->slab; 4495 - 4496 - c->slab = NULL; 4497 - c->freelist = NULL; 4498 - c->tid = next_tid(c->tid); 4499 - 4500 - local_unlock_cpu_slab(s, flags); 4501 - 4502 - if (unlikely(!allow_spin)) { 4503 - /* Reentrant slub cannot take locks, defer */ 4504 - defer_deactivate_slab(flush_slab, flush_freelist); 4505 - } else { 4506 - deactivate_slab(s, flush_slab, flush_freelist); 4507 - } 4508 - 4509 - stat(s, CPUSLAB_FLUSH); 4510 - 4511 - goto retry_load_slab; 4512 - } 4513 - c->slab = slab; 4514 - 4515 - goto load_freelist; 4516 - } 4517 - /* 4518 - * We disallow kprobes in ___slab_alloc() to prevent reentrance 4519 - * 4520 - * kmalloc() -> ___slab_alloc() -> local_lock_cpu_slab() protected part of 4521 - * ___slab_alloc() manipulating c->freelist -> kprobe -> bpf -> 4522 - * kmalloc_nolock() or kfree_nolock() -> __update_cpu_freelist_fast() 4523 - * manipulating c->freelist without lock. 4524 - * 4525 - * This does not prevent kprobe in functions called from ___slab_alloc() such as 4526 - * local_lock_irqsave() itself, and that is fine, we only need to protect the 4527 - * c->freelist manipulation in ___slab_alloc() itself. 4528 - */ 4529 - NOKPROBE_SYMBOL(___slab_alloc); 4530 - 4531 - /* 4532 - * A wrapper for ___slab_alloc() for contexts where preemption is not yet 4533 - * disabled. Compensates for possible cpu changes by refetching the per cpu area 4534 - * pointer. 4535 - */ 4536 - static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4537 - unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size) 4538 - { 4539 - void *p; 4540 - 4541 - #ifdef CONFIG_PREEMPT_COUNT 4542 - /* 4543 - * We may have been preempted and rescheduled on a different 4544 - * cpu before disabling preemption. Need to reload cpu area 4545 - * pointer. 4546 - */ 4547 - c = slub_get_cpu_ptr(s->cpu_slab); 4548 - #endif 4549 - if (unlikely(!gfpflags_allow_spinning(gfpflags))) { 4550 - if (local_lock_is_locked(&s->cpu_slab->lock)) { 4551 - /* 4552 - * EBUSY is an internal signal to kmalloc_nolock() to 4553 - * retry a different bucket. It's not propagated 4554 - * to the caller. 4555 - */ 4556 - p = ERR_PTR(-EBUSY); 4557 - goto out; 4558 - } 4559 - } 4560 - p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size); 4561 - out: 4562 - #ifdef CONFIG_PREEMPT_COUNT 4563 - slub_put_cpu_ptr(s->cpu_slab); 4564 - #endif 4565 - return p; 4566 - } 4567 - 4568 - static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 4569 - gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 4570 - { 4571 - struct kmem_cache_cpu *c; 4572 - struct slab *slab; 4573 - unsigned long tid; 4574 - void *object; 4575 - 4576 - redo: 4577 - /* 4578 - * Must read kmem_cache cpu data via this cpu ptr. Preemption is 4579 - * enabled. We may switch back and forth between cpus while 4580 - * reading from one cpu area. That does not matter as long 4581 - * as we end up on the original cpu again when doing the cmpxchg. 4582 - * 4583 - * We must guarantee that tid and kmem_cache_cpu are retrieved on the 4584 - * same cpu. We read first the kmem_cache_cpu pointer and use it to read 4585 - * the tid. If we are preempted and switched to another cpu between the 4586 - * two reads, it's OK as the two are still associated with the same cpu 4587 - * and cmpxchg later will validate the cpu. 4588 - */ 4589 - c = raw_cpu_ptr(s->cpu_slab); 4590 - tid = READ_ONCE(c->tid); 4591 - 4592 - /* 4593 - * Irqless object alloc/free algorithm used here depends on sequence 4594 - * of fetching cpu_slab's data. tid should be fetched before anything 4595 - * on c to guarantee that object and slab associated with previous tid 4596 - * won't be used with current tid. If we fetch tid first, object and 4597 - * slab could be one associated with next tid and our alloc/free 4598 - * request will be failed. In this case, we will retry. So, no problem. 4599 - */ 4600 - barrier(); 4601 - 4602 - /* 4603 - * The transaction ids are globally unique per cpu and per operation on 4604 - * a per cpu queue. Thus they can be guarantee that the cmpxchg_double 4605 - * occurs on the right processor and that there was no operation on the 4606 - * linked list in between. 4607 - */ 4608 - 4609 - object = c->freelist; 4610 - slab = c->slab; 4611 - 4612 - #ifdef CONFIG_NUMA 4613 - if (static_branch_unlikely(&strict_numa) && 4614 - node == NUMA_NO_NODE) { 4615 - 4616 - struct mempolicy *mpol = current->mempolicy; 4617 - 4618 - if (mpol) { 4619 - /* 4620 - * Special BIND rule support. If existing slab 4621 - * is in permitted set then do not redirect 4622 - * to a particular node. 4623 - * Otherwise we apply the memory policy to get 4624 - * the node we need to allocate on. 4625 - */ 4626 - if (mpol->mode != MPOL_BIND || !slab || 4627 - !node_isset(slab_nid(slab), mpol->nodes)) 4628 - 4629 - node = mempolicy_slab_node(); 4630 - } 4631 - } 4632 - #endif 4633 - 4634 - if (!USE_LOCKLESS_FAST_PATH() || 4635 - unlikely(!object || !slab || !node_match(slab, node))) { 4636 - object = __slab_alloc(s, gfpflags, node, addr, c, orig_size); 4637 - } else { 4638 - void *next_object = get_freepointer_safe(s, object); 4639 - 4640 - /* 4641 - * The cmpxchg will only match if there was no additional 4642 - * operation and if we are on the right processor. 4643 - * 4644 - * The cmpxchg does the following atomically (without lock 4645 - * semantics!) 4646 - * 1. Relocate first pointer to the current per cpu area. 4647 - * 2. Verify that tid and freelist have not been changed 4648 - * 3. If they were not changed replace tid and freelist 4649 - * 4650 - * Since this is without lock semantics the protection is only 4651 - * against code executing on this cpu *not* from access by 4652 - * other cpus. 4653 - */ 4654 - if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) { 4655 - note_cmpxchg_failure("slab_alloc", s, tid); 4656 - goto redo; 4657 - } 4658 - prefetch_freepointer(s, next_object); 4659 - stat(s, ALLOC_FASTPATH); 4660 - } 4661 - 4662 - return object; 4663 4439 } 4664 4440 4665 4441 /* ··· 4232 4894 !freeptr_outside_object(s)) 4233 4895 memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 4234 4896 0, sizeof(void *)); 4897 + } 4898 + 4899 + static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, 4900 + void **p, unsigned int count, bool allow_spin) 4901 + { 4902 + unsigned int allocated = 0; 4903 + struct kmem_cache_node *n; 4904 + bool needs_add_partial; 4905 + unsigned long flags; 4906 + void *object; 4907 + 4908 + /* 4909 + * Are we going to put the slab on the partial list? 4910 + * Note slab->inuse is 0 on a new slab. 4911 + */ 4912 + needs_add_partial = (slab->objects > count); 4913 + 4914 + if (!allow_spin && needs_add_partial) { 4915 + 4916 + n = get_node(s, slab_nid(slab)); 4917 + 4918 + if (!spin_trylock_irqsave(&n->list_lock, flags)) { 4919 + /* Unlucky, discard newly allocated slab */ 4920 + free_new_slab_nolock(s, slab); 4921 + return 0; 4922 + } 4923 + } 4924 + 4925 + object = slab->freelist; 4926 + while (object && allocated < count) { 4927 + p[allocated] = object; 4928 + object = get_freepointer(s, object); 4929 + maybe_wipe_obj_freeptr(s, p[allocated]); 4930 + 4931 + slab->inuse++; 4932 + allocated++; 4933 + } 4934 + slab->freelist = object; 4935 + 4936 + if (needs_add_partial) { 4937 + 4938 + if (allow_spin) { 4939 + n = get_node(s, slab_nid(slab)); 4940 + spin_lock_irqsave(&n->list_lock, flags); 4941 + } 4942 + add_partial(n, slab, ADD_TO_HEAD); 4943 + spin_unlock_irqrestore(&n->list_lock, flags); 4944 + } 4945 + 4946 + inc_slabs_node(s, slab_nid(slab), slab->objects); 4947 + return allocated; 4948 + } 4949 + 4950 + /* 4951 + * Slow path. We failed to allocate via percpu sheaves or they are not available 4952 + * due to bootstrap or debugging enabled or SLUB_TINY. 4953 + * 4954 + * We try to allocate from partial slab lists and fall back to allocating a new 4955 + * slab. 4956 + */ 4957 + static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 4958 + unsigned long addr, unsigned int orig_size) 4959 + { 4960 + bool allow_spin = gfpflags_allow_spinning(gfpflags); 4961 + void *object; 4962 + struct slab *slab; 4963 + struct partial_context pc; 4964 + bool try_thisnode = true; 4965 + 4966 + stat(s, ALLOC_SLOWPATH); 4967 + 4968 + new_objects: 4969 + 4970 + pc.flags = gfpflags; 4971 + /* 4972 + * When a preferred node is indicated but no __GFP_THISNODE 4973 + * 4974 + * 1) try to get a partial slab from target node only by having 4975 + * __GFP_THISNODE in pc.flags for get_from_partial() 4976 + * 2) if 1) failed, try to allocate a new slab from target node with 4977 + * GPF_NOWAIT | __GFP_THISNODE opportunistically 4978 + * 3) if 2) failed, retry with original gfpflags which will allow 4979 + * get_from_partial() try partial lists of other nodes before 4980 + * potentially allocating new page from other nodes 4981 + */ 4982 + if (unlikely(node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 4983 + && try_thisnode)) { 4984 + if (unlikely(!allow_spin)) 4985 + /* Do not upgrade gfp to NOWAIT from more restrictive mode */ 4986 + pc.flags = gfpflags | __GFP_THISNODE; 4987 + else 4988 + pc.flags = GFP_NOWAIT | __GFP_THISNODE; 4989 + } 4990 + 4991 + pc.orig_size = orig_size; 4992 + object = get_from_partial(s, node, &pc); 4993 + if (object) 4994 + goto success; 4995 + 4996 + slab = new_slab(s, pc.flags, node); 4997 + 4998 + if (unlikely(!slab)) { 4999 + if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE) 5000 + && try_thisnode) { 5001 + try_thisnode = false; 5002 + goto new_objects; 5003 + } 5004 + slab_out_of_memory(s, gfpflags, node); 5005 + return NULL; 5006 + } 5007 + 5008 + stat(s, ALLOC_SLAB); 5009 + 5010 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5011 + object = alloc_single_from_new_slab(s, slab, orig_size, gfpflags); 5012 + 5013 + if (likely(object)) 5014 + goto success; 5015 + } else { 5016 + alloc_from_new_slab(s, slab, &object, 1, allow_spin); 5017 + 5018 + /* we don't need to check SLAB_STORE_USER here */ 5019 + if (likely(object)) 5020 + return object; 5021 + } 5022 + 5023 + if (allow_spin) 5024 + goto new_objects; 5025 + 5026 + /* This could cause an endless loop. Fail instead. */ 5027 + return NULL; 5028 + 5029 + success: 5030 + if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) 5031 + set_track(s, object, TRACK_ALLOC, addr, gfpflags); 5032 + 5033 + return object; 5034 + } 5035 + 5036 + static __always_inline void *__slab_alloc_node(struct kmem_cache *s, 5037 + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) 5038 + { 5039 + void *object; 5040 + 5041 + #ifdef CONFIG_NUMA 5042 + if (static_branch_unlikely(&strict_numa) && 5043 + node == NUMA_NO_NODE) { 5044 + 5045 + struct mempolicy *mpol = current->mempolicy; 5046 + 5047 + if (mpol) { 5048 + /* 5049 + * Special BIND rule support. If the local node 5050 + * is in permitted set then do not redirect 5051 + * to a particular node. 5052 + * Otherwise we apply the memory policy to get 5053 + * the node we need to allocate on. 5054 + */ 5055 + if (mpol->mode != MPOL_BIND || 5056 + !node_isset(numa_mem_id(), mpol->nodes)) 5057 + node = mempolicy_slab_node(); 5058 + } 5059 + } 5060 + #endif 5061 + 5062 + object = ___slab_alloc(s, gfpflags, node, addr, orig_size); 5063 + 5064 + return object; 4235 5065 } 4236 5066 4237 5067 static __fastpath_inline ··· 4488 4982 4489 4983 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 4490 4984 4985 + /* Bootstrap or debug cache, back off */ 4986 + if (unlikely(!cache_has_sheaves(s))) { 4987 + local_unlock(&s->cpu_sheaves->lock); 4988 + return NULL; 4989 + } 4990 + 4491 4991 if (pcs->spare && pcs->spare->size > 0) { 4492 4992 swap(pcs->main, pcs->spare); 4493 4993 return pcs; ··· 4505 4993 return NULL; 4506 4994 } 4507 4995 4508 - full = barn_replace_empty_sheaf(barn, pcs->main); 4996 + full = barn_replace_empty_sheaf(barn, pcs->main, 4997 + gfpflags_allow_spinning(gfp)); 4509 4998 4510 4999 if (full) { 4511 5000 stat(s, BARN_GET); ··· 4523 5010 empty = pcs->spare; 4524 5011 pcs->spare = NULL; 4525 5012 } else { 4526 - empty = barn_get_empty_sheaf(barn); 5013 + empty = barn_get_empty_sheaf(barn, true); 4527 5014 } 4528 5015 } 4529 5016 ··· 4565 5052 */ 4566 5053 4567 5054 if (pcs->main->size == 0) { 4568 - barn_put_empty_sheaf(barn, pcs->main); 5055 + if (!pcs->spare) 5056 + pcs->spare = pcs->main; 5057 + else 5058 + barn_put_empty_sheaf(barn, pcs->main); 4569 5059 pcs->main = full; 4570 5060 return pcs; 4571 5061 } ··· 4625 5109 * We assume the percpu sheaves contain only local objects although it's 4626 5110 * not completely guaranteed, so we verify later. 4627 5111 */ 4628 - if (unlikely(node_requested && node != numa_mem_id())) 5112 + if (unlikely(node_requested && node != numa_mem_id())) { 5113 + stat(s, ALLOC_NODE_MISMATCH); 4629 5114 return NULL; 5115 + } 4630 5116 4631 5117 if (!local_trylock(&s->cpu_sheaves->lock)) 4632 5118 return NULL; ··· 4651 5133 */ 4652 5134 if (page_to_nid(virt_to_page(object)) != node) { 4653 5135 local_unlock(&s->cpu_sheaves->lock); 5136 + stat(s, ALLOC_NODE_MISMATCH); 4654 5137 return NULL; 4655 5138 } 4656 5139 } ··· 4660 5141 4661 5142 local_unlock(&s->cpu_sheaves->lock); 4662 5143 4663 - stat(s, ALLOC_PCS); 5144 + stat(s, ALLOC_FASTPATH); 4664 5145 4665 5146 return object; 4666 5147 } 4667 5148 4668 5149 static __fastpath_inline 4669 - unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, size_t size, void **p) 5150 + unsigned int alloc_from_pcs_bulk(struct kmem_cache *s, gfp_t gfp, size_t size, 5151 + void **p) 4670 5152 { 4671 5153 struct slub_percpu_sheaves *pcs; 4672 5154 struct slab_sheaf *main; ··· 4685 5165 struct slab_sheaf *full; 4686 5166 struct node_barn *barn; 4687 5167 5168 + if (unlikely(!cache_has_sheaves(s))) { 5169 + local_unlock(&s->cpu_sheaves->lock); 5170 + return allocated; 5171 + } 5172 + 4688 5173 if (pcs->spare && pcs->spare->size > 0) { 4689 5174 swap(pcs->main, pcs->spare); 4690 5175 goto do_alloc; ··· 4701 5176 return allocated; 4702 5177 } 4703 5178 4704 - full = barn_replace_empty_sheaf(barn, pcs->main); 5179 + full = barn_replace_empty_sheaf(barn, pcs->main, 5180 + gfpflags_allow_spinning(gfp)); 4705 5181 4706 5182 if (full) { 4707 5183 stat(s, BARN_GET); ··· 4732 5206 4733 5207 local_unlock(&s->cpu_sheaves->lock); 4734 5208 4735 - stat_add(s, ALLOC_PCS, batch); 5209 + stat_add(s, ALLOC_FASTPATH, batch); 4736 5210 4737 5211 allocated += batch; 4738 5212 ··· 4770 5244 if (unlikely(object)) 4771 5245 goto out; 4772 5246 4773 - if (s->cpu_sheaves) 4774 - object = alloc_from_pcs(s, gfpflags, node); 5247 + object = alloc_from_pcs(s, gfpflags, node); 4775 5248 4776 5249 if (!object) 4777 5250 object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); ··· 4865 5340 return ret; 4866 5341 } 4867 5342 5343 + static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 5344 + size_t size, void **p); 5345 + 4868 5346 /* 4869 5347 * returns a sheaf that has at least the requested size 4870 5348 * when prefilling is needed, do so with given gfp flags ··· 4881 5353 struct slab_sheaf *sheaf = NULL; 4882 5354 struct node_barn *barn; 4883 5355 4884 - if (unlikely(size > s->sheaf_capacity)) { 5356 + if (unlikely(!size)) 5357 + return NULL; 4885 5358 4886 - /* 4887 - * slab_debug disables cpu sheaves intentionally so all 4888 - * prefilled sheaves become "oversize" and we give up on 4889 - * performance for the debugging. Same with SLUB_TINY. 4890 - * Creating a cache without sheaves and then requesting a 4891 - * prefilled sheaf is however not expected, so warn. 4892 - */ 4893 - WARN_ON_ONCE(s->sheaf_capacity == 0 && 4894 - !IS_ENABLED(CONFIG_SLUB_TINY) && 4895 - !(s->flags & SLAB_DEBUG_FLAGS)); 5359 + if (unlikely(size > s->sheaf_capacity)) { 4896 5360 4897 5361 sheaf = kzalloc(struct_size(sheaf, objects, size), gfp); 4898 5362 if (!sheaf) ··· 5206 5686 gfp_t alloc_gfp = __GFP_NOWARN | __GFP_NOMEMALLOC | gfp_flags; 5207 5687 struct kmem_cache *s; 5208 5688 bool can_retry = true; 5209 - void *ret = ERR_PTR(-EBUSY); 5689 + void *ret; 5210 5690 5211 5691 VM_WARN_ON_ONCE(gfp_flags & ~(__GFP_ACCOUNT | __GFP_ZERO | 5212 5692 __GFP_NO_OBJ_EXT)); ··· 5214 5694 if (unlikely(!size)) 5215 5695 return ZERO_SIZE_PTR; 5216 5696 5217 - if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible()) 5218 - /* 5219 - * kmalloc_nolock() in PREEMPT_RT is not supported from 5220 - * non-preemptible context because local_lock becomes a 5221 - * sleeping lock on RT. 5222 - */ 5697 + /* 5698 + * See the comment for the same check in 5699 + * alloc_frozen_pages_nolock_noprof() 5700 + */ 5701 + if (IS_ENABLED(CONFIG_PREEMPT_RT) && (in_nmi() || in_hardirq())) 5223 5702 return NULL; 5703 + 5224 5704 retry: 5225 5705 if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) 5226 5706 return NULL; ··· 5229 5709 if (!(s->flags & __CMPXCHG_DOUBLE) && !kmem_cache_debug(s)) 5230 5710 /* 5231 5711 * kmalloc_nolock() is not supported on architectures that 5232 - * don't implement cmpxchg16b, but debug caches don't use 5233 - * per-cpu slab and per-cpu partial slabs. They rely on 5234 - * kmem_cache_node->list_lock, so kmalloc_nolock() can 5235 - * attempt to allocate from debug caches by 5712 + * don't implement cmpxchg16b and thus need slab_lock() 5713 + * which could be preempted by a nmi. 5714 + * But debug caches don't use that and only rely on 5715 + * kmem_cache_node->list_lock, so kmalloc_nolock() can attempt 5716 + * to allocate from debug caches by 5236 5717 * spin_trylock_irqsave(&n->list_lock, ...) 5237 5718 */ 5238 5719 return NULL; 5720 + 5721 + ret = alloc_from_pcs(s, alloc_gfp, node); 5722 + if (ret) 5723 + goto success; 5239 5724 5240 5725 /* 5241 5726 * Do not call slab_alloc_node(), since trylock mode isn't 5242 5727 * compatible with slab_pre_alloc_hook/should_failslab and 5243 5728 * kfence_alloc. Hence call __slab_alloc_node() (at most twice) 5244 5729 * and slab_post_alloc_hook() directly. 5245 - * 5246 - * In !PREEMPT_RT ___slab_alloc() manipulates (freelist,tid) pair 5247 - * in irq saved region. It assumes that the same cpu will not 5248 - * __update_cpu_freelist_fast() into the same (freelist,tid) pair. 5249 - * Therefore use in_nmi() to check whether particular bucket is in 5250 - * irq protected section. 5251 - * 5252 - * If in_nmi() && local_lock_is_locked(s->cpu_slab) then it means that 5253 - * this cpu was interrupted somewhere inside ___slab_alloc() after 5254 - * it did local_lock_irqsave(&s->cpu_slab->lock, flags). 5255 - * In this case fast path with __update_cpu_freelist_fast() is not safe. 5256 5730 */ 5257 - if (!in_nmi() || !local_lock_is_locked(&s->cpu_slab->lock)) 5258 - ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5731 + ret = __slab_alloc_node(s, alloc_gfp, node, _RET_IP_, size); 5259 5732 5260 - if (PTR_ERR(ret) == -EBUSY) { 5261 - if (can_retry) { 5262 - /* pick the next kmalloc bucket */ 5263 - size = s->object_size + 1; 5264 - /* 5265 - * Another alternative is to 5266 - * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5267 - * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5268 - * to retry from bucket of the same size. 5269 - */ 5270 - can_retry = false; 5271 - goto retry; 5272 - } 5273 - ret = NULL; 5733 + /* 5734 + * It's possible we failed due to trylock as we preempted someone with 5735 + * the sheaves locked, and the list_lock is also held by another cpu. 5736 + * But it should be rare that multiple kmalloc buckets would have 5737 + * sheaves locked, so try a larger one. 5738 + */ 5739 + if (!ret && can_retry) { 5740 + /* pick the next kmalloc bucket */ 5741 + size = s->object_size + 1; 5742 + /* 5743 + * Another alternative is to 5744 + * if (memcg) alloc_gfp &= ~__GFP_ACCOUNT; 5745 + * else if (!memcg) alloc_gfp |= __GFP_ACCOUNT; 5746 + * to retry from bucket of the same size. 5747 + */ 5748 + can_retry = false; 5749 + goto retry; 5274 5750 } 5275 5751 5752 + success: 5276 5753 maybe_wipe_obj_freeptr(s, ret); 5277 5754 slab_post_alloc_hook(s, NULL, alloc_gfp, 1, &ret, 5278 5755 slab_want_init_on_alloc(alloc_gfp, s), size); ··· 5351 5834 /* was on full list */ 5352 5835 remove_full(s, n, slab); 5353 5836 if (!slab_free) { 5354 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 5837 + add_partial(n, slab, ADD_TO_TAIL); 5355 5838 stat(s, FREE_ADD_PARTIAL); 5356 5839 } 5357 5840 } else if (slab_free) { ··· 5389 5872 unsigned long addr) 5390 5873 5391 5874 { 5392 - bool was_frozen, was_full; 5875 + bool was_full; 5393 5876 struct freelist_counters old, new; 5394 5877 struct kmem_cache_node *n = NULL; 5395 5878 unsigned long flags; 5396 5879 bool on_node_partial; 5397 5880 5398 - stat(s, FREE_SLOWPATH); 5399 - 5400 5881 if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 5401 5882 free_to_partial_list(s, slab, head, tail, cnt, addr); 5402 5883 return; 5403 5884 } 5404 - 5405 - /* 5406 - * It is enough to test IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) below 5407 - * instead of kmem_cache_has_cpu_partial(s), because kmem_cache_debug(s) 5408 - * is the only other reason it can be false, and it is already handled 5409 - * above. 5410 - */ 5411 5885 5412 5886 do { 5413 5887 if (unlikely(n)) { ··· 5410 5902 old.counters = slab->counters; 5411 5903 5412 5904 was_full = (old.freelist == NULL); 5413 - was_frozen = old.frozen; 5414 5905 5415 5906 set_freepointer(s, tail, old.freelist); 5416 5907 ··· 5422 5915 * to (due to not being full anymore) the partial list. 5423 5916 * Unless it's frozen. 5424 5917 */ 5425 - if ((!new.inuse || was_full) && !was_frozen) { 5918 + if (!new.inuse || was_full) { 5919 + 5920 + n = get_node(s, slab_nid(slab)); 5426 5921 /* 5427 - * If slab becomes non-full and we have cpu partial 5428 - * lists, we put it there unconditionally to avoid 5429 - * taking the list_lock. Otherwise we need it. 5922 + * Speculatively acquire the list_lock. 5923 + * If the cmpxchg does not succeed then we may 5924 + * drop the list_lock without any processing. 5925 + * 5926 + * Otherwise the list_lock will synchronize with 5927 + * other processors updating the list of slabs. 5430 5928 */ 5431 - if (!(IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full)) { 5929 + spin_lock_irqsave(&n->list_lock, flags); 5432 5930 5433 - n = get_node(s, slab_nid(slab)); 5434 - /* 5435 - * Speculatively acquire the list_lock. 5436 - * If the cmpxchg does not succeed then we may 5437 - * drop the list_lock without any processing. 5438 - * 5439 - * Otherwise the list_lock will synchronize with 5440 - * other processors updating the list of slabs. 5441 - */ 5442 - spin_lock_irqsave(&n->list_lock, flags); 5443 - 5444 - on_node_partial = slab_test_node_partial(slab); 5445 - } 5931 + on_node_partial = slab_test_node_partial(slab); 5446 5932 } 5447 5933 5448 5934 } while (!slab_update_freelist(s, slab, &old, &new, "__slab_free")); 5449 5935 5450 5936 if (likely(!n)) { 5451 - 5452 - if (likely(was_frozen)) { 5453 - /* 5454 - * The list lock was not taken therefore no list 5455 - * activity can be necessary. 5456 - */ 5457 - stat(s, FREE_FROZEN); 5458 - } else if (IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && was_full) { 5459 - /* 5460 - * If we started with a full slab then put it onto the 5461 - * per cpu partial list. 5462 - */ 5463 - put_cpu_partial(s, slab, 1); 5464 - stat(s, CPU_PARTIAL_FREE); 5465 - } 5466 - 5467 5937 /* 5468 - * In other cases we didn't take the list_lock because the slab 5469 - * was already on the partial list and will remain there. 5938 + * We didn't take the list_lock because the slab was already on 5939 + * the partial list and will remain there. 5470 5940 */ 5471 - 5472 5941 return; 5473 5942 } 5474 5943 ··· 5466 5983 5467 5984 /* 5468 5985 * Objects left in the slab. If it was not on the partial list before 5469 - * then add it. This can only happen when cache has no per cpu partial 5470 - * list otherwise we would have put it there. 5986 + * then add it. 5471 5987 */ 5472 - if (!IS_ENABLED(CONFIG_SLUB_CPU_PARTIAL) && unlikely(was_full)) { 5473 - add_partial(n, slab, DEACTIVATE_TO_TAIL); 5988 + if (unlikely(was_full)) { 5989 + add_partial(n, slab, ADD_TO_TAIL); 5474 5990 stat(s, FREE_ADD_PARTIAL); 5475 5991 } 5476 5992 spin_unlock_irqrestore(&n->list_lock, flags); ··· 5555 6073 * unlocked. 5556 6074 */ 5557 6075 static struct slub_percpu_sheaves * 5558 - __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs) 6076 + __pcs_replace_full_main(struct kmem_cache *s, struct slub_percpu_sheaves *pcs, 6077 + bool allow_spin) 5559 6078 { 5560 6079 struct slab_sheaf *empty; 5561 6080 struct node_barn *barn; ··· 5564 6081 5565 6082 restart: 5566 6083 lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock)); 6084 + 6085 + /* Bootstrap or debug cache, back off */ 6086 + if (unlikely(!cache_has_sheaves(s))) { 6087 + local_unlock(&s->cpu_sheaves->lock); 6088 + return NULL; 6089 + } 5567 6090 5568 6091 barn = get_barn(s); 5569 6092 if (!barn) { ··· 5580 6091 put_fail = false; 5581 6092 5582 6093 if (!pcs->spare) { 5583 - empty = barn_get_empty_sheaf(barn); 6094 + empty = barn_get_empty_sheaf(barn, allow_spin); 5584 6095 if (empty) { 5585 6096 pcs->spare = pcs->main; 5586 6097 pcs->main = empty; ··· 5594 6105 return pcs; 5595 6106 } 5596 6107 5597 - empty = barn_replace_full_sheaf(barn, pcs->main); 6108 + empty = barn_replace_full_sheaf(barn, pcs->main, allow_spin); 5598 6109 5599 6110 if (!IS_ERR(empty)) { 5600 6111 stat(s, BARN_PUT); ··· 5602 6113 return pcs; 5603 6114 } 5604 6115 5605 - if (PTR_ERR(empty) == -E2BIG) { 6116 + /* sheaf_flush_unused() doesn't support !allow_spin */ 6117 + if (PTR_ERR(empty) == -E2BIG && allow_spin) { 5606 6118 /* Since we got here, spare exists and is full */ 5607 6119 struct slab_sheaf *to_flush = pcs->spare; 5608 6120 ··· 5627 6137 5628 6138 alloc_empty: 5629 6139 local_unlock(&s->cpu_sheaves->lock); 6140 + 6141 + /* 6142 + * alloc_empty_sheaf() doesn't support !allow_spin and it's 6143 + * easier to fall back to freeing directly without sheaves 6144 + * than add the support (and to sheaf_flush_unused() above) 6145 + */ 6146 + if (!allow_spin) 6147 + return NULL; 5630 6148 5631 6149 empty = alloc_empty_sheaf(s, GFP_NOWAIT); 5632 6150 if (empty) ··· 5678 6180 * The object is expected to have passed slab_free_hook() already. 5679 6181 */ 5680 6182 static __fastpath_inline 5681 - bool free_to_pcs(struct kmem_cache *s, void *object) 6183 + bool free_to_pcs(struct kmem_cache *s, void *object, bool allow_spin) 5682 6184 { 5683 6185 struct slub_percpu_sheaves *pcs; 5684 6186 ··· 5689 6191 5690 6192 if (unlikely(pcs->main->size == s->sheaf_capacity)) { 5691 6193 5692 - pcs = __pcs_replace_full_main(s, pcs); 6194 + pcs = __pcs_replace_full_main(s, pcs, allow_spin); 5693 6195 if (unlikely(!pcs)) 5694 6196 return false; 5695 6197 } ··· 5698 6200 5699 6201 local_unlock(&s->cpu_sheaves->lock); 5700 6202 5701 - stat(s, FREE_PCS); 6203 + stat(s, FREE_FASTPATH); 5702 6204 5703 6205 return true; 5704 6206 } ··· 5763 6265 free_empty_sheaf(s, sheaf); 5764 6266 } 5765 6267 6268 + /* 6269 + * kvfree_call_rcu() can be called while holding a raw_spinlock_t. Since 6270 + * __kfree_rcu_sheaf() may acquire a spinlock_t (sleeping lock on PREEMPT_RT), 6271 + * this would violate lock nesting rules. Therefore, kvfree_call_rcu() avoids 6272 + * this problem by bypassing the sheaves layer entirely on PREEMPT_RT. 6273 + * 6274 + * However, lockdep still complains that it is invalid to acquire spinlock_t 6275 + * while holding raw_spinlock_t, even on !PREEMPT_RT where spinlock_t is a 6276 + * spinning lock. Tell lockdep that acquiring spinlock_t is valid here 6277 + * by temporarily raising the wait-type to LD_WAIT_CONFIG. 6278 + */ 6279 + static DEFINE_WAIT_OVERRIDE_MAP(kfree_rcu_sheaf_map, LD_WAIT_CONFIG); 6280 + 5766 6281 bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) 5767 6282 { 5768 6283 struct slub_percpu_sheaves *pcs; 5769 6284 struct slab_sheaf *rcu_sheaf; 6285 + 6286 + if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 6287 + return false; 6288 + 6289 + lock_map_acquire_try(&kfree_rcu_sheaf_map); 5770 6290 5771 6291 if (!local_trylock(&s->cpu_sheaves->lock)) 5772 6292 goto fail; ··· 5795 6279 5796 6280 struct slab_sheaf *empty; 5797 6281 struct node_barn *barn; 6282 + 6283 + /* Bootstrap or debug cache, fall back */ 6284 + if (unlikely(!cache_has_sheaves(s))) { 6285 + local_unlock(&s->cpu_sheaves->lock); 6286 + goto fail; 6287 + } 5798 6288 5799 6289 if (pcs->spare && pcs->spare->size == 0) { 5800 6290 pcs->rcu_free = pcs->spare; ··· 5814 6292 goto fail; 5815 6293 } 5816 6294 5817 - empty = barn_get_empty_sheaf(barn); 6295 + empty = barn_get_empty_sheaf(barn, true); 5818 6296 5819 6297 if (empty) { 5820 6298 pcs->rcu_free = empty; ··· 5868 6346 local_unlock(&s->cpu_sheaves->lock); 5869 6347 5870 6348 stat(s, FREE_RCU_SHEAF); 6349 + lock_map_release(&kfree_rcu_sheaf_map); 5871 6350 return true; 5872 6351 5873 6352 fail: 5874 6353 stat(s, FREE_RCU_SHEAF_FAIL); 6354 + lock_map_release(&kfree_rcu_sheaf_map); 5875 6355 return false; 5876 6356 } 5877 6357 ··· 5934 6410 goto no_empty; 5935 6411 5936 6412 if (!pcs->spare) { 5937 - empty = barn_get_empty_sheaf(barn); 6413 + empty = barn_get_empty_sheaf(barn, true); 5938 6414 if (!empty) 5939 6415 goto no_empty; 5940 6416 ··· 5948 6424 goto do_free; 5949 6425 } 5950 6426 5951 - empty = barn_replace_full_sheaf(barn, pcs->main); 6427 + empty = barn_replace_full_sheaf(barn, pcs->main, true); 5952 6428 if (IS_ERR(empty)) { 5953 6429 stat(s, BARN_PUT_FAIL); 5954 6430 goto no_empty; ··· 5966 6442 5967 6443 local_unlock(&s->cpu_sheaves->lock); 5968 6444 5969 - stat_add(s, FREE_PCS, batch); 6445 + stat_add(s, FREE_FASTPATH, batch); 5970 6446 5971 6447 if (batch < size) { 5972 6448 p += batch; ··· 5988 6464 */ 5989 6465 fallback: 5990 6466 __kmem_cache_free_bulk(s, size, p); 6467 + stat_add(s, FREE_SLOWPATH, size); 5991 6468 5992 6469 flush_remote: 5993 6470 if (remote_nr) { 5994 6471 __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); 6472 + stat_add(s, FREE_SLOWPATH, remote_nr); 5995 6473 if (i < size) { 5996 6474 remote_nr = 0; 5997 6475 goto next_remote_batch; ··· 6003 6477 6004 6478 struct defer_free { 6005 6479 struct llist_head objects; 6006 - struct llist_head slabs; 6007 6480 struct irq_work work; 6008 6481 }; 6009 6482 ··· 6010 6485 6011 6486 static DEFINE_PER_CPU(struct defer_free, defer_free_objects) = { 6012 6487 .objects = LLIST_HEAD_INIT(objects), 6013 - .slabs = LLIST_HEAD_INIT(slabs), 6014 6488 .work = IRQ_WORK_INIT(free_deferred_objects), 6015 6489 }; 6016 6490 6017 6491 /* 6018 6492 * In PREEMPT_RT irq_work runs in per-cpu kthread, so it's safe 6019 - * to take sleeping spin_locks from __slab_free() and deactivate_slab(). 6493 + * to take sleeping spin_locks from __slab_free(). 6020 6494 * In !PREEMPT_RT irq_work will run after local_unlock_irqrestore(). 6021 6495 */ 6022 6496 static void free_deferred_objects(struct irq_work *work) 6023 6497 { 6024 6498 struct defer_free *df = container_of(work, struct defer_free, work); 6025 6499 struct llist_head *objs = &df->objects; 6026 - struct llist_head *slabs = &df->slabs; 6027 6500 struct llist_node *llnode, *pos, *t; 6028 6501 6029 - if (llist_empty(objs) && llist_empty(slabs)) 6502 + if (llist_empty(objs)) 6030 6503 return; 6031 6504 6032 6505 llnode = llist_del_all(objs); ··· 6047 6524 set_freepointer(s, x, NULL); 6048 6525 6049 6526 __slab_free(s, slab, x, x, 1, _THIS_IP_); 6050 - } 6051 - 6052 - llnode = llist_del_all(slabs); 6053 - llist_for_each_safe(pos, t, llnode) { 6054 - struct slab *slab = container_of(pos, struct slab, llnode); 6055 - 6056 - if (slab->frozen) 6057 - deactivate_slab(slab->slab_cache, slab, slab->flush_freelist); 6058 - else 6059 - free_slab(slab->slab_cache, slab); 6527 + stat(s, FREE_SLOWPATH); 6060 6528 } 6061 6529 } 6062 6530 ··· 6064 6550 irq_work_queue(&df->work); 6065 6551 } 6066 6552 6067 - static void defer_deactivate_slab(struct slab *slab, void *flush_freelist) 6068 - { 6069 - struct defer_free *df; 6070 - 6071 - slab->flush_freelist = flush_freelist; 6072 - 6073 - guard(preempt)(); 6074 - 6075 - df = this_cpu_ptr(&defer_free_objects); 6076 - if (llist_add(&slab->llnode, &df->slabs)) 6077 - irq_work_queue(&df->work); 6078 - } 6079 - 6080 6553 void defer_free_barrier(void) 6081 6554 { 6082 6555 int cpu; 6083 6556 6084 6557 for_each_possible_cpu(cpu) 6085 6558 irq_work_sync(&per_cpu_ptr(&defer_free_objects, cpu)->work); 6086 - } 6087 - 6088 - /* 6089 - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that 6090 - * can perform fastpath freeing without additional function calls. 6091 - * 6092 - * The fastpath is only possible if we are freeing to the current cpu slab 6093 - * of this processor. This typically the case if we have just allocated 6094 - * the item before. 6095 - * 6096 - * If fastpath is not possible then fall back to __slab_free where we deal 6097 - * with all sorts of special processing. 6098 - * 6099 - * Bulk free of a freelist with several objects (all pointing to the 6100 - * same slab) possible by specifying head and tail ptr, plus objects 6101 - * count (cnt). Bulk free indicated by tail pointer being set. 6102 - */ 6103 - static __always_inline void do_slab_free(struct kmem_cache *s, 6104 - struct slab *slab, void *head, void *tail, 6105 - int cnt, unsigned long addr) 6106 - { 6107 - /* cnt == 0 signals that it's called from kfree_nolock() */ 6108 - bool allow_spin = cnt; 6109 - struct kmem_cache_cpu *c; 6110 - unsigned long tid; 6111 - void **freelist; 6112 - 6113 - redo: 6114 - /* 6115 - * Determine the currently cpus per cpu slab. 6116 - * The cpu may change afterward. However that does not matter since 6117 - * data is retrieved via this pointer. If we are on the same cpu 6118 - * during the cmpxchg then the free will succeed. 6119 - */ 6120 - c = raw_cpu_ptr(s->cpu_slab); 6121 - tid = READ_ONCE(c->tid); 6122 - 6123 - /* Same with comment on barrier() in __slab_alloc_node() */ 6124 - barrier(); 6125 - 6126 - if (unlikely(slab != c->slab)) { 6127 - if (unlikely(!allow_spin)) { 6128 - /* 6129 - * __slab_free() can locklessly cmpxchg16 into a slab, 6130 - * but then it might need to take spin_lock or local_lock 6131 - * in put_cpu_partial() for further processing. 6132 - * Avoid the complexity and simply add to a deferred list. 6133 - */ 6134 - defer_free(s, head); 6135 - } else { 6136 - __slab_free(s, slab, head, tail, cnt, addr); 6137 - } 6138 - return; 6139 - } 6140 - 6141 - if (unlikely(!allow_spin)) { 6142 - if ((in_nmi() || !USE_LOCKLESS_FAST_PATH()) && 6143 - local_lock_is_locked(&s->cpu_slab->lock)) { 6144 - defer_free(s, head); 6145 - return; 6146 - } 6147 - cnt = 1; /* restore cnt. kfree_nolock() frees one object at a time */ 6148 - } 6149 - 6150 - if (USE_LOCKLESS_FAST_PATH()) { 6151 - freelist = READ_ONCE(c->freelist); 6152 - 6153 - set_freepointer(s, tail, freelist); 6154 - 6155 - if (unlikely(!__update_cpu_freelist_fast(s, freelist, head, tid))) { 6156 - note_cmpxchg_failure("slab_free", s, tid); 6157 - goto redo; 6158 - } 6159 - } else { 6160 - __maybe_unused unsigned long flags = 0; 6161 - 6162 - /* Update the free list under the local lock */ 6163 - local_lock_cpu_slab(s, flags); 6164 - c = this_cpu_ptr(s->cpu_slab); 6165 - if (unlikely(slab != c->slab)) { 6166 - local_unlock_cpu_slab(s, flags); 6167 - goto redo; 6168 - } 6169 - tid = c->tid; 6170 - freelist = c->freelist; 6171 - 6172 - set_freepointer(s, tail, freelist); 6173 - c->freelist = head; 6174 - c->tid = next_tid(tid); 6175 - 6176 - local_unlock_cpu_slab(s, flags); 6177 - } 6178 - stat_add(s, FREE_FASTPATH, cnt); 6179 6559 } 6180 6560 6181 6561 static __fastpath_inline ··· 6082 6674 if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6083 6675 return; 6084 6676 6085 - if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6086 - slab_nid(slab) == numa_mem_id()) 6087 - && likely(!slab_test_pfmemalloc(slab))) { 6088 - if (likely(free_to_pcs(s, object))) 6677 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id()) 6678 + && likely(!slab_test_pfmemalloc(slab))) { 6679 + if (likely(free_to_pcs(s, object, true))) 6089 6680 return; 6090 6681 } 6091 6682 6092 - do_slab_free(s, slab, object, object, 1, addr); 6683 + __slab_free(s, slab, object, object, 1, addr); 6684 + stat(s, FREE_SLOWPATH); 6093 6685 } 6094 6686 6095 6687 #ifdef CONFIG_MEMCG ··· 6102 6694 alloc_tagging_slab_free_hook(s, slab, &object, 1); 6103 6695 6104 6696 if (likely(slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6105 - do_slab_free(s, slab, object, object, 1, _RET_IP_); 6697 + __slab_free(s, slab, object, object, 1, _RET_IP_); 6106 6698 } 6107 6699 #endif 6108 6700 ··· 6116 6708 * With KASAN enabled slab_free_freelist_hook modifies the freelist 6117 6709 * to remove objects, whose reuse must be delayed. 6118 6710 */ 6119 - if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) 6120 - do_slab_free(s, slab, head, tail, cnt, addr); 6711 + if (likely(slab_free_freelist_hook(s, &head, &tail, &cnt))) { 6712 + __slab_free(s, slab, head, tail, cnt, addr); 6713 + stat_add(s, FREE_SLOWPATH, cnt); 6714 + } 6121 6715 } 6122 6716 6123 6717 #ifdef CONFIG_SLUB_RCU_DEBUG ··· 6144 6734 return; 6145 6735 6146 6736 /* resume freeing */ 6147 - if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) 6148 - do_slab_free(s, slab, object, object, 1, _THIS_IP_); 6737 + if (slab_free_hook(s, object, slab_want_init_on_free(s), true)) { 6738 + __slab_free(s, slab, object, object, 1, _THIS_IP_); 6739 + stat(s, FREE_SLOWPATH); 6740 + } 6149 6741 } 6150 6742 #endif /* CONFIG_SLUB_RCU_DEBUG */ 6151 6743 6152 6744 #ifdef CONFIG_KASAN_GENERIC 6153 6745 void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr) 6154 6746 { 6155 - do_slab_free(cache, virt_to_slab(x), x, x, 1, addr); 6747 + __slab_free(cache, virt_to_slab(x), x, x, 1, addr); 6748 + stat(cache, FREE_SLOWPATH); 6156 6749 } 6157 6750 #endif 6158 6751 6159 - static inline struct kmem_cache *virt_to_cache(const void *obj) 6752 + static noinline void warn_free_bad_obj(struct kmem_cache *s, void *obj) 6160 6753 { 6754 + struct kmem_cache *cachep; 6161 6755 struct slab *slab; 6162 6756 6163 6757 slab = virt_to_slab(obj); 6164 - if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n", __func__)) 6165 - return NULL; 6166 - return slab->slab_cache; 6167 - } 6758 + if (WARN_ONCE(!slab, 6759 + "kmem_cache_free(%s, %p): object is not in a slab page\n", 6760 + s->name, obj)) 6761 + return; 6168 6762 6169 - static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 6170 - { 6171 - struct kmem_cache *cachep; 6763 + cachep = slab->slab_cache; 6172 6764 6173 - if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) && 6174 - !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) 6175 - return s; 6176 - 6177 - cachep = virt_to_cache(x); 6178 - if (WARN(cachep && cachep != s, 6179 - "%s: Wrong slab cache. %s but object is from %s\n", 6180 - __func__, s->name, cachep->name)) 6181 - print_tracking(cachep, x); 6182 - return cachep; 6765 + if (WARN_ONCE(cachep != s, 6766 + "kmem_cache_free(%s, %p): object belongs to different cache %s\n", 6767 + s->name, obj, cachep ? cachep->name : "(NULL)")) { 6768 + if (cachep) 6769 + print_tracking(cachep, obj); 6770 + return; 6771 + } 6183 6772 } 6184 6773 6185 6774 /** ··· 6191 6782 */ 6192 6783 void kmem_cache_free(struct kmem_cache *s, void *x) 6193 6784 { 6194 - s = cache_from_obj(s, x); 6195 - if (!s) 6196 - return; 6785 + struct slab *slab; 6786 + 6787 + slab = virt_to_slab(x); 6788 + 6789 + if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) || 6790 + kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { 6791 + 6792 + /* 6793 + * Intentionally leak the object in these cases, because it 6794 + * would be too dangerous to continue. 6795 + */ 6796 + if (unlikely(!slab || (slab->slab_cache != s))) { 6797 + warn_free_bad_obj(s, x); 6798 + return; 6799 + } 6800 + } 6801 + 6197 6802 trace_kmem_cache_free(_RET_IP_, x, s); 6198 - slab_free(s, virt_to_slab(x), x, _RET_IP_); 6803 + slab_free(s, slab, x, _RET_IP_); 6199 6804 } 6200 6805 EXPORT_SYMBOL(kmem_cache_free); 6806 + 6807 + static inline size_t slab_ksize(struct slab *slab) 6808 + { 6809 + struct kmem_cache *s = slab->slab_cache; 6810 + 6811 + #ifdef CONFIG_SLUB_DEBUG 6812 + /* 6813 + * Debugging requires use of the padding between object 6814 + * and whatever may come after it. 6815 + */ 6816 + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 6817 + return s->object_size; 6818 + #endif 6819 + if (s->flags & SLAB_KASAN) 6820 + return s->object_size; 6821 + /* 6822 + * If we have the need to store the freelist pointer 6823 + * or any other metadata back there then we can 6824 + * only use the space before that information. 6825 + */ 6826 + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) 6827 + return s->inuse; 6828 + else if (obj_exts_in_object(s, slab)) 6829 + return s->inuse; 6830 + /* 6831 + * Else we can use all the padding etc for the allocation 6832 + */ 6833 + return s->size; 6834 + } 6835 + 6836 + static size_t __ksize(const void *object) 6837 + { 6838 + struct page *page; 6839 + struct slab *slab; 6840 + 6841 + if (unlikely(object == ZERO_SIZE_PTR)) 6842 + return 0; 6843 + 6844 + page = virt_to_page(object); 6845 + 6846 + if (unlikely(PageLargeKmalloc(page))) 6847 + return large_kmalloc_size(page); 6848 + 6849 + slab = page_slab(page); 6850 + /* Delete this after we're sure there are no users */ 6851 + if (WARN_ON(!slab)) 6852 + return page_size(page); 6853 + 6854 + #ifdef CONFIG_SLUB_DEBUG 6855 + skip_orig_size_check(slab->slab_cache, object); 6856 + #endif 6857 + 6858 + return slab_ksize(slab); 6859 + } 6860 + 6861 + /** 6862 + * ksize -- Report full size of underlying allocation 6863 + * @objp: pointer to the object 6864 + * 6865 + * This should only be used internally to query the true size of allocations. 6866 + * It is not meant to be a way to discover the usable size of an allocation 6867 + * after the fact. Instead, use kmalloc_size_roundup(). Using memory beyond 6868 + * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, 6869 + * and/or FORTIFY_SOURCE. 6870 + * 6871 + * Return: size of the actual memory used by @objp in bytes 6872 + */ 6873 + size_t ksize(const void *objp) 6874 + { 6875 + /* 6876 + * We need to first check that the pointer to the object is valid. 6877 + * The KASAN report printed from ksize() is more useful, then when 6878 + * it's printed later when the behaviour could be undefined due to 6879 + * a potential use-after-free or double-free. 6880 + * 6881 + * We use kasan_check_byte(), which is supported for the hardware 6882 + * tag-based KASAN mode, unlike kasan_check_read/write(). 6883 + * 6884 + * If the pointed to memory is invalid, we return 0 to avoid users of 6885 + * ksize() writing to and potentially corrupting the memory region. 6886 + * 6887 + * We want to perform the check before __ksize(), to avoid potentially 6888 + * crashing in __ksize() due to accessing invalid metadata. 6889 + */ 6890 + if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) 6891 + return 0; 6892 + 6893 + return kfence_ksize(objp) ?: __ksize(objp); 6894 + } 6895 + EXPORT_SYMBOL(ksize); 6201 6896 6202 6897 static void free_large_kmalloc(struct page *page, void *object) 6203 6898 { ··· 6455 6942 * since kasan quarantine takes locks and not supported from NMI. 6456 6943 */ 6457 6944 kasan_slab_free(s, x, false, false, /* skip quarantine */true); 6458 - do_slab_free(s, slab, x, x, 0, _RET_IP_); 6945 + 6946 + if (likely(!IS_ENABLED(CONFIG_NUMA) || slab_nid(slab) == numa_mem_id())) { 6947 + if (likely(free_to_pcs(s, x, false))) 6948 + return; 6949 + } 6950 + 6951 + /* 6952 + * __slab_free() can locklessly cmpxchg16 into a slab, but then it might 6953 + * need to take spin_lock for further processing. 6954 + * Avoid the complexity and simply add to a deferred list. 6955 + */ 6956 + defer_free(s, x); 6459 6957 } 6460 6958 EXPORT_SYMBOL_GPL(kfree_nolock); 6461 6959 ··· 6837 7313 df->s = slab->slab_cache; 6838 7314 } else { 6839 7315 df->slab = slab; 6840 - df->s = cache_from_obj(s, object); /* Support for memcg */ 7316 + df->s = s; 6841 7317 } 6842 7318 6843 7319 /* Start new detached freelist */ ··· 6892 7368 if (kfence_free(df.freelist)) 6893 7369 continue; 6894 7370 6895 - do_slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 7371 + __slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, 6896 7372 _RET_IP_); 6897 7373 } while (likely(size)); 6898 7374 } ··· 6907 7383 * freeing to sheaves is so incompatible with the detached freelist so 6908 7384 * once we go that way, we have to do everything differently 6909 7385 */ 6910 - if (s && s->cpu_sheaves) { 7386 + if (s && cache_has_sheaves(s)) { 6911 7387 free_to_pcs_bulk(s, size, p); 6912 7388 return; 6913 7389 } ··· 6925 7401 } 6926 7402 EXPORT_SYMBOL(kmem_cache_free_bulk); 6927 7403 7404 + static unsigned int 7405 + __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7406 + unsigned int max, struct kmem_cache_node *n, 7407 + bool allow_spin) 7408 + { 7409 + struct partial_bulk_context pc; 7410 + struct slab *slab, *slab2; 7411 + unsigned int refilled = 0; 7412 + unsigned long flags; 7413 + void *object; 7414 + 7415 + pc.flags = gfp; 7416 + pc.min_objects = min; 7417 + pc.max_objects = max; 7418 + 7419 + if (!get_partial_node_bulk(s, n, &pc, allow_spin)) 7420 + return 0; 7421 + 7422 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7423 + 7424 + list_del(&slab->slab_list); 7425 + 7426 + object = get_freelist_nofreeze(s, slab); 7427 + 7428 + while (object && refilled < max) { 7429 + p[refilled] = object; 7430 + object = get_freepointer(s, object); 7431 + maybe_wipe_obj_freeptr(s, p[refilled]); 7432 + 7433 + refilled++; 7434 + } 7435 + 7436 + /* 7437 + * Freelist had more objects than we can accommodate, we need to 7438 + * free them back. We can treat it like a detached freelist, just 7439 + * need to find the tail object. 7440 + */ 7441 + if (unlikely(object)) { 7442 + void *head = object; 7443 + void *tail; 7444 + int cnt = 0; 7445 + 7446 + do { 7447 + tail = object; 7448 + cnt++; 7449 + object = get_freepointer(s, object); 7450 + } while (object); 7451 + __slab_free(s, slab, head, tail, cnt, _RET_IP_); 7452 + } 7453 + 7454 + if (refilled >= max) 7455 + break; 7456 + } 7457 + 7458 + if (unlikely(!list_empty(&pc.slabs))) { 7459 + spin_lock_irqsave(&n->list_lock, flags); 7460 + 7461 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7462 + 7463 + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) 7464 + continue; 7465 + 7466 + list_del(&slab->slab_list); 7467 + add_partial(n, slab, ADD_TO_HEAD); 7468 + } 7469 + 7470 + spin_unlock_irqrestore(&n->list_lock, flags); 7471 + 7472 + /* any slabs left are completely free and for discard */ 7473 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7474 + 7475 + list_del(&slab->slab_list); 7476 + discard_slab(s, slab); 7477 + } 7478 + } 7479 + 7480 + return refilled; 7481 + } 7482 + 7483 + #ifdef CONFIG_NUMA 7484 + static unsigned int 7485 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7486 + unsigned int max) 7487 + { 7488 + struct zonelist *zonelist; 7489 + struct zoneref *z; 7490 + struct zone *zone; 7491 + enum zone_type highest_zoneidx = gfp_zone(gfp); 7492 + unsigned int cpuset_mems_cookie; 7493 + unsigned int refilled = 0; 7494 + 7495 + /* see get_from_any_partial() for the defrag ratio description */ 7496 + if (!s->remote_node_defrag_ratio || 7497 + get_cycles() % 1024 > s->remote_node_defrag_ratio) 7498 + return 0; 7499 + 7500 + do { 7501 + cpuset_mems_cookie = read_mems_allowed_begin(); 7502 + zonelist = node_zonelist(mempolicy_slab_node(), gfp); 7503 + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { 7504 + struct kmem_cache_node *n; 7505 + unsigned int r; 7506 + 7507 + n = get_node(s, zone_to_nid(zone)); 7508 + 7509 + if (!n || !cpuset_zone_allowed(zone, gfp) || 7510 + n->nr_partial <= s->min_partial) 7511 + continue; 7512 + 7513 + r = __refill_objects_node(s, p, gfp, min, max, n, 7514 + /* allow_spin = */ false); 7515 + refilled += r; 7516 + 7517 + if (r >= min) { 7518 + /* 7519 + * Don't check read_mems_allowed_retry() here - 7520 + * if mems_allowed was updated in parallel, that 7521 + * was a harmless race between allocation and 7522 + * the cpuset update 7523 + */ 7524 + return refilled; 7525 + } 7526 + p += r; 7527 + min -= r; 7528 + max -= r; 7529 + } 7530 + } while (read_mems_allowed_retry(cpuset_mems_cookie)); 7531 + 7532 + return refilled; 7533 + } 7534 + #else 7535 + static inline unsigned int 7536 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7537 + unsigned int max) 7538 + { 7539 + return 0; 7540 + } 7541 + #endif 7542 + 7543 + static unsigned int 7544 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7545 + unsigned int max) 7546 + { 7547 + int local_node = numa_mem_id(); 7548 + unsigned int refilled; 7549 + struct slab *slab; 7550 + 7551 + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) 7552 + return 0; 7553 + 7554 + refilled = __refill_objects_node(s, p, gfp, min, max, 7555 + get_node(s, local_node), 7556 + /* allow_spin = */ true); 7557 + if (refilled >= min) 7558 + return refilled; 7559 + 7560 + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, 7561 + max - refilled); 7562 + if (refilled >= min) 7563 + return refilled; 7564 + 7565 + new_slab: 7566 + 7567 + slab = new_slab(s, gfp, local_node); 7568 + if (!slab) 7569 + goto out; 7570 + 7571 + stat(s, ALLOC_SLAB); 7572 + 7573 + /* 7574 + * TODO: possible optimization - if we know we will consume the whole 7575 + * slab we might skip creating the freelist? 7576 + */ 7577 + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, 7578 + /* allow_spin = */ true); 7579 + 7580 + if (refilled < min) 7581 + goto new_slab; 7582 + 7583 + out: 7584 + return refilled; 7585 + } 7586 + 6928 7587 static inline 6929 7588 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 6930 7589 void **p) 6931 7590 { 6932 - struct kmem_cache_cpu *c; 6933 - unsigned long irqflags; 6934 7591 int i; 6935 7592 6936 - /* 6937 - * Drain objects in the per cpu slab, while disabling local 6938 - * IRQs, which protects against PREEMPT and interrupts 6939 - * handlers invoking normal fastpath. 6940 - */ 6941 - c = slub_get_cpu_ptr(s->cpu_slab); 6942 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 7593 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 7594 + for (i = 0; i < size; i++) { 6943 7595 6944 - for (i = 0; i < size; i++) { 6945 - void *object = c->freelist; 6946 - 6947 - if (unlikely(!object)) { 6948 - /* 6949 - * We may have removed an object from c->freelist using 6950 - * the fastpath in the previous iteration; in that case, 6951 - * c->tid has not been bumped yet. 6952 - * Since ___slab_alloc() may reenable interrupts while 6953 - * allocating memory, we should bump c->tid now. 6954 - */ 6955 - c->tid = next_tid(c->tid); 6956 - 6957 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6958 - 6959 - /* 6960 - * Invoking slow path likely have side-effect 6961 - * of re-populating per CPU c->freelist 6962 - */ 6963 - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, 6964 - _RET_IP_, c, s->object_size); 7596 + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, 7597 + s->object_size); 6965 7598 if (unlikely(!p[i])) 6966 7599 goto error; 6967 7600 6968 - c = this_cpu_ptr(s->cpu_slab); 6969 7601 maybe_wipe_obj_freeptr(s, p[i]); 6970 - 6971 - local_lock_irqsave(&s->cpu_slab->lock, irqflags); 6972 - 6973 - continue; /* goto for-loop */ 6974 7602 } 6975 - c->freelist = get_freepointer(s, object); 6976 - p[i] = object; 6977 - maybe_wipe_obj_freeptr(s, p[i]); 6978 - stat(s, ALLOC_FASTPATH); 7603 + } else { 7604 + i = refill_objects(s, p, flags, size, size); 7605 + if (i < size) 7606 + goto error; 7607 + stat_add(s, ALLOC_SLOWPATH, i); 6979 7608 } 6980 - c->tid = next_tid(c->tid); 6981 - local_unlock_irqrestore(&s->cpu_slab->lock, irqflags); 6982 - slub_put_cpu_ptr(s->cpu_slab); 6983 7609 6984 7610 return i; 6985 7611 6986 7612 error: 6987 - slub_put_cpu_ptr(s->cpu_slab); 6988 7613 __kmem_cache_free_bulk(s, i, p); 6989 7614 return 0; 6990 7615 6991 7616 } 6992 7617 6993 - /* Note that interrupts must be enabled when calling this function. */ 7618 + /* 7619 + * Note that interrupts must be enabled when calling this function and gfp 7620 + * flags must allow spinning. 7621 + */ 6994 7622 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 6995 7623 void **p) 6996 7624 { ··· 7170 7494 size--; 7171 7495 } 7172 7496 7173 - if (s->cpu_sheaves) 7174 - i = alloc_from_pcs_bulk(s, size, p); 7497 + i = alloc_from_pcs_bulk(s, flags, size, p); 7175 7498 7176 7499 if (i < size) { 7177 7500 /* ··· 7358 7683 barn_init(barn); 7359 7684 } 7360 7685 7361 - static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) 7686 + #ifdef CONFIG_SLUB_STATS 7687 + static inline int alloc_kmem_cache_stats(struct kmem_cache *s) 7362 7688 { 7363 7689 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 7364 7690 NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * 7365 - sizeof(struct kmem_cache_cpu)); 7691 + sizeof(struct kmem_cache_stats)); 7366 7692 7367 - /* 7368 - * Must align to double word boundary for the double cmpxchg 7369 - * instructions to work; see __pcpu_double_call_return_bool(). 7370 - */ 7371 - s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 7372 - 2 * sizeof(void *)); 7693 + s->cpu_stats = alloc_percpu(struct kmem_cache_stats); 7373 7694 7374 - if (!s->cpu_slab) 7695 + if (!s->cpu_stats) 7375 7696 return 0; 7376 - 7377 - init_kmem_cache_cpus(s); 7378 7697 7379 7698 return 1; 7380 7699 } 7700 + #endif 7381 7701 7382 7702 static int init_percpu_sheaves(struct kmem_cache *s) 7383 7703 { 7704 + static struct slab_sheaf bootstrap_sheaf = {}; 7384 7705 int cpu; 7385 7706 7386 7707 for_each_possible_cpu(cpu) { ··· 7386 7715 7387 7716 local_trylock_init(&pcs->lock); 7388 7717 7389 - pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 7718 + /* 7719 + * Bootstrap sheaf has zero size so fast-path allocation fails. 7720 + * It has also size == s->sheaf_capacity, so fast-path free 7721 + * fails. In the slow paths we recognize the situation by 7722 + * checking s->sheaf_capacity. This allows fast paths to assume 7723 + * s->cpu_sheaves and pcs->main always exists and are valid. 7724 + * It's also safe to share the single static bootstrap_sheaf 7725 + * with zero-sized objects array as it's never modified. 7726 + * 7727 + * Bootstrap_sheaf also has NULL pointer to kmem_cache so we 7728 + * recognize it and not attempt to free it when destroying the 7729 + * cache. 7730 + * 7731 + * We keep bootstrap_sheaf for kmem_cache and kmem_cache_node, 7732 + * caches with debug enabled, and all caches with SLUB_TINY. 7733 + * For kmalloc caches it's used temporarily during the initial 7734 + * bootstrap. 7735 + */ 7736 + if (!s->sheaf_capacity) 7737 + pcs->main = &bootstrap_sheaf; 7738 + else 7739 + pcs->main = alloc_empty_sheaf(s, GFP_KERNEL); 7390 7740 7391 7741 if (!pcs->main) 7392 7742 return -ENOMEM; ··· 7458 7766 * No locks need to be taken here as it has just been 7459 7767 * initialized and there is no concurrent access. 7460 7768 */ 7461 - __add_partial(n, slab, DEACTIVATE_TO_HEAD); 7769 + __add_partial(n, slab, ADD_TO_HEAD); 7462 7770 } 7463 7771 7464 7772 static void free_kmem_cache_nodes(struct kmem_cache *s) ··· 7482 7790 void __kmem_cache_release(struct kmem_cache *s) 7483 7791 { 7484 7792 cache_random_seq_destroy(s); 7485 - if (s->cpu_sheaves) 7486 - pcs_destroy(s); 7487 - #ifdef CONFIG_PREEMPT_RT 7488 - if (s->cpu_slab) 7489 - lockdep_unregister_key(&s->lock_key); 7793 + pcs_destroy(s); 7794 + #ifdef CONFIG_SLUB_STATS 7795 + free_percpu(s->cpu_stats); 7490 7796 #endif 7491 - free_percpu(s->cpu_slab); 7492 7797 free_kmem_cache_nodes(s); 7493 7798 } 7494 7799 ··· 7502 7813 continue; 7503 7814 } 7504 7815 7505 - if (s->cpu_sheaves) { 7816 + if (cache_has_sheaves(s)) { 7506 7817 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 7507 7818 7508 7819 if (!barn) ··· 7523 7834 return 1; 7524 7835 } 7525 7836 7526 - static void set_cpu_partial(struct kmem_cache *s) 7837 + static unsigned int calculate_sheaf_capacity(struct kmem_cache *s, 7838 + struct kmem_cache_args *args) 7839 + 7527 7840 { 7528 - #ifdef CONFIG_SLUB_CPU_PARTIAL 7529 - unsigned int nr_objects; 7841 + unsigned int capacity; 7842 + size_t size; 7843 + 7844 + 7845 + if (IS_ENABLED(CONFIG_SLUB_TINY) || s->flags & SLAB_DEBUG_FLAGS) 7846 + return 0; 7530 7847 7531 7848 /* 7532 - * cpu_partial determined the maximum number of objects kept in the 7533 - * per cpu partial lists of a processor. 7534 - * 7535 - * Per cpu partial lists mainly contain slabs that just have one 7536 - * object freed. If they are used for allocation then they can be 7537 - * filled up again with minimal effort. The slab will never hit the 7538 - * per node partial lists and therefore no locking will be required. 7539 - * 7540 - * For backwards compatibility reasons, this is determined as number 7541 - * of objects, even though we now limit maximum number of pages, see 7542 - * slub_set_cpu_partial() 7849 + * Bootstrap caches can't have sheaves for now (SLAB_NO_OBJ_EXT). 7850 + * SLAB_NOLEAKTRACE caches (e.g., kmemleak's object_cache) must not 7851 + * have sheaves to avoid recursion when sheaf allocation triggers 7852 + * kmemleak tracking. 7543 7853 */ 7544 - if (!kmem_cache_has_cpu_partial(s)) 7545 - nr_objects = 0; 7546 - else if (s->size >= PAGE_SIZE) 7547 - nr_objects = 6; 7548 - else if (s->size >= 1024) 7549 - nr_objects = 24; 7550 - else if (s->size >= 256) 7551 - nr_objects = 52; 7552 - else 7553 - nr_objects = 120; 7854 + if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) 7855 + return 0; 7554 7856 7555 - slub_set_cpu_partial(s, nr_objects); 7556 - #endif 7857 + /* 7858 + * For now we use roughly similar formula (divided by two as there are 7859 + * two percpu sheaves) as what was used for percpu partial slabs, which 7860 + * should result in similar lock contention (barn or list_lock) 7861 + */ 7862 + if (s->size >= PAGE_SIZE) 7863 + capacity = 4; 7864 + else if (s->size >= 1024) 7865 + capacity = 12; 7866 + else if (s->size >= 256) 7867 + capacity = 26; 7868 + else 7869 + capacity = 60; 7870 + 7871 + /* Increment capacity to make sheaf exactly a kmalloc size bucket */ 7872 + size = struct_size_t(struct slab_sheaf, objects, capacity); 7873 + size = kmalloc_size_roundup(size); 7874 + capacity = (size - struct_size_t(struct slab_sheaf, objects, 0)) / sizeof(void *); 7875 + 7876 + /* 7877 + * Respect an explicit request for capacity that's typically motivated by 7878 + * expected maximum size of kmem_cache_prefill_sheaf() to not end up 7879 + * using low-performance oversize sheaves 7880 + */ 7881 + return max(capacity, args->sheaf_capacity); 7557 7882 } 7558 7883 7559 7884 /* ··· 7578 7875 { 7579 7876 slab_flags_t flags = s->flags; 7580 7877 unsigned int size = s->object_size; 7878 + unsigned int aligned_size; 7581 7879 unsigned int order; 7582 7880 7583 7881 /* ··· 7602 7898 7603 7899 7604 7900 /* 7605 - * If we are Redzoning then check if there is some space between the 7606 - * end of the object and the free pointer. If not then add an 7607 - * additional word to have some bytes to store Redzone information. 7901 + * If we are Redzoning and there is no space between the end of the 7902 + * object and the following fields, add one word so the right Redzone 7903 + * is non-empty. 7608 7904 */ 7609 7905 if ((flags & SLAB_RED_ZONE) && size == s->object_size) 7610 7906 size += sizeof(void *); ··· 7617 7913 s->inuse = size; 7618 7914 7619 7915 if (((flags & SLAB_TYPESAFE_BY_RCU) && !args->use_freeptr_offset) || 7620 - (flags & SLAB_POISON) || s->ctor || 7916 + (flags & SLAB_POISON) || 7917 + (s->ctor && !args->use_freeptr_offset) || 7621 7918 ((flags & SLAB_RED_ZONE) && 7622 7919 (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { 7623 7920 /* ··· 7639 7934 */ 7640 7935 s->offset = size; 7641 7936 size += sizeof(void *); 7642 - } else if ((flags & SLAB_TYPESAFE_BY_RCU) && args->use_freeptr_offset) { 7937 + } else if (((flags & SLAB_TYPESAFE_BY_RCU) || s->ctor) && 7938 + args->use_freeptr_offset) { 7643 7939 s->offset = args->freeptr_offset; 7644 7940 } else { 7645 7941 /* ··· 7661 7955 7662 7956 /* Save the original kmalloc request size */ 7663 7957 if (flags & SLAB_KMALLOC) 7664 - size += sizeof(unsigned int); 7958 + size += sizeof(unsigned long); 7665 7959 } 7666 7960 #endif 7667 7961 ··· 7688 7982 * offset 0. In order to align the objects we have to simply size 7689 7983 * each object to conform to the alignment. 7690 7984 */ 7691 - size = ALIGN(size, s->align); 7985 + aligned_size = ALIGN(size, s->align); 7986 + #if defined(CONFIG_SLAB_OBJ_EXT) && defined(CONFIG_64BIT) 7987 + if (slab_args_unmergeable(args, s->flags) && 7988 + (aligned_size - size >= sizeof(struct slabobj_ext))) 7989 + s->flags |= SLAB_OBJ_EXT_IN_OBJ; 7990 + #endif 7991 + size = aligned_size; 7992 + 7692 7993 s->size = size; 7693 7994 s->reciprocal_size = reciprocal_value(size); 7694 7995 order = calculate_order(size); ··· 7713 8000 7714 8001 if (s->flags & SLAB_RECLAIM_ACCOUNT) 7715 8002 s->allocflags |= __GFP_RECLAIMABLE; 8003 + 8004 + /* 8005 + * For KMALLOC_NORMAL caches we enable sheaves later by 8006 + * bootstrap_kmalloc_sheaves() to avoid recursion 8007 + */ 8008 + if (!is_kmalloc_normal(s)) 8009 + s->sheaf_capacity = calculate_sheaf_capacity(s, args); 7716 8010 7717 8011 /* 7718 8012 * Determine the number of objects per slab ··· 7805 8085 flush_all_cpus_locked(s); 7806 8086 7807 8087 /* we might have rcu sheaves in flight */ 7808 - if (s->cpu_sheaves) 8088 + if (cache_has_sheaves(s)) 7809 8089 rcu_barrier(); 7810 8090 7811 8091 /* Attempt to free all objects */ ··· 8117 8397 if (get_node(s, nid)) 8118 8398 continue; 8119 8399 8120 - if (s->cpu_sheaves) { 8400 + if (cache_has_sheaves(s)) { 8121 8401 barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, nid); 8122 8402 8123 8403 if (!barn) { ··· 8192 8472 8193 8473 memcpy(s, static_cache, kmem_cache->object_size); 8194 8474 8195 - /* 8196 - * This runs very early, and only the boot processor is supposed to be 8197 - * up. Even if it weren't true, IRQs are not up so we couldn't fire 8198 - * IPIs around. 8199 - */ 8200 - __flush_cpu_slab(s, smp_processor_id()); 8201 8475 for_each_kmem_cache_node(s, node, n) { 8202 8476 struct slab *p; 8203 8477 ··· 8205 8491 } 8206 8492 list_add(&s->list, &slab_caches); 8207 8493 return s; 8494 + } 8495 + 8496 + /* 8497 + * Finish the sheaves initialization done normally by init_percpu_sheaves() and 8498 + * init_kmem_cache_nodes(). For normal kmalloc caches we have to bootstrap it 8499 + * since sheaves and barns are allocated by kmalloc. 8500 + */ 8501 + static void __init bootstrap_cache_sheaves(struct kmem_cache *s) 8502 + { 8503 + struct kmem_cache_args empty_args = {}; 8504 + unsigned int capacity; 8505 + bool failed = false; 8506 + int node, cpu; 8507 + 8508 + capacity = calculate_sheaf_capacity(s, &empty_args); 8509 + 8510 + /* capacity can be 0 due to debugging or SLUB_TINY */ 8511 + if (!capacity) 8512 + return; 8513 + 8514 + for_each_node_mask(node, slab_nodes) { 8515 + struct node_barn *barn; 8516 + 8517 + barn = kmalloc_node(sizeof(*barn), GFP_KERNEL, node); 8518 + 8519 + if (!barn) { 8520 + failed = true; 8521 + goto out; 8522 + } 8523 + 8524 + barn_init(barn); 8525 + get_node(s, node)->barn = barn; 8526 + } 8527 + 8528 + for_each_possible_cpu(cpu) { 8529 + struct slub_percpu_sheaves *pcs; 8530 + 8531 + pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 8532 + 8533 + pcs->main = __alloc_empty_sheaf(s, GFP_KERNEL, capacity); 8534 + 8535 + if (!pcs->main) { 8536 + failed = true; 8537 + break; 8538 + } 8539 + } 8540 + 8541 + out: 8542 + /* 8543 + * It's still early in boot so treat this like same as a failure to 8544 + * create the kmalloc cache in the first place 8545 + */ 8546 + if (failed) 8547 + panic("Out of memory when creating kmem_cache %s\n", s->name); 8548 + 8549 + s->sheaf_capacity = capacity; 8550 + } 8551 + 8552 + static void __init bootstrap_kmalloc_sheaves(void) 8553 + { 8554 + enum kmalloc_cache_type type; 8555 + 8556 + for (type = KMALLOC_NORMAL; type <= KMALLOC_RANDOM_END; type++) { 8557 + for (int idx = 0; idx < KMALLOC_SHIFT_HIGH + 1; idx++) { 8558 + if (kmalloc_caches[type][idx]) 8559 + bootstrap_cache_sheaves(kmalloc_caches[type][idx]); 8560 + } 8561 + } 8208 8562 } 8209 8563 8210 8564 void __init kmem_cache_init(void) ··· 8318 8536 setup_kmalloc_cache_index_table(); 8319 8537 create_kmalloc_caches(); 8320 8538 8539 + bootstrap_kmalloc_sheaves(); 8540 + 8321 8541 /* Setup random freelists for each cache */ 8322 8542 init_freelist_randomization(); 8323 8543 ··· 8336 8552 { 8337 8553 flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); 8338 8554 WARN_ON(!flushwq); 8339 - } 8340 - 8341 - struct kmem_cache * 8342 - __kmem_cache_alias(const char *name, unsigned int size, unsigned int align, 8343 - slab_flags_t flags, void (*ctor)(void *)) 8344 - { 8345 - struct kmem_cache *s; 8346 - 8347 - s = find_mergeable(size, align, flags, name, ctor); 8348 - if (s) { 8349 - if (sysfs_slab_alias(s, name)) 8350 - pr_err("SLUB: Unable to add cache alias %s to sysfs\n", 8351 - name); 8352 - 8353 - s->refcount++; 8354 - 8355 - /* 8356 - * Adjust the object sizes so that we clear 8357 - * the complete object on kzalloc. 8358 - */ 8359 - s->object_size = max(s->object_size, size); 8360 - s->inuse = max(s->inuse, ALIGN(size, sizeof(void *))); 8361 - } 8362 - 8363 - return s; 8364 8555 } 8365 8556 8366 8557 int do_kmem_cache_create(struct kmem_cache *s, const char *name, ··· 8387 8628 s->min_partial = min_t(unsigned long, MAX_PARTIAL, ilog2(s->size) / 2); 8388 8629 s->min_partial = max_t(unsigned long, MIN_PARTIAL, s->min_partial); 8389 8630 8390 - set_cpu_partial(s); 8391 - 8392 - if (args->sheaf_capacity && !IS_ENABLED(CONFIG_SLUB_TINY) 8393 - && !(s->flags & SLAB_DEBUG_FLAGS)) { 8394 - s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 8395 - if (!s->cpu_sheaves) { 8396 - err = -ENOMEM; 8397 - goto out; 8398 - } 8399 - // TODO: increase capacity to grow slab_sheaf up to next kmalloc size? 8400 - s->sheaf_capacity = args->sheaf_capacity; 8631 + s->cpu_sheaves = alloc_percpu(struct slub_percpu_sheaves); 8632 + if (!s->cpu_sheaves) { 8633 + err = -ENOMEM; 8634 + goto out; 8401 8635 } 8402 8636 8403 8637 #ifdef CONFIG_NUMA ··· 8406 8654 if (!init_kmem_cache_nodes(s)) 8407 8655 goto out; 8408 8656 8409 - if (!alloc_kmem_cache_cpus(s)) 8657 + #ifdef CONFIG_SLUB_STATS 8658 + if (!alloc_kmem_cache_stats(s)) 8410 8659 goto out; 8660 + #endif 8411 8661 8412 - if (s->cpu_sheaves) { 8413 - err = init_percpu_sheaves(s); 8414 - if (err) 8415 - goto out; 8416 - } 8662 + err = init_percpu_sheaves(s); 8663 + if (err) 8664 + goto out; 8417 8665 8418 8666 err = 0; 8419 8667 ··· 8728 8976 if (!nodes) 8729 8977 return -ENOMEM; 8730 8978 8731 - if (flags & SO_CPU) { 8732 - int cpu; 8733 - 8734 - for_each_possible_cpu(cpu) { 8735 - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, 8736 - cpu); 8737 - int node; 8738 - struct slab *slab; 8739 - 8740 - slab = READ_ONCE(c->slab); 8741 - if (!slab) 8742 - continue; 8743 - 8744 - node = slab_nid(slab); 8745 - if (flags & SO_TOTAL) 8746 - x = slab->objects; 8747 - else if (flags & SO_OBJECTS) 8748 - x = slab->inuse; 8749 - else 8750 - x = 1; 8751 - 8752 - total += x; 8753 - nodes[node] += x; 8754 - 8755 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8756 - slab = slub_percpu_partial_read_once(c); 8757 - if (slab) { 8758 - node = slab_nid(slab); 8759 - if (flags & SO_TOTAL) 8760 - WARN_ON_ONCE(1); 8761 - else if (flags & SO_OBJECTS) 8762 - WARN_ON_ONCE(1); 8763 - else 8764 - x = data_race(slab->slabs); 8765 - total += x; 8766 - nodes[node] += x; 8767 - } 8768 - #endif 8769 - } 8770 - } 8771 - 8772 8979 /* 8773 8980 * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex" 8774 8981 * already held which will conflict with an existing lock order: ··· 8859 9148 8860 9149 static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) 8861 9150 { 8862 - unsigned int nr_partial = 0; 8863 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8864 - nr_partial = s->cpu_partial; 8865 - #endif 8866 - 8867 - return sysfs_emit(buf, "%u\n", nr_partial); 9151 + return sysfs_emit(buf, "0\n"); 8868 9152 } 8869 9153 8870 9154 static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, ··· 8871 9165 err = kstrtouint(buf, 10, &objects); 8872 9166 if (err) 8873 9167 return err; 8874 - if (objects && !kmem_cache_has_cpu_partial(s)) 9168 + if (objects) 8875 9169 return -EINVAL; 8876 9170 8877 - slub_set_cpu_partial(s, objects); 8878 - flush_all(s); 8879 9171 return length; 8880 9172 } 8881 9173 SLAB_ATTR(cpu_partial); ··· 8912 9208 8913 9209 static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) 8914 9210 { 8915 - int objects = 0; 8916 - int slabs = 0; 8917 - int cpu __maybe_unused; 8918 - int len = 0; 8919 - 8920 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8921 - for_each_online_cpu(cpu) { 8922 - struct slab *slab; 8923 - 8924 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8925 - 8926 - if (slab) 8927 - slabs += data_race(slab->slabs); 8928 - } 8929 - #endif 8930 - 8931 - /* Approximate half-full slabs, see slub_set_cpu_partial() */ 8932 - objects = (slabs * oo_objects(s->oo)) / 2; 8933 - len += sysfs_emit_at(buf, len, "%d(%d)", objects, slabs); 8934 - 8935 - #ifdef CONFIG_SLUB_CPU_PARTIAL 8936 - for_each_online_cpu(cpu) { 8937 - struct slab *slab; 8938 - 8939 - slab = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu)); 8940 - if (slab) { 8941 - slabs = data_race(slab->slabs); 8942 - objects = (slabs * oo_objects(s->oo)) / 2; 8943 - len += sysfs_emit_at(buf, len, " C%d=%d(%d)", 8944 - cpu, objects, slabs); 8945 - } 8946 - } 8947 - #endif 8948 - len += sysfs_emit_at(buf, len, "\n"); 8949 - 8950 - return len; 9211 + return sysfs_emit(buf, "0(0)\n"); 8951 9212 } 8952 9213 SLAB_ATTR_RO(slabs_cpu_partial); 8953 9214 ··· 9098 9429 return -ENOMEM; 9099 9430 9100 9431 for_each_online_cpu(cpu) { 9101 - unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; 9432 + unsigned int x = per_cpu_ptr(s->cpu_stats, cpu)->stat[si]; 9102 9433 9103 9434 data[cpu] = x; 9104 9435 sum += x; ··· 9124 9455 int cpu; 9125 9456 9126 9457 for_each_online_cpu(cpu) 9127 - per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; 9458 + per_cpu_ptr(s->cpu_stats, cpu)->stat[si] = 0; 9128 9459 } 9129 9460 9130 9461 #define STAT_ATTR(si, text) \ ··· 9142 9473 } \ 9143 9474 SLAB_ATTR(text); \ 9144 9475 9145 - STAT_ATTR(ALLOC_PCS, alloc_cpu_sheaf); 9146 9476 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 9147 9477 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 9148 - STAT_ATTR(FREE_PCS, free_cpu_sheaf); 9149 9478 STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); 9150 9479 STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); 9151 9480 STAT_ATTR(FREE_FASTPATH, free_fastpath); 9152 9481 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 9153 - STAT_ATTR(FREE_FROZEN, free_frozen); 9154 9482 STAT_ATTR(FREE_ADD_PARTIAL, free_add_partial); 9155 9483 STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); 9156 - STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); 9157 9484 STAT_ATTR(ALLOC_SLAB, alloc_slab); 9158 - STAT_ATTR(ALLOC_REFILL, alloc_refill); 9159 9485 STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); 9160 9486 STAT_ATTR(FREE_SLAB, free_slab); 9161 - STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); 9162 - STAT_ATTR(DEACTIVATE_FULL, deactivate_full); 9163 - STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); 9164 - STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); 9165 - STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); 9166 - STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); 9167 - STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); 9168 9487 STAT_ATTR(ORDER_FALLBACK, order_fallback); 9169 - STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 9170 9488 STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 9171 - STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); 9172 - STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); 9173 - STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); 9174 - STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); 9175 9489 STAT_ATTR(SHEAF_FLUSH, sheaf_flush); 9176 9490 STAT_ATTR(SHEAF_REFILL, sheaf_refill); 9177 9491 STAT_ATTR(SHEAF_ALLOC, sheaf_alloc); ··· 9230 9578 &remote_node_defrag_ratio_attr.attr, 9231 9579 #endif 9232 9580 #ifdef CONFIG_SLUB_STATS 9233 - &alloc_cpu_sheaf_attr.attr, 9234 9581 &alloc_fastpath_attr.attr, 9235 9582 &alloc_slowpath_attr.attr, 9236 - &free_cpu_sheaf_attr.attr, 9237 9583 &free_rcu_sheaf_attr.attr, 9238 9584 &free_rcu_sheaf_fail_attr.attr, 9239 9585 &free_fastpath_attr.attr, 9240 9586 &free_slowpath_attr.attr, 9241 - &free_frozen_attr.attr, 9242 9587 &free_add_partial_attr.attr, 9243 9588 &free_remove_partial_attr.attr, 9244 - &alloc_from_partial_attr.attr, 9245 9589 &alloc_slab_attr.attr, 9246 - &alloc_refill_attr.attr, 9247 9590 &alloc_node_mismatch_attr.attr, 9248 9591 &free_slab_attr.attr, 9249 - &cpuslab_flush_attr.attr, 9250 - &deactivate_full_attr.attr, 9251 - &deactivate_empty_attr.attr, 9252 - &deactivate_to_head_attr.attr, 9253 - &deactivate_to_tail_attr.attr, 9254 - &deactivate_remote_frees_attr.attr, 9255 - &deactivate_bypass_attr.attr, 9256 9592 &order_fallback_attr.attr, 9257 9593 &cmpxchg_double_fail_attr.attr, 9258 - &cmpxchg_double_cpu_fail_attr.attr, 9259 - &cpu_partial_alloc_attr.attr, 9260 - &cpu_partial_free_attr.attr, 9261 - &cpu_partial_node_attr.attr, 9262 - &cpu_partial_drain_attr.attr, 9263 9594 &sheaf_flush_attr.attr, 9264 9595 &sheaf_refill_attr.attr, 9265 9596 &sheaf_alloc_attr.attr, ··· 9446 9811 9447 9812 static struct saved_alias *alias_list; 9448 9813 9449 - static int sysfs_slab_alias(struct kmem_cache *s, const char *name) 9814 + int sysfs_slab_alias(struct kmem_cache *s, const char *name) 9450 9815 { 9451 9816 struct saved_alias *al; 9452 9817

Configure Feed

Configure Feed