slab: add optimized sheaf refill from partial list

At this point we have sheaves enabled for all caches, but their refill
is done via __kmem_cache_alloc_bulk() which relies on cpu (partial)
slabs - now a redundant caching layer that we are about to remove.

The refill will thus be done from slabs on the node partial list.
Introduce new functions that can do that in an optimized way as it's
easier than modifying the __kmem_cache_alloc_bulk() call chain.

Introduce struct partial_bulk_context, a variant of struct
partial_context that can return a list of slabs from the partial list
with the sum of free objects in them within the requested min and max.

Introduce get_partial_node_bulk() that removes the slabs from freelist
and returns them in the list. There is a racy read of slab->counters
so make sure the non-atomic write in __update_freelist_slow() is not
tearing.

Introduce get_freelist_nofreeze() which grabs the freelist without
freezing the slab.

Introduce alloc_from_new_slab() which can allocate multiple objects from
a newly allocated slab where we don't need to synchronize with freeing.
In some aspects it's similar to alloc_single_from_new_slab() but assumes
the cache is a non-debug one so it can avoid some actions. It supports
the allow_spin parameter, which we always set true here, but the
followup change will reuse the function in a context where it may be
false.

Introduce __refill_objects() that uses the functions above to fill an
array of objects. It has to handle the possibility that the slabs will
contain more objects that were requested, due to concurrent freeing of
objects to those slabs. When no more slabs on partial lists are
available, it will allocate new slabs. It is intended to be only used
in context where spinning is allowed, so add a WARN_ON_ONCE check there.

Finally, switch refill_sheaf() to use __refill_objects(). Sheaves are
only refilled from contexts that allow spinning, or even blocking.

Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Hao Li <hao.li@linux.dev>
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Vlastimil Babka 3 months ago ed30c4ad 913ffd3a

+272 -21

1 changed file

expand all

slub.c

+272 -21

mm/slub.c

··· 248 248 void *object; 249 249 }; 250 250 251 + /* Structure holding parameters for get_partial_node_bulk() */ 252 + struct partial_bulk_context { 253 + gfp_t flags; 254 + unsigned int min_objects; 255 + unsigned int max_objects; 256 + struct list_head slabs; 257 + }; 258 + 251 259 static inline bool kmem_cache_debug(struct kmem_cache *s) 252 260 { 253 261 return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS); ··· 787 779 if (slab->freelist == old->freelist && 788 780 slab->counters == old->counters) { 789 781 slab->freelist = new->freelist; 790 - slab->counters = new->counters; 782 + /* prevent tearing for the read in get_partial_node_bulk() */ 783 + WRITE_ONCE(slab->counters, new->counters); 791 784 ret = true; 792 785 } 793 786 slab_unlock(slab); ··· 2647 2638 stat(s, SHEAF_FREE); 2648 2639 } 2649 2640 2650 - static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 2651 - size_t size, void **p); 2652 - 2641 + static unsigned int 2642 + __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 2643 + unsigned int max); 2653 2644 2654 2645 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, 2655 2646 gfp_t gfp) ··· 2660 2651 if (!to_fill) 2661 2652 return 0; 2662 2653 2663 - filled = __kmem_cache_alloc_bulk(s, gfp, to_fill, 2664 - &sheaf->objects[sheaf->size]); 2654 + filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp, 2655 + to_fill, to_fill); 2665 2656 2666 2657 sheaf->size += filled; 2667 2658 ··· 3526 3517 int drain) { } 3527 3518 #endif 3528 3519 static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags); 3520 + 3521 + static bool get_partial_node_bulk(struct kmem_cache *s, 3522 + struct kmem_cache_node *n, 3523 + struct partial_bulk_context *pc) 3524 + { 3525 + struct slab *slab, *slab2; 3526 + unsigned int total_free = 0; 3527 + unsigned long flags; 3528 + 3529 + /* Racy check to avoid taking the lock unnecessarily. */ 3530 + if (!n || data_race(!n->nr_partial)) 3531 + return false; 3532 + 3533 + INIT_LIST_HEAD(&pc->slabs); 3534 + 3535 + spin_lock_irqsave(&n->list_lock, flags); 3536 + 3537 + list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) { 3538 + struct freelist_counters flc; 3539 + unsigned int slab_free; 3540 + 3541 + if (!pfmemalloc_match(slab, pc->flags)) 3542 + continue; 3543 + 3544 + /* 3545 + * determine the number of free objects in the slab racily 3546 + * 3547 + * slab_free is a lower bound due to possible subsequent 3548 + * concurrent freeing, so the caller may get more objects than 3549 + * requested and must handle that 3550 + */ 3551 + flc.counters = data_race(READ_ONCE(slab->counters)); 3552 + slab_free = flc.objects - flc.inuse; 3553 + 3554 + /* we have already min and this would get us over the max */ 3555 + if (total_free >= pc->min_objects 3556 + && total_free + slab_free > pc->max_objects) 3557 + break; 3558 + 3559 + remove_partial(n, slab); 3560 + 3561 + list_add(&slab->slab_list, &pc->slabs); 3562 + 3563 + total_free += slab_free; 3564 + if (total_free >= pc->max_objects) 3565 + break; 3566 + } 3567 + 3568 + spin_unlock_irqrestore(&n->list_lock, flags); 3569 + return total_free > 0; 3570 + } 3529 3571 3530 3572 /* 3531 3573 * Try to allocate a partial slab from a specific node. ··· 4505 4445 } 4506 4446 4507 4447 /* 4448 + * Get the slab's freelist and do not freeze it. 4449 + * 4450 + * Assumes the slab is isolated from node partial list and not frozen. 4451 + * 4452 + * Assumes this is performed only for caches without debugging so we 4453 + * don't need to worry about adding the slab to the full list. 4454 + */ 4455 + static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *slab) 4456 + { 4457 + struct freelist_counters old, new; 4458 + 4459 + do { 4460 + old.freelist = slab->freelist; 4461 + old.counters = slab->counters; 4462 + 4463 + new.freelist = NULL; 4464 + new.counters = old.counters; 4465 + VM_WARN_ON_ONCE(new.frozen); 4466 + 4467 + new.inuse = old.objects; 4468 + 4469 + } while (!slab_update_freelist(s, slab, &old, &new, "get_freelist_nofreeze")); 4470 + 4471 + return old.freelist; 4472 + } 4473 + 4474 + /* 4508 4475 * Freeze the partial slab and return the pointer to the freelist. 4509 4476 */ 4510 4477 static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab) ··· 4552 4465 } while (!slab_update_freelist(s, slab, &old, &new, "freeze_slab")); 4553 4466 4554 4467 return old.freelist; 4468 + } 4469 + 4470 + /* 4471 + * If the object has been wiped upon free, make sure it's fully initialized by 4472 + * zeroing out freelist pointer. 4473 + * 4474 + * Note that we also wipe custom freelist pointers. 4475 + */ 4476 + static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 4477 + void *obj) 4478 + { 4479 + if (unlikely(slab_want_init_on_free(s)) && obj && 4480 + !freeptr_outside_object(s)) 4481 + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 4482 + 0, sizeof(void *)); 4483 + } 4484 + 4485 + static unsigned int alloc_from_new_slab(struct kmem_cache *s, struct slab *slab, 4486 + void **p, unsigned int count, bool allow_spin) 4487 + { 4488 + unsigned int allocated = 0; 4489 + struct kmem_cache_node *n; 4490 + bool needs_add_partial; 4491 + unsigned long flags; 4492 + void *object; 4493 + 4494 + /* 4495 + * Are we going to put the slab on the partial list? 4496 + * Note slab->inuse is 0 on a new slab. 4497 + */ 4498 + needs_add_partial = (slab->objects > count); 4499 + 4500 + if (!allow_spin && needs_add_partial) { 4501 + 4502 + n = get_node(s, slab_nid(slab)); 4503 + 4504 + if (!spin_trylock_irqsave(&n->list_lock, flags)) { 4505 + /* Unlucky, discard newly allocated slab */ 4506 + defer_deactivate_slab(slab, NULL); 4507 + return 0; 4508 + } 4509 + } 4510 + 4511 + object = slab->freelist; 4512 + while (object && allocated < count) { 4513 + p[allocated] = object; 4514 + object = get_freepointer(s, object); 4515 + maybe_wipe_obj_freeptr(s, p[allocated]); 4516 + 4517 + slab->inuse++; 4518 + allocated++; 4519 + } 4520 + slab->freelist = object; 4521 + 4522 + if (needs_add_partial) { 4523 + 4524 + if (allow_spin) { 4525 + n = get_node(s, slab_nid(slab)); 4526 + spin_lock_irqsave(&n->list_lock, flags); 4527 + } 4528 + add_partial(n, slab, DEACTIVATE_TO_HEAD); 4529 + spin_unlock_irqrestore(&n->list_lock, flags); 4530 + } 4531 + 4532 + inc_slabs_node(s, slab_nid(slab), slab->objects); 4533 + return allocated; 4555 4534 } 4556 4535 4557 4536 /* ··· 5060 4907 } 5061 4908 5062 4909 return object; 5063 - } 5064 - 5065 - /* 5066 - * If the object has been wiped upon free, make sure it's fully initialized by 5067 - * zeroing out freelist pointer. 5068 - * 5069 - * Note that we also wipe custom freelist pointers. 5070 - */ 5071 - static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, 5072 - void *obj) 5073 - { 5074 - if (unlikely(slab_want_init_on_free(s)) && obj && 5075 - !freeptr_outside_object(s)) 5076 - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), 5077 - 0, sizeof(void *)); 5078 4910 } 5079 4911 5080 4912 static __fastpath_inline ··· 5521 5383 5522 5384 return ret; 5523 5385 } 5386 + 5387 + static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, 5388 + size_t size, void **p); 5524 5389 5525 5390 /* 5526 5391 * returns a sheaf that has at least the requested size ··· 7637 7496 } while (likely(size)); 7638 7497 } 7639 7498 EXPORT_SYMBOL(kmem_cache_free_bulk); 7499 + 7500 + static unsigned int 7501 + __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 7502 + unsigned int max) 7503 + { 7504 + struct partial_bulk_context pc; 7505 + struct slab *slab, *slab2; 7506 + unsigned int refilled = 0; 7507 + unsigned long flags; 7508 + void *object; 7509 + int node; 7510 + 7511 + pc.flags = gfp; 7512 + pc.min_objects = min; 7513 + pc.max_objects = max; 7514 + 7515 + node = numa_mem_id(); 7516 + 7517 + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) 7518 + return 0; 7519 + 7520 + /* TODO: consider also other nodes? */ 7521 + if (!get_partial_node_bulk(s, get_node(s, node), &pc)) 7522 + goto new_slab; 7523 + 7524 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7525 + 7526 + list_del(&slab->slab_list); 7527 + 7528 + object = get_freelist_nofreeze(s, slab); 7529 + 7530 + while (object && refilled < max) { 7531 + p[refilled] = object; 7532 + object = get_freepointer(s, object); 7533 + maybe_wipe_obj_freeptr(s, p[refilled]); 7534 + 7535 + refilled++; 7536 + } 7537 + 7538 + /* 7539 + * Freelist had more objects than we can accommodate, we need to 7540 + * free them back. We can treat it like a detached freelist, just 7541 + * need to find the tail object. 7542 + */ 7543 + if (unlikely(object)) { 7544 + void *head = object; 7545 + void *tail; 7546 + int cnt = 0; 7547 + 7548 + do { 7549 + tail = object; 7550 + cnt++; 7551 + object = get_freepointer(s, object); 7552 + } while (object); 7553 + do_slab_free(s, slab, head, tail, cnt, _RET_IP_); 7554 + } 7555 + 7556 + if (refilled >= max) 7557 + break; 7558 + } 7559 + 7560 + if (unlikely(!list_empty(&pc.slabs))) { 7561 + struct kmem_cache_node *n = get_node(s, node); 7562 + 7563 + spin_lock_irqsave(&n->list_lock, flags); 7564 + 7565 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7566 + 7567 + if (unlikely(!slab->inuse && n->nr_partial >= s->min_partial)) 7568 + continue; 7569 + 7570 + list_del(&slab->slab_list); 7571 + add_partial(n, slab, DEACTIVATE_TO_HEAD); 7572 + } 7573 + 7574 + spin_unlock_irqrestore(&n->list_lock, flags); 7575 + 7576 + /* any slabs left are completely free and for discard */ 7577 + list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 7578 + 7579 + list_del(&slab->slab_list); 7580 + discard_slab(s, slab); 7581 + } 7582 + } 7583 + 7584 + 7585 + if (likely(refilled >= min)) 7586 + goto out; 7587 + 7588 + new_slab: 7589 + 7590 + slab = new_slab(s, pc.flags, node); 7591 + if (!slab) 7592 + goto out; 7593 + 7594 + stat(s, ALLOC_SLAB); 7595 + 7596 + /* 7597 + * TODO: possible optimization - if we know we will consume the whole 7598 + * slab we might skip creating the freelist? 7599 + */ 7600 + refilled += alloc_from_new_slab(s, slab, p + refilled, max - refilled, 7601 + /* allow_spin = */ true); 7602 + 7603 + if (refilled < min) 7604 + goto new_slab; 7605 + out: 7606 + 7607 + return refilled; 7608 + } 7640 7609 7641 7610 static inline 7642 7611 int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,

Configure Feed

Configure Feed