slab: refill sheaves from all nodes

__refill_objects() currently only attempts to get partial slabs from the
local node and then allocates new slab(s). Expand it to trying also
other nodes while observing the remote node defrag ratio, similarly to
get_any_partial().

This will prevent allocating new slabs on a node while other nodes have
many free slabs. It does mean sheaves will contain non-local objects in
that case. Allocations that care about specific node will still be
served appropriately, but might get a slowpath allocation.

Like get_any_partial() we do observe cpuset_zone_allowed(), although we
might be refilling a sheaf that will be then used from a different
allocation context.

We can also use the resulting refill_objects() in
__kmem_cache_alloc_bulk() for non-debug caches. This means
kmem_cache_alloc_bulk() will get better performance when sheaves are
exhausted. kmem_cache_alloc_bulk() cannot indicate a preferred node so
it's compatible with sheaves refill in preferring the local node.
Its users also have gfp flags that allow spinning, so document that
as a requirement.

Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Hao Li <hao.li@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Vlastimil Babka 3 months ago 46dea174 6c2f307f

+106 -31

1 changed file

expand all

slub.c

+106 -31

mm/slub.c

··· 2512 2512 } 2513 2513 2514 2514 static unsigned int 2515 - __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 2516 - unsigned int max); 2515 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 2516 + unsigned int max); 2517 2517 2518 2518 static int refill_sheaf(struct kmem_cache *s, struct slab_sheaf *sheaf, 2519 2519 gfp_t gfp) ··· 2524 2524 if (!to_fill) 2525 2525 return 0; 2526 2526 2527 - filled = __refill_objects(s, &sheaf->objects[sheaf->size], gfp, 2528 - to_fill, to_fill); 2527 + filled = refill_objects(s, &sheaf->objects[sheaf->size], gfp, to_fill, 2528 + to_fill); 2529 2529 2530 2530 sheaf->size += filled; 2531 2531 ··· 6563 6563 EXPORT_SYMBOL(kmem_cache_free_bulk); 6564 6564 6565 6565 static unsigned int 6566 - __refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 6567 - unsigned int max) 6566 + __refill_objects_node(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 6567 + unsigned int max, struct kmem_cache_node *n) 6568 6568 { 6569 6569 struct partial_bulk_context pc; 6570 6570 struct slab *slab, *slab2; 6571 6571 unsigned int refilled = 0; 6572 6572 unsigned long flags; 6573 6573 void *object; 6574 - int node; 6575 6574 6576 6575 pc.flags = gfp; 6577 6576 pc.min_objects = min; 6578 6577 pc.max_objects = max; 6579 6578 6580 - node = numa_mem_id(); 6581 - 6582 - if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) 6579 + if (!get_partial_node_bulk(s, n, &pc)) 6583 6580 return 0; 6584 - 6585 - /* TODO: consider also other nodes? */ 6586 - if (!get_partial_node_bulk(s, get_node(s, node), &pc)) 6587 - goto new_slab; 6588 6581 6589 6582 list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { 6590 6583 ··· 6616 6623 } 6617 6624 6618 6625 if (unlikely(!list_empty(&pc.slabs))) { 6619 - struct kmem_cache_node *n = get_node(s, node); 6620 - 6621 6626 spin_lock_irqsave(&n->list_lock, flags); 6622 6627 6623 6628 list_for_each_entry_safe(slab, slab2, &pc.slabs, slab_list) { ··· 6637 6646 } 6638 6647 } 6639 6648 6649 + return refilled; 6650 + } 6640 6651 6641 - if (likely(refilled >= min)) 6642 - goto out; 6652 + #ifdef CONFIG_NUMA 6653 + static unsigned int 6654 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 6655 + unsigned int max) 6656 + { 6657 + struct zonelist *zonelist; 6658 + struct zoneref *z; 6659 + struct zone *zone; 6660 + enum zone_type highest_zoneidx = gfp_zone(gfp); 6661 + unsigned int cpuset_mems_cookie; 6662 + unsigned int refilled = 0; 6663 + 6664 + /* see get_from_any_partial() for the defrag ratio description */ 6665 + if (!s->remote_node_defrag_ratio || 6666 + get_cycles() % 1024 > s->remote_node_defrag_ratio) 6667 + return 0; 6668 + 6669 + do { 6670 + cpuset_mems_cookie = read_mems_allowed_begin(); 6671 + zonelist = node_zonelist(mempolicy_slab_node(), gfp); 6672 + for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) { 6673 + struct kmem_cache_node *n; 6674 + unsigned int r; 6675 + 6676 + n = get_node(s, zone_to_nid(zone)); 6677 + 6678 + if (!n || !cpuset_zone_allowed(zone, gfp) || 6679 + n->nr_partial <= s->min_partial) 6680 + continue; 6681 + 6682 + r = __refill_objects_node(s, p, gfp, min, max, n); 6683 + refilled += r; 6684 + 6685 + if (r >= min) { 6686 + /* 6687 + * Don't check read_mems_allowed_retry() here - 6688 + * if mems_allowed was updated in parallel, that 6689 + * was a harmless race between allocation and 6690 + * the cpuset update 6691 + */ 6692 + return refilled; 6693 + } 6694 + p += r; 6695 + min -= r; 6696 + max -= r; 6697 + } 6698 + } while (read_mems_allowed_retry(cpuset_mems_cookie)); 6699 + 6700 + return refilled; 6701 + } 6702 + #else 6703 + static inline unsigned int 6704 + __refill_objects_any(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 6705 + unsigned int max) 6706 + { 6707 + return 0; 6708 + } 6709 + #endif 6710 + 6711 + static unsigned int 6712 + refill_objects(struct kmem_cache *s, void **p, gfp_t gfp, unsigned int min, 6713 + unsigned int max) 6714 + { 6715 + int local_node = numa_mem_id(); 6716 + unsigned int refilled; 6717 + struct slab *slab; 6718 + 6719 + if (WARN_ON_ONCE(!gfpflags_allow_spinning(gfp))) 6720 + return 0; 6721 + 6722 + refilled = __refill_objects_node(s, p, gfp, min, max, 6723 + get_node(s, local_node)); 6724 + if (refilled >= min) 6725 + return refilled; 6726 + 6727 + refilled += __refill_objects_any(s, p + refilled, gfp, min - refilled, 6728 + max - refilled); 6729 + if (refilled >= min) 6730 + return refilled; 6643 6731 6644 6732 new_slab: 6645 6733 6646 - slab = new_slab(s, pc.flags, node); 6734 + slab = new_slab(s, gfp, local_node); 6647 6735 if (!slab) 6648 6736 goto out; 6649 6737 ··· 6737 6667 6738 6668 if (refilled < min) 6739 6669 goto new_slab; 6740 - out: 6741 6670 6671 + out: 6742 6672 return refilled; 6743 6673 } 6744 6674 ··· 6748 6678 { 6749 6679 int i; 6750 6680 6751 - /* 6752 - * TODO: this might be more efficient (if necessary) by reusing 6753 - * __refill_objects() 6754 - */ 6755 - for (i = 0; i < size; i++) { 6681 + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { 6682 + for (i = 0; i < size; i++) { 6756 6683 6757 - p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, 6758 - s->object_size); 6759 - if (unlikely(!p[i])) 6684 + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_, 6685 + s->object_size); 6686 + if (unlikely(!p[i])) 6687 + goto error; 6688 + 6689 + maybe_wipe_obj_freeptr(s, p[i]); 6690 + } 6691 + } else { 6692 + i = refill_objects(s, p, flags, size, size); 6693 + if (i < size) 6760 6694 goto error; 6761 - 6762 - maybe_wipe_obj_freeptr(s, p[i]); 6763 6695 } 6764 6696 6765 6697 return i; ··· 6772 6700 6773 6701 } 6774 6702 6775 - /* Note that interrupts must be enabled when calling this function. */ 6703 + /* 6704 + * Note that interrupts must be enabled when calling this function and gfp 6705 + * flags must allow spinning. 6706 + */ 6776 6707 int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, 6777 6708 void **p) 6778 6709 {

Configure Feed

Configure Feed