slab: skip percpu sheaves for remote object freeing

Since we don't control the NUMA locality of objects in percpu sheaves,
allocations with node restrictions bypass them. Allocations without
restrictions may however still expect to get local objects with high
probability, and the introduction of sheaves can decrease it due to
freed object from a remote node ending up in percpu sheaves.

The fraction of such remote frees seems low (5% on an 8-node machine)
but it can be expected that some cache or workload specific corner cases
exist. We can either conclude that this is not a problem due to the low
fraction, or we can make remote frees bypass percpu sheaves and go
directly to their slabs. This will make the remote frees more expensive,
but if it's only a small fraction, most frees will still benefit from
the lower overhead of percpu sheaves.

This patch thus makes remote object freeing bypass percpu sheaves,
including bulk freeing, and kfree_rcu() via the rcu_free sheaf. However
it's not intended to be 100% guarantee that percpu sheaves will only
contain local objects. The refill from slabs does not provide that
guarantee in the first place, and there might be cpu migrations
happening when we need to unlock the local_lock. Avoiding all that could
be possible but complicated so we can leave it for later investigation
whether it would be worth it. It can be expected that the more selective
freeing will itself prevent accumulation of remote objects in percpu
sheaves so any such violations would have only short-term effects.

Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Vlastimil Babka 8 months ago 989b09b7 08294229

+40 -8

2 changed files

expand all

slab_common.c

slub.c

+5 -2

mm/slab_common.c

··· 1623 1623 1624 1624 slab = folio_slab(folio); 1625 1625 s = slab->slab_cache; 1626 - if (s->cpu_sheaves) 1627 - return __kfree_rcu_sheaf(s, obj); 1626 + if (s->cpu_sheaves) { 1627 + if (likely(!IS_ENABLED(CONFIG_NUMA) || 1628 + slab_nid(slab) == numa_mem_id())) 1629 + return __kfree_rcu_sheaf(s, obj); 1630 + } 1628 1631 1629 1632 return false; 1630 1633 }

+35 -6

mm/slub.c

··· 472 472 }; 473 473 struct kmem_cache *cache; 474 474 unsigned int size; 475 + int node; /* only used for rcu_sheaf */ 475 476 void *objects[]; 476 477 }; 477 478 ··· 5823 5822 */ 5824 5823 __rcu_free_sheaf_prepare(s, sheaf); 5825 5824 5826 - barn = get_node(s, numa_mem_id())->barn; 5825 + barn = get_node(s, sheaf->node)->barn; 5827 5826 5828 5827 /* due to slab_free_hook() */ 5829 5828 if (unlikely(sheaf->size == 0)) ··· 5913 5912 */ 5914 5913 rcu_sheaf->objects[rcu_sheaf->size++] = obj; 5915 5914 5916 - if (likely(rcu_sheaf->size < s->sheaf_capacity)) 5915 + if (likely(rcu_sheaf->size < s->sheaf_capacity)) { 5917 5916 rcu_sheaf = NULL; 5918 - else 5917 + } else { 5919 5918 pcs->rcu_free = NULL; 5919 + rcu_sheaf->node = numa_mem_id(); 5920 + } 5920 5921 5921 5922 /* 5922 5923 * we flush before local_unlock to make sure a racing ··· 5949 5946 bool init = slab_want_init_on_free(s); 5950 5947 unsigned int batch, i = 0; 5951 5948 struct node_barn *barn; 5949 + void *remote_objects[PCS_BATCH_MAX]; 5950 + unsigned int remote_nr = 0; 5951 + int node = numa_mem_id(); 5952 5952 5953 + next_remote_batch: 5953 5954 while (i < size) { 5954 5955 struct slab *slab = virt_to_slab(p[i]); 5955 5956 ··· 5963 5956 if (unlikely(!slab_free_hook(s, p[i], init, false))) { 5964 5957 p[i] = p[--size]; 5965 5958 if (!size) 5966 - return; 5959 + goto flush_remote; 5960 + continue; 5961 + } 5962 + 5963 + if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { 5964 + remote_objects[remote_nr] = p[i]; 5965 + p[i] = p[--size]; 5966 + if (++remote_nr >= PCS_BATCH_MAX) 5967 + goto flush_remote; 5967 5968 continue; 5968 5969 } 5969 5970 ··· 6041 6026 */ 6042 6027 fallback: 6043 6028 __kmem_cache_free_bulk(s, size, p); 6029 + 6030 + flush_remote: 6031 + if (remote_nr) { 6032 + __kmem_cache_free_bulk(s, remote_nr, &remote_objects[0]); 6033 + if (i < size) { 6034 + remote_nr = 0; 6035 + goto next_remote_batch; 6036 + } 6037 + } 6044 6038 } 6045 6039 6046 6040 #ifndef CONFIG_SLUB_TINY ··· 6141 6117 if (unlikely(!slab_free_hook(s, object, slab_want_init_on_free(s), false))) 6142 6118 return; 6143 6119 6144 - if (!s->cpu_sheaves || !free_to_pcs(s, object)) 6145 - do_slab_free(s, slab, object, object, 1, addr); 6120 + if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6121 + slab_nid(slab) == numa_mem_id())) { 6122 + if (likely(free_to_pcs(s, object))) 6123 + return; 6124 + } 6125 + 6126 + do_slab_free(s, slab, object, object, 1, addr); 6146 6127 } 6147 6128 6148 6129 #ifdef CONFIG_MEMCG

Configure Feed

Configure Feed