slab: handle pfmemalloc slabs properly with sheaves

When a pfmemalloc allocation actually dips into reserves, the slab is
marked accordingly and non-pfmemalloc allocations should not be allowed
to allocate from it. The sheaves percpu caching currently doesn't follow
this rule, so implement it before we expand sheaves usage to all caches.

Make sure objects from pfmemalloc slabs don't end up in percpu sheaves.
When freeing, skip sheaves when freeing an object from pfmemalloc slab.
When refilling sheaves, use __GFP_NOMEMALLOC to override any pfmemalloc
context - the allocation will fallback to regular slab allocations when
sheaves are depleted and can't be refilled because of the override.

For kfree_rcu(), detect pfmemalloc slabs after processing the rcu_sheaf
after the grace period in __rcu_free_sheaf_prepare() and simply flush
it if any object is from pfmemalloc slabs.

For prefilled sheaves, try to refill them first with __GFP_NOMEMALLOC
and if it fails, retry without __GFP_NOMEMALLOC but then mark the sheaf
pfmemalloc, which makes it flushed back to slabs when returned.

Link: https://patch.msgid.link/20251105-sheaves-cleanups-v1-3-b8218e1ac7ef@suse.cz
Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Vlastimil Babka 7 months ago 1ce20c28 ea6b5e57

+55 -14

1 changed file

expand all

slub.c

+55 -14

mm/slub.c

··· 469 469 struct rcu_head rcu_head; 470 470 struct list_head barn_list; 471 471 /* only used for prefilled sheafs */ 472 - unsigned int capacity; 472 + struct { 473 + unsigned int capacity; 474 + bool pfmemalloc; 475 + }; 473 476 }; 474 477 struct kmem_cache *cache; 475 478 unsigned int size; ··· 2654 2651 if (!sheaf) 2655 2652 return NULL; 2656 2653 2657 - if (refill_sheaf(s, sheaf, gfp)) { 2654 + if (refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC)) { 2658 2655 free_empty_sheaf(s, sheaf); 2659 2656 return NULL; 2660 2657 } ··· 2732 2729 sheaf->size = 0; 2733 2730 } 2734 2731 2735 - static void __rcu_free_sheaf_prepare(struct kmem_cache *s, 2732 + static bool __rcu_free_sheaf_prepare(struct kmem_cache *s, 2736 2733 struct slab_sheaf *sheaf) 2737 2734 { 2738 2735 bool init = slab_want_init_on_free(s); 2739 2736 void **p = &sheaf->objects[0]; 2740 2737 unsigned int i = 0; 2738 + bool pfmemalloc = false; 2741 2739 2742 2740 while (i < sheaf->size) { 2743 2741 struct slab *slab = virt_to_slab(p[i]); ··· 2751 2747 continue; 2752 2748 } 2753 2749 2750 + if (slab_test_pfmemalloc(slab)) 2751 + pfmemalloc = true; 2752 + 2754 2753 i++; 2755 2754 } 2755 + 2756 + return pfmemalloc; 2756 2757 } 2757 2758 2758 2759 static void rcu_free_sheaf_nobarn(struct rcu_head *head) ··· 5050 5041 return NULL; 5051 5042 5052 5043 if (empty) { 5053 - if (!refill_sheaf(s, empty, gfp)) { 5044 + if (!refill_sheaf(s, empty, gfp | __GFP_NOMEMALLOC)) { 5054 5045 full = empty; 5055 5046 } else { 5056 5047 /* ··· 5350 5341 } 5351 5342 EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); 5352 5343 5344 + static int __prefill_sheaf_pfmemalloc(struct kmem_cache *s, 5345 + struct slab_sheaf *sheaf, gfp_t gfp) 5346 + { 5347 + int ret = 0; 5348 + 5349 + ret = refill_sheaf(s, sheaf, gfp | __GFP_NOMEMALLOC); 5350 + 5351 + if (likely(!ret || !gfp_pfmemalloc_allowed(gfp))) 5352 + return ret; 5353 + 5354 + /* 5355 + * if we are allowed to, refill sheaf with pfmemalloc but then remember 5356 + * it for when it's returned 5357 + */ 5358 + ret = refill_sheaf(s, sheaf, gfp); 5359 + sheaf->pfmemalloc = true; 5360 + 5361 + return ret; 5362 + } 5363 + 5353 5364 /* 5354 5365 * returns a sheaf that has at least the requested size 5355 5366 * when prefilling is needed, do so with given gfp flags ··· 5404 5375 sheaf->cache = s; 5405 5376 sheaf->capacity = size; 5406 5377 5378 + /* 5379 + * we do not need to care about pfmemalloc here because oversize 5380 + * sheaves area always flushed and freed when returned 5381 + */ 5407 5382 if (!__kmem_cache_alloc_bulk(s, gfp, size, 5408 5383 &sheaf->objects[0])) { 5409 5384 kfree(sheaf); ··· 5444 5411 if (!sheaf) 5445 5412 sheaf = alloc_empty_sheaf(s, gfp); 5446 5413 5447 - if (sheaf && sheaf->size < size) { 5448 - if (refill_sheaf(s, sheaf, gfp)) { 5414 + if (sheaf) { 5415 + sheaf->capacity = s->sheaf_capacity; 5416 + sheaf->pfmemalloc = false; 5417 + 5418 + if (sheaf->size < size && 5419 + __prefill_sheaf_pfmemalloc(s, sheaf, gfp)) { 5449 5420 sheaf_flush_unused(s, sheaf); 5450 5421 free_empty_sheaf(s, sheaf); 5451 5422 sheaf = NULL; 5452 5423 } 5453 5424 } 5454 - 5455 - if (sheaf) 5456 - sheaf->capacity = s->sheaf_capacity; 5457 5425 5458 5426 return sheaf; 5459 5427 } ··· 5475 5441 struct slub_percpu_sheaves *pcs; 5476 5442 struct node_barn *barn; 5477 5443 5478 - if (unlikely(sheaf->capacity != s->sheaf_capacity)) { 5444 + if (unlikely((sheaf->capacity != s->sheaf_capacity) 5445 + || sheaf->pfmemalloc)) { 5479 5446 sheaf_flush_unused(s, sheaf); 5480 5447 kfree(sheaf); 5481 5448 return; ··· 5542 5507 5543 5508 if (likely(sheaf->capacity >= size)) { 5544 5509 if (likely(sheaf->capacity == s->sheaf_capacity)) 5545 - return refill_sheaf(s, sheaf, gfp); 5510 + return __prefill_sheaf_pfmemalloc(s, sheaf, gfp); 5546 5511 5547 5512 if (!__kmem_cache_alloc_bulk(s, gfp, sheaf->capacity - sheaf->size, 5548 5513 &sheaf->objects[sheaf->size])) { ··· 6250 6215 * handles it fine. The only downside is that sheaf will serve fewer 6251 6216 * allocations when reused. It only happens due to debugging, which is a 6252 6217 * performance hit anyway. 6218 + * 6219 + * If it returns true, there was at least one object from pfmemalloc 6220 + * slab so simply flush everything. 6253 6221 */ 6254 - __rcu_free_sheaf_prepare(s, sheaf); 6222 + if (__rcu_free_sheaf_prepare(s, sheaf)) 6223 + goto flush; 6255 6224 6256 6225 n = get_node(s, sheaf->node); 6257 6226 if (!n) ··· 6410 6371 continue; 6411 6372 } 6412 6373 6413 - if (unlikely(IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node)) { 6374 + if (unlikely((IS_ENABLED(CONFIG_NUMA) && slab_nid(slab) != node) 6375 + || slab_test_pfmemalloc(slab))) { 6414 6376 remote_objects[remote_nr] = p[i]; 6415 6377 p[i] = p[--size]; 6416 6378 if (++remote_nr >= PCS_BATCH_MAX) ··· 6709 6669 return; 6710 6670 6711 6671 if (s->cpu_sheaves && likely(!IS_ENABLED(CONFIG_NUMA) || 6712 - slab_nid(slab) == numa_mem_id())) { 6672 + slab_nid(slab) == numa_mem_id()) 6673 + && likely(!slab_test_pfmemalloc(slab))) { 6713 6674 if (likely(free_to_pcs(s, object))) 6714 6675 return; 6715 6676 }

Configure Feed

Configure Feed