slab: add sheaf support for batching kfree_rcu() operations

Extend the sheaf infrastructure for more efficient kfree_rcu() handling.
For caches with sheaves, on each cpu maintain a rcu_free sheaf in
addition to main and spare sheaves.

kfree_rcu() operations will try to put objects on this sheaf. Once full,
the sheaf is detached and submitted to call_rcu() with a handler that
will try to put it in the barn, or flush to slab pages using bulk free,
when the barn is full. Then a new empty sheaf must be obtained to put
more objects there.

It's possible that no free sheaves are available to use for a new
rcu_free sheaf, and the allocation in kfree_rcu() context can only use
GFP_NOWAIT and thus may fail. In that case, fall back to the existing
kfree_rcu() implementation.

Expected advantages:
- batching the kfree_rcu() operations, that could eventually replace the
existing batching
- sheaves can be reused for allocations via barn instead of being
flushed to slabs, which is more efficient
- this includes cases where only some cpus are allowed to process rcu
callbacks (CONFIG_RCU_NOCB_CPU)

Possible disadvantage:
- objects might be waiting for more than their grace period (it is
determined by the last object freed into the sheaf), increasing memory
usage - but the existing batching does that too.

Only implement this for CONFIG_KVFREE_RCU_BATCHED as the tiny
implementation favors smaller memory footprint over performance.

Also for now skip the usage of rcu sheaf for CONFIG_PREEMPT_RT as the
contexts where kfree_rcu() is called might not be compatible with taking
a barn spinlock or a GFP_NOWAIT allocation of a new sheaf taking a
spinlock - the current kfree_rcu() implementation avoids doing that.

Teach kvfree_rcu_barrier() to flush all rcu_free sheaves from all caches
that have them. This is not a cheap operation, but the barrier usage is
rare - currently kmem_cache_destroy() or on module unload.

Add CONFIG_SLUB_STATS counters free_rcu_sheaf and free_rcu_sheaf_fail to
count how many kfree_rcu() used the rcu_free sheaf successfully and how
many had to fall back to the existing implementation.

Reviewed-by: Harry Yoo <harry.yoo@oracle.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>

Vlastimil Babka 8 months ago ec66e0d5 2d517aa0

+295 -2

3 changed files

expand all

slab.h

slab_common.c

slub.c

mm/slab.h

··· 435 435 return !(s->flags & (SLAB_CACHE_DMA|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT)); 436 436 } 437 437 438 + bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); 439 + void flush_all_rcu_sheaves(void); 440 + 438 441 #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ 439 442 SLAB_CACHE_DMA32 | SLAB_PANIC | \ 440 443 SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS | \

+26

mm/slab_common.c

··· 1608 1608 kvfree_rcu_list(head); 1609 1609 } 1610 1610 1611 + static bool kfree_rcu_sheaf(void *obj) 1612 + { 1613 + struct kmem_cache *s; 1614 + struct folio *folio; 1615 + struct slab *slab; 1616 + 1617 + if (is_vmalloc_addr(obj)) 1618 + return false; 1619 + 1620 + folio = virt_to_folio(obj); 1621 + if (unlikely(!folio_test_slab(folio))) 1622 + return false; 1623 + 1624 + slab = folio_slab(folio); 1625 + s = slab->slab_cache; 1626 + if (s->cpu_sheaves) 1627 + return __kfree_rcu_sheaf(s, obj); 1628 + 1629 + return false; 1630 + } 1631 + 1611 1632 static bool 1612 1633 need_offload_krc(struct kfree_rcu_cpu *krcp) 1613 1634 { ··· 1973 1952 if (!head) 1974 1953 might_sleep(); 1975 1954 1955 + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && kfree_rcu_sheaf(ptr)) 1956 + return; 1957 + 1976 1958 // Queue the object but don't yet schedule the batch. 1977 1959 if (debug_rcu_head_queue(ptr)) { 1978 1960 // Probable double kfree_rcu(), just leak. ··· 2049 2025 struct kfree_rcu_cpu *krcp; 2050 2026 bool queued; 2051 2027 int i, cpu; 2028 + 2029 + flush_all_rcu_sheaves(); 2052 2030 2053 2031 /* 2054 2032 * Firstly we detach objects and queue them over an RCU-batch

+266 -2

mm/slub.c

··· 367 367 ALLOC_FASTPATH, /* Allocation from cpu slab */ 368 368 ALLOC_SLOWPATH, /* Allocation by getting a new cpu slab */ 369 369 FREE_PCS, /* Free to percpu sheaf */ 370 + FREE_RCU_SHEAF, /* Free to rcu_free sheaf */ 371 + FREE_RCU_SHEAF_FAIL, /* Failed to free to a rcu_free sheaf */ 370 372 FREE_FASTPATH, /* Free to cpu slab */ 371 373 FREE_SLOWPATH, /* Freeing not to cpu slab */ 372 374 FREE_FROZEN, /* Freeing to frozen slab */ ··· 463 461 struct rcu_head rcu_head; 464 462 struct list_head barn_list; 465 463 }; 464 + struct kmem_cache *cache; 466 465 unsigned int size; 467 466 void *objects[]; 468 467 }; ··· 472 469 local_trylock_t lock; 473 470 struct slab_sheaf *main; /* never NULL when unlocked */ 474 471 struct slab_sheaf *spare; /* empty or full, may be NULL */ 472 + struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */ 475 473 }; 476 474 477 475 /* ··· 2535 2531 if (unlikely(!sheaf)) 2536 2532 return NULL; 2537 2533 2534 + sheaf->cache = s; 2535 + 2538 2536 stat(s, SHEAF_ALLOC); 2539 2537 2540 2538 return sheaf; ··· 2661 2655 sheaf->size = 0; 2662 2656 } 2663 2657 2658 + static void __rcu_free_sheaf_prepare(struct kmem_cache *s, 2659 + struct slab_sheaf *sheaf) 2660 + { 2661 + bool init = slab_want_init_on_free(s); 2662 + void **p = &sheaf->objects[0]; 2663 + unsigned int i = 0; 2664 + 2665 + while (i < sheaf->size) { 2666 + struct slab *slab = virt_to_slab(p[i]); 2667 + 2668 + memcg_slab_free_hook(s, slab, p + i, 1); 2669 + alloc_tagging_slab_free_hook(s, slab, p + i, 1); 2670 + 2671 + if (unlikely(!slab_free_hook(s, p[i], init, true))) { 2672 + p[i] = p[--sheaf->size]; 2673 + continue; 2674 + } 2675 + 2676 + i++; 2677 + } 2678 + } 2679 + 2680 + static void rcu_free_sheaf_nobarn(struct rcu_head *head) 2681 + { 2682 + struct slab_sheaf *sheaf; 2683 + struct kmem_cache *s; 2684 + 2685 + sheaf = container_of(head, struct slab_sheaf, rcu_head); 2686 + s = sheaf->cache; 2687 + 2688 + __rcu_free_sheaf_prepare(s, sheaf); 2689 + 2690 + sheaf_flush_unused(s, sheaf); 2691 + 2692 + free_empty_sheaf(s, sheaf); 2693 + } 2694 + 2664 2695 /* 2665 2696 * Caller needs to make sure migration is disabled in order to fully flush 2666 2697 * single cpu's sheaves ··· 2710 2667 static void pcs_flush_all(struct kmem_cache *s) 2711 2668 { 2712 2669 struct slub_percpu_sheaves *pcs; 2713 - struct slab_sheaf *spare; 2670 + struct slab_sheaf *spare, *rcu_free; 2714 2671 2715 2672 local_lock(&s->cpu_sheaves->lock); 2716 2673 pcs = this_cpu_ptr(s->cpu_sheaves); ··· 2718 2675 spare = pcs->spare; 2719 2676 pcs->spare = NULL; 2720 2677 2678 + rcu_free = pcs->rcu_free; 2679 + pcs->rcu_free = NULL; 2680 + 2721 2681 local_unlock(&s->cpu_sheaves->lock); 2722 2682 2723 2683 if (spare) { 2724 2684 sheaf_flush_unused(s, spare); 2725 2685 free_empty_sheaf(s, spare); 2726 2686 } 2687 + 2688 + if (rcu_free) 2689 + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); 2727 2690 2728 2691 sheaf_flush_main(s); 2729 2692 } ··· 2746 2697 sheaf_flush_unused(s, pcs->spare); 2747 2698 free_empty_sheaf(s, pcs->spare); 2748 2699 pcs->spare = NULL; 2700 + } 2701 + 2702 + if (pcs->rcu_free) { 2703 + call_rcu(&pcs->rcu_free->rcu_head, rcu_free_sheaf_nobarn); 2704 + pcs->rcu_free = NULL; 2749 2705 } 2750 2706 } 2751 2707 ··· 2777 2723 */ 2778 2724 2779 2725 WARN_ON(pcs->spare); 2726 + WARN_ON(pcs->rcu_free); 2780 2727 2781 2728 if (!WARN_ON(pcs->main->size)) { 2782 2729 free_empty_sheaf(s, pcs->main); ··· 3835 3780 3836 3781 pcs = per_cpu_ptr(s->cpu_sheaves, cpu); 3837 3782 3838 - return (pcs->spare || pcs->main->size); 3783 + return (pcs->spare || pcs->rcu_free || pcs->main->size); 3839 3784 } 3840 3785 3841 3786 /* ··· 3893 3838 cpus_read_lock(); 3894 3839 flush_all_cpus_locked(s); 3895 3840 cpus_read_unlock(); 3841 + } 3842 + 3843 + static void flush_rcu_sheaf(struct work_struct *w) 3844 + { 3845 + struct slub_percpu_sheaves *pcs; 3846 + struct slab_sheaf *rcu_free; 3847 + struct slub_flush_work *sfw; 3848 + struct kmem_cache *s; 3849 + 3850 + sfw = container_of(w, struct slub_flush_work, work); 3851 + s = sfw->s; 3852 + 3853 + local_lock(&s->cpu_sheaves->lock); 3854 + pcs = this_cpu_ptr(s->cpu_sheaves); 3855 + 3856 + rcu_free = pcs->rcu_free; 3857 + pcs->rcu_free = NULL; 3858 + 3859 + local_unlock(&s->cpu_sheaves->lock); 3860 + 3861 + if (rcu_free) 3862 + call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn); 3863 + } 3864 + 3865 + 3866 + /* needed for kvfree_rcu_barrier() */ 3867 + void flush_all_rcu_sheaves(void) 3868 + { 3869 + struct slub_flush_work *sfw; 3870 + struct kmem_cache *s; 3871 + unsigned int cpu; 3872 + 3873 + cpus_read_lock(); 3874 + mutex_lock(&slab_mutex); 3875 + 3876 + list_for_each_entry(s, &slab_caches, list) { 3877 + if (!s->cpu_sheaves) 3878 + continue; 3879 + 3880 + mutex_lock(&flush_lock); 3881 + 3882 + for_each_online_cpu(cpu) { 3883 + sfw = &per_cpu(slub_flush, cpu); 3884 + 3885 + /* 3886 + * we don't check if rcu_free sheaf exists - racing 3887 + * __kfree_rcu_sheaf() might have just removed it. 3888 + * by executing flush_rcu_sheaf() on the cpu we make 3889 + * sure the __kfree_rcu_sheaf() finished its call_rcu() 3890 + */ 3891 + 3892 + INIT_WORK(&sfw->work, flush_rcu_sheaf); 3893 + sfw->s = s; 3894 + queue_work_on(cpu, flushwq, &sfw->work); 3895 + } 3896 + 3897 + for_each_online_cpu(cpu) { 3898 + sfw = &per_cpu(slub_flush, cpu); 3899 + flush_work(&sfw->work); 3900 + } 3901 + 3902 + mutex_unlock(&flush_lock); 3903 + } 3904 + 3905 + mutex_unlock(&slab_mutex); 3906 + cpus_read_unlock(); 3907 + 3908 + rcu_barrier(); 3896 3909 } 3897 3910 3898 3911 /* ··· 5536 5413 return true; 5537 5414 } 5538 5415 5416 + static void rcu_free_sheaf(struct rcu_head *head) 5417 + { 5418 + struct slab_sheaf *sheaf; 5419 + struct node_barn *barn; 5420 + struct kmem_cache *s; 5421 + 5422 + sheaf = container_of(head, struct slab_sheaf, rcu_head); 5423 + 5424 + s = sheaf->cache; 5425 + 5426 + /* 5427 + * This may remove some objects due to slab_free_hook() returning false, 5428 + * so that the sheaf might no longer be completely full. But it's easier 5429 + * to handle it as full (unless it became completely empty), as the code 5430 + * handles it fine. The only downside is that sheaf will serve fewer 5431 + * allocations when reused. It only happens due to debugging, which is a 5432 + * performance hit anyway. 5433 + */ 5434 + __rcu_free_sheaf_prepare(s, sheaf); 5435 + 5436 + barn = get_node(s, numa_mem_id())->barn; 5437 + 5438 + /* due to slab_free_hook() */ 5439 + if (unlikely(sheaf->size == 0)) 5440 + goto empty; 5441 + 5442 + /* 5443 + * Checking nr_full/nr_empty outside lock avoids contention in case the 5444 + * barn is at the respective limit. Due to the race we might go over the 5445 + * limit but that should be rare and harmless. 5446 + */ 5447 + 5448 + if (data_race(barn->nr_full) < MAX_FULL_SHEAVES) { 5449 + stat(s, BARN_PUT); 5450 + barn_put_full_sheaf(barn, sheaf); 5451 + return; 5452 + } 5453 + 5454 + stat(s, BARN_PUT_FAIL); 5455 + sheaf_flush_unused(s, sheaf); 5456 + 5457 + empty: 5458 + if (data_race(barn->nr_empty) < MAX_EMPTY_SHEAVES) { 5459 + barn_put_empty_sheaf(barn, sheaf); 5460 + return; 5461 + } 5462 + 5463 + free_empty_sheaf(s, sheaf); 5464 + } 5465 + 5466 + bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj) 5467 + { 5468 + struct slub_percpu_sheaves *pcs; 5469 + struct slab_sheaf *rcu_sheaf; 5470 + 5471 + if (!local_trylock(&s->cpu_sheaves->lock)) 5472 + goto fail; 5473 + 5474 + pcs = this_cpu_ptr(s->cpu_sheaves); 5475 + 5476 + if (unlikely(!pcs->rcu_free)) { 5477 + 5478 + struct slab_sheaf *empty; 5479 + struct node_barn *barn; 5480 + 5481 + if (pcs->spare && pcs->spare->size == 0) { 5482 + pcs->rcu_free = pcs->spare; 5483 + pcs->spare = NULL; 5484 + goto do_free; 5485 + } 5486 + 5487 + barn = get_barn(s); 5488 + 5489 + empty = barn_get_empty_sheaf(barn); 5490 + 5491 + if (empty) { 5492 + pcs->rcu_free = empty; 5493 + goto do_free; 5494 + } 5495 + 5496 + local_unlock(&s->cpu_sheaves->lock); 5497 + 5498 + empty = alloc_empty_sheaf(s, GFP_NOWAIT); 5499 + 5500 + if (!empty) 5501 + goto fail; 5502 + 5503 + if (!local_trylock(&s->cpu_sheaves->lock)) { 5504 + barn_put_empty_sheaf(barn, empty); 5505 + goto fail; 5506 + } 5507 + 5508 + pcs = this_cpu_ptr(s->cpu_sheaves); 5509 + 5510 + if (unlikely(pcs->rcu_free)) 5511 + barn_put_empty_sheaf(barn, empty); 5512 + else 5513 + pcs->rcu_free = empty; 5514 + } 5515 + 5516 + do_free: 5517 + 5518 + rcu_sheaf = pcs->rcu_free; 5519 + 5520 + /* 5521 + * Since we flush immediately when size reaches capacity, we never reach 5522 + * this with size already at capacity, so no OOB write is possible. 5523 + */ 5524 + rcu_sheaf->objects[rcu_sheaf->size++] = obj; 5525 + 5526 + if (likely(rcu_sheaf->size < s->sheaf_capacity)) 5527 + rcu_sheaf = NULL; 5528 + else 5529 + pcs->rcu_free = NULL; 5530 + 5531 + /* 5532 + * we flush before local_unlock to make sure a racing 5533 + * flush_all_rcu_sheaves() doesn't miss this sheaf 5534 + */ 5535 + if (rcu_sheaf) 5536 + call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf); 5537 + 5538 + local_unlock(&s->cpu_sheaves->lock); 5539 + 5540 + stat(s, FREE_RCU_SHEAF); 5541 + return true; 5542 + 5543 + fail: 5544 + stat(s, FREE_RCU_SHEAF_FAIL); 5545 + return false; 5546 + } 5547 + 5539 5548 /* 5540 5549 * Bulk free objects to the percpu sheaves. 5541 5550 * Unlike free_to_pcs() this includes the calls to all necessary hooks ··· 7164 6909 struct kmem_cache_node *n; 7165 6910 7166 6911 flush_all_cpus_locked(s); 6912 + 6913 + /* we might have rcu sheaves in flight */ 6914 + if (s->cpu_sheaves) 6915 + rcu_barrier(); 6916 + 7167 6917 /* Attempt to free all objects */ 7168 6918 for_each_kmem_cache_node(s, node, n) { 7169 6919 if (n->barn) ··· 8544 8284 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); 8545 8285 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); 8546 8286 STAT_ATTR(FREE_PCS, free_cpu_sheaf); 8287 + STAT_ATTR(FREE_RCU_SHEAF, free_rcu_sheaf); 8288 + STAT_ATTR(FREE_RCU_SHEAF_FAIL, free_rcu_sheaf_fail); 8547 8289 STAT_ATTR(FREE_FASTPATH, free_fastpath); 8548 8290 STAT_ATTR(FREE_SLOWPATH, free_slowpath); 8549 8291 STAT_ATTR(FREE_FROZEN, free_frozen); ··· 8644 8382 &alloc_fastpath_attr.attr, 8645 8383 &alloc_slowpath_attr.attr, 8646 8384 &free_cpu_sheaf_attr.attr, 8385 + &free_rcu_sheaf_attr.attr, 8386 + &free_rcu_sheaf_fail_attr.attr, 8647 8387 &free_fastpath_attr.attr, 8648 8388 &free_slowpath_attr.attr, 8649 8389 &free_frozen_attr.attr,

Configure Feed

Configure Feed