Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

memcg: combine slab obj stock charging and accounting

When handing slab objects, we use obj_cgroup_[un]charge() for (un)charging
and mod_objcg_state() to account NR_SLAB_[UN]RECLAIMABLE_B. All these
operations use the percpu stock for performance. However with the calls
being separate, the stock_lock is taken twice in each case.

By refactoring the code, we can turn mod_objcg_state() into
__account_obj_stock() which is called on a stock that's already locked and
validated. On the charging side we can call this function from
consume_obj_stock() when it succeeds, and refill_obj_stock() in the
fallback. We just expand parameters of these functions as necessary. The
uncharge side from __memcg_slab_free_hook() is just the call to
refill_obj_stock().

Other callers of obj_cgroup_[un]charge() (i.e. not slab) simply pass the
extra parameters as NULL/zeroes to skip the __account_obj_stock()
operation.

In __memcg_slab_post_alloc_hook() we now charge each object separately,
but that's not a problem as we did call mod_objcg_state() for each object
separately, and most allocations are non-bulk anyway. This could be
improved by batching all operations until slab_pgdat(slab) changes.

Some preliminary benchmarking with a kfree(kmalloc()) loop of 10M
iterations with/without __GFP_ACCOUNT:

Before the patch:
kmalloc/kfree !memcg: 581390144 cycles
kmalloc/kfree memcg: 783689984 cycles

After the patch:
kmalloc/kfree memcg: 658723808 cycles

More than half of the overhead of __GFP_ACCOUNT relative to
non-accounted case seems eliminated.

Link: https://lkml.kernel.org/r/20250404013913.1663035-9-shakeel.butt@linux.dev
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Vlastimil Babka and committed by
Andrew Morton
bc730030 42a1910c

+46 -31
+46 -31
mm/memcontrol.c
··· 2774 2774 WRITE_ONCE(stock->cached_objcg, objcg); 2775 2775 } 2776 2776 2777 - static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, 2778 - enum node_stat_item idx, int nr) 2777 + static void __account_obj_stock(struct obj_cgroup *objcg, 2778 + struct memcg_stock_pcp *stock, int nr, 2779 + struct pglist_data *pgdat, enum node_stat_item idx) 2779 2780 { 2780 - struct memcg_stock_pcp *stock; 2781 - unsigned long flags; 2782 2781 int *bytes; 2783 - 2784 - local_lock_irqsave(&memcg_stock.stock_lock, flags); 2785 - stock = this_cpu_ptr(&memcg_stock); 2786 2782 2787 2783 /* 2788 2784 * Save vmstat data in stock and skip vmstat array update unless 2789 - * accumulating over a page of vmstat data or when pgdat or idx 2790 - * changes. 2785 + * accumulating over a page of vmstat data or when pgdat changes. 2791 2786 */ 2792 - if (READ_ONCE(stock->cached_objcg) != objcg) { 2793 - replace_stock_objcg(stock, objcg); 2794 - stock->cached_pgdat = pgdat; 2795 - } else if (stock->cached_pgdat != pgdat) { 2787 + if (stock->cached_pgdat != pgdat) { 2796 2788 /* Flush the existing cached vmstat data */ 2797 2789 struct pglist_data *oldpg = stock->cached_pgdat; 2798 2790 ··· 2821 2829 } 2822 2830 if (nr) 2823 2831 __mod_objcg_mlstate(objcg, pgdat, idx, nr); 2824 - 2825 - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); 2826 2832 } 2827 2833 2828 - static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) 2834 + static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 2835 + struct pglist_data *pgdat, enum node_stat_item idx) 2829 2836 { 2830 2837 struct memcg_stock_pcp *stock; 2831 2838 unsigned long flags; ··· 2836 2845 if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { 2837 2846 stock->nr_bytes -= nr_bytes; 2838 2847 ret = true; 2848 + 2849 + if (pgdat) 2850 + __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); 2839 2851 } 2840 2852 2841 2853 local_unlock_irqrestore(&memcg_stock.stock_lock, flags); ··· 2923 2929 } 2924 2930 2925 2931 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, 2926 - bool allow_uncharge) 2932 + bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, 2933 + enum node_stat_item idx) 2927 2934 { 2928 2935 struct memcg_stock_pcp *stock; 2929 2936 unsigned long flags; ··· 2939 2944 } 2940 2945 stock->nr_bytes += nr_bytes; 2941 2946 2947 + if (pgdat) 2948 + __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); 2949 + 2942 2950 if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { 2943 2951 nr_pages = stock->nr_bytes >> PAGE_SHIFT; 2944 2952 stock->nr_bytes &= (PAGE_SIZE - 1); ··· 2953 2955 obj_cgroup_uncharge_pages(objcg, nr_pages); 2954 2956 } 2955 2957 2956 - int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 2958 + static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, 2959 + struct pglist_data *pgdat, enum node_stat_item idx) 2957 2960 { 2958 2961 unsigned int nr_pages, nr_bytes; 2959 2962 int ret; 2960 2963 2961 - if (consume_obj_stock(objcg, size)) 2964 + if (likely(consume_obj_stock(objcg, size, pgdat, idx))) 2962 2965 return 0; 2963 2966 2964 2967 /* ··· 2992 2993 nr_pages += 1; 2993 2994 2994 2995 ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); 2995 - if (!ret && nr_bytes) 2996 - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); 2996 + if (!ret && (nr_bytes || pgdat)) 2997 + refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, 2998 + false, size, pgdat, idx); 2997 2999 2998 3000 return ret; 2999 3001 } 3000 3002 3003 + int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) 3004 + { 3005 + return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); 3006 + } 3007 + 3001 3008 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) 3002 3009 { 3003 - refill_obj_stock(objcg, size, true); 3010 + refill_obj_stock(objcg, size, true, 0, NULL, 0); 3004 3011 } 3005 3012 3006 3013 static inline size_t obj_full_size(struct kmem_cache *s) ··· 3058 3053 return false; 3059 3054 } 3060 3055 3061 - if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) 3062 - return false; 3063 - 3064 3056 for (i = 0; i < size; i++) { 3065 3057 slab = virt_to_slab(p[i]); 3066 3058 3067 3059 if (!slab_obj_exts(slab) && 3068 3060 alloc_slab_obj_exts(slab, s, flags, false)) { 3069 - obj_cgroup_uncharge(objcg, obj_full_size(s)); 3070 3061 continue; 3071 3062 } 3063 + 3064 + /* 3065 + * if we fail and size is 1, memcg_alloc_abort_single() will 3066 + * just free the object, which is ok as we have not assigned 3067 + * objcg to its obj_ext yet 3068 + * 3069 + * for larger sizes, kmem_cache_free_bulk() will uncharge 3070 + * any objects that were already charged and obj_ext assigned 3071 + * 3072 + * TODO: we could batch this until slab_pgdat(slab) changes 3073 + * between iterations, with a more complicated undo 3074 + */ 3075 + if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), 3076 + slab_pgdat(slab), cache_vmstat_idx(s))) 3077 + return false; 3072 3078 3073 3079 off = obj_to_index(s, slab, p[i]); 3074 3080 obj_cgroup_get(objcg); 3075 3081 slab_obj_exts(slab)[off].objcg = objcg; 3076 - mod_objcg_state(objcg, slab_pgdat(slab), 3077 - cache_vmstat_idx(s), obj_full_size(s)); 3078 3082 } 3079 3083 3080 3084 return true; ··· 3092 3078 void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, 3093 3079 void **p, int objects, struct slabobj_ext *obj_exts) 3094 3080 { 3081 + size_t obj_size = obj_full_size(s); 3082 + 3095 3083 for (int i = 0; i < objects; i++) { 3096 3084 struct obj_cgroup *objcg; 3097 3085 unsigned int off; ··· 3104 3088 continue; 3105 3089 3106 3090 obj_exts[off].objcg = NULL; 3107 - obj_cgroup_uncharge(objcg, obj_full_size(s)); 3108 - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), 3109 - -obj_full_size(s)); 3091 + refill_obj_stock(objcg, obj_size, true, -obj_size, 3092 + slab_pgdat(slab), cache_vmstat_idx(s)); 3110 3093 obj_cgroup_put(objcg); 3111 3094 } 3112 3095 }