Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

mm: memcontrol: convert objcg to be per-memcg per-node type

Convert objcg to be per-memcg per-node type, so that when reparent LRU
folios later, we can hold the lru lock at the node level, thus avoiding
holding too many lru locks at once.

[zhengqi.arch@bytedance.com: reset pn->orig_objcg to NULL]
Link: https://lore.kernel.org/20260309112939.31937-1-qi.zheng@linux.dev
[akpm@linux-foundation.org: fix comment typo, per Usama. Reflow comment to 80 cols]
[devnexen@gmail.com: fix obj_cgroup leak in mem_cgroup_css_online() error path]
Link: https://lore.kernel.org/20260322193631.45457-1-devnexen@gmail.com
[devnexen@gmail.com: add newline, per Qi Zheng]
Link: https://lore.kernel.org/20260323063007.7783-1-devnexen@gmail.com
Link: https://lore.kernel.org/56c04b1c5d54f75ccdc12896df6c1ca35403ecc3.1772711148.git.zhengqi.arch@bytedance.com
Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
Signed-off-by: David Carlier <devnexen@gmail.com>
Acked-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Allen Pais <apais@linux.microsoft.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Baoquan He <bhe@redhat.com>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Chen Ridong <chenridong@huawei.com>
Cc: David Hildenbrand <david@kernel.org>
Cc: Hamza Mahfooz <hamzamahfooz@linux.microsoft.com>
Cc: Harry Yoo <harry.yoo@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Imran Khan <imran.f.khan@oracle.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Kamalesh Babulal <kamalesh.babulal@oracle.com>
Cc: Lance Yang <lance.yang@linux.dev>
Cc: Liam Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Michal Koutný <mkoutny@suse.com>
Cc: Mike Rapoport <rppt@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Usama Arif <usamaarif642@gmail.com>
Cc: Vlastimil Babka <vbabka@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yosry Ahmed <yosry@kernel.org>
Cc: Yuanchu Xie <yuanchu@google.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Usama Arif <usama.arif@linux.dev>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Qi Zheng and committed by
Andrew Morton
01b9da29 8285917d

+75 -42
+12 -11
include/linux/memcontrol.h
··· 115 115 unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; 116 116 struct mem_cgroup_reclaim_iter iter; 117 117 118 + /* 119 + * objcg is wiped out as a part of the objcg repaprenting process. 120 + * orig_objcg preserves a pointer (and a reference) to the original 121 + * objcg until the end of live of memcg. 122 + */ 123 + struct obj_cgroup __rcu *objcg; 124 + struct obj_cgroup *orig_objcg; 125 + /* list of inherited objcgs, protected by objcg_lock */ 126 + struct list_head objcg_list; 127 + 118 128 #ifdef CONFIG_MEMCG_NMI_SAFETY_REQUIRES_ATOMIC 119 129 /* slab stats for nmi context */ 120 130 atomic_t slab_reclaimable; ··· 189 179 struct list_head list; /* protected by objcg_lock */ 190 180 struct rcu_head rcu; 191 181 }; 182 + bool is_root; 192 183 }; 193 184 194 185 /* ··· 268 257 seqlock_t socket_pressure_seqlock; 269 258 #endif 270 259 int kmemcg_id; 271 - /* 272 - * memcg->objcg is wiped out as a part of the objcg repaprenting 273 - * process. memcg->orig_objcg preserves a pointer (and a reference) 274 - * to the original objcg until the end of live of memcg. 275 - */ 276 - struct obj_cgroup __rcu *objcg; 277 - struct obj_cgroup *orig_objcg; 278 - /* list of inherited objcgs, protected by objcg_lock */ 279 - struct list_head objcg_list; 280 260 281 261 struct memcg_vmstats_percpu __percpu *vmstats_percpu; 282 262 ··· 334 332 #define MEMCG_CHARGE_BATCH 64U 335 333 336 334 extern struct mem_cgroup *root_mem_cgroup; 337 - extern struct obj_cgroup *root_obj_cgroup; 338 335 339 336 enum page_memcg_data_flags { 340 337 /* page->memcg_data is a pointer to an slabobj_ext vector */ ··· 552 551 553 552 static inline bool obj_cgroup_is_root(const struct obj_cgroup *objcg) 554 553 { 555 - return objcg == root_obj_cgroup; 554 + return objcg->is_root; 556 555 } 557 556 558 557 static inline bool mem_cgroup_disabled(void)
+1 -1
include/linux/sched.h
··· 1533 1533 /* Used by memcontrol for targeted memcg charge: */ 1534 1534 struct mem_cgroup *active_memcg; 1535 1535 1536 - /* Cache for current->cgroups->memcg->objcg lookups: */ 1536 + /* Cache for current->cgroups->memcg->nodeinfo[nid]->objcg lookups: */ 1537 1537 struct obj_cgroup *objcg; 1538 1538 #endif 1539 1539
+62 -30
mm/memcontrol.c
··· 83 83 struct mem_cgroup *root_mem_cgroup __read_mostly; 84 84 EXPORT_SYMBOL(root_mem_cgroup); 85 85 86 - struct obj_cgroup *root_obj_cgroup __read_mostly; 87 - 88 86 /* Active memory cgroup to use from an interrupt context */ 89 87 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); 90 88 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); ··· 207 209 } 208 210 209 211 static inline struct obj_cgroup *__memcg_reparent_objcgs(struct mem_cgroup *memcg, 210 - struct mem_cgroup *parent) 212 + struct mem_cgroup *parent, 213 + int nid) 211 214 { 212 215 struct obj_cgroup *objcg, *iter; 216 + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 217 + struct mem_cgroup_per_node *parent_pn = parent->nodeinfo[nid]; 213 218 214 - objcg = rcu_replace_pointer(memcg->objcg, NULL, true); 219 + objcg = rcu_replace_pointer(pn->objcg, NULL, true); 215 220 /* 1) Ready to reparent active objcg. */ 216 - list_add(&objcg->list, &memcg->objcg_list); 221 + list_add(&objcg->list, &pn->objcg_list); 217 222 /* 2) Reparent active objcg and already reparented objcgs to parent. */ 218 - list_for_each_entry(iter, &memcg->objcg_list, list) 223 + list_for_each_entry(iter, &pn->objcg_list, list) 219 224 WRITE_ONCE(iter->memcg, parent); 220 225 /* 3) Move already reparented objcgs to the parent's list */ 221 - list_splice(&memcg->objcg_list, &parent->objcg_list); 226 + list_splice(&pn->objcg_list, &parent_pn->objcg_list); 222 227 223 228 return objcg; 224 229 } ··· 268 267 { 269 268 struct obj_cgroup *objcg; 270 269 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 270 + int nid; 271 271 272 - reparent_locks(memcg, parent); 272 + for_each_node(nid) { 273 + reparent_locks(memcg, parent); 273 274 274 - objcg = __memcg_reparent_objcgs(memcg, parent); 275 + objcg = __memcg_reparent_objcgs(memcg, parent, nid); 275 276 276 - reparent_unlocks(memcg, parent); 277 + reparent_unlocks(memcg, parent); 277 278 278 - percpu_ref_kill(&objcg->refcnt); 279 + percpu_ref_kill(&objcg->refcnt); 280 + } 279 281 } 280 282 281 283 /* ··· 2834 2830 2835 2831 static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) 2836 2832 { 2833 + int nid = numa_node_id(); 2834 + 2837 2835 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 2838 - struct obj_cgroup *objcg = rcu_dereference(memcg->objcg); 2836 + struct obj_cgroup *objcg = rcu_dereference(memcg->nodeinfo[nid]->objcg); 2839 2837 2840 2838 if (likely(objcg && obj_cgroup_tryget(objcg))) 2841 2839 return objcg; ··· 2901 2895 { 2902 2896 struct mem_cgroup *memcg; 2903 2897 struct obj_cgroup *objcg; 2898 + int nid = numa_node_id(); 2904 2899 2905 2900 if (IS_ENABLED(CONFIG_MEMCG_NMI_UNSAFE) && in_nmi()) 2906 2901 return NULL; ··· 2918 2911 * Objcg reference is kept by the task, so it's safe 2919 2912 * to use the objcg by the current task. 2920 2913 */ 2921 - return objcg ? : root_obj_cgroup; 2914 + return objcg ? : rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); 2922 2915 } 2923 2916 2924 2917 memcg = this_cpu_read(int_active_memcg); 2925 2918 if (unlikely(memcg)) 2926 2919 goto from_memcg; 2927 2920 2928 - return root_obj_cgroup; 2921 + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); 2929 2922 2930 2923 from_memcg: 2931 2924 for (; memcg; memcg = parent_mem_cgroup(memcg)) { ··· 2935 2928 * away and can be used within the scope without any additional 2936 2929 * protection. 2937 2930 */ 2938 - objcg = rcu_dereference_check(memcg->objcg, 1); 2931 + objcg = rcu_dereference_check(memcg->nodeinfo[nid]->objcg, 1); 2939 2932 if (likely(objcg)) 2940 2933 return objcg; 2941 2934 } 2942 2935 2943 - return root_obj_cgroup; 2936 + return rcu_dereference_check(root_mem_cgroup->nodeinfo[nid]->objcg, 1); 2944 2937 } 2945 2938 2946 2939 struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) ··· 3883 3876 if (!pn->lruvec_stats_percpu) 3884 3877 goto fail; 3885 3878 3879 + INIT_LIST_HEAD(&pn->objcg_list); 3880 + 3886 3881 lruvec_init(&pn->lruvec); 3887 3882 pn->memcg = memcg; 3888 3883 ··· 3899 3890 { 3900 3891 int node; 3901 3892 3902 - obj_cgroup_put(memcg->orig_objcg); 3893 + for_each_node(node) { 3894 + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; 3895 + if (!pn) 3896 + continue; 3903 3897 3904 - for_each_node(node) 3905 - free_mem_cgroup_per_node_info(memcg->nodeinfo[node]); 3898 + obj_cgroup_put(pn->orig_objcg); 3899 + free_mem_cgroup_per_node_info(pn); 3900 + } 3906 3901 memcg1_free_events(memcg); 3907 3902 kfree(memcg->vmstats); 3908 3903 free_percpu(memcg->vmstats_percpu); ··· 3977 3964 #endif 3978 3965 memcg1_memcg_init(memcg); 3979 3966 memcg->kmemcg_id = -1; 3980 - INIT_LIST_HEAD(&memcg->objcg_list); 3981 3967 #ifdef CONFIG_CGROUP_WRITEBACK 3982 3968 INIT_LIST_HEAD(&memcg->cgwb_list); 3983 3969 for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) ··· 4053 4041 { 4054 4042 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 4055 4043 struct obj_cgroup *objcg; 4044 + int nid; 4056 4045 4057 4046 memcg_online_kmem(memcg); 4058 4047 ··· 4065 4052 if (alloc_shrinker_info(memcg)) 4066 4053 goto offline_kmem; 4067 4054 4068 - objcg = obj_cgroup_alloc(); 4069 - if (!objcg) 4070 - goto free_shrinker; 4055 + for_each_node(nid) { 4056 + objcg = obj_cgroup_alloc(); 4057 + if (!objcg) 4058 + goto free_objcg; 4071 4059 4072 - if (unlikely(mem_cgroup_is_root(memcg))) 4073 - root_obj_cgroup = objcg; 4060 + if (unlikely(mem_cgroup_is_root(memcg))) 4061 + objcg->is_root = true; 4074 4062 4075 - objcg->memcg = memcg; 4076 - rcu_assign_pointer(memcg->objcg, objcg); 4077 - obj_cgroup_get(objcg); 4078 - memcg->orig_objcg = objcg; 4063 + objcg->memcg = memcg; 4064 + rcu_assign_pointer(memcg->nodeinfo[nid]->objcg, objcg); 4065 + obj_cgroup_get(objcg); 4066 + memcg->nodeinfo[nid]->orig_objcg = objcg; 4067 + } 4079 4068 4080 4069 if (unlikely(mem_cgroup_is_root(memcg)) && !mem_cgroup_disabled()) 4081 4070 queue_delayed_work(system_dfl_wq, &stats_flush_dwork, ··· 4101 4086 xa_store(&mem_cgroup_private_ids, memcg->id.id, memcg, GFP_KERNEL); 4102 4087 4103 4088 return 0; 4104 - free_shrinker: 4089 + free_objcg: 4090 + for_each_node(nid) { 4091 + struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; 4092 + 4093 + objcg = rcu_replace_pointer(pn->objcg, NULL, true); 4094 + if (objcg) 4095 + percpu_ref_kill(&objcg->refcnt); 4096 + 4097 + if (pn->orig_objcg) { 4098 + obj_cgroup_put(pn->orig_objcg); 4099 + /* 4100 + * Reset pn->orig_objcg to NULL to prevent 4101 + * obj_cgroup_put() from being called again in 4102 + * __mem_cgroup_free(). 4103 + */ 4104 + pn->orig_objcg = NULL; 4105 + } 4106 + } 4105 4107 free_shrinker_info(memcg); 4106 4108 offline_kmem: 4107 4109 memcg_offline_kmem(memcg);