mm, memcg: cg2 memory{.swap,}.peak write handlers

+14 -8

Documentation/admin-guide/cgroup-v2.rst

··· 1333 1333 all the existing limitations and potential future extensions. 1334 1334 1335 1335 memory.peak 1336 - A read-only single value file which exists on non-root 1337 - cgroups. 1336 + A read-write single value file which exists on non-root cgroups. 1338 1337 1339 - The max memory usage recorded for the cgroup and its 1340 - descendants since the creation of the cgroup. 1338 + The max memory usage recorded for the cgroup and its descendants since 1339 + either the creation of the cgroup or the most recent reset for that FD. 1340 + 1341 + A write of any non-empty string to this file resets it to the 1342 + current memory usage for subsequent reads through the same 1343 + file descriptor. 1341 1344 1342 1345 memory.oom.group 1343 1346 A read-write single value file which exists on non-root ··· 1666 1663 Healthy workloads are not expected to reach this limit. 1667 1664 1668 1665 memory.swap.peak 1669 - A read-only single value file which exists on non-root 1670 - cgroups. 1666 + A read-write single value file which exists on non-root cgroups. 1671 1667 1672 - The max swap usage recorded for the cgroup and its 1673 - descendants since the creation of the cgroup. 1668 + The max swap usage recorded for the cgroup and its descendants since 1669 + the creation of the cgroup or the most recent reset for that FD. 1670 + 1671 + A write of any non-empty string to this file resets it to the 1672 + current memory usage for subsequent reads through the same 1673 + file descriptor. 1674 1674 1675 1675 memory.swap.max 1676 1676 A read-write single value file which exists on non-root

+5

include/linux/cgroup-defs.h

··· 775 775 776 776 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 777 777 778 + struct cgroup_of_peak { 779 + unsigned long value; 780 + struct list_head list; 781 + }; 782 + 778 783 /** 779 784 * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups 780 785 * @tsk: target task

+3

include/linux/cgroup.h

··· 11 11 12 12 #include <linux/sched.h> 13 13 #include <linux/nodemask.h> 14 + #include <linux/list.h> 14 15 #include <linux/rculist.h> 15 16 #include <linux/cgroupstats.h> 16 17 #include <linux/fs.h> ··· 854 853 #endif /* CONFIG_CGROUP_BPF */ 855 854 856 855 struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); 856 + 857 + struct cgroup_of_peak *of_peak(struct kernfs_open_file *of); 857 858 858 859 #endif /* _LINUX_CGROUP_H */

+5

include/linux/memcontrol.h

··· 193 193 struct page_counter memsw; /* v1 only */ 194 194 }; 195 195 196 + /* registered local peak watchers */ 197 + struct list_head memory_peaks; 198 + struct list_head swap_peaks; 199 + spinlock_t peaks_lock; 200 + 196 201 /* Range enforcement for interrupt charges */ 197 202 struct work_struct high_work; 198 203

+10 -1

include/linux/page_counter.h

··· 26 26 atomic_long_t children_low_usage; 27 27 28 28 unsigned long watermark; 29 + /* Latest cg2 reset watermark */ 30 + unsigned long local_watermark; 29 31 unsigned long failcnt; 30 32 31 33 /* Keep all the read most fields in a separete cacheline. */ ··· 86 84 87 85 static inline void page_counter_reset_watermark(struct page_counter *counter) 88 86 { 89 - counter->watermark = page_counter_read(counter); 87 + unsigned long usage = page_counter_read(counter); 88 + 89 + /* 90 + * Update local_watermark first, so it's always <= watermark 91 + * (modulo CPU/compiler re-ordering) 92 + */ 93 + counter->local_watermark = usage; 94 + counter->watermark = usage; 90 95 } 91 96 92 97 #ifdef CONFIG_MEMCG

+2

kernel/cgroup/cgroup-internal.h

··· 81 81 struct { 82 82 struct cgroup_pidlist *pidlist; 83 83 } procs1; 84 + 85 + struct cgroup_of_peak peak; 84 86 }; 85 87 86 88 /*

+7

kernel/cgroup/cgroup.c

··· 1972 1972 return -EINVAL; 1973 1973 } 1974 1974 1975 + struct cgroup_of_peak *of_peak(struct kernfs_open_file *of) 1976 + { 1977 + struct cgroup_file_ctx *ctx = of->priv; 1978 + 1979 + return &ctx->peak; 1980 + } 1981 + 1975 1982 static void apply_cgroup_root_flags(unsigned int root_flags) 1976 1983 { 1977 1984 if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {

+107 -11

mm/memcontrol.c

··· 25 25 * Copyright (C) 2020 Alibaba, Inc, Alex Shi 26 26 */ 27 27 28 + #include <linux/cgroup-defs.h> 28 29 #include <linux/page_counter.h> 29 30 #include <linux/memcontrol.h> 30 31 #include <linux/cgroup.h> ··· 42 41 #include <linux/rcupdate.h> 43 42 #include <linux/limits.h> 44 43 #include <linux/export.h> 44 + #include <linux/list.h> 45 45 #include <linux/mutex.h> 46 46 #include <linux/rbtree.h> 47 47 #include <linux/slab.h> ··· 3552 3550 3553 3551 INIT_WORK(&memcg->high_work, high_work_func); 3554 3552 vmpressure_init(&memcg->vmpressure); 3553 + INIT_LIST_HEAD(&memcg->memory_peaks); 3554 + INIT_LIST_HEAD(&memcg->swap_peaks); 3555 + spin_lock_init(&memcg->peaks_lock); 3555 3556 memcg->socket_pressure = jiffies; 3556 3557 memcg1_memcg_init(memcg); 3557 3558 memcg->kmemcg_id = -1; ··· 3949 3944 return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; 3950 3945 } 3951 3946 3952 - static u64 memory_peak_read(struct cgroup_subsys_state *css, 3953 - struct cftype *cft) 3954 - { 3955 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3947 + #define OFP_PEAK_UNSET (((-1UL))) 3956 3948 3957 - return (u64)memcg->memory.watermark * PAGE_SIZE; 3949 + static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc) 3950 + { 3951 + struct cgroup_of_peak *ofp = of_peak(sf->private); 3952 + u64 fd_peak = READ_ONCE(ofp->value), peak; 3953 + 3954 + /* User wants global or local peak? */ 3955 + if (fd_peak == OFP_PEAK_UNSET) 3956 + peak = pc->watermark; 3957 + else 3958 + peak = max(fd_peak, READ_ONCE(pc->local_watermark)); 3959 + 3960 + seq_printf(sf, "%llu\n", peak * PAGE_SIZE); 3961 + return 0; 3958 3962 } 3963 + 3964 + static int memory_peak_show(struct seq_file *sf, void *v) 3965 + { 3966 + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 3967 + 3968 + return peak_show(sf, v, &memcg->memory); 3969 + } 3970 + 3971 + static int peak_open(struct kernfs_open_file *of) 3972 + { 3973 + struct cgroup_of_peak *ofp = of_peak(of); 3974 + 3975 + ofp->value = OFP_PEAK_UNSET; 3976 + return 0; 3977 + } 3978 + 3979 + static void peak_release(struct kernfs_open_file *of) 3980 + { 3981 + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 3982 + struct cgroup_of_peak *ofp = of_peak(of); 3983 + 3984 + if (ofp->value == OFP_PEAK_UNSET) { 3985 + /* fast path (no writes on this fd) */ 3986 + return; 3987 + } 3988 + spin_lock(&memcg->peaks_lock); 3989 + list_del(&ofp->list); 3990 + spin_unlock(&memcg->peaks_lock); 3991 + } 3992 + 3993 + static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes, 3994 + loff_t off, struct page_counter *pc, 3995 + struct list_head *watchers) 3996 + { 3997 + unsigned long usage; 3998 + struct cgroup_of_peak *peer_ctx; 3999 + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4000 + struct cgroup_of_peak *ofp = of_peak(of); 4001 + 4002 + spin_lock(&memcg->peaks_lock); 4003 + 4004 + usage = page_counter_read(pc); 4005 + WRITE_ONCE(pc->local_watermark, usage); 4006 + 4007 + list_for_each_entry(peer_ctx, watchers, list) 4008 + if (usage > peer_ctx->value) 4009 + WRITE_ONCE(peer_ctx->value, usage); 4010 + 4011 + /* initial write, register watcher */ 4012 + if (ofp->value == -1) 4013 + list_add(&ofp->list, watchers); 4014 + 4015 + WRITE_ONCE(ofp->value, usage); 4016 + spin_unlock(&memcg->peaks_lock); 4017 + 4018 + return nbytes; 4019 + } 4020 + 4021 + static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf, 4022 + size_t nbytes, loff_t off) 4023 + { 4024 + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 4025 + 4026 + return peak_write(of, buf, nbytes, off, &memcg->memory, 4027 + &memcg->memory_peaks); 4028 + } 4029 + 4030 + #undef OFP_PEAK_UNSET 3959 4031 3960 4032 static int memory_min_show(struct seq_file *m, void *v) 3961 4033 { ··· 4383 4301 { 4384 4302 .name = "peak", 4385 4303 .flags = CFTYPE_NOT_ON_ROOT, 4386 - .read_u64 = memory_peak_read, 4304 + .open = peak_open, 4305 + .release = peak_release, 4306 + .seq_show = memory_peak_show, 4307 + .write = memory_peak_write, 4387 4308 }, 4388 4309 { 4389 4310 .name = "min", ··· 5178 5093 return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; 5179 5094 } 5180 5095 5181 - static u64 swap_peak_read(struct cgroup_subsys_state *css, 5182 - struct cftype *cft) 5096 + static int swap_peak_show(struct seq_file *sf, void *v) 5183 5097 { 5184 - struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5098 + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf)); 5185 5099 5186 - return (u64)memcg->swap.watermark * PAGE_SIZE; 5100 + return peak_show(sf, v, &memcg->swap); 5101 + } 5102 + 5103 + static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf, 5104 + size_t nbytes, loff_t off) 5105 + { 5106 + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); 5107 + 5108 + return peak_write(of, buf, nbytes, off, &memcg->swap, 5109 + &memcg->swap_peaks); 5187 5110 } 5188 5111 5189 5112 static int swap_high_show(struct seq_file *m, void *v) ··· 5275 5182 { 5276 5183 .name = "swap.peak", 5277 5184 .flags = CFTYPE_NOT_ON_ROOT, 5278 - .read_u64 = swap_peak_read, 5185 + .open = peak_open, 5186 + .release = peak_release, 5187 + .seq_show = swap_peak_show, 5188 + .write = swap_peak_write, 5279 5189 }, 5280 5190 { 5281 5191 .name = "swap.events",

+21 -8

mm/page_counter.c

··· 87 87 /* 88 88 * This is indeed racy, but we can live with some 89 89 * inaccuracy in the watermark. 90 + * 91 + * Notably, we have two watermarks to allow for both a globally 92 + * visible peak and one that can be reset at a smaller scope. 93 + * 94 + * Since we reset both watermarks when the global reset occurs, 95 + * we can guarantee that watermark >= local_watermark, so we 96 + * don't need to do both comparisons every time. 97 + * 98 + * On systems with branch predictors, the inner condition should 99 + * be almost free. 90 100 */ 91 - if (new > READ_ONCE(c->watermark)) 92 - WRITE_ONCE(c->watermark, new); 101 + if (new > READ_ONCE(c->local_watermark)) { 102 + WRITE_ONCE(c->local_watermark, new); 103 + if (new > READ_ONCE(c->watermark)) 104 + WRITE_ONCE(c->watermark, new); 105 + } 93 106 } 94 107 } 95 108 ··· 153 140 if (protection) 154 141 propagate_protected_usage(c, new); 155 142 156 - /* 157 - * Just like with failcnt, we can live with some 158 - * inaccuracy in the watermark. 159 - */ 160 - if (new > READ_ONCE(c->watermark)) 161 - WRITE_ONCE(c->watermark, new); 143 + /* see comment on page_counter_charge */ 144 + if (new > READ_ONCE(c->local_watermark)) { 145 + WRITE_ONCE(c->local_watermark, new); 146 + if (new > READ_ONCE(c->watermark)) 147 + WRITE_ONCE(c->watermark, new); 148 + } 162 149 } 163 150 return true; 164 151

Configure Feed

Configure Feed