Merge tag 'cgroup-for-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge tag 'cgroup-for-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

- Allow css_rstat_updated() in NMI context to enable memory accounting
for allocations in NMI context.

- /proc/cgroups doesn't contain useful information for cgroup2 and was
updated to only show v1 controllers. This unfortunately broke
something in the wild. Add an option to bring back the old behavior
to ease transition.

- selftest updates and other cleanups.

* tag 'cgroup-for-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: Add compatibility option for content of /proc/cgroups
selftests/cgroup: fix cpu.max tests
cgroup: llist: avoid memory tears for llist_node
selftests: cgroup: Fix missing newline in test_zswap_writeback_one
selftests: cgroup: Allow longer timeout for kmem_dead_cgroups cleanup
memcg: cgroup: call css_rstat_updated irrespective of in_nmi()
cgroup: remove per-cpu per-subsystem locks
cgroup: make css_rstat_updated nmi safe
cgroup: support to enable nmi-safe css_rstat_updated
selftests: cgroup: Fix compilation on pre-cgroupns kernels
selftests: cgroup: Optionally set up v1 environment
selftests: cgroup: Add support for named v1 hierarchies in test_core
selftests: cgroup_util: Add helpers for testing named v1 hierarchies
Documentation: cgroup: add section explaining controller availability
cgroup: Drop sock_cgroup_classid() dummy implementation

Linus Torvalds 9 months ago 6aee5aed af5b2619

+267 -208

14 changed files

expand all collapse all

Documentation

admin-guide

cgroup-v2.rst

kernel-parameters.txt

include

linux

cgroup-defs.h

llist.h

trace

events

cgroup.h

kernel

cgroup

cgroup-v1.c

rstat.c

memcontrol.c

tools

testing

selftests

cgroup

lib

cgroup_util.c

include

cgroup_util.h

test_core.c

test_cpu.c

test_kmem.c

test_zswap.c

Documentation/admin-guide/cgroup-v2.rst

reviewed

··· 435 435 Controlling Controllers 436 436 ----------------------- 437 437 438 438 + Availablity 439 439 + ~~~~~~~~~~~ 440 440 + 441 441 + A controller is available in a cgroup when it is supported by the kernel (i.e., 442 442 + compiled in, not disabled and not attached to a v1 hierarchy) and listed in the 443 443 + "cgroup.controllers" file. Availability means the controller's interface files 444 444 + are exposed in the cgroup’s directory, allowing the distribution of the target 445 445 + resource to be observed or controlled within that cgroup. 446 446 + 438 447 Enabling and Disabling 439 448 ~~~~~~~~~~~~~~~~~~~~~~ 440 449

Documentation/admin-guide/kernel-parameters.txt

reviewed

··· 633 633 named mounts. Specifying both "all" and "named" disables 634 634 all v1 hierarchies. 635 635 636 636 + cgroup_v1_proc= [KNL] Show also missing controllers in /proc/cgroups 637 637 + Format: { "true" | "false" } 638 638 + /proc/cgroups lists only v1 controllers by default. 639 639 + This compatibility option enables listing also v2 640 640 + controllers (whose v1 code is not compiled!), so that 641 641 + semi-legacy software can check this file to decide 642 642 + about usage of v2 (sic) controllers. 643 643 + 636 644 cgroup_favordynmods= [KNL] Enable or Disable favordynmods. 637 645 Format: { "true" | "false" } 638 646 Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS.

+8 -13

include/linux/cgroup-defs.h

reviewed

··· 375 375 * Child cgroups with stat updates on this cpu since the last read 376 376 * are linked on the parent's ->updated_children through 377 377 * ->updated_next. updated_children is terminated by its container css. 378 378 - * 379 379 - * In addition to being more compact, singly-linked list pointing to 380 380 - * the css makes it unnecessary for each per-cpu struct to point back 381 381 - * to the associated css. 382 382 - * 383 383 - * Protected by per-cpu css->ss->rstat_ss_cpu_lock. 384 378 */ 385 379 struct cgroup_subsys_state *updated_children; 386 380 struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ 381 381 + 382 382 + struct llist_node lnode; /* lockless list for update */ 383 383 + struct cgroup_subsys_state *owner; /* back pointer */ 387 384 }; 388 385 389 386 /* ··· 818 821 unsigned int depends_on; 819 822 820 823 spinlock_t rstat_ss_lock; 821 821 - raw_spinlock_t __percpu *rstat_ss_cpu_lock; 824 824 + struct llist_head __percpu *lhead; /* lockless update list head */ 822 825 }; 823 826 824 827 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; ··· 895 898 #endif 896 899 } 897 900 901 901 + #ifdef CONFIG_CGROUP_NET_CLASSID 898 902 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) 899 903 { 900 900 - #ifdef CONFIG_CGROUP_NET_CLASSID 901 904 return READ_ONCE(skcd->classid); 902 902 - #else 903 903 - return 0; 904 904 - #endif 905 905 } 906 906 + #endif 906 907 907 908 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, 908 909 u16 prioidx) ··· 910 915 #endif 911 916 } 912 917 918 918 + #ifdef CONFIG_CGROUP_NET_CLASSID 913 919 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, 914 920 u32 classid) 915 921 { 916 916 - #ifdef CONFIG_CGROUP_NET_CLASSID 917 922 WRITE_ONCE(skcd->classid, classid); 918 918 - #endif 919 923 } 924 924 + #endif 920 925 921 926 #else /* CONFIG_SOCK_CGROUP_DATA */ 922 927

+3 -3

include/linux/llist.h

reviewed

··· 83 83 */ 84 84 static inline void init_llist_node(struct llist_node *node) 85 85 { 86 86 - node->next = node; 86 86 + WRITE_ONCE(node->next, node); 87 87 } 88 88 89 89 /** ··· 97 97 */ 98 98 static inline bool llist_on_list(const struct llist_node *node) 99 99 { 100 100 - return node->next != node; 100 100 + return READ_ONCE(node->next) != node; 101 101 } 102 102 103 103 /** ··· 220 220 221 221 static inline struct llist_node *llist_next(struct llist_node *node) 222 222 { 223 223 - return node->next; 223 223 + return READ_ONCE(node->next); 224 224 } 225 225 226 226 /**

-47

include/trace/events/cgroup.h

reviewed

··· 257 257 TP_ARGS(cgrp, cpu, contended) 258 258 ); 259 259 260 260 - /* 261 261 - * Related to per CPU locks: 262 262 - * global rstat_base_cpu_lock for base stats 263 263 - * cgroup_subsys::rstat_ss_cpu_lock for subsystem stats 264 264 - */ 265 265 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended, 266 266 - 267 267 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 268 268 - 269 269 - TP_ARGS(cgrp, cpu, contended) 270 270 - ); 271 271 - 272 272 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath, 273 273 - 274 274 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 275 275 - 276 276 - TP_ARGS(cgrp, cpu, contended) 277 277 - ); 278 278 - 279 279 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked, 280 280 - 281 281 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 282 282 - 283 283 - TP_ARGS(cgrp, cpu, contended) 284 284 - ); 285 285 - 286 286 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath, 287 287 - 288 288 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 289 289 - 290 290 - TP_ARGS(cgrp, cpu, contended) 291 291 - ); 292 292 - 293 293 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock, 294 294 - 295 295 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 296 296 - 297 297 - TP_ARGS(cgrp, cpu, contended) 298 298 - ); 299 299 - 300 300 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath, 301 301 - 302 302 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 303 303 - 304 304 - TP_ARGS(cgrp, cpu, contended) 305 305 - ); 306 306 - 307 260 #endif /* _TRACE_CGROUP_H */ 308 261 309 262 /* This part must be outside protection */

+12 -2

kernel/cgroup/cgroup-v1.c

reviewed

··· 32 32 /* disable named v1 mounts */ 33 33 static bool cgroup_no_v1_named; 34 34 35 35 + /* Show unavailable controllers in /proc/cgroups */ 36 36 + static bool proc_show_all; 37 37 + 35 38 /* 36 39 * pidlist destructions need to be flushed on cgroup destruction. Use a 37 40 * separate workqueue as flush domain. ··· 686 683 */ 687 684 688 685 for_each_subsys(ss, i) { 689 689 - if (cgroup1_subsys_absent(ss)) 690 690 - continue; 691 686 cgrp_v1_visible |= ss->root != &cgrp_dfl_root; 687 687 + 688 688 + if (!proc_show_all && cgroup1_subsys_absent(ss)) 689 689 + continue; 692 690 693 691 seq_printf(m, "%s\t%d\t%d\t%d\n", 694 692 ss->legacy_name, ss->root->hierarchy_id, ··· 1363 1359 return 1; 1364 1360 } 1365 1361 __setup("cgroup_no_v1=", cgroup_no_v1); 1362 1362 + 1363 1363 + static int __init cgroup_v1_proc(char *str) 1364 1364 + { 1365 1365 + return (kstrtobool(str, &proc_show_all) == 0); 1366 1366 + } 1367 1367 + __setup("cgroup_v1_proc=", cgroup_v1_proc);

+92 -105

kernel/cgroup/rstat.c

reviewed

··· 10 10 #include <trace/events/cgroup.h> 11 11 12 12 static DEFINE_SPINLOCK(rstat_base_lock); 13 13 - static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); 13 13 + static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); 14 14 15 15 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 16 16 ··· 45 45 return &rstat_base_lock; 46 46 } 47 47 48 48 - static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) 48 48 + static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) 49 49 { 50 50 - if (ss) { 51 51 - /* 52 52 - * Depending on config, the subsystem per-cpu lock type may be an 53 53 - * empty struct. In enviromnents where this is the case, allocation 54 54 - * of this field is not performed in ss_rstat_init(). Avoid a 55 55 - * cpu-based offset relative to NULL by returning early. When the 56 56 - * lock type is zero in size, the corresponding lock functions are 57 57 - * no-ops so passing them NULL is acceptable. 58 58 - */ 59 59 - if (sizeof(*ss->rstat_ss_cpu_lock) == 0) 60 60 - return NULL; 61 61 - 62 62 - return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); 63 63 - } 64 64 - 65 65 - return per_cpu_ptr(&rstat_base_cpu_lock, cpu); 66 66 - } 67 67 - 68 68 - /* 69 69 - * Helper functions for rstat per CPU locks. 70 70 - * 71 71 - * This makes it easier to diagnose locking issues and contention in 72 72 - * production environments. The parameter @fast_path determine the 73 73 - * tracepoints being added, allowing us to diagnose "flush" related 74 74 - * operations without handling high-frequency fast-path "update" events. 75 75 - */ 76 76 - static __always_inline 77 77 - unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, 78 78 - const bool fast_path) 79 79 - { 80 80 - struct cgroup *cgrp = css->cgroup; 81 81 - raw_spinlock_t *cpu_lock; 82 82 - unsigned long flags; 83 83 - bool contended; 84 84 - 85 85 - /* 86 86 - * The _irqsave() is needed because the locks used for flushing are 87 87 - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock 88 88 - * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT 89 89 - * kernel. The raw_spinlock_t below disables interrupts on both 90 90 - * configurations. The _irqsave() ensures that interrupts are always 91 91 - * disabled and later restored. 92 92 - */ 93 93 - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); 94 94 - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); 95 95 - if (contended) { 96 96 - if (fast_path) 97 97 - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); 98 98 - else 99 99 - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); 100 100 - 101 101 - raw_spin_lock_irqsave(cpu_lock, flags); 102 102 - } 103 103 - 104 104 - if (fast_path) 105 105 - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); 106 106 - else 107 107 - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); 108 108 - 109 109 - return flags; 110 110 - } 111 111 - 112 112 - static __always_inline 113 113 - void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, 114 114 - unsigned long flags, const bool fast_path) 115 115 - { 116 116 - struct cgroup *cgrp = css->cgroup; 117 117 - raw_spinlock_t *cpu_lock; 118 118 - 119 119 - if (fast_path) 120 120 - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); 121 121 - else 122 122 - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); 123 123 - 124 124 - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); 125 125 - raw_spin_unlock_irqrestore(cpu_lock, flags); 50 50 + if (ss) 51 51 + return per_cpu_ptr(ss->lhead, cpu); 52 52 + return per_cpu_ptr(&rstat_backlog_list, cpu); 126 53 } 127 54 128 55 /** ··· 57 130 * @css: target cgroup subsystem state 58 131 * @cpu: cpu on which rstat_cpu was updated 59 132 * 60 60 - * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching 61 61 - * rstat_cpu->updated_children list. See the comment on top of 62 62 - * css_rstat_cpu definition for details. 133 133 + * Atomically inserts the css in the ss's llist for the given cpu. This is 134 134 + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist 135 135 + * will be processed at the flush time to create the update tree. 136 136 + * 137 137 + * NOTE: if the user needs the guarantee that the updater either add itself in 138 138 + * the lockless list or the concurrent flusher flushes its updated stats, a 139 139 + * memory barrier is needed before the call to css_rstat_updated() i.e. a 140 140 + * barrier after updating the per-cpu stats and before calling 141 141 + * css_rstat_updated(). 63 142 */ 64 143 __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) 65 144 { 66 66 - unsigned long flags; 145 145 + struct llist_head *lhead; 146 146 + struct css_rstat_cpu *rstatc; 147 147 + struct css_rstat_cpu __percpu *rstatc_pcpu; 148 148 + struct llist_node *self; 67 149 68 150 /* 69 151 * Since bpf programs can call this function, prevent access to ··· 81 145 if (!css_uses_rstat(css)) 82 146 return; 83 147 148 148 + lockdep_assert_preemption_disabled(); 149 149 + 84 150 /* 85 85 - * Speculative already-on-list test. This may race leading to 86 86 - * temporary inaccuracies, which is fine. 87 87 - * 88 88 - * Because @parent's updated_children is terminated with @parent 89 89 - * instead of NULL, we can tell whether @css is on the list by 90 90 - * testing the next pointer for NULL. 151 151 + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore 152 152 + * the requests from nmi context. 91 153 */ 92 92 - if (data_race(css_rstat_cpu(css, cpu)->updated_next)) 154 154 + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || 155 155 + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) 93 156 return; 94 157 95 95 - flags = _css_rstat_cpu_lock(css, cpu, true); 158 158 + rstatc = css_rstat_cpu(css, cpu); 159 159 + /* 160 160 + * If already on list return. This check is racy and smp_mb() is needed 161 161 + * to pair it with the smp_mb() in css_process_update_tree() if the 162 162 + * guarantee that the updated stats are visible to concurrent flusher is 163 163 + * needed. 164 164 + */ 165 165 + if (llist_on_list(&rstatc->lnode)) 166 166 + return; 96 167 168 168 + /* 169 169 + * This function can be renentered by irqs and nmis for the same cgroup 170 170 + * and may try to insert the same per-cpu lnode into the llist. Note 171 171 + * that llist_add() does not protect against such scenarios. 172 172 + * 173 173 + * To protect against such stacked contexts of irqs/nmis, we use the 174 174 + * fact that lnode points to itself when not on a list and then use 175 175 + * this_cpu_cmpxchg() to atomically set to NULL to select the winner 176 176 + * which will call llist_add(). The losers can assume the insertion is 177 177 + * successful and the winner will eventually add the per-cpu lnode to 178 178 + * the llist. 179 179 + */ 180 180 + self = &rstatc->lnode; 181 181 + rstatc_pcpu = css->rstat_cpu; 182 182 + if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) 183 183 + return; 184 184 + 185 185 + lhead = ss_lhead_cpu(css->ss, cpu); 186 186 + llist_add(&rstatc->lnode, lhead); 187 187 + } 188 188 + 189 189 + static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) 190 190 + { 97 191 /* put @css and all ancestors on the corresponding updated lists */ 98 192 while (true) { 99 193 struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); ··· 149 183 150 184 css = parent; 151 185 } 186 186 + } 152 187 153 153 - _css_rstat_cpu_unlock(css, cpu, flags, true); 188 188 + static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) 189 189 + { 190 190 + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); 191 191 + struct llist_node *lnode; 192 192 + 193 193 + while ((lnode = llist_del_first_init(lhead))) { 194 194 + struct css_rstat_cpu *rstatc; 195 195 + 196 196 + /* 197 197 + * smp_mb() is needed here (more specifically in between 198 198 + * init_llist_node() and per-cpu stats flushing) if the 199 199 + * guarantee is required by a rstat user where etiher the 200 200 + * updater should add itself on the lockless list or the 201 201 + * flusher flush the stats updated by the updater who have 202 202 + * observed that they are already on the list. The 203 203 + * corresponding barrier pair for this one should be before 204 204 + * css_rstat_updated() by the user. 205 205 + * 206 206 + * For now, there aren't any such user, so not adding the 207 207 + * barrier here but if such a use-case arise, please add 208 208 + * smp_mb() here. 209 209 + */ 210 210 + 211 211 + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); 212 212 + __css_process_update_tree(rstatc->owner, cpu); 213 213 + } 154 214 } 155 215 156 216 /** ··· 280 288 { 281 289 struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); 282 290 struct cgroup_subsys_state *head = NULL, *parent, *child; 283 283 - unsigned long flags; 284 291 285 285 - flags = _css_rstat_cpu_lock(root, cpu, false); 292 292 + css_process_update_tree(root->ss, cpu); 286 293 287 294 /* Return NULL if this subtree is not on-list */ 288 295 if (!rstatc->updated_next) 289 289 - goto unlock_ret; 296 296 + return NULL; 290 297 291 298 /* 292 299 * Unlink @root from its parent. As the updated_children list is ··· 317 326 rstatc->updated_children = root; 318 327 if (child != root) 319 328 head = css_rstat_push_children(head, child, cpu); 320 320 - unlock_ret: 321 321 - _css_rstat_cpu_unlock(root, cpu, flags, false); 329 329 + 322 330 return head; 323 331 } 324 332 ··· 458 468 for_each_possible_cpu(cpu) { 459 469 struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); 460 470 461 461 - rstatc->updated_children = css; 471 471 + rstatc->owner = rstatc->updated_children = css; 472 472 + init_llist_node(&rstatc->lnode); 462 473 463 474 if (is_self) { 464 475 struct cgroup_rstat_base_cpu *rstatbc; ··· 513 522 { 514 523 int cpu; 515 524 516 516 - /* 517 517 - * Depending on config, the subsystem per-cpu lock type may be an empty 518 518 - * struct. Avoid allocating a size of zero in this case. 519 519 - */ 520 520 - if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { 521 521 - ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); 522 522 - if (!ss->rstat_ss_cpu_lock) 525 525 + if (ss) { 526 526 + ss->lhead = alloc_percpu(struct llist_head); 527 527 + if (!ss->lhead) 523 528 return -ENOMEM; 524 529 } 525 530 526 531 spin_lock_init(ss_rstat_lock(ss)); 527 532 for_each_possible_cpu(cpu) 528 528 - raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); 533 533 + init_llist_head(ss_lhead_cpu(ss, cpu)); 529 534 530 535 return 0; 531 536 }

+5 -5

mm/memcontrol.c

reviewed

··· 570 570 if (!val) 571 571 return; 572 572 573 573 - /* TODO: add to cgroup update tree once it is nmi-safe. */ 574 574 - if (!in_nmi()) 575 575 - css_rstat_updated(&memcg->css, cpu); 573 573 + css_rstat_updated(&memcg->css, cpu); 576 574 statc_pcpu = memcg->vmstats_percpu; 577 575 for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { 578 576 statc = this_cpu_ptr(statc_pcpu); ··· 2525 2527 } else { 2526 2528 struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; 2527 2529 2528 2528 - /* TODO: add to cgroup update tree once it is nmi-safe. */ 2530 2530 + /* preemption is disabled in_nmi(). */ 2531 2531 + css_rstat_updated(&memcg->css, smp_processor_id()); 2529 2532 if (idx == NR_SLAB_RECLAIMABLE_B) 2530 2533 atomic_add(nr, &pn->slab_reclaimable); 2531 2534 else ··· 2749 2750 if (likely(!in_nmi())) { 2750 2751 mod_memcg_state(memcg, MEMCG_KMEM, val); 2751 2752 } else { 2752 2752 - /* TODO: add to cgroup update tree once it is nmi-safe. */ 2753 2753 + /* preemption is disabled in_nmi(). */ 2754 2754 + css_rstat_updated(&memcg->css, smp_processor_id()); 2753 2755 atomic_add(val, &memcg->kmem_stat); 2754 2756 } 2755 2757 }

+3 -1

tools/testing/selftests/cgroup/lib/cgroup_util.c

reviewed

··· 19 19 #include "cgroup_util.h" 20 20 #include "../../clone3/clone3_selftests.h" 21 21 22 22 + bool cg_test_v1_named; 23 23 + 22 24 /* Returns read len on success, or -errno on failure. */ 23 25 ssize_t read_text(const char *path, char *buf, size_t max_len) 24 26 { ··· 363 361 364 362 int cg_enter_current_thread(const char *cgroup) 365 363 { 366 366 - return cg_write(cgroup, "cgroup.threads", "0"); 364 364 + return cg_write(cgroup, CG_THREADS_FILE, "0"); 367 365 } 368 366 369 367 int cg_run(const char *cgroup,

tools/testing/selftests/cgroup/lib/include/cgroup_util.h

reviewed

··· 13 13 14 14 #define TEST_UID 65534 /* usually nobody, any !root is fine */ 15 15 16 16 + #define CG_THREADS_FILE (!cg_test_v1_named ? "cgroup.threads" : "tasks") 17 17 + #define CG_NAMED_NAME "selftest" 18 18 + #define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s")) 19 19 + 16 20 /* 17 21 * Checks if two given values differ by less than err% of their sum. 18 22 */ ··· 69 65 extern int cg_prepare_for_wait(const char *cgroup); 70 66 extern int memcg_prepare_for_wait(const char *cgroup); 71 67 extern int cg_wait_for(int fd); 68 68 + extern bool cg_test_v1_named;

+74 -10

tools/testing/selftests/cgroup/test_core.c

reviewed

··· 5 5 #include <linux/sched.h> 6 6 #include <sys/types.h> 7 7 #include <sys/mman.h> 8 8 + #include <sys/mount.h> 9 9 + #include <sys/stat.h> 8 10 #include <sys/wait.h> 9 11 #include <unistd.h> 10 12 #include <fcntl.h> ··· 21 19 #include "cgroup_util.h" 22 20 23 21 static bool nsdelegate; 22 22 + #ifndef CLONE_NEWCGROUP 23 23 + #define CLONE_NEWCGROUP 0 24 24 + #endif 24 25 25 26 static int touch_anon(char *buf, size_t size) 26 27 { ··· 152 147 char *cg_test_c = NULL, *cg_test_d = NULL; 153 148 int cgroup_fd = -EBADF; 154 149 pid_t pid; 150 150 + 151 151 + if (cg_test_v1_named) 152 152 + return KSFT_SKIP; 155 153 156 154 cg_test_a = cg_name(root, "cg_test_a"); 157 155 cg_test_b = cg_name(root, "cg_test_a/cg_test_b"); ··· 285 277 int ret = KSFT_FAIL; 286 278 char *grandparent = NULL, *parent = NULL, *child = NULL; 287 279 280 280 + if (cg_test_v1_named) 281 281 + return KSFT_SKIP; 282 282 + 288 283 grandparent = cg_name(root, "cg_test_grandparent"); 289 284 parent = cg_name(root, "cg_test_grandparent/cg_test_parent"); 290 285 child = cg_name(root, "cg_test_grandparent/cg_test_parent/cg_test_child"); ··· 350 339 int ret = KSFT_FAIL; 351 340 char *parent = NULL, *child = NULL; 352 341 342 342 + if (cg_test_v1_named) 343 343 + return KSFT_SKIP; 344 344 + 353 345 parent = cg_name(root, "cg_test_parent"); 354 346 child = cg_name(root, "cg_test_parent/cg_test_child"); 355 347 if (!parent || !child) ··· 392 378 int ret = KSFT_FAIL; 393 379 char *parent = NULL, *child = NULL; 394 380 395 395 - if (cg_read_strstr(root, "cgroup.controllers", "cpu") || 381 381 + if (cg_test_v1_named || 382 382 + cg_read_strstr(root, "cgroup.controllers", "cpu") || 396 383 cg_write(root, "cgroup.subtree_control", "+cpu")) { 397 384 ret = KSFT_SKIP; 398 385 goto cleanup; ··· 445 430 int ret = KSFT_FAIL; 446 431 char *parent = NULL, *child = NULL; 447 432 433 433 + if (cg_test_v1_named) 434 434 + return KSFT_SKIP; 435 435 + 448 436 parent = cg_name(root, "cg_test_parent"); 449 437 child = cg_name(root, "cg_test_parent/cg_test_child"); 450 438 if (!parent || !child) ··· 482 464 { 483 465 int ret = KSFT_FAIL; 484 466 char *parent = NULL, *child = NULL; 467 467 + 468 468 + if (cg_test_v1_named) 469 469 + return KSFT_SKIP; 485 470 486 471 parent = cg_name(root, "cg_test_parent"); 487 472 child = cg_name(root, "cg_test_parent/cg_test_child"); ··· 526 505 { 527 506 int ret = KSFT_FAIL; 528 507 char *parent = NULL, *child = NULL; 508 508 + 509 509 + if (cg_test_v1_named) 510 510 + return KSFT_SKIP; 529 511 530 512 parent = cg_name(root, "cg_test_parent"); 531 513 child = cg_name(root, "cg_test_parent/cg_test_child"); ··· 597 573 } 598 574 599 575 cg_enter_current(dst); 600 600 - if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1) 576 576 + if (cg_read_lc(dst, CG_THREADS_FILE) != n_threads + 1) 601 577 goto cleanup; 602 578 603 579 ret = KSFT_PASS; ··· 629 605 char lines[3][PATH_MAX]; 630 606 631 607 for (g = 1; g < 3; ++g) 632 632 - snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0])); 608 608 + snprintf(lines[g], sizeof(lines[g]), CG_PATH_FORMAT, grps[g] + strlen(grps[0])); 633 609 634 610 for (i = 0; i < n_iterations; ++i) { 635 611 cg_enter_current_thread(grps[(i % 2) + 1]); ··· 666 642 if (cg_create(grps[2])) 667 643 goto cleanup; 668 644 669 669 - if (cg_write(grps[1], "cgroup.type", "threaded")) 670 670 - goto cleanup; 671 671 - if (cg_write(grps[2], "cgroup.type", "threaded")) 672 672 - goto cleanup; 645 645 + if (!cg_test_v1_named) { 646 646 + if (cg_write(grps[1], "cgroup.type", "threaded")) 647 647 + goto cleanup; 648 648 + if (cg_write(grps[2], "cgroup.type", "threaded")) 649 649 + goto cleanup; 650 650 + } 673 651 674 652 if (cg_enter_current(grps[1])) 675 653 goto cleanup; ··· 685 659 if (retval) 686 660 goto cleanup; 687 661 688 688 - snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0])); 662 662 + snprintf(line, sizeof(line), CG_PATH_FORMAT, grps[1] + strlen(grps[0])); 689 663 if (proc_read_strstr(0, 1, "cgroup", line)) 690 664 goto cleanup; 691 665 ··· 868 842 return ret; 869 843 } 870 844 845 845 + static int setup_named_v1_root(char *root, size_t len, const char *name) 846 846 + { 847 847 + char options[PATH_MAX]; 848 848 + int r; 849 849 + 850 850 + r = snprintf(root, len, "/mnt/cg_selftest"); 851 851 + if (r < 0) 852 852 + return r; 853 853 + 854 854 + r = snprintf(options, sizeof(options), "none,name=%s", name); 855 855 + if (r < 0) 856 856 + return r; 857 857 + 858 858 + r = mkdir(root, 0755); 859 859 + if (r < 0 && errno != EEXIST) 860 860 + return r; 861 861 + 862 862 + r = mount("none", root, "cgroup", 0, options); 863 863 + if (r < 0) 864 864 + return r; 865 865 + 866 866 + return 0; 867 867 + } 868 868 + 869 869 + static void cleanup_named_v1_root(char *root) 870 870 + { 871 871 + if (!cg_test_v1_named) 872 872 + return; 873 873 + umount(root); 874 874 + rmdir(root); 875 875 + } 876 876 + 871 877 #define T(x) { x, #x } 872 878 struct corecg_test { 873 879 int (*fn)(const char *root); ··· 925 867 char root[PATH_MAX]; 926 868 int i, ret = EXIT_SUCCESS; 927 869 928 928 - if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) 929 929 - ksft_exit_skip("cgroup v2 isn't mounted\n"); 870 870 + if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) { 871 871 + if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME)) 872 872 + ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n"); 873 873 + cg_test_v1_named = true; 874 874 + goto post_v2_setup; 875 875 + } 930 876 931 877 if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) 932 878 if (cg_write(root, "cgroup.subtree_control", "+memory")) 933 879 ksft_exit_skip("Failed to set memory controller\n"); 934 880 881 881 + post_v2_setup: 935 882 for (i = 0; i < ARRAY_SIZE(tests); i++) { 936 883 switch (tests[i].fn(root)) { 937 884 case KSFT_PASS: ··· 952 889 } 953 890 } 954 891 892 892 + cleanup_named_v1_root(root); 955 893 return ret; 956 894 }

+43 -20

tools/testing/selftests/cgroup/test_cpu.c

reviewed

··· 2 2 3 3 #define _GNU_SOURCE 4 4 #include <linux/limits.h> 5 5 + #include <sys/param.h> 5 6 #include <sys/sysinfo.h> 6 7 #include <sys/wait.h> 7 8 #include <errno.h> ··· 646 645 static int test_cpucg_max(const char *root) 647 646 { 648 647 int ret = KSFT_FAIL; 649 649 - long usage_usec, user_usec; 650 650 - long usage_seconds = 1; 651 651 - long expected_usage_usec = usage_seconds * USEC_PER_SEC; 648 648 + long quota_usec = 1000; 649 649 + long default_period_usec = 100000; /* cpu.max's default period */ 650 650 + long duration_seconds = 1; 651 651 + 652 652 + long duration_usec = duration_seconds * USEC_PER_SEC; 653 653 + long usage_usec, n_periods, remainder_usec, expected_usage_usec; 652 654 char *cpucg; 655 655 + char quota_buf[32]; 656 656 + 657 657 + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); 653 658 654 659 cpucg = cg_name(root, "cpucg_test"); 655 660 if (!cpucg) ··· 664 657 if (cg_create(cpucg)) 665 658 goto cleanup; 666 659 667 667 - if (cg_write(cpucg, "cpu.max", "1000")) 660 660 + if (cg_write(cpucg, "cpu.max", quota_buf)) 668 661 goto cleanup; 669 662 670 663 struct cpu_hog_func_param param = { 671 664 .nprocs = 1, 672 665 .ts = { 673 673 - .tv_sec = usage_seconds, 666 666 + .tv_sec = duration_seconds, 674 667 .tv_nsec = 0, 675 668 }, 676 669 .clock_type = CPU_HOG_CLOCK_WALL, ··· 679 672 goto cleanup; 680 673 681 674 usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec"); 682 682 - user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec"); 683 683 - if (user_usec <= 0) 675 675 + if (usage_usec <= 0) 684 676 goto cleanup; 685 677 686 686 - if (user_usec >= expected_usage_usec) 687 687 - goto cleanup; 678 678 + /* 679 679 + * The following calculation applies only since 680 680 + * the cpu hog is set to run as per wall-clock time 681 681 + */ 682 682 + n_periods = duration_usec / default_period_usec; 683 683 + remainder_usec = duration_usec - n_periods * default_period_usec; 684 684 + expected_usage_usec 685 685 + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); 688 686 689 689 - if (values_close(usage_usec, expected_usage_usec, 95)) 687 687 + if (!values_close(usage_usec, expected_usage_usec, 10)) 690 688 goto cleanup; 691 689 692 690 ret = KSFT_PASS; ··· 710 698 static int test_cpucg_max_nested(const char *root) 711 699 { 712 700 int ret = KSFT_FAIL; 713 713 - long usage_usec, user_usec; 714 714 - long usage_seconds = 1; 715 715 - long expected_usage_usec = usage_seconds * USEC_PER_SEC; 701 701 + long quota_usec = 1000; 702 702 + long default_period_usec = 100000; /* cpu.max's default period */ 703 703 + long duration_seconds = 1; 704 704 + 705 705 + long duration_usec = duration_seconds * USEC_PER_SEC; 706 706 + long usage_usec, n_periods, remainder_usec, expected_usage_usec; 716 707 char *parent, *child; 708 708 + char quota_buf[32]; 709 709 + 710 710 + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); 717 711 718 712 parent = cg_name(root, "cpucg_parent"); 719 713 child = cg_name(parent, "cpucg_child"); ··· 735 717 if (cg_create(child)) 736 718 goto cleanup; 737 719 738 738 - if (cg_write(parent, "cpu.max", "1000")) 720 720 + if (cg_write(parent, "cpu.max", quota_buf)) 739 721 goto cleanup; 740 722 741 723 struct cpu_hog_func_param param = { 742 724 .nprocs = 1, 743 725 .ts = { 744 744 - .tv_sec = usage_seconds, 726 726 + .tv_sec = duration_seconds, 745 727 .tv_nsec = 0, 746 728 }, 747 729 .clock_type = CPU_HOG_CLOCK_WALL, ··· 750 732 goto cleanup; 751 733 752 734 usage_usec = cg_read_key_long(child, "cpu.stat", "usage_usec"); 753 753 - user_usec = cg_read_key_long(child, "cpu.stat", "user_usec"); 754 754 - if (user_usec <= 0) 735 735 + if (usage_usec <= 0) 755 736 goto cleanup; 756 737 757 757 - if (user_usec >= expected_usage_usec) 758 758 - goto cleanup; 738 738 + /* 739 739 + * The following calculation applies only since 740 740 + * the cpu hog is set to run as per wall-clock time 741 741 + */ 742 742 + n_periods = duration_usec / default_period_usec; 743 743 + remainder_usec = duration_usec - n_periods * default_period_usec; 744 744 + expected_usage_usec 745 745 + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); 759 746 760 760 - if (values_close(usage_usec, expected_usage_usec, 95)) 747 747 + if (!values_close(usage_usec, expected_usage_usec, 10)) 761 748 goto cleanup; 762 749 763 750 ret = KSFT_PASS;

+4 -1

tools/testing/selftests/cgroup/test_kmem.c

reviewed

··· 308 308 char *parent; 309 309 long dead; 310 310 int i; 311 311 + int max_time = 20; 311 312 312 313 parent = cg_name(root, "kmem_dead_cgroups_test"); 313 314 if (!parent) ··· 323 322 if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) 324 323 goto cleanup; 325 324 326 326 - for (i = 0; i < 5; i++) { 325 325 + for (i = 0; i < max_time; i++) { 327 326 dead = cg_read_key_long(parent, "cgroup.stat", 328 327 "nr_dying_descendants "); 329 328 if (dead == 0) { ··· 335 334 * let's wait a bit and repeat. 336 335 */ 337 336 sleep(1); 337 337 + if (i > 5) 338 338 + printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead); 338 339 } 339 340 340 341 cleanup:

+1 -1

tools/testing/selftests/cgroup/test_zswap.c

reviewed

··· 338 338 return -1; 339 339 340 340 if (wb != !!zswpwb_after) { 341 341 - ksft_print_msg("zswpwb_after is %ld while wb is %s", 341 341 + ksft_print_msg("zswpwb_after is %ld while wb is %s\n", 342 342 zswpwb_after, wb ? "enabled" : "disabled"); 343 343 return -1; 344 344 }