Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'cgroup-for-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

- Allow css_rstat_updated() in NMI context to enable memory accounting
for allocations in NMI context.

- /proc/cgroups doesn't contain useful information for cgroup2 and was
updated to only show v1 controllers. This unfortunately broke
something in the wild. Add an option to bring back the old behavior
to ease transition.

- selftest updates and other cleanups.

* tag 'cgroup-for-6.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: Add compatibility option for content of /proc/cgroups
selftests/cgroup: fix cpu.max tests
cgroup: llist: avoid memory tears for llist_node
selftests: cgroup: Fix missing newline in test_zswap_writeback_one
selftests: cgroup: Allow longer timeout for kmem_dead_cgroups cleanup
memcg: cgroup: call css_rstat_updated irrespective of in_nmi()
cgroup: remove per-cpu per-subsystem locks
cgroup: make css_rstat_updated nmi safe
cgroup: support to enable nmi-safe css_rstat_updated
selftests: cgroup: Fix compilation on pre-cgroupns kernels
selftests: cgroup: Optionally set up v1 environment
selftests: cgroup: Add support for named v1 hierarchies in test_core
selftests: cgroup_util: Add helpers for testing named v1 hierarchies
Documentation: cgroup: add section explaining controller availability
cgroup: Drop sock_cgroup_classid() dummy implementation

+267 -208
+9
Documentation/admin-guide/cgroup-v2.rst
··· 435 435 Controlling Controllers 436 436 ----------------------- 437 437 438 + Availablity 439 + ~~~~~~~~~~~ 440 + 441 + A controller is available in a cgroup when it is supported by the kernel (i.e., 442 + compiled in, not disabled and not attached to a v1 hierarchy) and listed in the 443 + "cgroup.controllers" file. Availability means the controller's interface files 444 + are exposed in the cgroup’s directory, allowing the distribution of the target 445 + resource to be observed or controlled within that cgroup. 446 + 438 447 Enabling and Disabling 439 448 ~~~~~~~~~~~~~~~~~~~~~~ 440 449
+8
Documentation/admin-guide/kernel-parameters.txt
··· 633 633 named mounts. Specifying both "all" and "named" disables 634 634 all v1 hierarchies. 635 635 636 + cgroup_v1_proc= [KNL] Show also missing controllers in /proc/cgroups 637 + Format: { "true" | "false" } 638 + /proc/cgroups lists only v1 controllers by default. 639 + This compatibility option enables listing also v2 640 + controllers (whose v1 code is not compiled!), so that 641 + semi-legacy software can check this file to decide 642 + about usage of v2 (sic) controllers. 643 + 636 644 cgroup_favordynmods= [KNL] Enable or Disable favordynmods. 637 645 Format: { "true" | "false" } 638 646 Defaults to the value of CONFIG_CGROUP_FAVOR_DYNMODS.
+8 -13
include/linux/cgroup-defs.h
··· 375 375 * Child cgroups with stat updates on this cpu since the last read 376 376 * are linked on the parent's ->updated_children through 377 377 * ->updated_next. updated_children is terminated by its container css. 378 - * 379 - * In addition to being more compact, singly-linked list pointing to 380 - * the css makes it unnecessary for each per-cpu struct to point back 381 - * to the associated css. 382 - * 383 - * Protected by per-cpu css->ss->rstat_ss_cpu_lock. 384 378 */ 385 379 struct cgroup_subsys_state *updated_children; 386 380 struct cgroup_subsys_state *updated_next; /* NULL if not on the list */ 381 + 382 + struct llist_node lnode; /* lockless list for update */ 383 + struct cgroup_subsys_state *owner; /* back pointer */ 387 384 }; 388 385 389 386 /* ··· 818 821 unsigned int depends_on; 819 822 820 823 spinlock_t rstat_ss_lock; 821 - raw_spinlock_t __percpu *rstat_ss_cpu_lock; 824 + struct llist_head __percpu *lhead; /* lockless update list head */ 822 825 }; 823 826 824 827 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; ··· 895 898 #endif 896 899 } 897 900 901 + #ifdef CONFIG_CGROUP_NET_CLASSID 898 902 static inline u32 sock_cgroup_classid(const struct sock_cgroup_data *skcd) 899 903 { 900 - #ifdef CONFIG_CGROUP_NET_CLASSID 901 904 return READ_ONCE(skcd->classid); 902 - #else 903 - return 0; 904 - #endif 905 905 } 906 + #endif 906 907 907 908 static inline void sock_cgroup_set_prioidx(struct sock_cgroup_data *skcd, 908 909 u16 prioidx) ··· 910 915 #endif 911 916 } 912 917 918 + #ifdef CONFIG_CGROUP_NET_CLASSID 913 919 static inline void sock_cgroup_set_classid(struct sock_cgroup_data *skcd, 914 920 u32 classid) 915 921 { 916 - #ifdef CONFIG_CGROUP_NET_CLASSID 917 922 WRITE_ONCE(skcd->classid, classid); 918 - #endif 919 923 } 924 + #endif 920 925 921 926 #else /* CONFIG_SOCK_CGROUP_DATA */ 922 927
+3 -3
include/linux/llist.h
··· 83 83 */ 84 84 static inline void init_llist_node(struct llist_node *node) 85 85 { 86 - node->next = node; 86 + WRITE_ONCE(node->next, node); 87 87 } 88 88 89 89 /** ··· 97 97 */ 98 98 static inline bool llist_on_list(const struct llist_node *node) 99 99 { 100 - return node->next != node; 100 + return READ_ONCE(node->next) != node; 101 101 } 102 102 103 103 /** ··· 220 220 221 221 static inline struct llist_node *llist_next(struct llist_node *node) 222 222 { 223 - return node->next; 223 + return READ_ONCE(node->next); 224 224 } 225 225 226 226 /**
-47
include/trace/events/cgroup.h
··· 257 257 TP_ARGS(cgrp, cpu, contended) 258 258 ); 259 259 260 - /* 261 - * Related to per CPU locks: 262 - * global rstat_base_cpu_lock for base stats 263 - * cgroup_subsys::rstat_ss_cpu_lock for subsystem stats 264 - */ 265 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended, 266 - 267 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 268 - 269 - TP_ARGS(cgrp, cpu, contended) 270 - ); 271 - 272 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath, 273 - 274 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 275 - 276 - TP_ARGS(cgrp, cpu, contended) 277 - ); 278 - 279 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked, 280 - 281 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 282 - 283 - TP_ARGS(cgrp, cpu, contended) 284 - ); 285 - 286 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath, 287 - 288 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 289 - 290 - TP_ARGS(cgrp, cpu, contended) 291 - ); 292 - 293 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock, 294 - 295 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 296 - 297 - TP_ARGS(cgrp, cpu, contended) 298 - ); 299 - 300 - DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath, 301 - 302 - TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), 303 - 304 - TP_ARGS(cgrp, cpu, contended) 305 - ); 306 - 307 260 #endif /* _TRACE_CGROUP_H */ 308 261 309 262 /* This part must be outside protection */
+12 -2
kernel/cgroup/cgroup-v1.c
··· 32 32 /* disable named v1 mounts */ 33 33 static bool cgroup_no_v1_named; 34 34 35 + /* Show unavailable controllers in /proc/cgroups */ 36 + static bool proc_show_all; 37 + 35 38 /* 36 39 * pidlist destructions need to be flushed on cgroup destruction. Use a 37 40 * separate workqueue as flush domain. ··· 686 683 */ 687 684 688 685 for_each_subsys(ss, i) { 689 - if (cgroup1_subsys_absent(ss)) 690 - continue; 691 686 cgrp_v1_visible |= ss->root != &cgrp_dfl_root; 687 + 688 + if (!proc_show_all && cgroup1_subsys_absent(ss)) 689 + continue; 692 690 693 691 seq_printf(m, "%s\t%d\t%d\t%d\n", 694 692 ss->legacy_name, ss->root->hierarchy_id, ··· 1363 1359 return 1; 1364 1360 } 1365 1361 __setup("cgroup_no_v1=", cgroup_no_v1); 1362 + 1363 + static int __init cgroup_v1_proc(char *str) 1364 + { 1365 + return (kstrtobool(str, &proc_show_all) == 0); 1366 + } 1367 + __setup("cgroup_v1_proc=", cgroup_v1_proc);
+92 -105
kernel/cgroup/rstat.c
··· 10 10 #include <trace/events/cgroup.h> 11 11 12 12 static DEFINE_SPINLOCK(rstat_base_lock); 13 - static DEFINE_PER_CPU(raw_spinlock_t, rstat_base_cpu_lock); 13 + static DEFINE_PER_CPU(struct llist_head, rstat_backlog_list); 14 14 15 15 static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); 16 16 ··· 45 45 return &rstat_base_lock; 46 46 } 47 47 48 - static raw_spinlock_t *ss_rstat_cpu_lock(struct cgroup_subsys *ss, int cpu) 48 + static inline struct llist_head *ss_lhead_cpu(struct cgroup_subsys *ss, int cpu) 49 49 { 50 - if (ss) { 51 - /* 52 - * Depending on config, the subsystem per-cpu lock type may be an 53 - * empty struct. In enviromnents where this is the case, allocation 54 - * of this field is not performed in ss_rstat_init(). Avoid a 55 - * cpu-based offset relative to NULL by returning early. When the 56 - * lock type is zero in size, the corresponding lock functions are 57 - * no-ops so passing them NULL is acceptable. 58 - */ 59 - if (sizeof(*ss->rstat_ss_cpu_lock) == 0) 60 - return NULL; 61 - 62 - return per_cpu_ptr(ss->rstat_ss_cpu_lock, cpu); 63 - } 64 - 65 - return per_cpu_ptr(&rstat_base_cpu_lock, cpu); 66 - } 67 - 68 - /* 69 - * Helper functions for rstat per CPU locks. 70 - * 71 - * This makes it easier to diagnose locking issues and contention in 72 - * production environments. The parameter @fast_path determine the 73 - * tracepoints being added, allowing us to diagnose "flush" related 74 - * operations without handling high-frequency fast-path "update" events. 75 - */ 76 - static __always_inline 77 - unsigned long _css_rstat_cpu_lock(struct cgroup_subsys_state *css, int cpu, 78 - const bool fast_path) 79 - { 80 - struct cgroup *cgrp = css->cgroup; 81 - raw_spinlock_t *cpu_lock; 82 - unsigned long flags; 83 - bool contended; 84 - 85 - /* 86 - * The _irqsave() is needed because the locks used for flushing are 87 - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring this lock 88 - * with the _irq() suffix only disables interrupts on a non-PREEMPT_RT 89 - * kernel. The raw_spinlock_t below disables interrupts on both 90 - * configurations. The _irqsave() ensures that interrupts are always 91 - * disabled and later restored. 92 - */ 93 - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); 94 - contended = !raw_spin_trylock_irqsave(cpu_lock, flags); 95 - if (contended) { 96 - if (fast_path) 97 - trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); 98 - else 99 - trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); 100 - 101 - raw_spin_lock_irqsave(cpu_lock, flags); 102 - } 103 - 104 - if (fast_path) 105 - trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); 106 - else 107 - trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); 108 - 109 - return flags; 110 - } 111 - 112 - static __always_inline 113 - void _css_rstat_cpu_unlock(struct cgroup_subsys_state *css, int cpu, 114 - unsigned long flags, const bool fast_path) 115 - { 116 - struct cgroup *cgrp = css->cgroup; 117 - raw_spinlock_t *cpu_lock; 118 - 119 - if (fast_path) 120 - trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); 121 - else 122 - trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); 123 - 124 - cpu_lock = ss_rstat_cpu_lock(css->ss, cpu); 125 - raw_spin_unlock_irqrestore(cpu_lock, flags); 50 + if (ss) 51 + return per_cpu_ptr(ss->lhead, cpu); 52 + return per_cpu_ptr(&rstat_backlog_list, cpu); 126 53 } 127 54 128 55 /** ··· 57 130 * @css: target cgroup subsystem state 58 131 * @cpu: cpu on which rstat_cpu was updated 59 132 * 60 - * @css's rstat_cpu on @cpu was updated. Put it on the parent's matching 61 - * rstat_cpu->updated_children list. See the comment on top of 62 - * css_rstat_cpu definition for details. 133 + * Atomically inserts the css in the ss's llist for the given cpu. This is 134 + * reentrant safe i.e. safe against softirq, hardirq and nmi. The ss's llist 135 + * will be processed at the flush time to create the update tree. 136 + * 137 + * NOTE: if the user needs the guarantee that the updater either add itself in 138 + * the lockless list or the concurrent flusher flushes its updated stats, a 139 + * memory barrier is needed before the call to css_rstat_updated() i.e. a 140 + * barrier after updating the per-cpu stats and before calling 141 + * css_rstat_updated(). 63 142 */ 64 143 __bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) 65 144 { 66 - unsigned long flags; 145 + struct llist_head *lhead; 146 + struct css_rstat_cpu *rstatc; 147 + struct css_rstat_cpu __percpu *rstatc_pcpu; 148 + struct llist_node *self; 67 149 68 150 /* 69 151 * Since bpf programs can call this function, prevent access to ··· 81 145 if (!css_uses_rstat(css)) 82 146 return; 83 147 148 + lockdep_assert_preemption_disabled(); 149 + 84 150 /* 85 - * Speculative already-on-list test. This may race leading to 86 - * temporary inaccuracies, which is fine. 87 - * 88 - * Because @parent's updated_children is terminated with @parent 89 - * instead of NULL, we can tell whether @css is on the list by 90 - * testing the next pointer for NULL. 151 + * For archs withnot nmi safe cmpxchg or percpu ops support, ignore 152 + * the requests from nmi context. 91 153 */ 92 - if (data_race(css_rstat_cpu(css, cpu)->updated_next)) 154 + if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) || 155 + !IS_ENABLED(CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS)) && in_nmi()) 93 156 return; 94 157 95 - flags = _css_rstat_cpu_lock(css, cpu, true); 158 + rstatc = css_rstat_cpu(css, cpu); 159 + /* 160 + * If already on list return. This check is racy and smp_mb() is needed 161 + * to pair it with the smp_mb() in css_process_update_tree() if the 162 + * guarantee that the updated stats are visible to concurrent flusher is 163 + * needed. 164 + */ 165 + if (llist_on_list(&rstatc->lnode)) 166 + return; 96 167 168 + /* 169 + * This function can be renentered by irqs and nmis for the same cgroup 170 + * and may try to insert the same per-cpu lnode into the llist. Note 171 + * that llist_add() does not protect against such scenarios. 172 + * 173 + * To protect against such stacked contexts of irqs/nmis, we use the 174 + * fact that lnode points to itself when not on a list and then use 175 + * this_cpu_cmpxchg() to atomically set to NULL to select the winner 176 + * which will call llist_add(). The losers can assume the insertion is 177 + * successful and the winner will eventually add the per-cpu lnode to 178 + * the llist. 179 + */ 180 + self = &rstatc->lnode; 181 + rstatc_pcpu = css->rstat_cpu; 182 + if (this_cpu_cmpxchg(rstatc_pcpu->lnode.next, self, NULL) != self) 183 + return; 184 + 185 + lhead = ss_lhead_cpu(css->ss, cpu); 186 + llist_add(&rstatc->lnode, lhead); 187 + } 188 + 189 + static void __css_process_update_tree(struct cgroup_subsys_state *css, int cpu) 190 + { 97 191 /* put @css and all ancestors on the corresponding updated lists */ 98 192 while (true) { 99 193 struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); ··· 149 183 150 184 css = parent; 151 185 } 186 + } 152 187 153 - _css_rstat_cpu_unlock(css, cpu, flags, true); 188 + static void css_process_update_tree(struct cgroup_subsys *ss, int cpu) 189 + { 190 + struct llist_head *lhead = ss_lhead_cpu(ss, cpu); 191 + struct llist_node *lnode; 192 + 193 + while ((lnode = llist_del_first_init(lhead))) { 194 + struct css_rstat_cpu *rstatc; 195 + 196 + /* 197 + * smp_mb() is needed here (more specifically in between 198 + * init_llist_node() and per-cpu stats flushing) if the 199 + * guarantee is required by a rstat user where etiher the 200 + * updater should add itself on the lockless list or the 201 + * flusher flush the stats updated by the updater who have 202 + * observed that they are already on the list. The 203 + * corresponding barrier pair for this one should be before 204 + * css_rstat_updated() by the user. 205 + * 206 + * For now, there aren't any such user, so not adding the 207 + * barrier here but if such a use-case arise, please add 208 + * smp_mb() here. 209 + */ 210 + 211 + rstatc = container_of(lnode, struct css_rstat_cpu, lnode); 212 + __css_process_update_tree(rstatc->owner, cpu); 213 + } 154 214 } 155 215 156 216 /** ··· 280 288 { 281 289 struct css_rstat_cpu *rstatc = css_rstat_cpu(root, cpu); 282 290 struct cgroup_subsys_state *head = NULL, *parent, *child; 283 - unsigned long flags; 284 291 285 - flags = _css_rstat_cpu_lock(root, cpu, false); 292 + css_process_update_tree(root->ss, cpu); 286 293 287 294 /* Return NULL if this subtree is not on-list */ 288 295 if (!rstatc->updated_next) 289 - goto unlock_ret; 296 + return NULL; 290 297 291 298 /* 292 299 * Unlink @root from its parent. As the updated_children list is ··· 317 326 rstatc->updated_children = root; 318 327 if (child != root) 319 328 head = css_rstat_push_children(head, child, cpu); 320 - unlock_ret: 321 - _css_rstat_cpu_unlock(root, cpu, flags, false); 329 + 322 330 return head; 323 331 } 324 332 ··· 458 468 for_each_possible_cpu(cpu) { 459 469 struct css_rstat_cpu *rstatc = css_rstat_cpu(css, cpu); 460 470 461 - rstatc->updated_children = css; 471 + rstatc->owner = rstatc->updated_children = css; 472 + init_llist_node(&rstatc->lnode); 462 473 463 474 if (is_self) { 464 475 struct cgroup_rstat_base_cpu *rstatbc; ··· 513 522 { 514 523 int cpu; 515 524 516 - /* 517 - * Depending on config, the subsystem per-cpu lock type may be an empty 518 - * struct. Avoid allocating a size of zero in this case. 519 - */ 520 - if (ss && sizeof(*ss->rstat_ss_cpu_lock)) { 521 - ss->rstat_ss_cpu_lock = alloc_percpu(raw_spinlock_t); 522 - if (!ss->rstat_ss_cpu_lock) 525 + if (ss) { 526 + ss->lhead = alloc_percpu(struct llist_head); 527 + if (!ss->lhead) 523 528 return -ENOMEM; 524 529 } 525 530 526 531 spin_lock_init(ss_rstat_lock(ss)); 527 532 for_each_possible_cpu(cpu) 528 - raw_spin_lock_init(ss_rstat_cpu_lock(ss, cpu)); 533 + init_llist_head(ss_lhead_cpu(ss, cpu)); 529 534 530 535 return 0; 531 536 }
+5 -5
mm/memcontrol.c
··· 570 570 if (!val) 571 571 return; 572 572 573 - /* TODO: add to cgroup update tree once it is nmi-safe. */ 574 - if (!in_nmi()) 575 - css_rstat_updated(&memcg->css, cpu); 573 + css_rstat_updated(&memcg->css, cpu); 576 574 statc_pcpu = memcg->vmstats_percpu; 577 575 for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { 578 576 statc = this_cpu_ptr(statc_pcpu); ··· 2525 2527 } else { 2526 2528 struct mem_cgroup_per_node *pn = memcg->nodeinfo[pgdat->node_id]; 2527 2529 2528 - /* TODO: add to cgroup update tree once it is nmi-safe. */ 2530 + /* preemption is disabled in_nmi(). */ 2531 + css_rstat_updated(&memcg->css, smp_processor_id()); 2529 2532 if (idx == NR_SLAB_RECLAIMABLE_B) 2530 2533 atomic_add(nr, &pn->slab_reclaimable); 2531 2534 else ··· 2749 2750 if (likely(!in_nmi())) { 2750 2751 mod_memcg_state(memcg, MEMCG_KMEM, val); 2751 2752 } else { 2752 - /* TODO: add to cgroup update tree once it is nmi-safe. */ 2753 + /* preemption is disabled in_nmi(). */ 2754 + css_rstat_updated(&memcg->css, smp_processor_id()); 2753 2755 atomic_add(val, &memcg->kmem_stat); 2754 2756 } 2755 2757 }
+3 -1
tools/testing/selftests/cgroup/lib/cgroup_util.c
··· 19 19 #include "cgroup_util.h" 20 20 #include "../../clone3/clone3_selftests.h" 21 21 22 + bool cg_test_v1_named; 23 + 22 24 /* Returns read len on success, or -errno on failure. */ 23 25 ssize_t read_text(const char *path, char *buf, size_t max_len) 24 26 { ··· 363 361 364 362 int cg_enter_current_thread(const char *cgroup) 365 363 { 366 - return cg_write(cgroup, "cgroup.threads", "0"); 364 + return cg_write(cgroup, CG_THREADS_FILE, "0"); 367 365 } 368 366 369 367 int cg_run(const char *cgroup,
+5
tools/testing/selftests/cgroup/lib/include/cgroup_util.h
··· 13 13 14 14 #define TEST_UID 65534 /* usually nobody, any !root is fine */ 15 15 16 + #define CG_THREADS_FILE (!cg_test_v1_named ? "cgroup.threads" : "tasks") 17 + #define CG_NAMED_NAME "selftest" 18 + #define CG_PATH_FORMAT (!cg_test_v1_named ? "0::%s" : (":name=" CG_NAMED_NAME ":%s")) 19 + 16 20 /* 17 21 * Checks if two given values differ by less than err% of their sum. 18 22 */ ··· 69 65 extern int cg_prepare_for_wait(const char *cgroup); 70 66 extern int memcg_prepare_for_wait(const char *cgroup); 71 67 extern int cg_wait_for(int fd); 68 + extern bool cg_test_v1_named;
+74 -10
tools/testing/selftests/cgroup/test_core.c
··· 5 5 #include <linux/sched.h> 6 6 #include <sys/types.h> 7 7 #include <sys/mman.h> 8 + #include <sys/mount.h> 9 + #include <sys/stat.h> 8 10 #include <sys/wait.h> 9 11 #include <unistd.h> 10 12 #include <fcntl.h> ··· 21 19 #include "cgroup_util.h" 22 20 23 21 static bool nsdelegate; 22 + #ifndef CLONE_NEWCGROUP 23 + #define CLONE_NEWCGROUP 0 24 + #endif 24 25 25 26 static int touch_anon(char *buf, size_t size) 26 27 { ··· 152 147 char *cg_test_c = NULL, *cg_test_d = NULL; 153 148 int cgroup_fd = -EBADF; 154 149 pid_t pid; 150 + 151 + if (cg_test_v1_named) 152 + return KSFT_SKIP; 155 153 156 154 cg_test_a = cg_name(root, "cg_test_a"); 157 155 cg_test_b = cg_name(root, "cg_test_a/cg_test_b"); ··· 285 277 int ret = KSFT_FAIL; 286 278 char *grandparent = NULL, *parent = NULL, *child = NULL; 287 279 280 + if (cg_test_v1_named) 281 + return KSFT_SKIP; 282 + 288 283 grandparent = cg_name(root, "cg_test_grandparent"); 289 284 parent = cg_name(root, "cg_test_grandparent/cg_test_parent"); 290 285 child = cg_name(root, "cg_test_grandparent/cg_test_parent/cg_test_child"); ··· 350 339 int ret = KSFT_FAIL; 351 340 char *parent = NULL, *child = NULL; 352 341 342 + if (cg_test_v1_named) 343 + return KSFT_SKIP; 344 + 353 345 parent = cg_name(root, "cg_test_parent"); 354 346 child = cg_name(root, "cg_test_parent/cg_test_child"); 355 347 if (!parent || !child) ··· 392 378 int ret = KSFT_FAIL; 393 379 char *parent = NULL, *child = NULL; 394 380 395 - if (cg_read_strstr(root, "cgroup.controllers", "cpu") || 381 + if (cg_test_v1_named || 382 + cg_read_strstr(root, "cgroup.controllers", "cpu") || 396 383 cg_write(root, "cgroup.subtree_control", "+cpu")) { 397 384 ret = KSFT_SKIP; 398 385 goto cleanup; ··· 445 430 int ret = KSFT_FAIL; 446 431 char *parent = NULL, *child = NULL; 447 432 433 + if (cg_test_v1_named) 434 + return KSFT_SKIP; 435 + 448 436 parent = cg_name(root, "cg_test_parent"); 449 437 child = cg_name(root, "cg_test_parent/cg_test_child"); 450 438 if (!parent || !child) ··· 482 464 { 483 465 int ret = KSFT_FAIL; 484 466 char *parent = NULL, *child = NULL; 467 + 468 + if (cg_test_v1_named) 469 + return KSFT_SKIP; 485 470 486 471 parent = cg_name(root, "cg_test_parent"); 487 472 child = cg_name(root, "cg_test_parent/cg_test_child"); ··· 526 505 { 527 506 int ret = KSFT_FAIL; 528 507 char *parent = NULL, *child = NULL; 508 + 509 + if (cg_test_v1_named) 510 + return KSFT_SKIP; 529 511 530 512 parent = cg_name(root, "cg_test_parent"); 531 513 child = cg_name(root, "cg_test_parent/cg_test_child"); ··· 597 573 } 598 574 599 575 cg_enter_current(dst); 600 - if (cg_read_lc(dst, "cgroup.threads") != n_threads + 1) 576 + if (cg_read_lc(dst, CG_THREADS_FILE) != n_threads + 1) 601 577 goto cleanup; 602 578 603 579 ret = KSFT_PASS; ··· 629 605 char lines[3][PATH_MAX]; 630 606 631 607 for (g = 1; g < 3; ++g) 632 - snprintf(lines[g], sizeof(lines[g]), "0::%s", grps[g] + strlen(grps[0])); 608 + snprintf(lines[g], sizeof(lines[g]), CG_PATH_FORMAT, grps[g] + strlen(grps[0])); 633 609 634 610 for (i = 0; i < n_iterations; ++i) { 635 611 cg_enter_current_thread(grps[(i % 2) + 1]); ··· 666 642 if (cg_create(grps[2])) 667 643 goto cleanup; 668 644 669 - if (cg_write(grps[1], "cgroup.type", "threaded")) 670 - goto cleanup; 671 - if (cg_write(grps[2], "cgroup.type", "threaded")) 672 - goto cleanup; 645 + if (!cg_test_v1_named) { 646 + if (cg_write(grps[1], "cgroup.type", "threaded")) 647 + goto cleanup; 648 + if (cg_write(grps[2], "cgroup.type", "threaded")) 649 + goto cleanup; 650 + } 673 651 674 652 if (cg_enter_current(grps[1])) 675 653 goto cleanup; ··· 685 659 if (retval) 686 660 goto cleanup; 687 661 688 - snprintf(line, sizeof(line), "0::%s", grps[1] + strlen(grps[0])); 662 + snprintf(line, sizeof(line), CG_PATH_FORMAT, grps[1] + strlen(grps[0])); 689 663 if (proc_read_strstr(0, 1, "cgroup", line)) 690 664 goto cleanup; 691 665 ··· 868 842 return ret; 869 843 } 870 844 845 + static int setup_named_v1_root(char *root, size_t len, const char *name) 846 + { 847 + char options[PATH_MAX]; 848 + int r; 849 + 850 + r = snprintf(root, len, "/mnt/cg_selftest"); 851 + if (r < 0) 852 + return r; 853 + 854 + r = snprintf(options, sizeof(options), "none,name=%s", name); 855 + if (r < 0) 856 + return r; 857 + 858 + r = mkdir(root, 0755); 859 + if (r < 0 && errno != EEXIST) 860 + return r; 861 + 862 + r = mount("none", root, "cgroup", 0, options); 863 + if (r < 0) 864 + return r; 865 + 866 + return 0; 867 + } 868 + 869 + static void cleanup_named_v1_root(char *root) 870 + { 871 + if (!cg_test_v1_named) 872 + return; 873 + umount(root); 874 + rmdir(root); 875 + } 876 + 871 877 #define T(x) { x, #x } 872 878 struct corecg_test { 873 879 int (*fn)(const char *root); ··· 925 867 char root[PATH_MAX]; 926 868 int i, ret = EXIT_SUCCESS; 927 869 928 - if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) 929 - ksft_exit_skip("cgroup v2 isn't mounted\n"); 870 + if (cg_find_unified_root(root, sizeof(root), &nsdelegate)) { 871 + if (setup_named_v1_root(root, sizeof(root), CG_NAMED_NAME)) 872 + ksft_exit_skip("cgroup v2 isn't mounted and could not setup named v1 hierarchy\n"); 873 + cg_test_v1_named = true; 874 + goto post_v2_setup; 875 + } 930 876 931 877 if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) 932 878 if (cg_write(root, "cgroup.subtree_control", "+memory")) 933 879 ksft_exit_skip("Failed to set memory controller\n"); 934 880 881 + post_v2_setup: 935 882 for (i = 0; i < ARRAY_SIZE(tests); i++) { 936 883 switch (tests[i].fn(root)) { 937 884 case KSFT_PASS: ··· 952 889 } 953 890 } 954 891 892 + cleanup_named_v1_root(root); 955 893 return ret; 956 894 }
+43 -20
tools/testing/selftests/cgroup/test_cpu.c
··· 2 2 3 3 #define _GNU_SOURCE 4 4 #include <linux/limits.h> 5 + #include <sys/param.h> 5 6 #include <sys/sysinfo.h> 6 7 #include <sys/wait.h> 7 8 #include <errno.h> ··· 646 645 static int test_cpucg_max(const char *root) 647 646 { 648 647 int ret = KSFT_FAIL; 649 - long usage_usec, user_usec; 650 - long usage_seconds = 1; 651 - long expected_usage_usec = usage_seconds * USEC_PER_SEC; 648 + long quota_usec = 1000; 649 + long default_period_usec = 100000; /* cpu.max's default period */ 650 + long duration_seconds = 1; 651 + 652 + long duration_usec = duration_seconds * USEC_PER_SEC; 653 + long usage_usec, n_periods, remainder_usec, expected_usage_usec; 652 654 char *cpucg; 655 + char quota_buf[32]; 656 + 657 + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); 653 658 654 659 cpucg = cg_name(root, "cpucg_test"); 655 660 if (!cpucg) ··· 664 657 if (cg_create(cpucg)) 665 658 goto cleanup; 666 659 667 - if (cg_write(cpucg, "cpu.max", "1000")) 660 + if (cg_write(cpucg, "cpu.max", quota_buf)) 668 661 goto cleanup; 669 662 670 663 struct cpu_hog_func_param param = { 671 664 .nprocs = 1, 672 665 .ts = { 673 - .tv_sec = usage_seconds, 666 + .tv_sec = duration_seconds, 674 667 .tv_nsec = 0, 675 668 }, 676 669 .clock_type = CPU_HOG_CLOCK_WALL, ··· 679 672 goto cleanup; 680 673 681 674 usage_usec = cg_read_key_long(cpucg, "cpu.stat", "usage_usec"); 682 - user_usec = cg_read_key_long(cpucg, "cpu.stat", "user_usec"); 683 - if (user_usec <= 0) 675 + if (usage_usec <= 0) 684 676 goto cleanup; 685 677 686 - if (user_usec >= expected_usage_usec) 687 - goto cleanup; 678 + /* 679 + * The following calculation applies only since 680 + * the cpu hog is set to run as per wall-clock time 681 + */ 682 + n_periods = duration_usec / default_period_usec; 683 + remainder_usec = duration_usec - n_periods * default_period_usec; 684 + expected_usage_usec 685 + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); 688 686 689 - if (values_close(usage_usec, expected_usage_usec, 95)) 687 + if (!values_close(usage_usec, expected_usage_usec, 10)) 690 688 goto cleanup; 691 689 692 690 ret = KSFT_PASS; ··· 710 698 static int test_cpucg_max_nested(const char *root) 711 699 { 712 700 int ret = KSFT_FAIL; 713 - long usage_usec, user_usec; 714 - long usage_seconds = 1; 715 - long expected_usage_usec = usage_seconds * USEC_PER_SEC; 701 + long quota_usec = 1000; 702 + long default_period_usec = 100000; /* cpu.max's default period */ 703 + long duration_seconds = 1; 704 + 705 + long duration_usec = duration_seconds * USEC_PER_SEC; 706 + long usage_usec, n_periods, remainder_usec, expected_usage_usec; 716 707 char *parent, *child; 708 + char quota_buf[32]; 709 + 710 + snprintf(quota_buf, sizeof(quota_buf), "%ld", quota_usec); 717 711 718 712 parent = cg_name(root, "cpucg_parent"); 719 713 child = cg_name(parent, "cpucg_child"); ··· 735 717 if (cg_create(child)) 736 718 goto cleanup; 737 719 738 - if (cg_write(parent, "cpu.max", "1000")) 720 + if (cg_write(parent, "cpu.max", quota_buf)) 739 721 goto cleanup; 740 722 741 723 struct cpu_hog_func_param param = { 742 724 .nprocs = 1, 743 725 .ts = { 744 - .tv_sec = usage_seconds, 726 + .tv_sec = duration_seconds, 745 727 .tv_nsec = 0, 746 728 }, 747 729 .clock_type = CPU_HOG_CLOCK_WALL, ··· 750 732 goto cleanup; 751 733 752 734 usage_usec = cg_read_key_long(child, "cpu.stat", "usage_usec"); 753 - user_usec = cg_read_key_long(child, "cpu.stat", "user_usec"); 754 - if (user_usec <= 0) 735 + if (usage_usec <= 0) 755 736 goto cleanup; 756 737 757 - if (user_usec >= expected_usage_usec) 758 - goto cleanup; 738 + /* 739 + * The following calculation applies only since 740 + * the cpu hog is set to run as per wall-clock time 741 + */ 742 + n_periods = duration_usec / default_period_usec; 743 + remainder_usec = duration_usec - n_periods * default_period_usec; 744 + expected_usage_usec 745 + = n_periods * quota_usec + MIN(remainder_usec, quota_usec); 759 746 760 - if (values_close(usage_usec, expected_usage_usec, 95)) 747 + if (!values_close(usage_usec, expected_usage_usec, 10)) 761 748 goto cleanup; 762 749 763 750 ret = KSFT_PASS;
+4 -1
tools/testing/selftests/cgroup/test_kmem.c
··· 308 308 char *parent; 309 309 long dead; 310 310 int i; 311 + int max_time = 20; 311 312 312 313 parent = cg_name(root, "kmem_dead_cgroups_test"); 313 314 if (!parent) ··· 323 322 if (cg_run_in_subcgroups(parent, alloc_dcache, (void *)100, 30)) 324 323 goto cleanup; 325 324 326 - for (i = 0; i < 5; i++) { 325 + for (i = 0; i < max_time; i++) { 327 326 dead = cg_read_key_long(parent, "cgroup.stat", 328 327 "nr_dying_descendants "); 329 328 if (dead == 0) { ··· 335 334 * let's wait a bit and repeat. 336 335 */ 337 336 sleep(1); 337 + if (i > 5) 338 + printf("Waiting time longer than 5s; wait: %ds (dead: %ld)\n", i, dead); 338 339 } 339 340 340 341 cleanup:
+1 -1
tools/testing/selftests/cgroup/test_zswap.c
··· 338 338 return -1; 339 339 340 340 if (wb != !!zswpwb_after) { 341 - ksft_print_msg("zswpwb_after is %ld while wb is %s", 341 + ksft_print_msg("zswpwb_after is %ld while wb is %s\n", 342 342 zswpwb_after, wb ? "enabled" : "disabled"); 343 343 return -1; 344 344 }