Merge tag 'cgroup-for-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+28 -5

Documentation/admin-guide/cgroup-v2.rst

··· 15 15 16 16 .. CONTENTS 17 17 18 + [Whenever any new section is added to this document, please also add 19 + an entry here.] 20 + 18 21 1. Introduction 19 22 1-1. Terminology 20 23 1-2. What is cgroup? ··· 28 25 2-2-2. Threads 29 26 2-3. [Un]populated Notification 30 27 2-4. Controlling Controllers 31 - 2-4-1. Enabling and Disabling 32 - 2-4-2. Top-down Constraint 33 - 2-4-3. No Internal Process Constraint 28 + 2-4-1. Availability 29 + 2-4-2. Enabling and Disabling 30 + 2-4-3. Top-down Constraint 31 + 2-4-4. No Internal Process Constraint 34 32 2-5. Delegation 35 33 2-5-1. Model of Delegation 36 34 2-5-2. Delegation Containment ··· 65 61 5-4-1. PID Interface Files 66 62 5-5. Cpuset 67 63 5.5-1. Cpuset Interface Files 68 - 5-6. Device 64 + 5-6. Device controller 69 65 5-7. RDMA 70 66 5-7-1. RDMA Interface Files 71 67 5-8. DMEM 68 + 5-8-1. DMEM Interface Files 72 69 5-9. HugeTLB 73 70 5.9-1. HugeTLB Interface Files 74 71 5-10. Misc 75 - 5.10-1 Miscellaneous cgroup Interface Files 72 + 5.10-1 Misc Interface Files 76 73 5.10-2 Migration and Ownership 77 74 5-11. Others 78 75 5-11-1. perf_event ··· 1005 1000 nr_dying_subsys_<cgroup_subsys> 1006 1001 Total number of dying cgroup subsystems (e.g. memory 1007 1002 cgroup) at and beneath the current cgroup. 1003 + 1004 + cgroup.stat.local 1005 + A read-only flat-keyed file which exists in non-root cgroups. 1006 + The following entry is defined: 1007 + 1008 + frozen_usec 1009 + Cumulative time that this cgroup has spent between freezing and 1010 + thawing, regardless of whether by self or ancestor groups. 1011 + NB: (not) reaching "frozen" state is not accounted here. 1012 + 1013 + Using the following ASCII representation of a cgroup's freezer 1014 + state, :: 1015 + 1016 + 1 _____ 1017 + frozen 0 __/ \__ 1018 + ab cd 1019 + 1020 + the duration being measured is the span between a and c. 1008 1021 1009 1022 cgroup.freeze 1010 1023 A read-write single value file which exists on non-root cgroups.

+41 -2

include/linux/cgroup-defs.h

··· 91 91 * cgroup_threadgroup_rwsem. This makes hot path operations such as 92 92 * forks and exits into the slow path and more expensive. 93 93 * 94 + * Alleviate the contention between fork, exec, exit operations and 95 + * writing to cgroup.procs by taking a per threadgroup rwsem instead of 96 + * the global cgroup_threadgroup_rwsem. Fork and other operations 97 + * from threads in different thread groups no longer contend with 98 + * writing to cgroup.procs. 99 + * 94 100 * The static usage pattern of creating a cgroup, enabling controllers, 95 101 * and then seeding it with CLONE_INTO_CGROUP doesn't require write 96 102 * locking cgroup_threadgroup_rwsem and thus doesn't benefit from ··· 144 138 __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ 145 139 __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ 146 140 __CFTYPE_ADDED = (1 << 18), 141 + }; 142 + 143 + enum cgroup_attach_lock_mode { 144 + /* Default */ 145 + CGRP_ATTACH_LOCK_GLOBAL, 146 + 147 + /* When pid=0 && threadgroup=false, see comments in cgroup_procs_write_start */ 148 + CGRP_ATTACH_LOCK_NONE, 149 + 150 + /* When favordynmods is on, see comments above CGRP_ROOT_FAVOR_DYNMODS */ 151 + CGRP_ATTACH_LOCK_PER_THREADGROUP, 147 152 }; 148 153 149 154 /* ··· 450 433 * frozen, SIGSTOPped, and PTRACEd. 451 434 */ 452 435 int nr_frozen_tasks; 436 + 437 + /* Freeze time data consistency protection */ 438 + seqcount_t freeze_seq; 439 + 440 + /* 441 + * Most recent time the cgroup was requested to freeze. 442 + * Accesses guarded by freeze_seq counter. Writes serialized 443 + * by css_set_lock. 444 + */ 445 + u64 freeze_start_nsec; 446 + 447 + /* 448 + * Total duration the cgroup has spent freezing. 449 + * Accesses guarded by freeze_seq counter. Writes serialized 450 + * by css_set_lock. 451 + */ 452 + u64 frozen_nsec; 453 453 }; 454 454 455 455 struct cgroup { ··· 780 746 int (*can_attach)(struct cgroup_taskset *tset); 781 747 void (*cancel_attach)(struct cgroup_taskset *tset); 782 748 void (*attach)(struct cgroup_taskset *tset); 783 - void (*post_attach)(void); 784 749 int (*can_fork)(struct task_struct *task, 785 750 struct css_set *cset); 786 751 void (*cancel_fork)(struct task_struct *task, struct css_set *cset); ··· 855 822 }; 856 823 857 824 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 825 + extern bool cgroup_enable_per_threadgroup_rwsem; 858 826 859 827 struct cgroup_of_peak { 860 828 unsigned long value; ··· 867 833 * @tsk: target task 868 834 * 869 835 * Allows cgroup operations to synchronize against threadgroup changes 870 - * using a percpu_rw_semaphore. 836 + * using a global percpu_rw_semaphore and a per threadgroup rw_semaphore when 837 + * favordynmods is on. See the comment above CGRP_ROOT_FAVOR_DYNMODS definition. 871 838 */ 872 839 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) 873 840 { 874 841 percpu_down_read(&cgroup_threadgroup_rwsem); 842 + if (cgroup_enable_per_threadgroup_rwsem) 843 + down_read(&tsk->signal->cgroup_threadgroup_rwsem); 875 844 } 876 845 877 846 /** ··· 885 848 */ 886 849 static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) 887 850 { 851 + if (cgroup_enable_per_threadgroup_rwsem) 852 + up_read(&tsk->signal->cgroup_threadgroup_rwsem); 888 853 percpu_up_read(&cgroup_threadgroup_rwsem); 889 854 } 890 855

+5

include/linux/cgroup.h

··· 355 355 return css->flags & CSS_DYING; 356 356 } 357 357 358 + static inline bool css_is_online(struct cgroup_subsys_state *css) 359 + { 360 + return css->flags & CSS_ONLINE; 361 + } 362 + 358 363 static inline bool css_is_self(struct cgroup_subsys_state *css) 359 364 { 360 365 if (css == &css->cgroup->self) {

+4

include/linux/sched/signal.h

··· 226 226 struct tty_audit_buf *tty_audit_buf; 227 227 #endif 228 228 229 + #ifdef CONFIG_CGROUPS 230 + struct rw_semaphore cgroup_threadgroup_rwsem; 231 + #endif 232 + 229 233 /* 230 234 * Thread is the potential origin of an oom condition; kill first on 231 235 * oom

+3

init/init_task.c

··· 27 27 }, 28 28 .multiprocess = HLIST_HEAD_INIT, 29 29 .rlim = INIT_RLIMITS, 30 + #ifdef CONFIG_CGROUPS 31 + .cgroup_threadgroup_rwsem = __RWSEM_INITIALIZER(init_signals.cgroup_threadgroup_rwsem), 32 + #endif 30 33 .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), 31 34 .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), 32 35 #ifdef CONFIG_POSIX_TIMERS

+7 -4

kernel/cgroup/cgroup-internal.h

··· 249 249 250 250 int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, 251 251 bool threadgroup); 252 - void cgroup_attach_lock(bool lock_threadgroup); 253 - void cgroup_attach_unlock(bool lock_threadgroup); 252 + void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, 253 + struct task_struct *tsk); 254 + void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, 255 + struct task_struct *tsk); 254 256 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, 255 - bool *locked) 257 + enum cgroup_attach_lock_mode *lock_mode) 256 258 __acquires(&cgroup_threadgroup_rwsem); 257 - void cgroup_procs_write_finish(struct task_struct *task, bool locked) 259 + void cgroup_procs_write_finish(struct task_struct *task, 260 + enum cgroup_attach_lock_mode lock_mode) 258 261 __releases(&cgroup_threadgroup_rwsem); 259 262 260 263 void cgroup_lock_and_drain_offline(struct cgroup *cgrp);

+10 -9

kernel/cgroup/cgroup-v1.c

··· 10 10 #include <linux/sched/task.h> 11 11 #include <linux/magic.h> 12 12 #include <linux/slab.h> 13 + #include <linux/string.h> 13 14 #include <linux/vmalloc.h> 14 15 #include <linux/delayacct.h> 15 16 #include <linux/pid_namespace.h> ··· 69 68 int retval = 0; 70 69 71 70 cgroup_lock(); 72 - cgroup_attach_lock(true); 71 + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); 73 72 for_each_root(root) { 74 73 struct cgroup *from_cgrp; 75 74 ··· 81 80 if (retval) 82 81 break; 83 82 } 84 - cgroup_attach_unlock(true); 83 + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); 85 84 cgroup_unlock(); 86 85 87 86 return retval; ··· 118 117 119 118 cgroup_lock(); 120 119 121 - cgroup_attach_lock(true); 120 + cgroup_attach_lock(CGRP_ATTACH_LOCK_GLOBAL, NULL); 122 121 123 122 /* all tasks in @from are being moved, all csets are source */ 124 123 spin_lock_irq(&css_set_lock); ··· 154 153 } while (task && !ret); 155 154 out_err: 156 155 cgroup_migrate_finish(&mgctx); 157 - cgroup_attach_unlock(true); 156 + cgroup_attach_unlock(CGRP_ATTACH_LOCK_GLOBAL, NULL); 158 157 cgroup_unlock(); 159 158 return ret; 160 159 } ··· 503 502 struct task_struct *task; 504 503 const struct cred *cred, *tcred; 505 504 ssize_t ret; 506 - bool locked; 505 + enum cgroup_attach_lock_mode lock_mode; 507 506 508 507 cgrp = cgroup_kn_lock_live(of->kn, false); 509 508 if (!cgrp) 510 509 return -ENODEV; 511 510 512 - task = cgroup_procs_write_start(buf, threadgroup, &locked); 511 + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); 513 512 ret = PTR_ERR_OR_ZERO(task); 514 513 if (ret) 515 514 goto out_unlock; ··· 532 531 ret = cgroup_attach_task(cgrp, task, threadgroup); 533 532 534 533 out_finish: 535 - cgroup_procs_write_finish(task, locked); 534 + cgroup_procs_write_finish(task, lock_mode); 536 535 out_unlock: 537 536 cgroup_kn_unlock(of->kn); 538 537 ··· 1134 1133 1135 1134 if (ctx->release_agent) { 1136 1135 spin_lock(&release_agent_path_lock); 1137 - strcpy(root->release_agent_path, ctx->release_agent); 1136 + strscpy(root->release_agent_path, ctx->release_agent); 1138 1137 spin_unlock(&release_agent_path_lock); 1139 1138 } 1140 1139 ··· 1326 1325 * Cap @max_active to 1 too. 1327 1326 */ 1328 1327 cgroup_pidlist_destroy_wq = alloc_workqueue("cgroup_pidlist_destroy", 1329 - 0, 1); 1328 + WQ_PERCPU, 1); 1330 1329 BUG_ON(!cgroup_pidlist_destroy_wq); 1331 1330 return 0; 1332 1331 }

+152 -47

kernel/cgroup/cgroup.c

··· 125 125 /* 126 126 * cgroup destruction makes heavy use of work items and there can be a lot 127 127 * of concurrent destructions. Use a separate workqueue so that cgroup 128 - * destruction work items don't end up filling up max_active of system_wq 128 + * destruction work items don't end up filling up max_active of system_percpu_wq 129 129 * which may lead to deadlock. 130 130 * 131 131 * A cgroup destruction should enqueue work sequentially to: ··· 239 239 static u16 have_canfork_callback __read_mostly; 240 240 241 241 static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS); 242 + 243 + /* 244 + * Write protected by cgroup_mutex and write-lock of cgroup_threadgroup_rwsem, 245 + * read protected by either. 246 + * 247 + * Can only be turned on, but not turned off. 248 + */ 249 + bool cgroup_enable_per_threadgroup_rwsem __read_mostly; 242 250 243 251 /* cgroup namespace for init task */ 244 252 struct cgroup_namespace init_cgroup_ns = { ··· 1335 1327 { 1336 1328 bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; 1337 1329 1338 - /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ 1330 + /* 1331 + * see the comment above CGRP_ROOT_FAVOR_DYNMODS definition. 1332 + * favordynmods can flip while task is between 1333 + * cgroup_threadgroup_change_begin() and end(), so down_write global 1334 + * cgroup_threadgroup_rwsem to synchronize them. 1335 + * 1336 + * Once cgroup_enable_per_threadgroup_rwsem is enabled, holding 1337 + * cgroup_threadgroup_rwsem doesn't exlude tasks between 1338 + * cgroup_thread_group_change_begin() and end() and thus it's unsafe to 1339 + * turn off. As the scenario is unlikely, simply disallow disabling once 1340 + * enabled and print out a warning. 1341 + */ 1342 + percpu_down_write(&cgroup_threadgroup_rwsem); 1339 1343 if (favor && !favoring) { 1344 + cgroup_enable_per_threadgroup_rwsem = true; 1340 1345 rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); 1341 1346 root->flags |= CGRP_ROOT_FAVOR_DYNMODS; 1342 1347 } else if (!favor && favoring) { 1348 + if (cgroup_enable_per_threadgroup_rwsem) 1349 + pr_warn_once("cgroup favordynmods: per threadgroup rwsem mechanism can't be disabled\n"); 1343 1350 rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); 1344 1351 root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; 1345 1352 } 1353 + percpu_up_write(&cgroup_threadgroup_rwsem); 1346 1354 } 1347 1355 1348 1356 static int cgroup_init_root_id(struct cgroup_root *root) ··· 2508 2484 2509 2485 /** 2510 2486 * cgroup_attach_lock - Lock for ->attach() 2511 - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem 2487 + * @lock_mode: whether acquire and acquire which rwsem 2488 + * @tsk: thread group to lock 2512 2489 * 2513 2490 * cgroup migration sometimes needs to stabilize threadgroups against forks and 2514 2491 * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach() ··· 2529 2504 * Resolve the situation by always acquiring cpus_read_lock() before optionally 2530 2505 * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that 2531 2506 * CPU hotplug is disabled on entry. 2507 + * 2508 + * When favordynmods is enabled, take per threadgroup rwsem to reduce overhead 2509 + * on dynamic cgroup modifications. see the comment above 2510 + * CGRP_ROOT_FAVOR_DYNMODS definition. 2511 + * 2512 + * tsk is not NULL only when writing to cgroup.procs. 2532 2513 */ 2533 - void cgroup_attach_lock(bool lock_threadgroup) 2514 + void cgroup_attach_lock(enum cgroup_attach_lock_mode lock_mode, 2515 + struct task_struct *tsk) 2534 2516 { 2535 2517 cpus_read_lock(); 2536 - if (lock_threadgroup) 2518 + 2519 + switch (lock_mode) { 2520 + case CGRP_ATTACH_LOCK_NONE: 2521 + break; 2522 + case CGRP_ATTACH_LOCK_GLOBAL: 2537 2523 percpu_down_write(&cgroup_threadgroup_rwsem); 2524 + break; 2525 + case CGRP_ATTACH_LOCK_PER_THREADGROUP: 2526 + down_write(&tsk->signal->cgroup_threadgroup_rwsem); 2527 + break; 2528 + default: 2529 + pr_warn("cgroup: Unexpected attach lock mode."); 2530 + break; 2531 + } 2538 2532 } 2539 2533 2540 2534 /** 2541 2535 * cgroup_attach_unlock - Undo cgroup_attach_lock() 2542 - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem 2536 + * @lock_mode: whether release and release which rwsem 2537 + * @tsk: thread group to lock 2543 2538 */ 2544 - void cgroup_attach_unlock(bool lock_threadgroup) 2539 + void cgroup_attach_unlock(enum cgroup_attach_lock_mode lock_mode, 2540 + struct task_struct *tsk) 2545 2541 { 2546 - if (lock_threadgroup) 2542 + switch (lock_mode) { 2543 + case CGRP_ATTACH_LOCK_NONE: 2544 + break; 2545 + case CGRP_ATTACH_LOCK_GLOBAL: 2547 2546 percpu_up_write(&cgroup_threadgroup_rwsem); 2547 + break; 2548 + case CGRP_ATTACH_LOCK_PER_THREADGROUP: 2549 + up_write(&tsk->signal->cgroup_threadgroup_rwsem); 2550 + break; 2551 + default: 2552 + pr_warn("cgroup: Unexpected attach lock mode."); 2553 + break; 2554 + } 2555 + 2548 2556 cpus_read_unlock(); 2549 2557 } 2550 2558 ··· 3027 2969 3028 2970 /* look up all src csets */ 3029 2971 spin_lock_irq(&css_set_lock); 3030 - rcu_read_lock(); 3031 2972 task = leader; 3032 2973 do { 3033 2974 cgroup_migrate_add_src(task_css_set(task), dst_cgrp, &mgctx); 3034 2975 if (!threadgroup) 3035 2976 break; 3036 2977 } while_each_thread(leader, task); 3037 - rcu_read_unlock(); 3038 2978 spin_unlock_irq(&css_set_lock); 3039 2979 3040 2980 /* prepare dst csets and commit */ ··· 3049 2993 } 3050 2994 3051 2995 struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, 3052 - bool *threadgroup_locked) 2996 + enum cgroup_attach_lock_mode *lock_mode) 3053 2997 { 3054 2998 struct task_struct *tsk; 3055 2999 pid_t pid; ··· 3057 3001 if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) 3058 3002 return ERR_PTR(-EINVAL); 3059 3003 3060 - /* 3061 - * If we migrate a single thread, we don't care about threadgroup 3062 - * stability. If the thread is `current`, it won't exit(2) under our 3063 - * hands or change PID through exec(2). We exclude 3064 - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write 3065 - * callers by cgroup_mutex. 3066 - * Therefore, we can skip the global lock. 3067 - */ 3068 - lockdep_assert_held(&cgroup_mutex); 3069 - *threadgroup_locked = pid || threadgroup; 3070 - cgroup_attach_lock(*threadgroup_locked); 3071 - 3004 + retry_find_task: 3072 3005 rcu_read_lock(); 3073 3006 if (pid) { 3074 3007 tsk = find_task_by_vpid(pid); 3075 3008 if (!tsk) { 3076 3009 tsk = ERR_PTR(-ESRCH); 3077 - goto out_unlock_threadgroup; 3010 + goto out_unlock_rcu; 3078 3011 } 3079 3012 } else { 3080 3013 tsk = current; ··· 3080 3035 */ 3081 3036 if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) { 3082 3037 tsk = ERR_PTR(-EINVAL); 3083 - goto out_unlock_threadgroup; 3038 + goto out_unlock_rcu; 3039 + } 3040 + get_task_struct(tsk); 3041 + rcu_read_unlock(); 3042 + 3043 + /* 3044 + * If we migrate a single thread, we don't care about threadgroup 3045 + * stability. If the thread is `current`, it won't exit(2) under our 3046 + * hands or change PID through exec(2). We exclude 3047 + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write callers 3048 + * by cgroup_mutex. Therefore, we can skip the global lock. 3049 + */ 3050 + lockdep_assert_held(&cgroup_mutex); 3051 + 3052 + if (pid || threadgroup) { 3053 + if (cgroup_enable_per_threadgroup_rwsem) 3054 + *lock_mode = CGRP_ATTACH_LOCK_PER_THREADGROUP; 3055 + else 3056 + *lock_mode = CGRP_ATTACH_LOCK_GLOBAL; 3057 + } else { 3058 + *lock_mode = CGRP_ATTACH_LOCK_NONE; 3084 3059 } 3085 3060 3086 - get_task_struct(tsk); 3087 - goto out_unlock_rcu; 3061 + cgroup_attach_lock(*lock_mode, tsk); 3088 3062 3089 - out_unlock_threadgroup: 3090 - cgroup_attach_unlock(*threadgroup_locked); 3091 - *threadgroup_locked = false; 3063 + if (threadgroup) { 3064 + if (!thread_group_leader(tsk)) { 3065 + /* 3066 + * A race with de_thread from another thread's exec() 3067 + * may strip us of our leadership. If this happens, 3068 + * throw this task away and try again. 3069 + */ 3070 + cgroup_attach_unlock(*lock_mode, tsk); 3071 + put_task_struct(tsk); 3072 + goto retry_find_task; 3073 + } 3074 + } 3075 + 3076 + return tsk; 3077 + 3092 3078 out_unlock_rcu: 3093 3079 rcu_read_unlock(); 3094 3080 return tsk; 3095 3081 } 3096 3082 3097 - void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked) 3083 + void cgroup_procs_write_finish(struct task_struct *task, 3084 + enum cgroup_attach_lock_mode lock_mode) 3098 3085 { 3099 - struct cgroup_subsys *ss; 3100 - int ssid; 3086 + cgroup_attach_unlock(lock_mode, task); 3101 3087 3102 3088 /* release reference from cgroup_procs_write_start() */ 3103 3089 put_task_struct(task); 3104 - 3105 - cgroup_attach_unlock(threadgroup_locked); 3106 - 3107 - for_each_subsys(ss, ssid) 3108 - if (ss->post_attach) 3109 - ss->post_attach(); 3110 3090 } 3111 3091 3112 3092 static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) ··· 3183 3113 struct cgroup_subsys_state *d_css; 3184 3114 struct cgroup *dsct; 3185 3115 struct css_set *src_cset; 3116 + enum cgroup_attach_lock_mode lock_mode; 3186 3117 bool has_tasks; 3187 3118 int ret; 3188 3119 ··· 3215 3144 * write-locking can be skipped safely. 3216 3145 */ 3217 3146 has_tasks = !list_empty(&mgctx.preloaded_src_csets); 3218 - cgroup_attach_lock(has_tasks); 3147 + 3148 + if (has_tasks) 3149 + lock_mode = CGRP_ATTACH_LOCK_GLOBAL; 3150 + else 3151 + lock_mode = CGRP_ATTACH_LOCK_NONE; 3152 + 3153 + cgroup_attach_lock(lock_mode, NULL); 3219 3154 3220 3155 /* NULL dst indicates self on default hierarchy */ 3221 3156 ret = cgroup_migrate_prepare_dst(&mgctx); ··· 3242 3165 ret = cgroup_migrate_execute(&mgctx); 3243 3166 out_finish: 3244 3167 cgroup_migrate_finish(&mgctx); 3245 - cgroup_attach_unlock(has_tasks); 3168 + cgroup_attach_unlock(lock_mode, NULL); 3246 3169 return ret; 3247 3170 } 3248 3171 ··· 3862 3785 cgroup_subsys[ssid]->name, dying_cnt[ssid]); 3863 3786 } 3864 3787 rcu_read_unlock(); 3788 + return 0; 3789 + } 3790 + 3791 + static int cgroup_core_local_stat_show(struct seq_file *seq, void *v) 3792 + { 3793 + struct cgroup *cgrp = seq_css(seq)->cgroup; 3794 + unsigned int sequence; 3795 + u64 freeze_time; 3796 + 3797 + do { 3798 + sequence = read_seqcount_begin(&cgrp->freezer.freeze_seq); 3799 + freeze_time = cgrp->freezer.frozen_nsec; 3800 + /* Add in current freezer interval if the cgroup is freezing. */ 3801 + if (test_bit(CGRP_FREEZE, &cgrp->flags)) 3802 + freeze_time += (ktime_get_ns() - 3803 + cgrp->freezer.freeze_start_nsec); 3804 + } while (read_seqcount_retry(&cgrp->freezer.freeze_seq, sequence)); 3805 + 3806 + do_div(freeze_time, NSEC_PER_USEC); 3807 + seq_printf(seq, "frozen_usec %llu\n", freeze_time); 3808 + 3865 3809 return 0; 3866 3810 } 3867 3811 ··· 5365 5267 struct task_struct *task; 5366 5268 const struct cred *saved_cred; 5367 5269 ssize_t ret; 5368 - bool threadgroup_locked; 5270 + enum cgroup_attach_lock_mode lock_mode; 5369 5271 5370 5272 dst_cgrp = cgroup_kn_lock_live(of->kn, false); 5371 5273 if (!dst_cgrp) 5372 5274 return -ENODEV; 5373 5275 5374 - task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked); 5276 + task = cgroup_procs_write_start(buf, threadgroup, &lock_mode); 5375 5277 ret = PTR_ERR_OR_ZERO(task); 5376 5278 if (ret) 5377 5279 goto out_unlock; ··· 5397 5299 ret = cgroup_attach_task(dst_cgrp, task, threadgroup); 5398 5300 5399 5301 out_finish: 5400 - cgroup_procs_write_finish(task, threadgroup_locked); 5302 + cgroup_procs_write_finish(task, lock_mode); 5401 5303 out_unlock: 5402 5304 cgroup_kn_unlock(of->kn); 5403 5305 ··· 5477 5379 { 5478 5380 .name = "cgroup.stat", 5479 5381 .seq_show = cgroup_stat_show, 5382 + }, 5383 + { 5384 + .name = "cgroup.stat.local", 5385 + .flags = CFTYPE_NOT_ON_ROOT, 5386 + .seq_show = cgroup_core_local_stat_show, 5480 5387 }, 5481 5388 { 5482 5389 .name = "cgroup.freeze", ··· 5892 5789 * if the parent has to be frozen, the child has too. 5893 5790 */ 5894 5791 cgrp->freezer.e_freeze = parent->freezer.e_freeze; 5792 + seqcount_init(&cgrp->freezer.freeze_seq); 5895 5793 if (cgrp->freezer.e_freeze) { 5896 5794 /* 5897 5795 * Set the CGRP_FREEZE flag, so when a process will be ··· 5901 5797 * consider it frozen immediately. 5902 5798 */ 5903 5799 set_bit(CGRP_FREEZE, &cgrp->flags); 5800 + cgrp->freezer.freeze_start_nsec = ktime_get_ns(); 5904 5801 set_bit(CGRP_FROZEN, &cgrp->flags); 5905 5802 } 5906 5803 ··· 6457 6352 * We would prefer to do this in cgroup_init() above, but that 6458 6353 * is called before init_workqueues(): so leave this until after. 6459 6354 */ 6460 - cgroup_offline_wq = alloc_workqueue("cgroup_offline", 0, 1); 6355 + cgroup_offline_wq = alloc_workqueue("cgroup_offline", WQ_PERCPU, 1); 6461 6356 BUG_ON(!cgroup_offline_wq); 6462 6357 6463 - cgroup_release_wq = alloc_workqueue("cgroup_release", 0, 1); 6358 + cgroup_release_wq = alloc_workqueue("cgroup_release", WQ_PERCPU, 1); 6464 6359 BUG_ON(!cgroup_release_wq); 6465 6360 6466 - cgroup_free_wq = alloc_workqueue("cgroup_free", 0, 1); 6361 + cgroup_free_wq = alloc_workqueue("cgroup_free", WQ_PERCPU, 1); 6467 6362 BUG_ON(!cgroup_free_wq); 6468 6363 return 0; 6469 6364 }

+3 -2

kernel/cgroup/cpuset-internal.h

··· 38 38 39 39 /* bits in struct cpuset flags field */ 40 40 typedef enum { 41 - CS_ONLINE, 42 41 CS_CPU_EXCLUSIVE, 43 42 CS_MEM_EXCLUSIVE, 44 43 CS_MEM_HARDWALL, ··· 201 202 /* convenient tests for these bits */ 202 203 static inline bool is_cpuset_online(struct cpuset *cs) 203 204 { 204 - return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css); 205 + return css_is_online(&cs->css) && !css_is_dying(&cs->css); 205 206 } 206 207 207 208 static inline int is_cpu_exclusive(const struct cpuset *cs) ··· 276 277 ssize_t cpuset_write_resmask(struct kernfs_open_file *of, 277 278 char *buf, size_t nbytes, loff_t off); 278 279 int cpuset_common_seq_show(struct seq_file *sf, void *v); 280 + void cpuset_full_lock(void); 281 + void cpuset_full_unlock(void); 279 282 280 283 /* 281 284 * cpuset-v1.c

+4 -8

kernel/cgroup/cpuset-v1.c

··· 169 169 cpuset_filetype_t type = cft->private; 170 170 int retval = -ENODEV; 171 171 172 - cpus_read_lock(); 173 - cpuset_lock(); 172 + cpuset_full_lock(); 174 173 if (!is_cpuset_online(cs)) 175 174 goto out_unlock; 176 175 ··· 183 184 break; 184 185 } 185 186 out_unlock: 186 - cpuset_unlock(); 187 - cpus_read_unlock(); 187 + cpuset_full_unlock(); 188 188 return retval; 189 189 } 190 190 ··· 452 454 cpuset_filetype_t type = cft->private; 453 455 int retval = 0; 454 456 455 - cpus_read_lock(); 456 - cpuset_lock(); 457 + cpuset_full_lock(); 457 458 if (!is_cpuset_online(cs)) { 458 459 retval = -ENODEV; 459 460 goto out_unlock; ··· 495 498 break; 496 499 } 497 500 out_unlock: 498 - cpuset_unlock(); 499 - cpus_read_unlock(); 501 + cpuset_full_unlock(); 500 502 return retval; 501 503 } 502 504

+408 -354

kernel/cgroup/cpuset.c

··· 40 40 #include <linux/sched/isolation.h> 41 41 #include <linux/wait.h> 42 42 #include <linux/workqueue.h> 43 + #include <linux/task_work.h> 43 44 44 45 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); 45 46 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); ··· 132 131 #define PRS_INVALID_ROOT -1 133 132 #define PRS_INVALID_ISOLATED -2 134 133 135 - static inline bool is_prs_invalid(int prs_state) 136 - { 137 - return prs_state < 0; 138 - } 139 - 140 134 /* 141 135 * Temporary cpumasks for working with partitions that are passed among 142 136 * functions to avoid memory allocation in inner functions. ··· 155 159 cs->nr_deadline_tasks--; 156 160 } 157 161 158 - static inline int is_partition_valid(const struct cpuset *cs) 162 + static inline bool is_partition_valid(const struct cpuset *cs) 159 163 { 160 164 return cs->partition_root_state > 0; 161 165 } 162 166 163 - static inline int is_partition_invalid(const struct cpuset *cs) 167 + static inline bool is_partition_invalid(const struct cpuset *cs) 164 168 { 165 169 return cs->partition_root_state < 0; 170 + } 171 + 172 + static inline bool cs_is_member(const struct cpuset *cs) 173 + { 174 + return cs->partition_root_state == PRS_MEMBER; 166 175 } 167 176 168 177 /* ··· 208 207 * parallel, we may leave an offline CPU in cpu_allowed or some other masks. 209 208 */ 210 209 static struct cpuset top_cpuset = { 211 - .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | 210 + .flags = BIT(CS_CPU_EXCLUSIVE) | 212 211 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), 213 212 .partition_root_state = PRS_ROOT, 214 213 .relax_domain_level = -1, ··· 251 250 252 251 static DEFINE_MUTEX(cpuset_mutex); 253 252 253 + /** 254 + * cpuset_lock - Acquire the global cpuset mutex 255 + * 256 + * This locks the global cpuset mutex to prevent modifications to cpuset 257 + * hierarchy and configurations. This helper is not enough to make modification. 258 + */ 254 259 void cpuset_lock(void) 255 260 { 256 261 mutex_lock(&cpuset_mutex); ··· 265 258 void cpuset_unlock(void) 266 259 { 267 260 mutex_unlock(&cpuset_mutex); 261 + } 262 + 263 + /** 264 + * cpuset_full_lock - Acquire full protection for cpuset modification 265 + * 266 + * Takes both CPU hotplug read lock (cpus_read_lock()) and cpuset mutex 267 + * to safely modify cpuset data. 268 + */ 269 + void cpuset_full_lock(void) 270 + { 271 + cpus_read_lock(); 272 + mutex_lock(&cpuset_mutex); 273 + } 274 + 275 + void cpuset_full_unlock(void) 276 + { 277 + mutex_unlock(&cpuset_mutex); 278 + cpus_read_unlock(); 268 279 } 269 280 270 281 static DEFINE_SPINLOCK(callback_lock); ··· 436 411 } 437 412 438 413 /** 439 - * alloc_cpumasks - allocate three cpumasks for cpuset 440 - * @cs: the cpuset that have cpumasks to be allocated. 441 - * @tmp: the tmpmasks structure pointer 414 + * alloc_cpumasks - Allocate an array of cpumask variables 415 + * @pmasks: Pointer to array of cpumask_var_t pointers 416 + * @size: Number of cpumasks to allocate 442 417 * Return: 0 if successful, -ENOMEM otherwise. 443 418 * 444 - * Only one of the two input arguments should be non-NULL. 419 + * Allocates @size cpumasks and initializes them to empty. Returns 0 on 420 + * success, -ENOMEM on allocation failure. On failure, any previously 421 + * allocated cpumasks are freed. 445 422 */ 446 - static inline int alloc_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 423 + static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size) 447 424 { 448 - cpumask_var_t *pmask1, *pmask2, *pmask3, *pmask4; 425 + int i; 449 426 450 - if (cs) { 451 - pmask1 = &cs->cpus_allowed; 452 - pmask2 = &cs->effective_cpus; 453 - pmask3 = &cs->effective_xcpus; 454 - pmask4 = &cs->exclusive_cpus; 455 - } else { 456 - pmask1 = &tmp->new_cpus; 457 - pmask2 = &tmp->addmask; 458 - pmask3 = &tmp->delmask; 459 - pmask4 = NULL; 427 + for (i = 0; i < size; i++) { 428 + if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) { 429 + while (--i >= 0) 430 + free_cpumask_var(*pmasks[i]); 431 + return -ENOMEM; 432 + } 460 433 } 461 - 462 - if (!zalloc_cpumask_var(pmask1, GFP_KERNEL)) 463 - return -ENOMEM; 464 - 465 - if (!zalloc_cpumask_var(pmask2, GFP_KERNEL)) 466 - goto free_one; 467 - 468 - if (!zalloc_cpumask_var(pmask3, GFP_KERNEL)) 469 - goto free_two; 470 - 471 - if (pmask4 && !zalloc_cpumask_var(pmask4, GFP_KERNEL)) 472 - goto free_three; 473 - 474 - 475 434 return 0; 476 - 477 - free_three: 478 - free_cpumask_var(*pmask3); 479 - free_two: 480 - free_cpumask_var(*pmask2); 481 - free_one: 482 - free_cpumask_var(*pmask1); 483 - return -ENOMEM; 484 435 } 485 436 486 437 /** 487 - * free_cpumasks - free cpumasks in a tmpmasks structure 488 - * @cs: the cpuset that have cpumasks to be free. 438 + * alloc_tmpmasks - Allocate temporary cpumasks for cpuset operations. 439 + * @tmp: Pointer to tmpmasks structure to populate 440 + * Return: 0 on success, -ENOMEM on allocation failure 441 + */ 442 + static inline int alloc_tmpmasks(struct tmpmasks *tmp) 443 + { 444 + /* 445 + * Array of pointers to the three cpumask_var_t fields in tmpmasks. 446 + * Note: Array size must match actual number of masks (3) 447 + */ 448 + cpumask_var_t *pmask[3] = { 449 + &tmp->new_cpus, 450 + &tmp->addmask, 451 + &tmp->delmask 452 + }; 453 + 454 + return alloc_cpumasks(pmask, ARRAY_SIZE(pmask)); 455 + } 456 + 457 + /** 458 + * free_tmpmasks - free cpumasks in a tmpmasks structure 489 459 * @tmp: the tmpmasks structure pointer 490 460 */ 491 - static inline void free_cpumasks(struct cpuset *cs, struct tmpmasks *tmp) 461 + static inline void free_tmpmasks(struct tmpmasks *tmp) 492 462 { 493 - if (cs) { 494 - free_cpumask_var(cs->cpus_allowed); 495 - free_cpumask_var(cs->effective_cpus); 496 - free_cpumask_var(cs->effective_xcpus); 497 - free_cpumask_var(cs->exclusive_cpus); 498 - } 499 - if (tmp) { 500 - free_cpumask_var(tmp->new_cpus); 501 - free_cpumask_var(tmp->addmask); 502 - free_cpumask_var(tmp->delmask); 503 - } 463 + if (!tmp) 464 + return; 465 + 466 + free_cpumask_var(tmp->new_cpus); 467 + free_cpumask_var(tmp->addmask); 468 + free_cpumask_var(tmp->delmask); 504 469 } 505 470 506 471 /** 507 - * alloc_trial_cpuset - allocate a trial cpuset 508 - * @cs: the cpuset that the trial cpuset duplicates 472 + * dup_or_alloc_cpuset - Duplicate or allocate a new cpuset 473 + * @cs: Source cpuset to duplicate (NULL for a fresh allocation) 474 + * 475 + * Creates a new cpuset by either: 476 + * 1. Duplicating an existing cpuset (if @cs is non-NULL), or 477 + * 2. Allocating a fresh cpuset with zero-initialized masks (if @cs is NULL) 478 + * 479 + * Return: Pointer to newly allocated cpuset on success, NULL on failure 509 480 */ 510 - static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) 481 + static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs) 511 482 { 512 483 struct cpuset *trial; 513 484 514 - trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); 485 + /* Allocate base structure */ 486 + trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) : 487 + kzalloc(sizeof(*cs), GFP_KERNEL); 515 488 if (!trial) 516 489 return NULL; 517 490 518 - if (alloc_cpumasks(trial, NULL)) { 491 + /* Setup cpumask pointer array */ 492 + cpumask_var_t *pmask[4] = { 493 + &trial->cpus_allowed, 494 + &trial->effective_cpus, 495 + &trial->effective_xcpus, 496 + &trial->exclusive_cpus 497 + }; 498 + 499 + if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) { 519 500 kfree(trial); 520 501 return NULL; 521 502 } 522 503 523 - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 524 - cpumask_copy(trial->effective_cpus, cs->effective_cpus); 525 - cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); 526 - cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); 504 + /* Copy masks if duplicating */ 505 + if (cs) { 506 + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 507 + cpumask_copy(trial->effective_cpus, cs->effective_cpus); 508 + cpumask_copy(trial->effective_xcpus, cs->effective_xcpus); 509 + cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus); 510 + } 511 + 527 512 return trial; 528 513 } 529 514 ··· 543 508 */ 544 509 static inline void free_cpuset(struct cpuset *cs) 545 510 { 546 - free_cpumasks(cs, NULL); 511 + free_cpumask_var(cs->cpus_allowed); 512 + free_cpumask_var(cs->effective_cpus); 513 + free_cpumask_var(cs->effective_xcpus); 514 + free_cpumask_var(cs->exclusive_cpus); 547 515 kfree(cs); 548 516 } 549 517 ··· 576 538 if (cpumask_intersects(xcpus1, xcpus2)) 577 539 return false; 578 540 return true; 541 + } 542 + 543 + /** 544 + * cpus_excl_conflict - Check if two cpusets have exclusive CPU conflicts 545 + * @cs1: first cpuset to check 546 + * @cs2: second cpuset to check 547 + * 548 + * Returns: true if CPU exclusivity conflict exists, false otherwise 549 + * 550 + * Conflict detection rules: 551 + * 1. If either cpuset is CPU exclusive, they must be mutually exclusive 552 + * 2. exclusive_cpus masks cannot intersect between cpusets 553 + * 3. The allowed CPUs of one cpuset cannot be a subset of another's exclusive CPUs 554 + */ 555 + static inline bool cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) 556 + { 557 + /* If either cpuset is exclusive, check if they are mutually exclusive */ 558 + if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2)) 559 + return !cpusets_are_exclusive(cs1, cs2); 560 + 561 + /* Exclusive_cpus cannot intersect */ 562 + if (cpumask_intersects(cs1->exclusive_cpus, cs2->exclusive_cpus)) 563 + return true; 564 + 565 + /* The cpus_allowed of one cpuset cannot be a subset of another cpuset's exclusive_cpus */ 566 + if (!cpumask_empty(cs1->cpus_allowed) && 567 + cpumask_subset(cs1->cpus_allowed, cs2->exclusive_cpus)) 568 + return true; 569 + 570 + if (!cpumask_empty(cs2->cpus_allowed) && 571 + cpumask_subset(cs2->cpus_allowed, cs1->exclusive_cpus)) 572 + return true; 573 + 574 + return false; 575 + } 576 + 577 + static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2) 578 + { 579 + if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2))) 580 + return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 581 + return false; 579 582 } 580 583 581 584 /* ··· 700 621 */ 701 622 ret = -EINVAL; 702 623 cpuset_for_each_child(c, css, par) { 703 - bool txset, cxset; /* Are exclusive_cpus set? */ 704 - 705 624 if (c == cur) 706 625 continue; 707 - 708 - txset = !cpumask_empty(trial->exclusive_cpus); 709 - cxset = !cpumask_empty(c->exclusive_cpus); 710 - if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || 711 - (txset && cxset)) { 712 - if (!cpusets_are_exclusive(trial, c)) 713 - goto out; 714 - } else if (txset || cxset) { 715 - struct cpumask *xcpus, *acpus; 716 - 717 - /* 718 - * When just one of the exclusive_cpus's is set, 719 - * cpus_allowed of the other cpuset, if set, cannot be 720 - * a subset of it or none of those CPUs will be 721 - * available if these exclusive CPUs are activated. 722 - */ 723 - if (txset) { 724 - xcpus = trial->exclusive_cpus; 725 - acpus = c->cpus_allowed; 726 - } else { 727 - xcpus = c->exclusive_cpus; 728 - acpus = trial->cpus_allowed; 729 - } 730 - if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) 731 - goto out; 732 - } 733 - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && 734 - nodes_intersects(trial->mems_allowed, c->mems_allowed)) 626 + if (cpus_excl_conflict(trial, c)) 627 + goto out; 628 + if (mems_excl_conflict(trial, c)) 735 629 goto out; 736 630 } 737 631 ··· 1415 1363 } 1416 1364 EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); 1417 1365 1418 - /* 1419 - * compute_effective_exclusive_cpumask - compute effective exclusive CPUs 1420 - * @cs: cpuset 1421 - * @xcpus: effective exclusive CPUs value to be set 1422 - * @real_cs: the real cpuset (can be NULL) 1423 - * Return: 0 if there is no sibling conflict, > 0 otherwise 1366 + /** 1367 + * rm_siblings_excl_cpus - Remove exclusive CPUs that are used by sibling cpusets 1368 + * @parent: Parent cpuset containing all siblings 1369 + * @cs: Current cpuset (will be skipped) 1370 + * @excpus: exclusive effective CPU mask to modify 1424 1371 * 1425 - * If exclusive_cpus isn't explicitly set or a real_cs is provided, we have to 1426 - * scan the sibling cpusets and exclude their exclusive_cpus or effective_xcpus 1427 - * as well. The provision of real_cs means that a cpumask is being changed and 1428 - * the given cs is a trial one. 1372 + * This function ensures the given @excpus mask doesn't include any CPUs that 1373 + * are exclusively allocated to sibling cpusets. It walks through all siblings 1374 + * of @cs under @parent and removes their exclusive CPUs from @excpus. 1429 1375 */ 1430 - static int compute_effective_exclusive_cpumask(struct cpuset *cs, 1431 - struct cpumask *xcpus, 1432 - struct cpuset *real_cs) 1376 + static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs, 1377 + struct cpumask *excpus) 1433 1378 { 1434 1379 struct cgroup_subsys_state *css; 1435 - struct cpuset *parent = parent_cs(cs); 1436 1380 struct cpuset *sibling; 1437 1381 int retval = 0; 1438 1382 1439 - if (!xcpus) 1440 - xcpus = cs->effective_xcpus; 1441 - 1442 - cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); 1443 - 1444 - if (!real_cs) { 1445 - if (!cpumask_empty(cs->exclusive_cpus)) 1446 - return 0; 1447 - } else { 1448 - cs = real_cs; 1449 - } 1383 + if (cpumask_empty(excpus)) 1384 + return retval; 1450 1385 1451 1386 /* 1452 1387 * Exclude exclusive CPUs from siblings ··· 1443 1404 if (sibling == cs) 1444 1405 continue; 1445 1406 1446 - if (cpumask_intersects(xcpus, sibling->exclusive_cpus)) { 1447 - cpumask_andnot(xcpus, xcpus, sibling->exclusive_cpus); 1407 + if (cpumask_intersects(excpus, sibling->exclusive_cpus)) { 1408 + cpumask_andnot(excpus, excpus, sibling->exclusive_cpus); 1448 1409 retval++; 1449 1410 continue; 1450 1411 } 1451 - if (cpumask_intersects(xcpus, sibling->effective_xcpus)) { 1452 - cpumask_andnot(xcpus, xcpus, sibling->effective_xcpus); 1412 + if (cpumask_intersects(excpus, sibling->effective_xcpus)) { 1413 + cpumask_andnot(excpus, excpus, sibling->effective_xcpus); 1453 1414 retval++; 1454 1415 } 1455 1416 } 1456 1417 rcu_read_unlock(); 1418 + 1457 1419 return retval; 1420 + } 1421 + 1422 + /* 1423 + * compute_excpus - compute effective exclusive CPUs 1424 + * @cs: cpuset 1425 + * @xcpus: effective exclusive CPUs value to be set 1426 + * Return: 0 if there is no sibling conflict, > 0 otherwise 1427 + * 1428 + * If exclusive_cpus isn't explicitly set , we have to scan the sibling cpusets 1429 + * and exclude their exclusive_cpus or effective_xcpus as well. 1430 + */ 1431 + static int compute_excpus(struct cpuset *cs, struct cpumask *excpus) 1432 + { 1433 + struct cpuset *parent = parent_cs(cs); 1434 + 1435 + cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus); 1436 + 1437 + if (!cpumask_empty(cs->exclusive_cpus)) 1438 + return 0; 1439 + 1440 + return rm_siblings_excl_cpus(parent, cs, excpus); 1441 + } 1442 + 1443 + /* 1444 + * compute_trialcs_excpus - Compute effective exclusive CPUs for a trial cpuset 1445 + * @trialcs: The trial cpuset containing the proposed new configuration 1446 + * @cs: The original cpuset that the trial configuration is based on 1447 + * Return: 0 if successful with no sibling conflict, >0 if a conflict is found 1448 + * 1449 + * Computes the effective_xcpus for a trial configuration. @cs is provided to represent 1450 + * the real cs. 1451 + */ 1452 + static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs) 1453 + { 1454 + struct cpuset *parent = parent_cs(trialcs); 1455 + struct cpumask *excpus = trialcs->effective_xcpus; 1456 + 1457 + /* trialcs is member, cpuset.cpus has no impact to excpus */ 1458 + if (cs_is_member(cs)) 1459 + cpumask_and(excpus, trialcs->exclusive_cpus, 1460 + parent->effective_xcpus); 1461 + else 1462 + cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus); 1463 + 1464 + return rm_siblings_excl_cpus(parent, cs, excpus); 1458 1465 } 1459 1466 1460 1467 static inline bool is_remote_partition(struct cpuset *cs) ··· 1544 1459 * Note that creating a remote partition with any local partition root 1545 1460 * above it or remote partition root underneath it is not allowed. 1546 1461 */ 1547 - compute_effective_exclusive_cpumask(cs, tmp->new_cpus, NULL); 1462 + compute_excpus(cs, tmp->new_cpus); 1548 1463 WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus)); 1549 1464 if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) || 1550 1465 cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) ··· 1593 1508 cs->partition_root_state = PRS_MEMBER; 1594 1509 1595 1510 /* effective_xcpus may need to be changed */ 1596 - compute_effective_exclusive_cpumask(cs, NULL, NULL); 1511 + compute_excpus(cs, cs->effective_xcpus); 1597 1512 reset_partition_data(cs); 1598 1513 spin_unlock_irq(&callback_lock); 1599 1514 update_unbound_workqueue_cpumask(isolcpus_updated); ··· 1762 1677 old_prs = new_prs = cs->partition_root_state; 1763 1678 1764 1679 if (cmd == partcmd_invalidate) { 1765 - if (is_prs_invalid(old_prs)) 1680 + if (is_partition_invalid(cs)) 1766 1681 return 0; 1767 1682 1768 1683 /* ··· 1794 1709 1795 1710 if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) { 1796 1711 /* 1797 - * Need to call compute_effective_exclusive_cpumask() in case 1712 + * Need to call compute_excpus() in case 1798 1713 * exclusive_cpus not set. Sibling conflict should only happen 1799 1714 * if exclusive_cpus isn't set. 1800 1715 */ 1801 1716 xcpus = tmp->delmask; 1802 - if (compute_effective_exclusive_cpumask(cs, xcpus, NULL)) 1717 + if (compute_excpus(cs, xcpus)) 1803 1718 WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus)); 1719 + new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; 1804 1720 1805 1721 /* 1806 1722 * Enabling partition root is not allowed if its ··· 1813 1727 if (prstate_housekeeping_conflict(new_prs, xcpus)) 1814 1728 return PERR_HKEEPING; 1815 1729 1816 - /* 1817 - * A parent can be left with no CPU as long as there is no 1818 - * task directly associated with the parent partition. 1819 - */ 1820 - if (nocpu) 1730 + if (tasks_nocpu_error(parent, cs, xcpus)) 1821 1731 return PERR_NOCPUS; 1822 1732 1823 1733 /* ··· 1830 1748 1831 1749 deleting = true; 1832 1750 subparts_delta++; 1833 - new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED; 1834 1751 } else if (cmd == partcmd_disable) { 1835 1752 /* 1836 1753 * May need to add cpus back to parent's effective_cpus ··· 1869 1788 * For invalid partition: 1870 1789 * delmask = newmask & parent->effective_xcpus 1871 1790 */ 1872 - if (is_prs_invalid(old_prs)) { 1791 + if (is_partition_invalid(cs)) { 1873 1792 adding = false; 1874 1793 deleting = cpumask_and(tmp->delmask, 1875 1794 newmask, parent->effective_xcpus); ··· 1918 1837 * A partition error happens when parent has tasks and all 1919 1838 * its effective CPUs will have to be distributed out. 1920 1839 */ 1921 - WARN_ON_ONCE(!is_partition_valid(parent)); 1922 1840 if (nocpu) { 1923 1841 part_error = PERR_NOCPUS; 1924 1842 if (is_partition_valid(cs)) ··· 2076 1996 * 2) All the effective_cpus will be used up and cp 2077 1997 * has tasks 2078 1998 */ 2079 - compute_effective_exclusive_cpumask(cs, new_ecpus, NULL); 1999 + compute_excpus(cs, new_ecpus); 2080 2000 cpumask_and(new_ecpus, new_ecpus, cpu_active_mask); 2081 2001 2082 2002 rcu_read_lock(); ··· 2155 2075 * its value is being processed. 2156 2076 */ 2157 2077 if (remote && (cp != cs)) { 2158 - compute_effective_exclusive_cpumask(cp, tmp->new_cpus, NULL); 2078 + compute_excpus(cp, tmp->new_cpus); 2159 2079 if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) { 2160 2080 pos_css = css_rightmost_descendant(pos_css); 2161 2081 continue; ··· 2257 2177 cpumask_copy(cp->effective_cpus, tmp->new_cpus); 2258 2178 cp->partition_root_state = new_prs; 2259 2179 if (!cpumask_empty(cp->exclusive_cpus) && (cp != cs)) 2260 - compute_effective_exclusive_cpumask(cp, NULL, NULL); 2180 + compute_excpus(cp, cp->effective_xcpus); 2261 2181 2262 2182 /* 2263 2183 * Make sure effective_xcpus is properly set for a valid ··· 2364 2284 rcu_read_unlock(); 2365 2285 } 2366 2286 2367 - /** 2368 - * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 2369 - * @cs: the cpuset to consider 2370 - * @trialcs: trial cpuset 2371 - * @buf: buffer of cpu numbers written to this cpuset 2372 - */ 2373 - static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2374 - const char *buf) 2287 + static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask) 2375 2288 { 2376 2289 int retval; 2377 - struct tmpmasks tmp; 2290 + 2291 + retval = cpulist_parse(buf, out_mask); 2292 + if (retval < 0) 2293 + return retval; 2294 + if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed)) 2295 + return -EINVAL; 2296 + 2297 + return 0; 2298 + } 2299 + 2300 + /** 2301 + * validate_partition - Validate a cpuset partition configuration 2302 + * @cs: The cpuset to validate 2303 + * @trialcs: The trial cpuset containing proposed configuration changes 2304 + * 2305 + * If any validation check fails, the appropriate error code is set in the 2306 + * cpuset's prs_err field. 2307 + * 2308 + * Return: PRS error code (0 if valid, non-zero error code if invalid) 2309 + */ 2310 + static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs) 2311 + { 2378 2312 struct cpuset *parent = parent_cs(cs); 2379 - bool invalidate = false; 2380 - bool force = false; 2381 - int old_prs = cs->partition_root_state; 2382 2313 2383 - /* top_cpuset.cpus_allowed tracks cpu_active_mask; it's read-only */ 2384 - if (cs == &top_cpuset) 2385 - return -EACCES; 2314 + if (cs_is_member(trialcs)) 2315 + return PERR_NONE; 2386 2316 2387 - /* 2388 - * An empty cpus_allowed is ok only if the cpuset has no tasks. 2389 - * Since cpulist_parse() fails on an empty mask, we special case 2390 - * that parsing. The validate_change() call ensures that cpusets 2391 - * with tasks have cpus. 2392 - */ 2393 - if (!*buf) { 2394 - cpumask_clear(trialcs->cpus_allowed); 2395 - if (cpumask_empty(trialcs->exclusive_cpus)) 2396 - cpumask_clear(trialcs->effective_xcpus); 2397 - } else { 2398 - retval = cpulist_parse(buf, trialcs->cpus_allowed); 2399 - if (retval < 0) 2400 - return retval; 2317 + if (cpumask_empty(trialcs->effective_xcpus)) 2318 + return PERR_INVCPUS; 2401 2319 2402 - if (!cpumask_subset(trialcs->cpus_allowed, 2403 - top_cpuset.cpus_allowed)) 2404 - return -EINVAL; 2320 + if (prstate_housekeeping_conflict(trialcs->partition_root_state, 2321 + trialcs->effective_xcpus)) 2322 + return PERR_HKEEPING; 2405 2323 2406 - /* 2407 - * When exclusive_cpus isn't explicitly set, it is constrained 2408 - * by cpus_allowed and parent's effective_xcpus. Otherwise, 2409 - * trialcs->effective_xcpus is used as a temporary cpumask 2410 - * for checking validity of the partition root. 2411 - */ 2412 - trialcs->partition_root_state = PRS_MEMBER; 2413 - if (!cpumask_empty(trialcs->exclusive_cpus) || is_partition_valid(cs)) 2414 - compute_effective_exclusive_cpumask(trialcs, NULL, cs); 2415 - } 2324 + if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) 2325 + return PERR_NOCPUS; 2416 2326 2417 - /* Nothing to do if the cpus didn't change */ 2418 - if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 2419 - return 0; 2327 + return PERR_NONE; 2328 + } 2420 2329 2421 - if (alloc_cpumasks(NULL, &tmp)) 2422 - return -ENOMEM; 2423 - 2424 - if (old_prs) { 2425 - if (is_partition_valid(cs) && 2426 - cpumask_empty(trialcs->effective_xcpus)) { 2427 - invalidate = true; 2428 - cs->prs_err = PERR_INVCPUS; 2429 - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2430 - invalidate = true; 2431 - cs->prs_err = PERR_HKEEPING; 2432 - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2433 - invalidate = true; 2434 - cs->prs_err = PERR_NOCPUS; 2435 - } 2436 - } 2437 - 2438 - /* 2439 - * Check all the descendants in update_cpumasks_hier() if 2440 - * effective_xcpus is to be changed. 2441 - */ 2442 - force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); 2330 + static int cpus_allowed_validate_change(struct cpuset *cs, struct cpuset *trialcs, 2331 + struct tmpmasks *tmp) 2332 + { 2333 + int retval; 2334 + struct cpuset *parent = parent_cs(cs); 2443 2335 2444 2336 retval = validate_change(cs, trialcs); 2445 2337 ··· 2426 2374 * partition. However, any conflicting sibling partitions 2427 2375 * have to be marked as invalid too. 2428 2376 */ 2429 - invalidate = true; 2377 + trialcs->prs_err = PERR_NOTEXCL; 2430 2378 rcu_read_lock(); 2431 2379 cpuset_for_each_child(cp, css, parent) { 2432 2380 struct cpumask *xcpus = user_xcpus(trialcs); ··· 2434 2382 if (is_partition_valid(cp) && 2435 2383 cpumask_intersects(xcpus, cp->effective_xcpus)) { 2436 2384 rcu_read_unlock(); 2437 - update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, &tmp); 2385 + update_parent_effective_cpumask(cp, partcmd_invalidate, NULL, tmp); 2438 2386 rcu_read_lock(); 2439 2387 } 2440 2388 } 2441 2389 rcu_read_unlock(); 2442 2390 retval = 0; 2443 2391 } 2392 + return retval; 2393 + } 2444 2394 2395 + /** 2396 + * partition_cpus_change - Handle partition state changes due to CPU mask updates 2397 + * @cs: The target cpuset being modified 2398 + * @trialcs: The trial cpuset containing proposed configuration changes 2399 + * @tmp: Temporary masks for intermediate calculations 2400 + * 2401 + * This function handles partition state transitions triggered by CPU mask changes. 2402 + * CPU modifications may cause a partition to be disabled or require state updates. 2403 + */ 2404 + static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs, 2405 + struct tmpmasks *tmp) 2406 + { 2407 + enum prs_errcode prs_err; 2408 + 2409 + if (cs_is_member(cs)) 2410 + return; 2411 + 2412 + prs_err = validate_partition(cs, trialcs); 2413 + if (prs_err) 2414 + trialcs->prs_err = cs->prs_err = prs_err; 2415 + 2416 + if (is_remote_partition(cs)) { 2417 + if (trialcs->prs_err) 2418 + remote_partition_disable(cs, tmp); 2419 + else 2420 + remote_cpus_update(cs, trialcs->exclusive_cpus, 2421 + trialcs->effective_xcpus, tmp); 2422 + } else { 2423 + if (trialcs->prs_err) 2424 + update_parent_effective_cpumask(cs, partcmd_invalidate, 2425 + NULL, tmp); 2426 + else 2427 + update_parent_effective_cpumask(cs, partcmd_update, 2428 + trialcs->effective_xcpus, tmp); 2429 + } 2430 + } 2431 + 2432 + /** 2433 + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 2434 + * @cs: the cpuset to consider 2435 + * @trialcs: trial cpuset 2436 + * @buf: buffer of cpu numbers written to this cpuset 2437 + */ 2438 + static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, 2439 + const char *buf) 2440 + { 2441 + int retval; 2442 + struct tmpmasks tmp; 2443 + bool force = false; 2444 + int old_prs = cs->partition_root_state; 2445 + 2446 + retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed); 2447 + if (retval < 0) 2448 + return retval; 2449 + 2450 + /* Nothing to do if the cpus didn't change */ 2451 + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 2452 + return 0; 2453 + 2454 + if (alloc_tmpmasks(&tmp)) 2455 + return -ENOMEM; 2456 + 2457 + compute_trialcs_excpus(trialcs, cs); 2458 + trialcs->prs_err = PERR_NONE; 2459 + 2460 + retval = cpus_allowed_validate_change(cs, trialcs, &tmp); 2445 2461 if (retval < 0) 2446 2462 goto out_free; 2447 2463 2448 - if (is_partition_valid(cs) || 2449 - (is_partition_invalid(cs) && !invalidate)) { 2450 - struct cpumask *xcpus = trialcs->effective_xcpus; 2464 + /* 2465 + * Check all the descendants in update_cpumasks_hier() if 2466 + * effective_xcpus is to be changed. 2467 + */ 2468 + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); 2451 2469 2452 - if (cpumask_empty(xcpus) && is_partition_invalid(cs)) 2453 - xcpus = trialcs->cpus_allowed; 2454 - 2455 - /* 2456 - * Call remote_cpus_update() to handle valid remote partition 2457 - */ 2458 - if (is_remote_partition(cs)) 2459 - remote_cpus_update(cs, NULL, xcpus, &tmp); 2460 - else if (invalidate) 2461 - update_parent_effective_cpumask(cs, partcmd_invalidate, 2462 - NULL, &tmp); 2463 - else 2464 - update_parent_effective_cpumask(cs, partcmd_update, 2465 - xcpus, &tmp); 2466 - } 2470 + partition_cpus_change(cs, trialcs, &tmp); 2467 2471 2468 2472 spin_lock_irq(&callback_lock); 2469 2473 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); ··· 2535 2427 if (cs->partition_root_state) 2536 2428 update_partition_sd_lb(cs, old_prs); 2537 2429 out_free: 2538 - free_cpumasks(NULL, &tmp); 2430 + free_tmpmasks(&tmp); 2539 2431 return retval; 2540 2432 } 2541 2433 ··· 2552 2444 { 2553 2445 int retval; 2554 2446 struct tmpmasks tmp; 2555 - struct cpuset *parent = parent_cs(cs); 2556 - bool invalidate = false; 2557 2447 bool force = false; 2558 2448 int old_prs = cs->partition_root_state; 2559 2449 2560 - if (!*buf) { 2561 - cpumask_clear(trialcs->exclusive_cpus); 2562 - cpumask_clear(trialcs->effective_xcpus); 2563 - } else { 2564 - retval = cpulist_parse(buf, trialcs->exclusive_cpus); 2565 - if (retval < 0) 2566 - return retval; 2567 - } 2450 + retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus); 2451 + if (retval < 0) 2452 + return retval; 2568 2453 2569 2454 /* Nothing to do if the CPUs didn't change */ 2570 2455 if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus)) 2571 2456 return 0; 2572 2457 2573 - if (*buf) { 2574 - trialcs->partition_root_state = PRS_MEMBER; 2575 - /* 2576 - * Reject the change if there is exclusive CPUs conflict with 2577 - * the siblings. 2578 - */ 2579 - if (compute_effective_exclusive_cpumask(trialcs, NULL, cs)) 2580 - return -EINVAL; 2581 - } 2458 + /* 2459 + * Reject the change if there is exclusive CPUs conflict with 2460 + * the siblings. 2461 + */ 2462 + if (compute_trialcs_excpus(trialcs, cs)) 2463 + return -EINVAL; 2582 2464 2583 2465 /* 2584 2466 * Check all the descendants in update_cpumasks_hier() if ··· 2580 2482 if (retval) 2581 2483 return retval; 2582 2484 2583 - if (alloc_cpumasks(NULL, &tmp)) 2485 + if (alloc_tmpmasks(&tmp)) 2584 2486 return -ENOMEM; 2585 2487 2586 - if (old_prs) { 2587 - if (cpumask_empty(trialcs->effective_xcpus)) { 2588 - invalidate = true; 2589 - cs->prs_err = PERR_INVCPUS; 2590 - } else if (prstate_housekeeping_conflict(old_prs, trialcs->effective_xcpus)) { 2591 - invalidate = true; 2592 - cs->prs_err = PERR_HKEEPING; 2593 - } else if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus)) { 2594 - invalidate = true; 2595 - cs->prs_err = PERR_NOCPUS; 2596 - } 2488 + trialcs->prs_err = PERR_NONE; 2489 + partition_cpus_change(cs, trialcs, &tmp); 2597 2490 2598 - if (is_remote_partition(cs)) { 2599 - if (invalidate) 2600 - remote_partition_disable(cs, &tmp); 2601 - else 2602 - remote_cpus_update(cs, trialcs->exclusive_cpus, 2603 - trialcs->effective_xcpus, &tmp); 2604 - } else if (invalidate) { 2605 - update_parent_effective_cpumask(cs, partcmd_invalidate, 2606 - NULL, &tmp); 2607 - } else { 2608 - update_parent_effective_cpumask(cs, partcmd_update, 2609 - trialcs->effective_xcpus, &tmp); 2610 - } 2611 - } 2612 2491 spin_lock_irq(&callback_lock); 2613 2492 cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); 2614 2493 cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); ··· 2605 2530 if (cs->partition_root_state) 2606 2531 update_partition_sd_lb(cs, old_prs); 2607 2532 2608 - free_cpumasks(NULL, &tmp); 2533 + free_tmpmasks(&tmp); 2609 2534 return 0; 2610 2535 } 2611 2536 ··· 2657 2582 } 2658 2583 } 2659 2584 2660 - static void cpuset_post_attach(void) 2585 + static void flush_migrate_mm_task_workfn(struct callback_head *head) 2661 2586 { 2662 2587 flush_workqueue(cpuset_migrate_mm_wq); 2588 + kfree(head); 2589 + } 2590 + 2591 + static void schedule_flush_migrate_mm(void) 2592 + { 2593 + struct callback_head *flush_cb; 2594 + 2595 + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); 2596 + if (!flush_cb) 2597 + return; 2598 + 2599 + init_task_work(flush_cb, flush_migrate_mm_task_workfn); 2600 + 2601 + if (task_work_add(current, flush_cb, TWA_RESUME)) 2602 + kfree(flush_cb); 2663 2603 } 2664 2604 2665 2605 /* ··· 2840 2750 int retval; 2841 2751 2842 2752 /* 2843 - * top_cpuset.mems_allowed tracks node_stats[N_MEMORY]; 2844 - * it's read-only 2845 - */ 2846 - if (cs == &top_cpuset) { 2847 - retval = -EACCES; 2848 - goto done; 2849 - } 2850 - 2851 - /* 2852 2753 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 2853 - * Since nodelist_parse() fails on an empty mask, we special case 2854 - * that parsing. The validate_change() call ensures that cpusets 2855 - * with tasks have memory. 2754 + * The validate_change() call ensures that cpusets with tasks have memory. 2856 2755 */ 2857 - if (!*buf) { 2858 - nodes_clear(trialcs->mems_allowed); 2859 - } else { 2860 - retval = nodelist_parse(buf, trialcs->mems_allowed); 2861 - if (retval < 0) 2862 - goto done; 2756 + retval = nodelist_parse(buf, trialcs->mems_allowed); 2757 + if (retval < 0) 2758 + goto done; 2863 2759 2864 - if (!nodes_subset(trialcs->mems_allowed, 2865 - top_cpuset.mems_allowed)) { 2866 - retval = -EINVAL; 2867 - goto done; 2868 - } 2760 + if (!nodes_subset(trialcs->mems_allowed, 2761 + top_cpuset.mems_allowed)) { 2762 + retval = -EINVAL; 2763 + goto done; 2869 2764 } 2870 2765 2871 2766 if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed)) { ··· 2901 2826 int spread_flag_changed; 2902 2827 int err; 2903 2828 2904 - trialcs = alloc_trial_cpuset(cs); 2829 + trialcs = dup_or_alloc_cpuset(cs); 2905 2830 if (!trialcs) 2906 2831 return -ENOMEM; 2907 2832 ··· 2959 2884 /* 2960 2885 * Treat a previously invalid partition root as if it is a "member". 2961 2886 */ 2962 - if (new_prs && is_prs_invalid(old_prs)) 2887 + if (new_prs && is_partition_invalid(cs)) 2963 2888 old_prs = PRS_MEMBER; 2964 2889 2965 - if (alloc_cpumasks(NULL, &tmpmask)) 2890 + if (alloc_tmpmasks(&tmpmask)) 2966 2891 return -ENOMEM; 2967 2892 2968 2893 err = update_partition_exclusive_flag(cs, new_prs); ··· 3058 2983 notify_partition_change(cs, old_prs); 3059 2984 if (force_sd_rebuild) 3060 2985 rebuild_sched_domains_locked(); 3061 - free_cpumasks(NULL, &tmpmask); 2986 + free_tmpmasks(&tmpmask); 3062 2987 return 0; 3063 2988 } 3064 2989 ··· 3216 3141 struct cpuset *cs; 3217 3142 struct cpuset *oldcs = cpuset_attach_old_cs; 3218 3143 bool cpus_updated, mems_updated; 3144 + bool queue_task_work = false; 3219 3145 3220 3146 cgroup_taskset_first(tset, &css); 3221 3147 cs = css_cs(css); ··· 3267 3191 * @old_mems_allowed is the right nodesets that we 3268 3192 * migrate mm from. 3269 3193 */ 3270 - if (is_memory_migrate(cs)) 3194 + if (is_memory_migrate(cs)) { 3271 3195 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 3272 3196 &cpuset_attach_nodemask_to); 3273 - else 3197 + queue_task_work = true; 3198 + } else 3274 3199 mmput(mm); 3275 3200 } 3276 3201 } 3277 3202 3278 3203 out: 3204 + if (queue_task_work) 3205 + schedule_flush_migrate_mm(); 3279 3206 cs->old_mems_allowed = cpuset_attach_nodemask_to; 3280 3207 3281 3208 if (cs->nr_migrate_dl_tasks) { ··· 3302 3223 struct cpuset *trialcs; 3303 3224 int retval = -ENODEV; 3304 3225 3226 + /* root is read-only */ 3227 + if (cs == &top_cpuset) 3228 + return -EACCES; 3229 + 3305 3230 buf = strstrip(buf); 3306 - cpus_read_lock(); 3307 - mutex_lock(&cpuset_mutex); 3231 + cpuset_full_lock(); 3308 3232 if (!is_cpuset_online(cs)) 3309 3233 goto out_unlock; 3310 3234 3311 - trialcs = alloc_trial_cpuset(cs); 3235 + trialcs = dup_or_alloc_cpuset(cs); 3312 3236 if (!trialcs) { 3313 3237 retval = -ENOMEM; 3314 3238 goto out_unlock; ··· 3336 3254 if (force_sd_rebuild) 3337 3255 rebuild_sched_domains_locked(); 3338 3256 out_unlock: 3339 - mutex_unlock(&cpuset_mutex); 3340 - cpus_read_unlock(); 3341 - flush_workqueue(cpuset_migrate_mm_wq); 3257 + cpuset_full_unlock(); 3258 + if (of_cft(of)->private == FILE_MEMLIST) 3259 + schedule_flush_migrate_mm(); 3342 3260 return retval ?: nbytes; 3343 3261 } 3344 3262 ··· 3440 3358 else 3441 3359 return -EINVAL; 3442 3360 3443 - cpus_read_lock(); 3444 - mutex_lock(&cpuset_mutex); 3361 + cpuset_full_lock(); 3445 3362 if (is_cpuset_online(cs)) 3446 3363 retval = update_prstate(cs, val); 3447 - mutex_unlock(&cpuset_mutex); 3448 - cpus_read_unlock(); 3364 + cpuset_full_unlock(); 3449 3365 return retval ?: nbytes; 3450 3366 } 3451 3367 ··· 3542 3462 if (!parent_css) 3543 3463 return &top_cpuset.css; 3544 3464 3545 - cs = kzalloc(sizeof(*cs), GFP_KERNEL); 3465 + cs = dup_or_alloc_cpuset(NULL); 3546 3466 if (!cs) 3547 3467 return ERR_PTR(-ENOMEM); 3548 - 3549 - if (alloc_cpumasks(cs, NULL)) { 3550 - kfree(cs); 3551 - return ERR_PTR(-ENOMEM); 3552 - } 3553 3468 3554 3469 __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 3555 3470 fmeter_init(&cs->fmeter); ··· 3568 3493 if (!parent) 3569 3494 return 0; 3570 3495 3571 - cpus_read_lock(); 3572 - mutex_lock(&cpuset_mutex); 3573 - 3574 - set_bit(CS_ONLINE, &cs->flags); 3496 + cpuset_full_lock(); 3575 3497 if (is_spread_page(parent)) 3576 3498 set_bit(CS_SPREAD_PAGE, &cs->flags); 3577 3499 if (is_spread_slab(parent)) ··· 3620 3548 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 3621 3549 spin_unlock_irq(&callback_lock); 3622 3550 out_unlock: 3623 - mutex_unlock(&cpuset_mutex); 3624 - cpus_read_unlock(); 3551 + cpuset_full_unlock(); 3625 3552 return 0; 3626 3553 } 3627 3554 ··· 3635 3564 { 3636 3565 struct cpuset *cs = css_cs(css); 3637 3566 3638 - cpus_read_lock(); 3639 - mutex_lock(&cpuset_mutex); 3640 - 3567 + cpuset_full_lock(); 3641 3568 if (!cpuset_v2() && is_sched_load_balance(cs)) 3642 3569 cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 3643 3570 3644 3571 cpuset_dec(); 3645 - clear_bit(CS_ONLINE, &cs->flags); 3646 - 3647 - mutex_unlock(&cpuset_mutex); 3648 - cpus_read_unlock(); 3572 + cpuset_full_unlock(); 3649 3573 } 3650 3574 3651 3575 /* ··· 3652 3586 { 3653 3587 struct cpuset *cs = css_cs(css); 3654 3588 3655 - cpus_read_lock(); 3656 - mutex_lock(&cpuset_mutex); 3657 - 3589 + cpuset_full_lock(); 3658 3590 /* Reset valid partition back to member */ 3659 3591 if (is_partition_valid(cs)) 3660 3592 update_prstate(cs, PRS_MEMBER); 3661 - 3662 - mutex_unlock(&cpuset_mutex); 3663 - cpus_read_unlock(); 3664 - 3593 + cpuset_full_unlock(); 3665 3594 } 3666 3595 3667 3596 static void cpuset_css_free(struct cgroup_subsys_state *css) ··· 3785 3724 .can_attach = cpuset_can_attach, 3786 3725 .cancel_attach = cpuset_cancel_attach, 3787 3726 .attach = cpuset_attach, 3788 - .post_attach = cpuset_post_attach, 3789 3727 .bind = cpuset_bind, 3790 3728 .can_fork = cpuset_can_fork, 3791 3729 .cancel_fork = cpuset_cancel_fork, ··· 3988 3928 bool on_dfl = is_in_v2_mode(); 3989 3929 struct tmpmasks tmp, *ptmp = NULL; 3990 3930 3991 - if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 3931 + if (on_dfl && !alloc_tmpmasks(&tmp)) 3992 3932 ptmp = &tmp; 3993 3933 3994 3934 lockdep_assert_cpus_held(); ··· 4068 4008 if (force_sd_rebuild) 4069 4009 rebuild_sched_domains_cpuslocked(); 4070 4010 4071 - free_cpumasks(NULL, ptmp); 4011 + free_tmpmasks(ptmp); 4072 4012 } 4073 4013 4074 4014 void cpuset_update_active_cpus(void) ··· 4133 4073 struct cpuset *cs; 4134 4074 4135 4075 spin_lock_irqsave(&callback_lock, flags); 4136 - rcu_read_lock(); 4137 4076 4138 4077 cs = task_cs(tsk); 4139 4078 if (cs != &top_cpuset) ··· 4154 4095 cpumask_copy(pmask, possible_mask); 4155 4096 } 4156 4097 4157 - rcu_read_unlock(); 4158 4098 spin_unlock_irqrestore(&callback_lock, flags); 4159 4099 } 4160 4100 ··· 4226 4168 unsigned long flags; 4227 4169 4228 4170 spin_lock_irqsave(&callback_lock, flags); 4229 - rcu_read_lock(); 4230 4171 guarantee_online_mems(task_cs(tsk), &mask); 4231 - rcu_read_unlock(); 4232 4172 spin_unlock_irqrestore(&callback_lock, flags); 4233 4173 4234 4174 return mask; ··· 4321 4265 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 4322 4266 spin_lock_irqsave(&callback_lock, flags); 4323 4267 4324 - rcu_read_lock(); 4325 4268 cs = nearest_hardwall_ancestor(task_cs(current)); 4326 4269 allowed = node_isset(node, cs->mems_allowed); 4327 - rcu_read_unlock(); 4328 4270 4329 4271 spin_unlock_irqrestore(&callback_lock, flags); 4330 4272 return allowed;

-4

kernel/cgroup/debug.c

··· 49 49 return -ENODEV; 50 50 51 51 spin_lock_irq(&css_set_lock); 52 - rcu_read_lock(); 53 52 cset = task_css_set(current); 54 53 refcnt = refcount_read(&cset->refcount); 55 54 seq_printf(seq, "css_set %pK %d", cset, refcnt); ··· 66 67 seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, 67 68 css, css->id); 68 69 } 69 - rcu_read_unlock(); 70 70 spin_unlock_irq(&css_set_lock); 71 71 cgroup_kn_unlock(of->kn); 72 72 return 0; ··· 93 95 return -ENOMEM; 94 96 95 97 spin_lock_irq(&css_set_lock); 96 - rcu_read_lock(); 97 98 cset = task_css_set(current); 98 99 list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { 99 100 struct cgroup *c = link->cgrp; ··· 101 104 seq_printf(seq, "Root %d group %s\n", 102 105 c->root->hierarchy_id, name_buf); 103 106 } 104 - rcu_read_unlock(); 105 107 spin_unlock_irq(&css_set_lock); 106 108 kfree(name_buf); 107 109 return 0;

+12 -4

kernel/cgroup/freezer.c

··· 171 171 /* 172 172 * Freeze or unfreeze all tasks in the given cgroup. 173 173 */ 174 - static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) 174 + static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze, u64 ts_nsec) 175 175 { 176 176 struct css_task_iter it; 177 177 struct task_struct *task; ··· 179 179 lockdep_assert_held(&cgroup_mutex); 180 180 181 181 spin_lock_irq(&css_set_lock); 182 - if (freeze) 182 + write_seqcount_begin(&cgrp->freezer.freeze_seq); 183 + if (freeze) { 183 184 set_bit(CGRP_FREEZE, &cgrp->flags); 184 - else 185 + cgrp->freezer.freeze_start_nsec = ts_nsec; 186 + } else { 185 187 clear_bit(CGRP_FREEZE, &cgrp->flags); 188 + cgrp->freezer.frozen_nsec += (ts_nsec - 189 + cgrp->freezer.freeze_start_nsec); 190 + } 191 + write_seqcount_end(&cgrp->freezer.freeze_seq); 186 192 spin_unlock_irq(&css_set_lock); 187 193 188 194 if (freeze) ··· 266 260 struct cgroup *parent; 267 261 struct cgroup *dsct; 268 262 bool applied = false; 263 + u64 ts_nsec; 269 264 bool old_e; 270 265 271 266 lockdep_assert_held(&cgroup_mutex); ··· 278 271 return; 279 272 280 273 cgrp->freezer.freeze = freeze; 274 + ts_nsec = ktime_get_ns(); 281 275 282 276 /* 283 277 * Propagate changes downwards the cgroup tree. ··· 306 298 /* 307 299 * Do change actual state: freeze or unfreeze. 308 300 */ 309 - cgroup_do_freeze(dsct, freeze); 301 + cgroup_do_freeze(dsct, freeze, ts_nsec); 310 302 applied = true; 311 303 } 312 304

+4

kernel/fork.c

··· 1688 1688 tty_audit_fork(sig); 1689 1689 sched_autogroup_fork(sig); 1690 1690 1691 + #ifdef CONFIG_CGROUPS 1692 + init_rwsem(&sig->cgroup_threadgroup_rwsem); 1693 + #endif 1694 + 1691 1695 sig->oom_score_adj = current->signal->oom_score_adj; 1692 1696 sig->oom_score_adj_min = current->signal->oom_score_adj_min; 1693 1697

+12

tools/testing/selftests/cgroup/lib/cgroup_util.c

··· 522 522 return strstr(buf, option) != NULL; 523 523 } 524 524 525 + int cgroup_feature(const char *feature) 526 + { 527 + char buf[PAGE_SIZE]; 528 + ssize_t read; 529 + 530 + read = read_text("/sys/kernel/cgroup/features", buf, sizeof(buf)); 531 + if (read < 0) 532 + return read; 533 + 534 + return strstr(buf, feature) != NULL; 535 + } 536 + 525 537 ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size) 526 538 { 527 539 char path[PATH_MAX];

+1

tools/testing/selftests/cgroup/lib/include/cgroup_util.h

··· 60 60 extern int cg_wait_for_proc_count(const char *cgroup, int count); 61 61 extern int cg_killall(const char *cgroup); 62 62 int proc_mount_contains(const char *option); 63 + int cgroup_feature(const char *feature); 63 64 extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); 64 65 extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); 65 66 extern pid_t clone_into_cgroup(int cgroup_fd);

+663

tools/testing/selftests/cgroup/test_freezer.c

··· 804 804 return ret; 805 805 } 806 806 807 + /* 808 + * Get the current frozen_usec for the cgroup. 809 + */ 810 + static long cg_check_freezetime(const char *cgroup) 811 + { 812 + return cg_read_key_long(cgroup, "cgroup.stat.local", 813 + "frozen_usec "); 814 + } 815 + 816 + /* 817 + * Test that the freeze time will behave as expected for an empty cgroup. 818 + */ 819 + static int test_cgfreezer_time_empty(const char *root) 820 + { 821 + int ret = KSFT_FAIL; 822 + char *cgroup = NULL; 823 + long prev, curr; 824 + 825 + cgroup = cg_name(root, "cg_time_test_empty"); 826 + if (!cgroup) 827 + goto cleanup; 828 + 829 + /* 830 + * 1) Create an empty cgroup and check that its freeze time 831 + * is 0. 832 + */ 833 + if (cg_create(cgroup)) 834 + goto cleanup; 835 + 836 + curr = cg_check_freezetime(cgroup); 837 + if (curr < 0) { 838 + ret = KSFT_SKIP; 839 + goto cleanup; 840 + } 841 + if (curr > 0) { 842 + debug("Expect time (%ld) to be 0\n", curr); 843 + goto cleanup; 844 + } 845 + 846 + if (cg_freeze_nowait(cgroup, true)) 847 + goto cleanup; 848 + 849 + /* 850 + * 2) Sleep for 1000 us. Check that the freeze time is at 851 + * least 1000 us. 852 + */ 853 + usleep(1000); 854 + curr = cg_check_freezetime(cgroup); 855 + if (curr < 1000) { 856 + debug("Expect time (%ld) to be at least 1000 us\n", 857 + curr); 858 + goto cleanup; 859 + } 860 + 861 + /* 862 + * 3) Unfreeze the cgroup. Check that the freeze time is 863 + * larger than at 2). 864 + */ 865 + if (cg_freeze_nowait(cgroup, false)) 866 + goto cleanup; 867 + prev = curr; 868 + curr = cg_check_freezetime(cgroup); 869 + if (curr <= prev) { 870 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 871 + curr, prev); 872 + goto cleanup; 873 + } 874 + 875 + /* 876 + * 4) Check the freeze time again to ensure that it has not 877 + * changed. 878 + */ 879 + prev = curr; 880 + curr = cg_check_freezetime(cgroup); 881 + if (curr != prev) { 882 + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", 883 + curr, prev); 884 + goto cleanup; 885 + } 886 + 887 + ret = KSFT_PASS; 888 + 889 + cleanup: 890 + if (cgroup) 891 + cg_destroy(cgroup); 892 + free(cgroup); 893 + return ret; 894 + } 895 + 896 + /* 897 + * A simple test for cgroup freezer time accounting. This test follows 898 + * the same flow as test_cgfreezer_time_empty, but with a single process 899 + * in the cgroup. 900 + */ 901 + static int test_cgfreezer_time_simple(const char *root) 902 + { 903 + int ret = KSFT_FAIL; 904 + char *cgroup = NULL; 905 + long prev, curr; 906 + 907 + cgroup = cg_name(root, "cg_time_test_simple"); 908 + if (!cgroup) 909 + goto cleanup; 910 + 911 + /* 912 + * 1) Create a cgroup and check that its freeze time is 0. 913 + */ 914 + if (cg_create(cgroup)) 915 + goto cleanup; 916 + 917 + curr = cg_check_freezetime(cgroup); 918 + if (curr < 0) { 919 + ret = KSFT_SKIP; 920 + goto cleanup; 921 + } 922 + if (curr > 0) { 923 + debug("Expect time (%ld) to be 0\n", curr); 924 + goto cleanup; 925 + } 926 + 927 + /* 928 + * 2) Populate the cgroup with one child and check that the 929 + * freeze time is still 0. 930 + */ 931 + cg_run_nowait(cgroup, child_fn, NULL); 932 + prev = curr; 933 + curr = cg_check_freezetime(cgroup); 934 + if (curr > prev) { 935 + debug("Expect time (%ld) to be 0\n", curr); 936 + goto cleanup; 937 + } 938 + 939 + if (cg_freeze_nowait(cgroup, true)) 940 + goto cleanup; 941 + 942 + /* 943 + * 3) Sleep for 1000 us. Check that the freeze time is at 944 + * least 1000 us. 945 + */ 946 + usleep(1000); 947 + prev = curr; 948 + curr = cg_check_freezetime(cgroup); 949 + if (curr < 1000) { 950 + debug("Expect time (%ld) to be at least 1000 us\n", 951 + curr); 952 + goto cleanup; 953 + } 954 + 955 + /* 956 + * 4) Unfreeze the cgroup. Check that the freeze time is 957 + * larger than at 3). 958 + */ 959 + if (cg_freeze_nowait(cgroup, false)) 960 + goto cleanup; 961 + prev = curr; 962 + curr = cg_check_freezetime(cgroup); 963 + if (curr <= prev) { 964 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 965 + curr, prev); 966 + goto cleanup; 967 + } 968 + 969 + /* 970 + * 5) Sleep for 1000 us. Check that the freeze time is the 971 + * same as at 4). 972 + */ 973 + usleep(1000); 974 + prev = curr; 975 + curr = cg_check_freezetime(cgroup); 976 + if (curr != prev) { 977 + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", 978 + curr, prev); 979 + goto cleanup; 980 + } 981 + 982 + ret = KSFT_PASS; 983 + 984 + cleanup: 985 + if (cgroup) 986 + cg_destroy(cgroup); 987 + free(cgroup); 988 + return ret; 989 + } 990 + 991 + /* 992 + * Test that freezer time accounting works as expected, even while we're 993 + * populating a cgroup with processes. 994 + */ 995 + static int test_cgfreezer_time_populate(const char *root) 996 + { 997 + int ret = KSFT_FAIL; 998 + char *cgroup = NULL; 999 + long prev, curr; 1000 + int i; 1001 + 1002 + cgroup = cg_name(root, "cg_time_test_populate"); 1003 + if (!cgroup) 1004 + goto cleanup; 1005 + 1006 + if (cg_create(cgroup)) 1007 + goto cleanup; 1008 + 1009 + curr = cg_check_freezetime(cgroup); 1010 + if (curr < 0) { 1011 + ret = KSFT_SKIP; 1012 + goto cleanup; 1013 + } 1014 + if (curr > 0) { 1015 + debug("Expect time (%ld) to be 0\n", curr); 1016 + goto cleanup; 1017 + } 1018 + 1019 + /* 1020 + * 1) Populate the cgroup with 100 processes. Check that 1021 + * the freeze time is 0. 1022 + */ 1023 + for (i = 0; i < 100; i++) 1024 + cg_run_nowait(cgroup, child_fn, NULL); 1025 + prev = curr; 1026 + curr = cg_check_freezetime(cgroup); 1027 + if (curr != prev) { 1028 + debug("Expect time (%ld) to be 0\n", curr); 1029 + goto cleanup; 1030 + } 1031 + 1032 + /* 1033 + * 2) Wait for the group to become fully populated. Check 1034 + * that the freeze time is 0. 1035 + */ 1036 + if (cg_wait_for_proc_count(cgroup, 100)) 1037 + goto cleanup; 1038 + prev = curr; 1039 + curr = cg_check_freezetime(cgroup); 1040 + if (curr != prev) { 1041 + debug("Expect time (%ld) to be 0\n", curr); 1042 + goto cleanup; 1043 + } 1044 + 1045 + /* 1046 + * 3) Freeze the cgroup and then populate it with 100 more 1047 + * processes. Check that the freeze time continues to grow. 1048 + */ 1049 + if (cg_freeze_nowait(cgroup, true)) 1050 + goto cleanup; 1051 + prev = curr; 1052 + curr = cg_check_freezetime(cgroup); 1053 + if (curr <= prev) { 1054 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 1055 + curr, prev); 1056 + goto cleanup; 1057 + } 1058 + 1059 + for (i = 0; i < 100; i++) 1060 + cg_run_nowait(cgroup, child_fn, NULL); 1061 + prev = curr; 1062 + curr = cg_check_freezetime(cgroup); 1063 + if (curr <= prev) { 1064 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 1065 + curr, prev); 1066 + goto cleanup; 1067 + } 1068 + 1069 + /* 1070 + * 4) Wait for the group to become fully populated. Check 1071 + * that the freeze time is larger than at 3). 1072 + */ 1073 + if (cg_wait_for_proc_count(cgroup, 200)) 1074 + goto cleanup; 1075 + prev = curr; 1076 + curr = cg_check_freezetime(cgroup); 1077 + if (curr <= prev) { 1078 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 1079 + curr, prev); 1080 + goto cleanup; 1081 + } 1082 + 1083 + /* 1084 + * 5) Unfreeze the cgroup. Check that the freeze time is 1085 + * larger than at 4). 1086 + */ 1087 + if (cg_freeze_nowait(cgroup, false)) 1088 + goto cleanup; 1089 + prev = curr; 1090 + curr = cg_check_freezetime(cgroup); 1091 + if (curr <= prev) { 1092 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 1093 + curr, prev); 1094 + goto cleanup; 1095 + } 1096 + 1097 + /* 1098 + * 6) Kill the processes. Check that the freeze time is the 1099 + * same as it was at 5). 1100 + */ 1101 + if (cg_killall(cgroup)) 1102 + goto cleanup; 1103 + prev = curr; 1104 + curr = cg_check_freezetime(cgroup); 1105 + if (curr != prev) { 1106 + debug("Expect time (%ld) to be unchanged from previous check (%ld)\n", 1107 + curr, prev); 1108 + goto cleanup; 1109 + } 1110 + 1111 + /* 1112 + * 7) Freeze and unfreeze the cgroup. Check that the freeze 1113 + * time is larger than it was at 6). 1114 + */ 1115 + if (cg_freeze_nowait(cgroup, true)) 1116 + goto cleanup; 1117 + if (cg_freeze_nowait(cgroup, false)) 1118 + goto cleanup; 1119 + prev = curr; 1120 + curr = cg_check_freezetime(cgroup); 1121 + if (curr <= prev) { 1122 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 1123 + curr, prev); 1124 + goto cleanup; 1125 + } 1126 + 1127 + ret = KSFT_PASS; 1128 + 1129 + cleanup: 1130 + if (cgroup) 1131 + cg_destroy(cgroup); 1132 + free(cgroup); 1133 + return ret; 1134 + } 1135 + 1136 + /* 1137 + * Test that frozen time for a cgroup continues to work as expected, 1138 + * even as processes are migrated. Frozen cgroup A's freeze time should 1139 + * continue to increase and running cgroup B's should stay 0. 1140 + */ 1141 + static int test_cgfreezer_time_migrate(const char *root) 1142 + { 1143 + long prev_A, curr_A, curr_B; 1144 + char *cgroup[2] = {0}; 1145 + int ret = KSFT_FAIL; 1146 + int pid; 1147 + 1148 + cgroup[0] = cg_name(root, "cg_time_test_migrate_A"); 1149 + if (!cgroup[0]) 1150 + goto cleanup; 1151 + 1152 + cgroup[1] = cg_name(root, "cg_time_test_migrate_B"); 1153 + if (!cgroup[1]) 1154 + goto cleanup; 1155 + 1156 + if (cg_create(cgroup[0])) 1157 + goto cleanup; 1158 + 1159 + if (cg_check_freezetime(cgroup[0]) < 0) { 1160 + ret = KSFT_SKIP; 1161 + goto cleanup; 1162 + } 1163 + 1164 + if (cg_create(cgroup[1])) 1165 + goto cleanup; 1166 + 1167 + pid = cg_run_nowait(cgroup[0], child_fn, NULL); 1168 + if (pid < 0) 1169 + goto cleanup; 1170 + 1171 + if (cg_wait_for_proc_count(cgroup[0], 1)) 1172 + goto cleanup; 1173 + 1174 + curr_A = cg_check_freezetime(cgroup[0]); 1175 + if (curr_A) { 1176 + debug("Expect time (%ld) to be 0\n", curr_A); 1177 + goto cleanup; 1178 + } 1179 + curr_B = cg_check_freezetime(cgroup[1]); 1180 + if (curr_B) { 1181 + debug("Expect time (%ld) to be 0\n", curr_B); 1182 + goto cleanup; 1183 + } 1184 + 1185 + /* 1186 + * Freeze cgroup A. 1187 + */ 1188 + if (cg_freeze_wait(cgroup[0], true)) 1189 + goto cleanup; 1190 + prev_A = curr_A; 1191 + curr_A = cg_check_freezetime(cgroup[0]); 1192 + if (curr_A <= prev_A) { 1193 + debug("Expect time (%ld) to be > 0\n", curr_A); 1194 + goto cleanup; 1195 + } 1196 + 1197 + /* 1198 + * Migrate from A (frozen) to B (running). 1199 + */ 1200 + if (cg_enter(cgroup[1], pid)) 1201 + goto cleanup; 1202 + 1203 + usleep(1000); 1204 + curr_B = cg_check_freezetime(cgroup[1]); 1205 + if (curr_B) { 1206 + debug("Expect time (%ld) to be 0\n", curr_B); 1207 + goto cleanup; 1208 + } 1209 + 1210 + prev_A = curr_A; 1211 + curr_A = cg_check_freezetime(cgroup[0]); 1212 + if (curr_A <= prev_A) { 1213 + debug("Expect time (%ld) to be more than previous check (%ld)\n", 1214 + curr_A, prev_A); 1215 + goto cleanup; 1216 + } 1217 + 1218 + ret = KSFT_PASS; 1219 + 1220 + cleanup: 1221 + if (cgroup[0]) 1222 + cg_destroy(cgroup[0]); 1223 + free(cgroup[0]); 1224 + if (cgroup[1]) 1225 + cg_destroy(cgroup[1]); 1226 + free(cgroup[1]); 1227 + return ret; 1228 + } 1229 + 1230 + /* 1231 + * The test creates a cgroup and freezes it. Then it creates a child cgroup. 1232 + * After that it checks that the child cgroup has a non-zero freeze time 1233 + * that is less than the parent's. Next, it freezes the child, unfreezes 1234 + * the parent, and sleeps. Finally, it checks that the child's freeze 1235 + * time has grown larger than the parent's. 1236 + */ 1237 + static int test_cgfreezer_time_parent(const char *root) 1238 + { 1239 + char *parent, *child = NULL; 1240 + int ret = KSFT_FAIL; 1241 + long ptime, ctime; 1242 + 1243 + parent = cg_name(root, "cg_test_parent_A"); 1244 + if (!parent) 1245 + goto cleanup; 1246 + 1247 + child = cg_name(parent, "cg_test_parent_B"); 1248 + if (!child) 1249 + goto cleanup; 1250 + 1251 + if (cg_create(parent)) 1252 + goto cleanup; 1253 + 1254 + if (cg_check_freezetime(parent) < 0) { 1255 + ret = KSFT_SKIP; 1256 + goto cleanup; 1257 + } 1258 + 1259 + if (cg_freeze_wait(parent, true)) 1260 + goto cleanup; 1261 + 1262 + usleep(1000); 1263 + if (cg_create(child)) 1264 + goto cleanup; 1265 + 1266 + if (cg_check_frozen(child, true)) 1267 + goto cleanup; 1268 + 1269 + /* 1270 + * Since the parent was frozen the entire time the child cgroup 1271 + * was being created, we expect the parent's freeze time to be 1272 + * larger than the child's. 1273 + * 1274 + * Ideally, we would be able to check both times simultaneously, 1275 + * but here we get the child's after we get the parent's. 1276 + */ 1277 + ptime = cg_check_freezetime(parent); 1278 + ctime = cg_check_freezetime(child); 1279 + if (ptime <= ctime) { 1280 + debug("Expect ptime (%ld) > ctime (%ld)\n", ptime, ctime); 1281 + goto cleanup; 1282 + } 1283 + 1284 + if (cg_freeze_nowait(child, true)) 1285 + goto cleanup; 1286 + 1287 + if (cg_freeze_wait(parent, false)) 1288 + goto cleanup; 1289 + 1290 + if (cg_check_frozen(child, true)) 1291 + goto cleanup; 1292 + 1293 + usleep(100000); 1294 + 1295 + ctime = cg_check_freezetime(child); 1296 + ptime = cg_check_freezetime(parent); 1297 + 1298 + if (ctime <= ptime) { 1299 + debug("Expect ctime (%ld) > ptime (%ld)\n", ctime, ptime); 1300 + goto cleanup; 1301 + } 1302 + 1303 + ret = KSFT_PASS; 1304 + 1305 + cleanup: 1306 + if (child) 1307 + cg_destroy(child); 1308 + free(child); 1309 + if (parent) 1310 + cg_destroy(parent); 1311 + free(parent); 1312 + return ret; 1313 + } 1314 + 1315 + /* 1316 + * The test creates a parent cgroup and a child cgroup. Then, it freezes 1317 + * the child and checks that the child's freeze time is greater than the 1318 + * parent's, which should be zero. 1319 + */ 1320 + static int test_cgfreezer_time_child(const char *root) 1321 + { 1322 + char *parent, *child = NULL; 1323 + int ret = KSFT_FAIL; 1324 + long ptime, ctime; 1325 + 1326 + parent = cg_name(root, "cg_test_child_A"); 1327 + if (!parent) 1328 + goto cleanup; 1329 + 1330 + child = cg_name(parent, "cg_test_child_B"); 1331 + if (!child) 1332 + goto cleanup; 1333 + 1334 + if (cg_create(parent)) 1335 + goto cleanup; 1336 + 1337 + if (cg_check_freezetime(parent) < 0) { 1338 + ret = KSFT_SKIP; 1339 + goto cleanup; 1340 + } 1341 + 1342 + if (cg_create(child)) 1343 + goto cleanup; 1344 + 1345 + if (cg_freeze_wait(child, true)) 1346 + goto cleanup; 1347 + 1348 + ctime = cg_check_freezetime(child); 1349 + ptime = cg_check_freezetime(parent); 1350 + if (ptime != 0) { 1351 + debug("Expect ptime (%ld) to be 0\n", ptime); 1352 + goto cleanup; 1353 + } 1354 + 1355 + if (ctime <= ptime) { 1356 + debug("Expect ctime (%ld) <= ptime (%ld)\n", ctime, ptime); 1357 + goto cleanup; 1358 + } 1359 + 1360 + ret = KSFT_PASS; 1361 + 1362 + cleanup: 1363 + if (child) 1364 + cg_destroy(child); 1365 + free(child); 1366 + if (parent) 1367 + cg_destroy(parent); 1368 + free(parent); 1369 + return ret; 1370 + } 1371 + 1372 + /* 1373 + * The test creates the following hierarchy: 1374 + * A 1375 + * | 1376 + * B 1377 + * | 1378 + * C 1379 + * 1380 + * Then it freezes the cgroups in the order C, B, A. 1381 + * Then it unfreezes the cgroups in the order A, B, C. 1382 + * Then it checks that C's freeze time is larger than B's and 1383 + * that B's is larger than A's. 1384 + */ 1385 + static int test_cgfreezer_time_nested(const char *root) 1386 + { 1387 + char *cgroup[3] = {0}; 1388 + int ret = KSFT_FAIL; 1389 + long time[3] = {0}; 1390 + int i; 1391 + 1392 + cgroup[0] = cg_name(root, "cg_test_time_A"); 1393 + if (!cgroup[0]) 1394 + goto cleanup; 1395 + 1396 + cgroup[1] = cg_name(cgroup[0], "B"); 1397 + if (!cgroup[1]) 1398 + goto cleanup; 1399 + 1400 + cgroup[2] = cg_name(cgroup[1], "C"); 1401 + if (!cgroup[2]) 1402 + goto cleanup; 1403 + 1404 + if (cg_create(cgroup[0])) 1405 + goto cleanup; 1406 + 1407 + if (cg_check_freezetime(cgroup[0]) < 0) { 1408 + ret = KSFT_SKIP; 1409 + goto cleanup; 1410 + } 1411 + 1412 + if (cg_create(cgroup[1])) 1413 + goto cleanup; 1414 + 1415 + if (cg_create(cgroup[2])) 1416 + goto cleanup; 1417 + 1418 + if (cg_freeze_nowait(cgroup[2], true)) 1419 + goto cleanup; 1420 + 1421 + if (cg_freeze_nowait(cgroup[1], true)) 1422 + goto cleanup; 1423 + 1424 + if (cg_freeze_nowait(cgroup[0], true)) 1425 + goto cleanup; 1426 + 1427 + usleep(1000); 1428 + 1429 + if (cg_freeze_nowait(cgroup[0], false)) 1430 + goto cleanup; 1431 + 1432 + if (cg_freeze_nowait(cgroup[1], false)) 1433 + goto cleanup; 1434 + 1435 + if (cg_freeze_nowait(cgroup[2], false)) 1436 + goto cleanup; 1437 + 1438 + time[2] = cg_check_freezetime(cgroup[2]); 1439 + time[1] = cg_check_freezetime(cgroup[1]); 1440 + time[0] = cg_check_freezetime(cgroup[0]); 1441 + 1442 + if (time[2] <= time[1]) { 1443 + debug("Expect C's time (%ld) > B's time (%ld)", time[2], time[1]); 1444 + goto cleanup; 1445 + } 1446 + 1447 + if (time[1] <= time[0]) { 1448 + debug("Expect B's time (%ld) > A's time (%ld)", time[1], time[0]); 1449 + goto cleanup; 1450 + } 1451 + 1452 + ret = KSFT_PASS; 1453 + 1454 + cleanup: 1455 + for (i = 2; i >= 0 && cgroup[i]; i--) { 1456 + cg_destroy(cgroup[i]); 1457 + free(cgroup[i]); 1458 + } 1459 + 1460 + return ret; 1461 + } 1462 + 807 1463 #define T(x) { x, #x } 808 1464 struct cgfreezer_test { 809 1465 int (*fn)(const char *root); ··· 1475 819 T(test_cgfreezer_stopped), 1476 820 T(test_cgfreezer_ptraced), 1477 821 T(test_cgfreezer_vfork), 822 + T(test_cgfreezer_time_empty), 823 + T(test_cgfreezer_time_simple), 824 + T(test_cgfreezer_time_populate), 825 + T(test_cgfreezer_time_migrate), 826 + T(test_cgfreezer_time_parent), 827 + T(test_cgfreezer_time_child), 828 + T(test_cgfreezer_time_nested), 1478 829 }; 1479 830 #undef T 1480 831

+3

tools/testing/selftests/cgroup/test_pids.c

··· 77 77 char *cg_parent = NULL, *cg_child = NULL; 78 78 int pid; 79 79 80 + if (cgroup_feature("pids_localevents") <= 0) 81 + return KSFT_SKIP; 82 + 80 83 cg_parent = cg_name(root, "pids_parent"); 81 84 cg_child = cg_name(cg_parent, "pids_child"); 82 85 if (!cg_parent || !cg_child)

Configure Feed

Configure Feed