Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

- Fix cgroup rmdir racing with dying tasks.

Deferred task cgroup unlink introduced a window where cgroup.procs
is empty but the cgroup is still populated, causing rmdir to fail
with -EBUSY and selftest failures.

Make rmdir wait for dying tasks to fully leave and fix selftests to
not depend on synchronous populated updates.

- Fix cpuset v1 task migration failure from empty cpusets under strict
security policies.

When CPU hotplug removes the last CPU from a v1 cpuset, tasks must be
migrated to an ancestor without a security_task_setscheduler() check
that would block the migration.

* tag 'cgroup-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup/cpuset: Skip security check for hotplug induced v1 task migration
cgroup/cpuset: Simplify setsched decision check in task iteration loop of cpuset_can_attach()
cgroup: Fix cgroup_drain_dying() testing the wrong condition
selftests/cgroup: Don't require synchronous populated update on task exit
cgroup: Wait for dying tasks to leave on rmdir

+131 -16
+3
include/linux/cgroup-defs.h
··· 609 609 /* used to wait for offlining of csses */ 610 610 wait_queue_head_t offline_waitq; 611 611 612 + /* used by cgroup_rmdir() to wait for dying tasks to leave */ 613 + wait_queue_head_t dying_populated_waitq; 614 + 612 615 /* used to schedule release agent */ 613 616 struct work_struct release_agent_work; 614 617
+85 -3
kernel/cgroup/cgroup.c
··· 2126 2126 #endif 2127 2127 2128 2128 init_waitqueue_head(&cgrp->offline_waitq); 2129 + init_waitqueue_head(&cgrp->dying_populated_waitq); 2129 2130 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); 2130 2131 } 2131 2132 ··· 6225 6224 return 0; 6226 6225 }; 6227 6226 6227 + /** 6228 + * cgroup_drain_dying - wait for dying tasks to leave before rmdir 6229 + * @cgrp: the cgroup being removed 6230 + * 6231 + * cgroup.procs and cgroup.threads use css_task_iter which filters out 6232 + * PF_EXITING tasks so that userspace doesn't see tasks that have already been 6233 + * reaped via waitpid(). However, cgroup_has_tasks() - which tests whether the 6234 + * cgroup has non-empty css_sets - is only updated when dying tasks pass through 6235 + * cgroup_task_dead() in finish_task_switch(). This creates a window where 6236 + * cgroup.procs reads empty but cgroup_has_tasks() is still true, making rmdir 6237 + * fail with -EBUSY from cgroup_destroy_locked() even though userspace sees no 6238 + * tasks. 6239 + * 6240 + * This function aligns cgroup_has_tasks() with what userspace can observe. If 6241 + * cgroup_has_tasks() but the task iterator sees nothing (all remaining tasks are 6242 + * PF_EXITING), we wait for cgroup_task_dead() to finish processing them. As the 6243 + * window between PF_EXITING and cgroup_task_dead() is short, the wait is brief. 6244 + * 6245 + * This function only concerns itself with this cgroup's own dying tasks. 6246 + * Whether the cgroup has children is cgroup_destroy_locked()'s problem. 6247 + * 6248 + * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we 6249 + * retry the full check from scratch. 6250 + * 6251 + * Must be called with cgroup_mutex held. 6252 + */ 6253 + static int cgroup_drain_dying(struct cgroup *cgrp) 6254 + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 6255 + { 6256 + struct css_task_iter it; 6257 + struct task_struct *task; 6258 + DEFINE_WAIT(wait); 6259 + 6260 + lockdep_assert_held(&cgroup_mutex); 6261 + retry: 6262 + if (!cgroup_has_tasks(cgrp)) 6263 + return 0; 6264 + 6265 + /* Same iterator as cgroup.threads - if any task is visible, it's busy */ 6266 + css_task_iter_start(&cgrp->self, 0, &it); 6267 + task = css_task_iter_next(&it); 6268 + css_task_iter_end(&it); 6269 + 6270 + if (task) 6271 + return -EBUSY; 6272 + 6273 + /* 6274 + * All remaining tasks are PF_EXITING and will pass through 6275 + * cgroup_task_dead() shortly. Wait for a kick and retry. 6276 + * 6277 + * cgroup_has_tasks() can't transition from false to true while we're 6278 + * holding cgroup_mutex, but the true to false transition happens 6279 + * under css_set_lock (via cgroup_task_dead()). We must retest and 6280 + * prepare_to_wait() under css_set_lock. Otherwise, the transition 6281 + * can happen between our first test and prepare_to_wait(), and we 6282 + * sleep with no one to wake us. 6283 + */ 6284 + spin_lock_irq(&css_set_lock); 6285 + if (!cgroup_has_tasks(cgrp)) { 6286 + spin_unlock_irq(&css_set_lock); 6287 + return 0; 6288 + } 6289 + prepare_to_wait(&cgrp->dying_populated_waitq, &wait, 6290 + TASK_UNINTERRUPTIBLE); 6291 + spin_unlock_irq(&css_set_lock); 6292 + mutex_unlock(&cgroup_mutex); 6293 + schedule(); 6294 + finish_wait(&cgrp->dying_populated_waitq, &wait); 6295 + mutex_lock(&cgroup_mutex); 6296 + goto retry; 6297 + } 6298 + 6228 6299 int cgroup_rmdir(struct kernfs_node *kn) 6229 6300 { 6230 6301 struct cgroup *cgrp; ··· 6306 6233 if (!cgrp) 6307 6234 return 0; 6308 6235 6309 - ret = cgroup_destroy_locked(cgrp); 6310 - if (!ret) 6311 - TRACE_CGROUP_PATH(rmdir, cgrp); 6236 + ret = cgroup_drain_dying(cgrp); 6237 + if (!ret) { 6238 + ret = cgroup_destroy_locked(cgrp); 6239 + if (!ret) 6240 + TRACE_CGROUP_PATH(rmdir, cgrp); 6241 + } 6312 6242 6313 6243 cgroup_kn_unlock(kn); 6314 6244 return ret; ··· 7071 6995 7072 6996 static void do_cgroup_task_dead(struct task_struct *tsk) 7073 6997 { 6998 + struct cgrp_cset_link *link; 7074 6999 struct css_set *cset; 7075 7000 unsigned long flags; 7076 7001 ··· 7084 7007 /* matches the signal->live check in css_task_iter_advance() */ 7085 7008 if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) 7086 7009 list_add_tail(&tsk->cg_list, &cset->dying_tasks); 7010 + 7011 + /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ 7012 + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) 7013 + if (waitqueue_active(&link->cgrp->dying_populated_waitq)) 7014 + wake_up(&link->cgrp->dying_populated_waitq); 7087 7015 7088 7016 if (dl_task(tsk)) 7089 7017 dec_dl_tasks_cs(tsk);
+20 -9
kernel/cgroup/cpuset.c
··· 2988 2988 struct cgroup_subsys_state *css; 2989 2989 struct cpuset *cs, *oldcs; 2990 2990 struct task_struct *task; 2991 - bool cpus_updated, mems_updated; 2991 + bool setsched_check; 2992 2992 int ret; 2993 2993 2994 2994 /* used later by cpuset_attach() */ ··· 3003 3003 if (ret) 3004 3004 goto out_unlock; 3005 3005 3006 - cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); 3007 - mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 3006 + /* 3007 + * Skip rights over task setsched check in v2 when nothing changes, 3008 + * migration permission derives from hierarchy ownership in 3009 + * cgroup_procs_write_permission()). 3010 + */ 3011 + setsched_check = !cpuset_v2() || 3012 + !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) || 3013 + !nodes_equal(cs->effective_mems, oldcs->effective_mems); 3014 + 3015 + /* 3016 + * A v1 cpuset with tasks will have no CPU left only when CPU hotplug 3017 + * brings the last online CPU offline as users are not allowed to empty 3018 + * cpuset.cpus when there are active tasks inside. When that happens, 3019 + * we should allow tasks to migrate out without security check to make 3020 + * sure they will be able to run after migration. 3021 + */ 3022 + if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus)) 3023 + setsched_check = false; 3008 3024 3009 3025 cgroup_taskset_for_each(task, css, tset) { 3010 3026 ret = task_can_attach(task); 3011 3027 if (ret) 3012 3028 goto out_unlock; 3013 3029 3014 - /* 3015 - * Skip rights over task check in v2 when nothing changes, 3016 - * migration permission derives from hierarchy ownership in 3017 - * cgroup_procs_write_permission()). 3018 - */ 3019 - if (!cpuset_v2() || (cpus_updated || mems_updated)) { 3030 + if (setsched_check) { 3020 3031 ret = security_task_setscheduler(task); 3021 3032 if (ret) 3022 3033 goto out_unlock;
+15
tools/testing/selftests/cgroup/lib/cgroup_util.c
··· 123 123 return ret; 124 124 } 125 125 126 + int cg_read_strcmp_wait(const char *cgroup, const char *control, 127 + const char *expected) 128 + { 129 + int i, ret; 130 + 131 + for (i = 0; i < 100; i++) { 132 + ret = cg_read_strcmp(cgroup, control, expected); 133 + if (!ret) 134 + return ret; 135 + usleep(10000); 136 + } 137 + 138 + return ret; 139 + } 140 + 126 141 int cg_read_strstr(const char *cgroup, const char *control, const char *needle) 127 142 { 128 143 char buf[PAGE_SIZE];
+2
tools/testing/selftests/cgroup/lib/include/cgroup_util.h
··· 61 61 char *buf, size_t len); 62 62 extern int cg_read_strcmp(const char *cgroup, const char *control, 63 63 const char *expected); 64 + extern int cg_read_strcmp_wait(const char *cgroup, const char *control, 65 + const char *expected); 64 66 extern int cg_read_strstr(const char *cgroup, const char *control, 65 67 const char *needle); 66 68 extern long cg_read_long(const char *cgroup, const char *control);
+2 -1
tools/testing/selftests/cgroup/test_core.c
··· 233 233 if (err) 234 234 goto cleanup; 235 235 236 - if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n")) 236 + if (cg_read_strcmp_wait(cg_test_d, "cgroup.events", 237 + "populated 0\n")) 237 238 goto cleanup; 238 239 239 240 /* Remove cgroup. */
+4 -3
tools/testing/selftests/cgroup/test_kill.c
··· 86 86 wait_for_pid(pids[i]); 87 87 88 88 if (ret == KSFT_PASS && 89 - cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n")) 89 + cg_read_strcmp_wait(cgroup, "cgroup.events", "populated 0\n")) 90 90 ret = KSFT_FAIL; 91 91 92 92 if (cgroup) ··· 190 190 wait_for_pid(pids[i]); 191 191 192 192 if (ret == KSFT_PASS && 193 - cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n")) 193 + cg_read_strcmp_wait(cgroup[0], "cgroup.events", 194 + "populated 0\n")) 194 195 ret = KSFT_FAIL; 195 196 196 197 for (i = 9; i >= 0 && cgroup[i]; i--) { ··· 252 251 wait_for_pid(pid); 253 252 254 253 if (ret == KSFT_PASS && 255 - cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n")) 254 + cg_read_strcmp_wait(cgroup, "cgroup.events", "populated 0\n")) 256 255 ret = KSFT_FAIL; 257 256 258 257 if (cgroup)