cgroup: Wait for dying tasks to leave on rmdir

a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup") hid PF_EXITING
tasks from cgroup.procs so that systemd doesn't see tasks that have already
been reaped via waitpid(). However, the populated counter (nr_populated_csets)
is only decremented when the task later passes through cgroup_task_dead() in
finish_task_switch(). This means cgroup.procs can appear empty while the
cgroup is still populated, causing rmdir to fail with -EBUSY.

Fix this by making cgroup_rmdir() wait for dying tasks to fully leave. If the
cgroup is populated but all remaining tasks have PF_EXITING set (the task
iterator returns none due to the existing filter), wait for a kick from
cgroup_task_dead() and retry. The wait is brief as tasks are removed from the
cgroup's css_set between PF_EXITING assertion in do_exit() and
cgroup_task_dead() in finish_task_switch().

v2: cgroup_is_populated() true to false transition happens under css_set_lock
not cgroup_mutex, so retest under css_set_lock before sleeping to avoid
missed wakeups (Sebastian).

Fixes: a72f73c4dd9b ("cgroup: Don't expose dead tasks in cgroup")
Reported-by: kernel test robot <oliver.sang@intel.com>
Closes: https://lore.kernel.org/oe-lkp/202603222104.2c81684e-lkp@intel.com
Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Bert Karwatzki <spasswolf@web.de>
Cc: Michal Koutny <mkoutny@suse.com>
Cc: cgroups@vger.kernel.org

Tejun Heo 1 month ago 1b164b87 a72f73c4

+86 -3

2 changed files

expand all

include

linux

cgroup-defs.h

kernel

cgroup

cgroup.c

include/linux/cgroup-defs.h

··· 609 609 /* used to wait for offlining of csses */ 610 610 wait_queue_head_t offline_waitq; 611 611 612 + /* used by cgroup_rmdir() to wait for dying tasks to leave */ 613 + wait_queue_head_t dying_populated_waitq; 614 + 612 615 /* used to schedule release agent */ 613 616 struct work_struct release_agent_work; 614 617

+83 -3

kernel/cgroup/cgroup.c

··· 2126 2126 #endif 2127 2127 2128 2128 init_waitqueue_head(&cgrp->offline_waitq); 2129 + init_waitqueue_head(&cgrp->dying_populated_waitq); 2129 2130 INIT_WORK(&cgrp->release_agent_work, cgroup1_release_agent); 2130 2131 } 2131 2132 ··· 6225 6224 return 0; 6226 6225 }; 6227 6226 6227 + /** 6228 + * cgroup_drain_dying - wait for dying tasks to leave before rmdir 6229 + * @cgrp: the cgroup being removed 6230 + * 6231 + * The PF_EXITING filter in css_task_iter_advance() hides exiting tasks from 6232 + * cgroup.procs so that userspace (e.g. systemd) doesn't see tasks that have 6233 + * already been reaped via waitpid(). However, the populated counter 6234 + * (nr_populated_csets) is only decremented when the task later passes through 6235 + * cgroup_task_dead() in finish_task_switch(). This creates a window where 6236 + * cgroup.procs appears empty but cgroup_is_populated() is still true, causing 6237 + * rmdir to fail with -EBUSY. 6238 + * 6239 + * This function bridges that gap. If the cgroup is populated but all remaining 6240 + * tasks have PF_EXITING set, we wait for cgroup_task_dead() to process them. 6241 + * Tasks are removed from the cgroup's css_set in cgroup_task_dead() called from 6242 + * finish_task_switch(). As the window between PF_EXITING and cgroup_task_dead() 6243 + * is short, the number of PF_EXITING tasks on the list is small and the wait 6244 + * is brief. 6245 + * 6246 + * Each cgroup_task_dead() kicks the waitqueue via cset->cgrp_links, and we 6247 + * retry the full check from scratch. 6248 + * 6249 + * Must be called with cgroup_mutex held. 6250 + */ 6251 + static int cgroup_drain_dying(struct cgroup *cgrp) 6252 + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 6253 + { 6254 + struct css_task_iter it; 6255 + struct task_struct *task; 6256 + DEFINE_WAIT(wait); 6257 + 6258 + lockdep_assert_held(&cgroup_mutex); 6259 + retry: 6260 + if (!cgroup_is_populated(cgrp)) 6261 + return 0; 6262 + 6263 + /* Same iterator as cgroup.threads - if any task is visible, it's busy */ 6264 + css_task_iter_start(&cgrp->self, 0, &it); 6265 + task = css_task_iter_next(&it); 6266 + css_task_iter_end(&it); 6267 + 6268 + if (task) 6269 + return -EBUSY; 6270 + 6271 + /* 6272 + * All remaining tasks are PF_EXITING and will pass through 6273 + * cgroup_task_dead() shortly. Wait for a kick and retry. 6274 + * 6275 + * cgroup_is_populated() can't transition from false to true while 6276 + * we're holding cgroup_mutex, but the true to false transition 6277 + * happens under css_set_lock (via cgroup_task_dead()). We must 6278 + * retest and prepare_to_wait() under css_set_lock. Otherwise, the 6279 + * transition can happen between our first test and 6280 + * prepare_to_wait(), and we sleep with no one to wake us. 6281 + */ 6282 + spin_lock_irq(&css_set_lock); 6283 + if (!cgroup_is_populated(cgrp)) { 6284 + spin_unlock_irq(&css_set_lock); 6285 + return 0; 6286 + } 6287 + prepare_to_wait(&cgrp->dying_populated_waitq, &wait, 6288 + TASK_UNINTERRUPTIBLE); 6289 + spin_unlock_irq(&css_set_lock); 6290 + mutex_unlock(&cgroup_mutex); 6291 + schedule(); 6292 + finish_wait(&cgrp->dying_populated_waitq, &wait); 6293 + mutex_lock(&cgroup_mutex); 6294 + goto retry; 6295 + } 6296 + 6228 6297 int cgroup_rmdir(struct kernfs_node *kn) 6229 6298 { 6230 6299 struct cgroup *cgrp; ··· 6304 6233 if (!cgrp) 6305 6234 return 0; 6306 6235 6307 - ret = cgroup_destroy_locked(cgrp); 6308 - if (!ret) 6309 - TRACE_CGROUP_PATH(rmdir, cgrp); 6236 + ret = cgroup_drain_dying(cgrp); 6237 + if (!ret) { 6238 + ret = cgroup_destroy_locked(cgrp); 6239 + if (!ret) 6240 + TRACE_CGROUP_PATH(rmdir, cgrp); 6241 + } 6310 6242 6311 6243 cgroup_kn_unlock(kn); 6312 6244 return ret; ··· 7069 6995 7070 6996 static void do_cgroup_task_dead(struct task_struct *tsk) 7071 6997 { 6998 + struct cgrp_cset_link *link; 7072 6999 struct css_set *cset; 7073 7000 unsigned long flags; 7074 7001 ··· 7082 7007 /* matches the signal->live check in css_task_iter_advance() */ 7083 7008 if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live)) 7084 7009 list_add_tail(&tsk->cg_list, &cset->dying_tasks); 7010 + 7011 + /* kick cgroup_drain_dying() waiters, see cgroup_rmdir() */ 7012 + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) 7013 + if (waitqueue_active(&link->cgrp->dying_populated_waitq)) 7014 + wake_up(&link->cgrp->dying_populated_waitq); 7085 7015 7086 7016 if (dl_task(tsk)) 7087 7017 dec_dl_tasks_cs(tsk);

Configure Feed

Configure Feed