cpuset: Defer flushing of the cpuset_migrate_mm_wq to task_work

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Now in cpuset_attach(), we need to synchronously wait for
flush_workqueue to complete. The execution time of flushing
cpuset_migrate_mm_wq depends on the amount of mm migration initiated by
cpusets at that time. When the cpuset.mems of a cgroup occupying a large
amount of memory is modified, it may trigger extensive mm migration,
causing cpuset_attach() to block on flush_workqueue for an extended period.
This could be dangerous because cpuset_attach() is within the critical
section of cgroup_mutex, which may ultimately cause all cgroup-related
operations in the system to be blocked.

This patch attempts to defer the flush_workqueue() operation until
returning to userspace using the task_work which is originally proposed by
tejun[1], so that flush happens after cgroup_mutex is dropped. That way we
maintain the operation synchronicity while avoiding bothering anyone else.

[1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883

Originally-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Chuyi Zhou <zhouchuyi@bytedance.com>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Chuyi Zhou and committed by

Tejun Heo 9 months ago 3514309e c0fb16ef

+24 -5

1 changed file

expand all

kernel

cgroup

cpuset.c

+24 -5

kernel/cgroup/cpuset.c

··· 40 40 #include <linux/sched/isolation.h> 41 41 #include <linux/wait.h> 42 42 #include <linux/workqueue.h> 43 + #include <linux/task_work.h> 43 44 44 45 DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key); 45 46 DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); ··· 2620 2619 } 2621 2620 } 2622 2621 2623 - static void cpuset_post_attach(void) 2622 + static void flush_migrate_mm_task_workfn(struct callback_head *head) 2624 2623 { 2625 2624 flush_workqueue(cpuset_migrate_mm_wq); 2625 + kfree(head); 2626 + } 2627 + 2628 + static void schedule_flush_migrate_mm(void) 2629 + { 2630 + struct callback_head *flush_cb; 2631 + 2632 + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL); 2633 + if (!flush_cb) 2634 + return; 2635 + 2636 + init_task_work(flush_cb, flush_migrate_mm_task_workfn); 2637 + 2638 + if (task_work_add(current, flush_cb, TWA_RESUME)) 2639 + kfree(flush_cb); 2626 2640 } 2627 2641 2628 2642 /* ··· 3194 3178 struct cpuset *cs; 3195 3179 struct cpuset *oldcs = cpuset_attach_old_cs; 3196 3180 bool cpus_updated, mems_updated; 3181 + bool queue_task_work = false; 3197 3182 3198 3183 cgroup_taskset_first(tset, &css); 3199 3184 cs = css_cs(css); ··· 3245 3228 * @old_mems_allowed is the right nodesets that we 3246 3229 * migrate mm from. 3247 3230 */ 3248 - if (is_memory_migrate(cs)) 3231 + if (is_memory_migrate(cs)) { 3249 3232 cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 3250 3233 &cpuset_attach_nodemask_to); 3251 - else 3234 + queue_task_work = true; 3235 + } else 3252 3236 mmput(mm); 3253 3237 } 3254 3238 } 3255 3239 3256 3240 out: 3241 + if (queue_task_work) 3242 + schedule_flush_migrate_mm(); 3257 3243 cs->old_mems_allowed = cpuset_attach_nodemask_to; 3258 3244 3259 3245 if (cs->nr_migrate_dl_tasks) { ··· 3312 3292 out_unlock: 3313 3293 cpuset_full_unlock(); 3314 3294 if (of_cft(of)->private == FILE_MEMLIST) 3315 - flush_workqueue(cpuset_migrate_mm_wq); 3295 + schedule_flush_migrate_mm(); 3316 3296 return retval ?: nbytes; 3317 3297 } 3318 3298 ··· 3759 3739 .can_attach = cpuset_can_attach, 3760 3740 .cancel_attach = cpuset_cancel_attach, 3761 3741 .attach = cpuset_attach, 3762 - .post_attach = cpuset_post_attach, 3763 3742 .bind = cpuset_bind, 3764 3743 .can_fork = cpuset_can_fork, 3765 3744 .cancel_fork = cpuset_cancel_fork,

Configure Feed

Configure Feed