Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched_ext: Add cgroup support

Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. A BPF scheduler may not implement or implement only
subset of cgroup features. The implemented features can be indicated using
%SCX_OPS_HAS_CGOUP_* flags. If cgroup configuration makes use of features
that are not implemented, a warning is triggered.

While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.

v7: - cgroup interface file visibility toggling is dropped in favor just
warning messages. Dynamically changing interface visiblity caused more
confusion than helping.

v6: - Updated to reflect the removal of SCX_KF_SLEEPABLE.

- Updated to use CONFIG_GROUP_SCHED_WEIGHT and fixes for
!CONFIG_FAIR_GROUP_SCHED && CONFIG_EXT_GROUP_SCHED.

v5: - Flipped the locking order between scx_cgroup_rwsem and
cpus_read_lock() to avoid locking order conflict w/ cpuset. Better
documentation around locking.

- sched_move_task() takes an early exit if the source and destination
are identical. This triggered the warning in scx_cgroup_can_attach()
as it left p->scx.cgrp_moving_from uncleared. Updated the cgroup
migration path so that ops.cgroup_prep_move() is skipped for identity
migrations so that its invocations always match ops.cgroup_move()
one-to-one.

v4: - Example schedulers moved into their own patches.

- Fix build failure when !CONFIG_CGROUP_SCHED, reported by Andrea Righi.

v3: - Make scx_example_pair switch all tasks by default.

- Convert to BPF inline iterators.

- scx_bpf_task_cgroup() is added to determine the current cgroup from
CPU controller's POV. This allows BPF schedulers to accurately track
CPU cgroup membership.

- scx_example_flatcg added. This demonstrates flattened hierarchy
implementation of CPU cgroup control and shows significant performance
improvement when cgroups which are nested multiple levels are under
competition.

v2: - Build fixes for different CONFIG combinations.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
Reported-by: kernel test robot <lkp@intel.com>
Cc: Andrea Righi <andrea.righi@canonical.com>

+636 -19
+3
include/linux/sched/ext.h
··· 188 188 bool disallow; /* reject switching into SCX */ 189 189 190 190 /* cold fields */ 191 + #ifdef CONFIG_EXT_GROUP_SCHED 192 + struct cgroup *cgrp_moving_from; 193 + #endif 191 194 /* must be the last field, see init_scx_entity() */ 192 195 struct list_head tasks_node; 193 196 };
+6
init/Kconfig
··· 1055 1055 realtime bandwidth for them. 1056 1056 See Documentation/scheduler/sched-rt-group.rst for more information. 1057 1057 1058 + config EXT_GROUP_SCHED 1059 + bool 1060 + depends on SCHED_CLASS_EXT && CGROUP_SCHED 1061 + select GROUP_SCHED_WEIGHT 1062 + default y 1063 + 1058 1064 endif #CGROUP_SCHED 1059 1065 1060 1066 config SCHED_MM_CID
+57 -10
kernel/sched/core.c
··· 8364 8364 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 8365 8365 init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); 8366 8366 #endif /* CONFIG_FAIR_GROUP_SCHED */ 8367 + #ifdef CONFIG_EXT_GROUP_SCHED 8368 + root_task_group.scx_weight = CGROUP_WEIGHT_DFL; 8369 + #endif /* CONFIG_EXT_GROUP_SCHED */ 8367 8370 #ifdef CONFIG_RT_GROUP_SCHED 8368 8371 root_task_group.rt_se = (struct sched_rt_entity **)ptr; 8369 8372 ptr += nr_cpu_ids * sizeof(void **); ··· 8804 8801 if (!alloc_rt_sched_group(tg, parent)) 8805 8802 goto err; 8806 8803 8804 + scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); 8807 8805 alloc_uclamp_sched_group(tg, parent); 8808 8806 8809 8807 return tg; ··· 8932 8928 put_prev_task(rq, tsk); 8933 8929 8934 8930 sched_change_group(tsk, group); 8931 + scx_move_task(tsk); 8935 8932 8936 8933 if (queued) 8937 8934 enqueue_task(rq, tsk, queue_flags); ··· 8970 8965 { 8971 8966 struct task_group *tg = css_tg(css); 8972 8967 struct task_group *parent = css_tg(css->parent); 8968 + int ret; 8969 + 8970 + ret = scx_tg_online(tg); 8971 + if (ret) 8972 + return ret; 8973 8973 8974 8974 if (parent) 8975 8975 sched_online_group(tg, parent); ··· 8987 8977 #endif 8988 8978 8989 8979 return 0; 8980 + } 8981 + 8982 + static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) 8983 + { 8984 + struct task_group *tg = css_tg(css); 8985 + 8986 + scx_tg_offline(tg); 8990 8987 } 8991 8988 8992 8989 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ··· 9013 8996 sched_unregister_group(tg); 9014 8997 } 9015 8998 9016 - #ifdef CONFIG_RT_GROUP_SCHED 9017 8999 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 9018 9000 { 9001 + #ifdef CONFIG_RT_GROUP_SCHED 9019 9002 struct task_struct *task; 9020 9003 struct cgroup_subsys_state *css; 9021 9004 ··· 9023 9006 if (!sched_rt_can_attach(css_tg(css), task)) 9024 9007 return -EINVAL; 9025 9008 } 9026 - return 0; 9027 - } 9028 9009 #endif 9010 + return scx_cgroup_can_attach(tset); 9011 + } 9029 9012 9030 9013 static void cpu_cgroup_attach(struct cgroup_taskset *tset) 9031 9014 { ··· 9034 9017 9035 9018 cgroup_taskset_for_each(task, css, tset) 9036 9019 sched_move_task(task); 9020 + 9021 + scx_cgroup_finish_attach(); 9022 + } 9023 + 9024 + static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) 9025 + { 9026 + scx_cgroup_cancel_attach(tset); 9037 9027 } 9038 9028 9039 9029 #ifdef CONFIG_UCLAMP_TASK_GROUP ··· 9220 9196 #ifdef CONFIG_GROUP_SCHED_WEIGHT 9221 9197 static unsigned long tg_weight(struct task_group *tg) 9222 9198 { 9199 + #ifdef CONFIG_FAIR_GROUP_SCHED 9223 9200 return scale_load_down(tg->shares); 9201 + #else 9202 + return sched_weight_from_cgroup(tg->scx_weight); 9203 + #endif 9224 9204 } 9225 9205 9226 9206 static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 9227 9207 struct cftype *cftype, u64 shareval) 9228 9208 { 9209 + int ret; 9210 + 9229 9211 if (shareval > scale_load_down(ULONG_MAX)) 9230 9212 shareval = MAX_SHARES; 9231 - return sched_group_set_shares(css_tg(css), scale_load(shareval)); 9213 + ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); 9214 + if (!ret) 9215 + scx_group_set_weight(css_tg(css), 9216 + sched_weight_to_cgroup(shareval)); 9217 + return ret; 9232 9218 } 9233 9219 9234 9220 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, ··· 9629 9595 static int cpu_idle_write_s64(struct cgroup_subsys_state *css, 9630 9596 struct cftype *cft, s64 idle) 9631 9597 { 9632 - return sched_group_set_idle(css_tg(css), idle); 9598 + int ret; 9599 + 9600 + ret = sched_group_set_idle(css_tg(css), idle); 9601 + if (!ret) 9602 + scx_group_set_idle(css_tg(css), idle); 9603 + return ret; 9633 9604 } 9634 9605 #endif 9635 9606 ··· 9761 9722 struct cftype *cft, u64 cgrp_weight) 9762 9723 { 9763 9724 unsigned long weight; 9725 + int ret; 9764 9726 9765 9727 if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) 9766 9728 return -ERANGE; 9767 9729 9768 9730 weight = sched_weight_from_cgroup(cgrp_weight); 9769 9731 9770 - return sched_group_set_shares(css_tg(css), scale_load(weight)); 9732 + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); 9733 + if (!ret) 9734 + scx_group_set_weight(css_tg(css), cgrp_weight); 9735 + return ret; 9771 9736 } 9772 9737 9773 9738 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, ··· 9796 9753 struct cftype *cft, s64 nice) 9797 9754 { 9798 9755 unsigned long weight; 9799 - int idx; 9756 + int idx, ret; 9800 9757 9801 9758 if (nice < MIN_NICE || nice > MAX_NICE) 9802 9759 return -ERANGE; ··· 9805 9762 idx = array_index_nospec(idx, 40); 9806 9763 weight = sched_prio_to_weight[idx]; 9807 9764 9808 - return sched_group_set_shares(css_tg(css), scale_load(weight)); 9765 + ret = sched_group_set_shares(css_tg(css), scale_load(weight)); 9766 + if (!ret) 9767 + scx_group_set_weight(css_tg(css), 9768 + sched_weight_to_cgroup(weight)); 9769 + return ret; 9809 9770 } 9810 9771 #endif /* CONFIG_GROUP_SCHED_WEIGHT */ 9811 9772 ··· 9925 9878 struct cgroup_subsys cpu_cgrp_subsys = { 9926 9879 .css_alloc = cpu_cgroup_css_alloc, 9927 9880 .css_online = cpu_cgroup_css_online, 9881 + .css_offline = cpu_cgroup_css_offline, 9928 9882 .css_released = cpu_cgroup_css_released, 9929 9883 .css_free = cpu_cgroup_css_free, 9930 9884 .css_extra_stat_show = cpu_extra_stat_show, 9931 9885 .css_local_stat_show = cpu_local_stat_show, 9932 - #ifdef CONFIG_RT_GROUP_SCHED 9933 9886 .can_attach = cpu_cgroup_can_attach, 9934 - #endif 9935 9887 .attach = cpu_cgroup_attach, 9888 + .cancel_attach = cpu_cgroup_cancel_attach, 9936 9889 .legacy_cftypes = cpu_legacy_files, 9937 9890 .dfl_cftypes = cpu_files, 9938 9891 .early_init = true,
+510 -9
kernel/sched/ext.c
··· 116 116 */ 117 117 SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, 118 118 119 + /* 120 + * CPU cgroup support flags 121 + */ 122 + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ 123 + 119 124 SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | 120 125 SCX_OPS_ENQ_LAST | 121 126 SCX_OPS_ENQ_EXITING | 122 - SCX_OPS_SWITCH_PARTIAL, 127 + SCX_OPS_SWITCH_PARTIAL | 128 + SCX_OPS_HAS_CGROUP_WEIGHT, 123 129 }; 124 130 125 131 /* argument container for ops.init_task() */ ··· 135 129 * to the scheduler transition path. 136 130 */ 137 131 bool fork; 132 + #ifdef CONFIG_EXT_GROUP_SCHED 133 + /* the cgroup the task is joining */ 134 + struct cgroup *cgroup; 135 + #endif 138 136 }; 139 137 140 138 /* argument container for ops.exit_task() */ 141 139 struct scx_exit_task_args { 142 140 /* Whether the task exited before running on sched_ext. */ 143 141 bool cancelled; 142 + }; 143 + 144 + /* argument container for ops->cgroup_init() */ 145 + struct scx_cgroup_init_args { 146 + /* the weight of the cgroup [1..10000] */ 147 + u32 weight; 144 148 }; 145 149 146 150 enum scx_cpu_preempt_reason { ··· 517 501 */ 518 502 void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); 519 503 504 + #ifdef CONFIG_EXT_GROUP_SCHED 505 + /** 506 + * cgroup_init - Initialize a cgroup 507 + * @cgrp: cgroup being initialized 508 + * @args: init arguments, see the struct definition 509 + * 510 + * Either the BPF scheduler is being loaded or @cgrp created, initialize 511 + * @cgrp for sched_ext. This operation may block. 512 + * 513 + * Return 0 for success, -errno for failure. An error return while 514 + * loading will abort loading of the BPF scheduler. During cgroup 515 + * creation, it will abort the specific cgroup creation. 516 + */ 517 + s32 (*cgroup_init)(struct cgroup *cgrp, 518 + struct scx_cgroup_init_args *args); 519 + 520 + /** 521 + * cgroup_exit - Exit a cgroup 522 + * @cgrp: cgroup being exited 523 + * 524 + * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit 525 + * @cgrp for sched_ext. This operation my block. 526 + */ 527 + void (*cgroup_exit)(struct cgroup *cgrp); 528 + 529 + /** 530 + * cgroup_prep_move - Prepare a task to be moved to a different cgroup 531 + * @p: task being moved 532 + * @from: cgroup @p is being moved from 533 + * @to: cgroup @p is being moved to 534 + * 535 + * Prepare @p for move from cgroup @from to @to. This operation may 536 + * block and can be used for allocations. 537 + * 538 + * Return 0 for success, -errno for failure. An error return aborts the 539 + * migration. 540 + */ 541 + s32 (*cgroup_prep_move)(struct task_struct *p, 542 + struct cgroup *from, struct cgroup *to); 543 + 544 + /** 545 + * cgroup_move - Commit cgroup move 546 + * @p: task being moved 547 + * @from: cgroup @p is being moved from 548 + * @to: cgroup @p is being moved to 549 + * 550 + * Commit the move. @p is dequeued during this operation. 551 + */ 552 + void (*cgroup_move)(struct task_struct *p, 553 + struct cgroup *from, struct cgroup *to); 554 + 555 + /** 556 + * cgroup_cancel_move - Cancel cgroup move 557 + * @p: task whose cgroup move is being canceled 558 + * @from: cgroup @p was being moved from 559 + * @to: cgroup @p was being moved to 560 + * 561 + * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). 562 + * Undo the preparation. 563 + */ 564 + void (*cgroup_cancel_move)(struct task_struct *p, 565 + struct cgroup *from, struct cgroup *to); 566 + 567 + /** 568 + * cgroup_set_weight - A cgroup's weight is being changed 569 + * @cgrp: cgroup whose weight is being updated 570 + * @weight: new weight [1..10000] 571 + * 572 + * Update @tg's weight to @weight. 573 + */ 574 + void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); 575 + #endif /* CONFIG_CGROUPS */ 576 + 520 577 /* 521 578 * All online ops must come before ops.cpu_online(). 522 579 */ ··· 770 681 * return after the target CPU finishes picking the next task. 771 682 */ 772 683 SCX_KICK_WAIT = 1LLU << 2, 684 + }; 685 + 686 + enum scx_tg_flags { 687 + SCX_TG_ONLINE = 1U << 0, 688 + SCX_TG_INITED = 1U << 1, 773 689 }; 774 690 775 691 enum scx_ops_enable_state { ··· 3329 3235 resched_curr(rq); 3330 3236 } 3331 3237 3238 + #ifdef CONFIG_EXT_GROUP_SCHED 3239 + static struct cgroup *tg_cgrp(struct task_group *tg) 3240 + { 3241 + /* 3242 + * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, 3243 + * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the 3244 + * root cgroup. 3245 + */ 3246 + if (tg && tg->css.cgroup) 3247 + return tg->css.cgroup; 3248 + else 3249 + return &cgrp_dfl_root.cgrp; 3250 + } 3251 + 3252 + #define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), 3253 + 3254 + #else /* CONFIG_EXT_GROUP_SCHED */ 3255 + 3256 + #define SCX_INIT_TASK_ARGS_CGROUP(tg) 3257 + 3258 + #endif /* CONFIG_EXT_GROUP_SCHED */ 3259 + 3332 3260 static enum scx_task_state scx_get_task_state(const struct task_struct *p) 3333 3261 { 3334 3262 return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; ··· 3395 3279 3396 3280 if (SCX_HAS_OP(init_task)) { 3397 3281 struct scx_init_task_args args = { 3282 + SCX_INIT_TASK_ARGS_CGROUP(tg) 3398 3283 .fork = fork, 3399 3284 }; 3400 3285 ··· 3460 3343 scx_set_task_state(p, SCX_TASK_ENABLED); 3461 3344 3462 3345 if (SCX_HAS_OP(set_weight)) 3463 - SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight); 3346 + SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); 3464 3347 } 3465 3348 3466 3349 static void scx_ops_disable_task(struct task_struct *p) ··· 3672 3555 } 3673 3556 #endif 3674 3557 3558 + #ifdef CONFIG_EXT_GROUP_SCHED 3559 + 3560 + DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); 3561 + static bool cgroup_warned_missing_weight; 3562 + static bool cgroup_warned_missing_idle; 3563 + 3564 + static void scx_cgroup_warn_missing_weight(struct task_group *tg) 3565 + { 3566 + if (scx_ops_enable_state() == SCX_OPS_DISABLED || 3567 + cgroup_warned_missing_weight) 3568 + return; 3569 + 3570 + if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) 3571 + return; 3572 + 3573 + pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", 3574 + scx_ops.name); 3575 + cgroup_warned_missing_weight = true; 3576 + } 3577 + 3578 + static void scx_cgroup_warn_missing_idle(struct task_group *tg) 3579 + { 3580 + if (scx_ops_enable_state() == SCX_OPS_DISABLED || 3581 + cgroup_warned_missing_idle) 3582 + return; 3583 + 3584 + if (!tg->idle) 3585 + return; 3586 + 3587 + pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", 3588 + scx_ops.name); 3589 + cgroup_warned_missing_idle = true; 3590 + } 3591 + 3592 + int scx_tg_online(struct task_group *tg) 3593 + { 3594 + int ret = 0; 3595 + 3596 + WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); 3597 + 3598 + percpu_down_read(&scx_cgroup_rwsem); 3599 + 3600 + scx_cgroup_warn_missing_weight(tg); 3601 + 3602 + if (SCX_HAS_OP(cgroup_init)) { 3603 + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; 3604 + 3605 + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, 3606 + tg->css.cgroup, &args); 3607 + if (!ret) 3608 + tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; 3609 + else 3610 + ret = ops_sanitize_err("cgroup_init", ret); 3611 + } else { 3612 + tg->scx_flags |= SCX_TG_ONLINE; 3613 + } 3614 + 3615 + percpu_up_read(&scx_cgroup_rwsem); 3616 + return ret; 3617 + } 3618 + 3619 + void scx_tg_offline(struct task_group *tg) 3620 + { 3621 + WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); 3622 + 3623 + percpu_down_read(&scx_cgroup_rwsem); 3624 + 3625 + if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) 3626 + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, tg->css.cgroup); 3627 + tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); 3628 + 3629 + percpu_up_read(&scx_cgroup_rwsem); 3630 + } 3631 + 3632 + int scx_cgroup_can_attach(struct cgroup_taskset *tset) 3633 + { 3634 + struct cgroup_subsys_state *css; 3635 + struct task_struct *p; 3636 + int ret; 3637 + 3638 + /* released in scx_finish/cancel_attach() */ 3639 + percpu_down_read(&scx_cgroup_rwsem); 3640 + 3641 + if (!scx_enabled()) 3642 + return 0; 3643 + 3644 + cgroup_taskset_for_each(p, css, tset) { 3645 + struct cgroup *from = tg_cgrp(task_group(p)); 3646 + struct cgroup *to = tg_cgrp(css_tg(css)); 3647 + 3648 + WARN_ON_ONCE(p->scx.cgrp_moving_from); 3649 + 3650 + /* 3651 + * sched_move_task() omits identity migrations. Let's match the 3652 + * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() 3653 + * always match one-to-one. 3654 + */ 3655 + if (from == to) 3656 + continue; 3657 + 3658 + if (SCX_HAS_OP(cgroup_prep_move)) { 3659 + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_prep_move, 3660 + p, from, css->cgroup); 3661 + if (ret) 3662 + goto err; 3663 + } 3664 + 3665 + p->scx.cgrp_moving_from = from; 3666 + } 3667 + 3668 + return 0; 3669 + 3670 + err: 3671 + cgroup_taskset_for_each(p, css, tset) { 3672 + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) 3673 + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, 3674 + p->scx.cgrp_moving_from, css->cgroup); 3675 + p->scx.cgrp_moving_from = NULL; 3676 + } 3677 + 3678 + percpu_up_read(&scx_cgroup_rwsem); 3679 + return ops_sanitize_err("cgroup_prep_move", ret); 3680 + } 3681 + 3682 + void scx_move_task(struct task_struct *p) 3683 + { 3684 + if (!scx_enabled()) 3685 + return; 3686 + 3687 + /* 3688 + * We're called from sched_move_task() which handles both cgroup and 3689 + * autogroup moves. Ignore the latter. 3690 + * 3691 + * Also ignore exiting tasks, because in the exit path tasks transition 3692 + * from the autogroup to the root group, so task_group_is_autogroup() 3693 + * alone isn't able to catch exiting autogroup tasks. This is safe for 3694 + * cgroup_move(), because cgroup migrations never happen for PF_EXITING 3695 + * tasks. 3696 + */ 3697 + if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING)) 3698 + return; 3699 + 3700 + /* 3701 + * @p must have ops.cgroup_prep_move() called on it and thus 3702 + * cgrp_moving_from set. 3703 + */ 3704 + if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) 3705 + SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, 3706 + p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); 3707 + p->scx.cgrp_moving_from = NULL; 3708 + } 3709 + 3710 + void scx_cgroup_finish_attach(void) 3711 + { 3712 + percpu_up_read(&scx_cgroup_rwsem); 3713 + } 3714 + 3715 + void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) 3716 + { 3717 + struct cgroup_subsys_state *css; 3718 + struct task_struct *p; 3719 + 3720 + if (!scx_enabled()) 3721 + goto out_unlock; 3722 + 3723 + cgroup_taskset_for_each(p, css, tset) { 3724 + if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) 3725 + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_cancel_move, p, 3726 + p->scx.cgrp_moving_from, css->cgroup); 3727 + p->scx.cgrp_moving_from = NULL; 3728 + } 3729 + out_unlock: 3730 + percpu_up_read(&scx_cgroup_rwsem); 3731 + } 3732 + 3733 + void scx_group_set_weight(struct task_group *tg, unsigned long weight) 3734 + { 3735 + percpu_down_read(&scx_cgroup_rwsem); 3736 + 3737 + if (tg->scx_weight != weight) { 3738 + if (SCX_HAS_OP(cgroup_set_weight)) 3739 + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, 3740 + tg_cgrp(tg), weight); 3741 + tg->scx_weight = weight; 3742 + } 3743 + 3744 + percpu_up_read(&scx_cgroup_rwsem); 3745 + } 3746 + 3747 + void scx_group_set_idle(struct task_group *tg, bool idle) 3748 + { 3749 + percpu_down_read(&scx_cgroup_rwsem); 3750 + scx_cgroup_warn_missing_idle(tg); 3751 + percpu_up_read(&scx_cgroup_rwsem); 3752 + } 3753 + 3754 + static void scx_cgroup_lock(void) 3755 + { 3756 + percpu_down_write(&scx_cgroup_rwsem); 3757 + } 3758 + 3759 + static void scx_cgroup_unlock(void) 3760 + { 3761 + percpu_up_write(&scx_cgroup_rwsem); 3762 + } 3763 + 3764 + #else /* CONFIG_EXT_GROUP_SCHED */ 3765 + 3766 + static inline void scx_cgroup_lock(void) {} 3767 + static inline void scx_cgroup_unlock(void) {} 3768 + 3769 + #endif /* CONFIG_EXT_GROUP_SCHED */ 3770 + 3675 3771 /* 3676 3772 * Omitted operations: 3677 3773 * ··· 4015 3685 out_unlock_rcu: 4016 3686 rcu_read_unlock(); 4017 3687 } 3688 + 3689 + #ifdef CONFIG_EXT_GROUP_SCHED 3690 + static void scx_cgroup_exit(void) 3691 + { 3692 + struct cgroup_subsys_state *css; 3693 + 3694 + percpu_rwsem_assert_held(&scx_cgroup_rwsem); 3695 + 3696 + /* 3697 + * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk 3698 + * cgroups and exit all the inited ones, all online cgroups are exited. 3699 + */ 3700 + rcu_read_lock(); 3701 + css_for_each_descendant_post(css, &root_task_group.css) { 3702 + struct task_group *tg = css_tg(css); 3703 + 3704 + if (!(tg->scx_flags & SCX_TG_INITED)) 3705 + continue; 3706 + tg->scx_flags &= ~SCX_TG_INITED; 3707 + 3708 + if (!scx_ops.cgroup_exit) 3709 + continue; 3710 + 3711 + if (WARN_ON_ONCE(!css_tryget(css))) 3712 + continue; 3713 + rcu_read_unlock(); 3714 + 3715 + SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); 3716 + 3717 + rcu_read_lock(); 3718 + css_put(css); 3719 + } 3720 + rcu_read_unlock(); 3721 + } 3722 + 3723 + static int scx_cgroup_init(void) 3724 + { 3725 + struct cgroup_subsys_state *css; 3726 + int ret; 3727 + 3728 + percpu_rwsem_assert_held(&scx_cgroup_rwsem); 3729 + 3730 + cgroup_warned_missing_weight = false; 3731 + cgroup_warned_missing_idle = false; 3732 + 3733 + /* 3734 + * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk 3735 + * cgroups and init, all online cgroups are initialized. 3736 + */ 3737 + rcu_read_lock(); 3738 + css_for_each_descendant_pre(css, &root_task_group.css) { 3739 + struct task_group *tg = css_tg(css); 3740 + struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; 3741 + 3742 + scx_cgroup_warn_missing_weight(tg); 3743 + scx_cgroup_warn_missing_idle(tg); 3744 + 3745 + if ((tg->scx_flags & 3746 + (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) 3747 + continue; 3748 + 3749 + if (!scx_ops.cgroup_init) { 3750 + tg->scx_flags |= SCX_TG_INITED; 3751 + continue; 3752 + } 3753 + 3754 + if (WARN_ON_ONCE(!css_tryget(css))) 3755 + continue; 3756 + rcu_read_unlock(); 3757 + 3758 + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, 3759 + css->cgroup, &args); 3760 + if (ret) { 3761 + css_put(css); 3762 + return ret; 3763 + } 3764 + tg->scx_flags |= SCX_TG_INITED; 3765 + 3766 + rcu_read_lock(); 3767 + css_put(css); 3768 + } 3769 + rcu_read_unlock(); 3770 + 3771 + return 0; 3772 + } 3773 + 3774 + #else 3775 + static void scx_cgroup_exit(void) {} 3776 + static int scx_cgroup_init(void) { return 0; } 3777 + #endif 4018 3778 4019 3779 4020 3780 /******************************************************************************** ··· 4398 3978 WRITE_ONCE(scx_switching_all, false); 4399 3979 4400 3980 /* 4401 - * Avoid racing against fork. See scx_ops_enable() for explanation on 4402 - * the locking order. 3981 + * Avoid racing against fork and cgroup changes. See scx_ops_enable() 3982 + * for explanation on the locking order. 4403 3983 */ 4404 3984 percpu_down_write(&scx_fork_rwsem); 4405 3985 cpus_read_lock(); 3986 + scx_cgroup_lock(); 4406 3987 4407 3988 spin_lock_irq(&scx_tasks_lock); 4408 3989 scx_task_iter_init(&sti); ··· 4439 4018 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 4440 4019 synchronize_rcu(); 4441 4020 4021 + scx_cgroup_exit(); 4022 + 4023 + scx_cgroup_unlock(); 4442 4024 cpus_read_unlock(); 4443 4025 percpu_up_write(&scx_fork_rwsem); 4444 4026 ··· 4998 4574 scx_watchdog_timeout / 2); 4999 4575 5000 4576 /* 5001 - * Lock out forks before opening the floodgate so that they don't wander 5002 - * into the operations prematurely. 4577 + * Lock out forks, cgroup on/offlining and moves before opening the 4578 + * floodgate so that they don't wander into the operations prematurely. 5003 4579 * 5004 - * We don't need to keep the CPUs stable but grab cpus_read_lock() to 5005 - * ease future locking changes for cgroup suport. 4580 + * We don't need to keep the CPUs stable but static_branch_*() requires 4581 + * cpus_read_lock() and scx_cgroup_rwsem must nest inside 4582 + * cpu_hotplug_lock because of the following dependency chain: 4583 + * 4584 + * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem 4585 + * 4586 + * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use 4587 + * static_branch_*_cpuslocked(). 5006 4588 * 5007 4589 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the 5008 4590 * following dependency chain: ··· 5017 4587 */ 5018 4588 percpu_down_write(&scx_fork_rwsem); 5019 4589 cpus_read_lock(); 4590 + scx_cgroup_lock(); 5020 4591 5021 4592 check_hotplug_seq(ops); 5022 4593 ··· 5039 4608 } else { 5040 4609 static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 5041 4610 } 4611 + 4612 + /* 4613 + * All cgroups should be initialized before letting in tasks. cgroup 4614 + * on/offlining and task migrations are already locked out. 4615 + */ 4616 + ret = scx_cgroup_init(); 4617 + if (ret) 4618 + goto err_disable_unlock_all; 5042 4619 5043 4620 static_branch_enable_cpuslocked(&__scx_ops_enabled); 5044 4621 ··· 5139 4700 5140 4701 spin_unlock_irq(&scx_tasks_lock); 5141 4702 preempt_enable(); 4703 + scx_cgroup_unlock(); 5142 4704 cpus_read_unlock(); 5143 4705 percpu_up_write(&scx_fork_rwsem); 5144 4706 ··· 5174 4734 return ret; 5175 4735 5176 4736 err_disable_unlock_all: 4737 + scx_cgroup_unlock(); 5177 4738 percpu_up_write(&scx_fork_rwsem); 5178 4739 err_disable_unlock_cpus: 5179 4740 cpus_read_unlock(); ··· 5369 4928 5370 4929 switch (moff) { 5371 4930 case offsetof(struct sched_ext_ops, init_task): 4931 + #ifdef CONFIG_EXT_GROUP_SCHED 4932 + case offsetof(struct sched_ext_ops, cgroup_init): 4933 + case offsetof(struct sched_ext_ops, cgroup_exit): 4934 + case offsetof(struct sched_ext_ops, cgroup_prep_move): 4935 + #endif 5372 4936 case offsetof(struct sched_ext_ops, cpu_online): 5373 4937 case offsetof(struct sched_ext_ops, cpu_offline): 5374 4938 case offsetof(struct sched_ext_ops, init): ··· 5448 5002 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} 5449 5003 static void enable_stub(struct task_struct *p) {} 5450 5004 static void disable_stub(struct task_struct *p) {} 5005 + #ifdef CONFIG_EXT_GROUP_SCHED 5006 + static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } 5007 + static void cgroup_exit_stub(struct cgroup *cgrp) {} 5008 + static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } 5009 + static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 5010 + static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} 5011 + static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} 5012 + #endif 5451 5013 static void cpu_online_stub(s32 cpu) {} 5452 5014 static void cpu_offline_stub(s32 cpu) {} 5453 5015 static s32 init_stub(void) { return -EINVAL; } ··· 5485 5031 .exit_task = exit_task_stub, 5486 5032 .enable = enable_stub, 5487 5033 .disable = disable_stub, 5034 + #ifdef CONFIG_EXT_GROUP_SCHED 5035 + .cgroup_init = cgroup_init_stub, 5036 + .cgroup_exit = cgroup_exit_stub, 5037 + .cgroup_prep_move = cgroup_prep_move_stub, 5038 + .cgroup_move = cgroup_move_stub, 5039 + .cgroup_cancel_move = cgroup_cancel_move_stub, 5040 + .cgroup_set_weight = cgroup_set_weight_stub, 5041 + #endif 5488 5042 .cpu_online = cpu_online_stub, 5489 5043 .cpu_offline = cpu_offline_stub, 5490 5044 .init = init_stub, ··· 5742 5280 * definitions so that BPF scheduler implementations can use them 5743 5281 * through the generated vmlinux.h. 5744 5282 */ 5745 - WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT); 5283 + WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | 5284 + SCX_TG_ONLINE); 5746 5285 5747 5286 BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); 5748 5287 init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); ··· 6803 6340 return cpu_rq(cpu); 6804 6341 } 6805 6342 6343 + /** 6344 + * scx_bpf_task_cgroup - Return the sched cgroup of a task 6345 + * @p: task of interest 6346 + * 6347 + * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with 6348 + * from the scheduler's POV. SCX operations should use this function to 6349 + * determine @p's current cgroup as, unlike following @p->cgroups, 6350 + * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all 6351 + * rq-locked operations. Can be called on the parameter tasks of rq-locked 6352 + * operations. The restriction guarantees that @p's rq is locked by the caller. 6353 + */ 6354 + #ifdef CONFIG_CGROUP_SCHED 6355 + __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) 6356 + { 6357 + struct task_group *tg = p->sched_task_group; 6358 + struct cgroup *cgrp = &cgrp_dfl_root.cgrp; 6359 + 6360 + if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) 6361 + goto out; 6362 + 6363 + /* 6364 + * A task_group may either be a cgroup or an autogroup. In the latter 6365 + * case, @tg->css.cgroup is %NULL. A task_group can't become the other 6366 + * kind once created. 6367 + */ 6368 + if (tg && tg->css.cgroup) 6369 + cgrp = tg->css.cgroup; 6370 + else 6371 + cgrp = &cgrp_dfl_root.cgrp; 6372 + out: 6373 + cgroup_get(cgrp); 6374 + return cgrp; 6375 + } 6376 + #endif 6377 + 6806 6378 __bpf_kfunc_end_defs(); 6807 6379 6808 6380 BTF_KFUNCS_START(scx_kfunc_ids_any) ··· 6866 6368 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) 6867 6369 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) 6868 6370 BTF_ID_FLAGS(func, scx_bpf_cpu_rq) 6371 + #ifdef CONFIG_CGROUP_SCHED 6372 + BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) 6373 + #endif 6869 6374 BTF_KFUNCS_END(scx_kfunc_ids_any) 6870 6375 6871 6376 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
+22
kernel/sched/ext.h
··· 67 67 #else 68 68 static inline void scx_update_idle(struct rq *rq, bool idle) {} 69 69 #endif 70 + 71 + #ifdef CONFIG_CGROUP_SCHED 72 + #ifdef CONFIG_EXT_GROUP_SCHED 73 + int scx_tg_online(struct task_group *tg); 74 + void scx_tg_offline(struct task_group *tg); 75 + int scx_cgroup_can_attach(struct cgroup_taskset *tset); 76 + void scx_move_task(struct task_struct *p); 77 + void scx_cgroup_finish_attach(void); 78 + void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); 79 + void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); 80 + void scx_group_set_idle(struct task_group *tg, bool idle); 81 + #else /* CONFIG_EXT_GROUP_SCHED */ 82 + static inline int scx_tg_online(struct task_group *tg) { return 0; } 83 + static inline void scx_tg_offline(struct task_group *tg) {} 84 + static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } 85 + static inline void scx_move_task(struct task_struct *p) {} 86 + static inline void scx_cgroup_finish_attach(void) {} 87 + static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} 88 + static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} 89 + static inline void scx_group_set_idle(struct task_group *tg, bool idle) {} 90 + #endif /* CONFIG_EXT_GROUP_SCHED */ 91 + #endif /* CONFIG_CGROUP_SCHED */
+5
kernel/sched/sched.h
··· 459 459 struct rt_bandwidth rt_bandwidth; 460 460 #endif 461 461 462 + #ifdef CONFIG_EXT_GROUP_SCHED 463 + u32 scx_flags; /* SCX_TG_* */ 464 + u32 scx_weight; 465 + #endif 466 + 462 467 struct rcu_head rcu; 463 468 struct list_head list; 464 469
+1
tools/sched_ext/include/scx/common.bpf.h
··· 61 61 bool scx_bpf_task_running(const struct task_struct *p) __ksym; 62 62 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; 63 63 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; 64 + struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; 64 65 65 66 static inline __attribute__((format(printf, 1, 2))) 66 67 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
+32
tools/testing/selftests/sched_ext/maximal.bpf.c
··· 95 95 void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) 96 96 {} 97 97 98 + s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, 99 + struct scx_cgroup_init_args *args) 100 + { 101 + return 0; 102 + } 103 + 104 + void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) 105 + {} 106 + 107 + s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, 108 + struct cgroup *from, struct cgroup *to) 109 + { 110 + return 0; 111 + } 112 + 113 + void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, 114 + struct cgroup *from, struct cgroup *to) 115 + {} 116 + 117 + void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, 118 + struct cgroup *from, struct cgroup *to) 119 + {} 120 + 121 + void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) 122 + {} 123 + 98 124 s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) 99 125 { 100 126 return 0; ··· 152 126 .enable = maximal_enable, 153 127 .exit_task = maximal_exit_task, 154 128 .disable = maximal_disable, 129 + .cgroup_init = maximal_cgroup_init, 130 + .cgroup_exit = maximal_cgroup_exit, 131 + .cgroup_prep_move = maximal_cgroup_prep_move, 132 + .cgroup_move = maximal_cgroup_move, 133 + .cgroup_cancel_move = maximal_cgroup_cancel_move, 134 + .cgroup_set_weight = maximal_cgroup_set_weight, 155 135 .init = maximal_init, 156 136 .exit = maximal_exit, 157 137 .name = "maximal",