Merge tag 'sched_ext-for-6.12-rc1-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

+172 -148

kernel/sched/ext.c

··· 9 9 #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 10 10 11 11 enum scx_consts { 12 + SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, 12 13 SCX_DSP_DFL_MAX_BATCH = 32, 13 14 SCX_DSP_MAX_LOOPS = 32, 14 15 SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, ··· 779 778 }; 780 779 781 780 enum scx_ops_enable_state { 782 - SCX_OPS_PREPPING, 783 781 SCX_OPS_ENABLING, 784 782 SCX_OPS_ENABLED, 785 783 SCX_OPS_DISABLING, ··· 786 786 }; 787 787 788 788 static const char *scx_ops_enable_state_str[] = { 789 - [SCX_OPS_PREPPING] = "prepping", 790 789 [SCX_OPS_ENABLING] = "enabling", 791 790 [SCX_OPS_ENABLED] = "enabled", 792 791 [SCX_OPS_DISABLING] = "disabling", ··· 853 854 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 854 855 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); 855 856 static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); 857 + static bool scx_ops_init_task_enabled; 856 858 static bool scx_switching_all; 857 859 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 858 860 ··· 925 925 */ 926 926 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 927 927 928 - /* dispatch queues */ 929 - static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; 928 + /* 929 + * Dispatch queues. 930 + * 931 + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is 932 + * to avoid live-locking in bypass mode where all tasks are dispatched to 933 + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't 934 + * sufficient, it can be further split. 935 + */ 936 + static struct scx_dispatch_q **global_dsqs; 930 937 931 938 static const struct rhashtable_params dsq_hash_params = { 932 939 .key_len = 8, ··· 1034 1027 static bool u32_before(u32 a, u32 b) 1035 1028 { 1036 1029 return (s32)(a - b) < 0; 1030 + } 1031 + 1032 + static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) 1033 + { 1034 + return global_dsqs[cpu_to_node(task_cpu(p))]; 1035 + } 1036 + 1037 + static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) 1038 + { 1039 + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); 1037 1040 } 1038 1041 1039 1042 /* ··· 1654 1637 scx_ops_error("attempting to dispatch to a destroyed dsq"); 1655 1638 /* fall back to the global dsq */ 1656 1639 raw_spin_unlock(&dsq->lock); 1657 - dsq = &scx_dsq_global; 1640 + dsq = find_global_dsq(p); 1658 1641 raw_spin_lock(&dsq->lock); 1659 1642 } 1660 1643 } ··· 1820 1803 raw_spin_unlock(&dsq->lock); 1821 1804 } 1822 1805 1823 - static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) 1824 - { 1825 - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); 1826 - } 1827 - 1828 - static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) 1829 - { 1830 - lockdep_assert(rcu_read_lock_any_held()); 1831 - 1832 - if (dsq_id == SCX_DSQ_GLOBAL) 1833 - return &scx_dsq_global; 1834 - else 1835 - return find_user_dsq(dsq_id); 1836 - } 1837 - 1838 1806 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, 1839 1807 struct task_struct *p) 1840 1808 { ··· 1832 1830 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1833 1831 1834 1832 if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1835 - return &scx_dsq_global; 1833 + return find_global_dsq(p); 1836 1834 1837 1835 return &cpu_rq(cpu)->scx.local_dsq; 1838 1836 } 1839 1837 1840 - dsq = find_non_local_dsq(dsq_id); 1838 + if (dsq_id == SCX_DSQ_GLOBAL) 1839 + dsq = find_global_dsq(p); 1840 + else 1841 + dsq = find_user_dsq(dsq_id); 1842 + 1841 1843 if (unlikely(!dsq)) { 1842 1844 scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", 1843 1845 dsq_id, p->comm, p->pid); 1844 - return &scx_dsq_global; 1846 + return find_global_dsq(p); 1845 1847 } 1846 1848 1847 1849 return dsq; ··· 1944 1938 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1945 1939 int sticky_cpu) 1946 1940 { 1941 + bool bypassing = scx_rq_bypassing(rq); 1947 1942 struct task_struct **ddsp_taskp; 1948 1943 unsigned long qseq; 1949 1944 ··· 1962 1955 if (!scx_rq_online(rq)) 1963 1956 goto local; 1964 1957 1965 - if (scx_rq_bypassing(rq)) 1958 + if (bypassing) 1966 1959 goto global; 1967 1960 1968 1961 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) ··· 2017 2010 2018 2011 global: 2019 2012 touch_core_sched(rq, p); /* see the comment in local: */ 2020 - p->scx.slice = SCX_SLICE_DFL; 2021 - dispatch_enqueue(&scx_dsq_global, p, enq_flags); 2013 + p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; 2014 + dispatch_enqueue(find_global_dsq(p), p, enq_flags); 2022 2015 } 2023 2016 2024 2017 static bool task_runnable(const struct task_struct *p) ··· 2364 2357 } 2365 2358 } 2366 2359 #else /* CONFIG_SMP */ 2360 + static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } 2367 2361 static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } 2368 2362 static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } 2369 2363 #endif /* CONFIG_SMP */ ··· 2404 2396 return false; 2405 2397 } 2406 2398 2399 + static bool consume_global_dsq(struct rq *rq) 2400 + { 2401 + int node = cpu_to_node(cpu_of(rq)); 2402 + 2403 + return consume_dispatch_q(rq, global_dsqs[node]); 2404 + } 2405 + 2407 2406 /** 2408 2407 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2409 2408 * @rq: current rq which is locked ··· 2444 2429 2445 2430 #ifdef CONFIG_SMP 2446 2431 if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { 2447 - dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2432 + dispatch_enqueue(find_global_dsq(p), p, 2433 + enq_flags | SCX_ENQ_CLEAR_OPSS); 2448 2434 return; 2449 2435 } 2450 2436 ··· 2645 2629 if (rq->scx.local_dsq.nr) 2646 2630 goto has_tasks; 2647 2631 2648 - if (consume_dispatch_q(rq, &scx_dsq_global)) 2632 + if (consume_global_dsq(rq)) 2649 2633 goto has_tasks; 2650 2634 2651 2635 if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) ··· 2670 2654 2671 2655 if (rq->scx.local_dsq.nr) 2672 2656 goto has_tasks; 2673 - if (consume_dispatch_q(rq, &scx_dsq_global)) 2657 + if (consume_global_dsq(rq)) 2674 2658 goto has_tasks; 2675 2659 2676 2660 /* ··· 3074 3058 * there is an idle core elsewhere on the system. 3075 3059 */ 3076 3060 cpu = smp_processor_id(); 3077 - if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && 3061 + if ((wake_flags & SCX_WAKE_SYNC) && 3078 3062 !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && 3079 3063 cpu_rq(cpu)->scx.local_dsq.nr == 0) { 3080 3064 if (cpumask_test_cpu(cpu, p->cpus_ptr)) 3081 3065 goto cpu_found; 3082 - } 3083 - 3084 - if (p->nr_cpus_allowed == 1) { 3085 - if (test_and_clear_cpu_idle(prev_cpu)) { 3086 - cpu = prev_cpu; 3087 - goto cpu_found; 3088 - } else { 3089 - return prev_cpu; 3090 - } 3091 3066 } 3092 3067 3093 3068 /* ··· 3557 3550 { 3558 3551 percpu_rwsem_assert_held(&scx_fork_rwsem); 3559 3552 3560 - if (scx_enabled()) 3553 + if (scx_ops_init_task_enabled) 3561 3554 return scx_ops_init_task(p, task_group(p), true); 3562 3555 else 3563 3556 return 0; ··· 3565 3558 3566 3559 void scx_post_fork(struct task_struct *p) 3567 3560 { 3568 - if (scx_enabled()) { 3561 + if (scx_ops_init_task_enabled) { 3569 3562 scx_set_task_state(p, SCX_TASK_READY); 3570 3563 3571 3564 /* ··· 3697 3690 #ifdef CONFIG_EXT_GROUP_SCHED 3698 3691 3699 3692 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); 3693 + static bool scx_cgroup_enabled; 3700 3694 static bool cgroup_warned_missing_weight; 3701 3695 static bool cgroup_warned_missing_idle; 3702 3696 ··· 3717 3709 3718 3710 static void scx_cgroup_warn_missing_idle(struct task_group *tg) 3719 3711 { 3720 - if (scx_ops_enable_state() == SCX_OPS_DISABLED || 3721 - cgroup_warned_missing_idle) 3712 + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) 3722 3713 return; 3723 3714 3724 3715 if (!tg->idle) ··· 3738 3731 3739 3732 scx_cgroup_warn_missing_weight(tg); 3740 3733 3741 - if (SCX_HAS_OP(cgroup_init)) { 3742 - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; 3734 + if (scx_cgroup_enabled) { 3735 + if (SCX_HAS_OP(cgroup_init)) { 3736 + struct scx_cgroup_init_args args = 3737 + { .weight = tg->scx_weight }; 3743 3738 3744 - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, 3745 - tg->css.cgroup, &args); 3746 - if (!ret) 3739 + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, 3740 + tg->css.cgroup, &args); 3741 + if (ret) 3742 + ret = ops_sanitize_err("cgroup_init", ret); 3743 + } 3744 + if (ret == 0) 3747 3745 tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; 3748 - else 3749 - ret = ops_sanitize_err("cgroup_init", ret); 3750 3746 } else { 3751 3747 tg->scx_flags |= SCX_TG_ONLINE; 3752 3748 } ··· 3780 3770 /* released in scx_finish/cancel_attach() */ 3781 3771 percpu_down_read(&scx_cgroup_rwsem); 3782 3772 3783 - if (!scx_enabled()) 3773 + if (!scx_cgroup_enabled) 3784 3774 return 0; 3785 3775 3786 3776 cgroup_taskset_for_each(p, css, tset) { ··· 3823 3813 3824 3814 void scx_move_task(struct task_struct *p) 3825 3815 { 3826 - if (!scx_enabled()) 3816 + if (!scx_cgroup_enabled) 3827 3817 return; 3828 3818 3829 3819 /* ··· 3859 3849 struct cgroup_subsys_state *css; 3860 3850 struct task_struct *p; 3861 3851 3862 - if (!scx_enabled()) 3852 + if (!scx_cgroup_enabled) 3863 3853 goto out_unlock; 3864 3854 3865 3855 cgroup_taskset_for_each(p, css, tset) { ··· 3876 3866 { 3877 3867 percpu_down_read(&scx_cgroup_rwsem); 3878 3868 3879 - if (tg->scx_weight != weight) { 3869 + if (scx_cgroup_enabled && tg->scx_weight != weight) { 3880 3870 if (SCX_HAS_OP(cgroup_set_weight)) 3881 3871 SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, 3882 3872 tg_cgrp(tg), weight); ··· 4048 4038 4049 4039 percpu_rwsem_assert_held(&scx_cgroup_rwsem); 4050 4040 4041 + WARN_ON_ONCE(!scx_cgroup_enabled); 4042 + scx_cgroup_enabled = false; 4043 + 4051 4044 /* 4052 4045 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk 4053 4046 * cgroups and exit all the inited ones, all online cgroups are exited. ··· 4125 4112 css_put(css); 4126 4113 } 4127 4114 rcu_read_unlock(); 4115 + 4116 + WARN_ON_ONCE(scx_cgroup_enabled); 4117 + scx_cgroup_enabled = true; 4128 4118 4129 4119 return 0; 4130 4120 } ··· 4447 4431 WRITE_ONCE(scx_switching_all, false); 4448 4432 4449 4433 /* 4450 - * Avoid racing against fork and cgroup changes. See scx_ops_enable() 4451 - * for explanation on the locking order. 4434 + * Shut down cgroup support before tasks so that the cgroup attach path 4435 + * doesn't race against scx_ops_exit_task(). 4452 4436 */ 4453 - percpu_down_write(&scx_fork_rwsem); 4454 - cpus_read_lock(); 4455 4437 scx_cgroup_lock(); 4438 + scx_cgroup_exit(); 4439 + scx_cgroup_unlock(); 4456 4440 4457 - spin_lock_irq(&scx_tasks_lock); 4458 - scx_task_iter_init(&sti); 4459 4441 /* 4460 4442 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 4461 4443 * must be switched out and exited synchronously. 4462 4444 */ 4445 + percpu_down_write(&scx_fork_rwsem); 4446 + 4447 + scx_ops_init_task_enabled = false; 4448 + 4449 + spin_lock_irq(&scx_tasks_lock); 4450 + scx_task_iter_init(&sti); 4463 4451 while ((p = scx_task_iter_next_locked(&sti))) { 4464 4452 const struct sched_class *old_class = p->sched_class; 4465 4453 struct sched_enq_and_set_ctx ctx; ··· 4481 4461 } 4482 4462 scx_task_iter_exit(&sti); 4483 4463 spin_unlock_irq(&scx_tasks_lock); 4464 + percpu_up_write(&scx_fork_rwsem); 4484 4465 4485 4466 /* no task is on scx, turn off all the switches and flush in-progress calls */ 4486 - static_branch_disable_cpuslocked(&__scx_ops_enabled); 4467 + static_branch_disable(&__scx_ops_enabled); 4487 4468 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 4488 - static_branch_disable_cpuslocked(&scx_has_op[i]); 4489 - static_branch_disable_cpuslocked(&scx_ops_enq_last); 4490 - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); 4491 - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); 4492 - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 4469 + static_branch_disable(&scx_has_op[i]); 4470 + static_branch_disable(&scx_ops_enq_last); 4471 + static_branch_disable(&scx_ops_enq_exiting); 4472 + static_branch_disable(&scx_ops_cpu_preempt); 4473 + static_branch_disable(&scx_builtin_idle_enabled); 4493 4474 synchronize_rcu(); 4494 - 4495 - scx_cgroup_exit(); 4496 - 4497 - scx_cgroup_unlock(); 4498 - cpus_read_unlock(); 4499 - percpu_up_write(&scx_fork_rwsem); 4500 4475 4501 4476 if (ei->kind >= SCX_EXIT_ERROR) { 4502 4477 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", ··· 4944 4929 struct scx_task_iter sti; 4945 4930 struct task_struct *p; 4946 4931 unsigned long timeout; 4947 - int i, cpu, ret; 4932 + int i, cpu, node, ret; 4948 4933 4949 4934 if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), 4950 4935 cpu_possible_mask)) { ··· 4961 4946 ret = -ENOMEM; 4962 4947 goto err_unlock; 4963 4948 } 4949 + } 4950 + 4951 + if (!global_dsqs) { 4952 + struct scx_dispatch_q **dsqs; 4953 + 4954 + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); 4955 + if (!dsqs) { 4956 + ret = -ENOMEM; 4957 + goto err_unlock; 4958 + } 4959 + 4960 + for_each_node_state(node, N_POSSIBLE) { 4961 + struct scx_dispatch_q *dsq; 4962 + 4963 + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); 4964 + if (!dsq) { 4965 + for_each_node_state(node, N_POSSIBLE) 4966 + kfree(dsqs[node]); 4967 + kfree(dsqs); 4968 + ret = -ENOMEM; 4969 + goto err_unlock; 4970 + } 4971 + 4972 + init_dsq(dsq, SCX_DSQ_GLOBAL); 4973 + dsqs[node] = dsq; 4974 + } 4975 + 4976 + global_dsqs = dsqs; 4964 4977 } 4965 4978 4966 4979 if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ··· 5014 4971 } 5015 4972 5016 4973 /* 5017 - * Set scx_ops, transition to PREPPING and clear exit info to arm the 4974 + * Set scx_ops, transition to ENABLING and clear exit info to arm the 5018 4975 * disable path. Failure triggers full disabling from here on. 5019 4976 */ 5020 4977 scx_ops = *ops; 5021 4978 5022 - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != 4979 + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != 5023 4980 SCX_OPS_DISABLED); 5024 4981 5025 4982 atomic_set(&scx_exit_kind, SCX_EXIT_NONE); ··· 5040 4997 ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); 5041 4998 if (ret) { 5042 4999 ret = ops_sanitize_err("init", ret); 5043 - goto err_disable_unlock_cpus; 5000 + cpus_read_unlock(); 5001 + goto err_disable; 5044 5002 } 5045 5003 } 5046 5004 ··· 5049 5005 if (((void (**)(void))ops)[i]) 5050 5006 static_branch_enable_cpuslocked(&scx_has_op[i]); 5051 5007 5008 + check_hotplug_seq(ops); 5052 5009 cpus_read_unlock(); 5053 5010 5054 5011 ret = validate_ops(ops); ··· 5077 5032 scx_watchdog_timeout / 2); 5078 5033 5079 5034 /* 5080 - * Lock out forks, cgroup on/offlining and moves before opening the 5081 - * floodgate so that they don't wander into the operations prematurely. 5082 - * 5083 - * We don't need to keep the CPUs stable but static_branch_*() requires 5084 - * cpus_read_lock() and scx_cgroup_rwsem must nest inside 5085 - * cpu_hotplug_lock because of the following dependency chain: 5086 - * 5087 - * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem 5088 - * 5089 - * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use 5090 - * static_branch_*_cpuslocked(). 5091 - * 5092 - * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the 5093 - * following dependency chain: 5094 - * 5095 - * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock 5035 + * Once __scx_ops_enabled is set, %current can be switched to SCX 5036 + * anytime. This can lead to stalls as some BPF schedulers (e.g. 5037 + * userspace scheduling) may not function correctly before all tasks are 5038 + * switched. Init in bypass mode to guarantee forward progress. 5096 5039 */ 5097 - percpu_down_write(&scx_fork_rwsem); 5098 - cpus_read_lock(); 5099 - scx_cgroup_lock(); 5100 - 5101 - check_hotplug_seq(ops); 5040 + scx_ops_bypass(true); 5102 5041 5103 5042 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 5104 5043 if (((void (**)(void))ops)[i]) 5105 - static_branch_enable_cpuslocked(&scx_has_op[i]); 5044 + static_branch_enable(&scx_has_op[i]); 5106 5045 5107 5046 if (ops->flags & SCX_OPS_ENQ_LAST) 5108 - static_branch_enable_cpuslocked(&scx_ops_enq_last); 5047 + static_branch_enable(&scx_ops_enq_last); 5109 5048 5110 5049 if (ops->flags & SCX_OPS_ENQ_EXITING) 5111 - static_branch_enable_cpuslocked(&scx_ops_enq_exiting); 5050 + static_branch_enable(&scx_ops_enq_exiting); 5112 5051 if (scx_ops.cpu_acquire || scx_ops.cpu_release) 5113 - static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); 5052 + static_branch_enable(&scx_ops_cpu_preempt); 5114 5053 5115 5054 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { 5116 5055 reset_idle_masks(); 5117 - static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); 5056 + static_branch_enable(&scx_builtin_idle_enabled); 5118 5057 } else { 5119 - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 5058 + static_branch_disable(&scx_builtin_idle_enabled); 5120 5059 } 5121 5060 5122 5061 /* 5123 - * All cgroups should be initialized before letting in tasks. cgroup 5124 - * on/offlining and task migrations are already locked out. 5062 + * Lock out forks, cgroup on/offlining and moves before opening the 5063 + * floodgate so that they don't wander into the operations prematurely. 5125 5064 */ 5126 - ret = scx_cgroup_init(); 5127 - if (ret) 5128 - goto err_disable_unlock_all; 5065 + percpu_down_write(&scx_fork_rwsem); 5129 5066 5130 - static_branch_enable_cpuslocked(&__scx_ops_enabled); 5067 + WARN_ON_ONCE(scx_ops_init_task_enabled); 5068 + scx_ops_init_task_enabled = true; 5131 5069 5132 5070 /* 5133 5071 * Enable ops for every task. Fork is excluded by scx_fork_rwsem ··· 5118 5090 * leaving as sched_ext_free() can handle both prepped and enabled 5119 5091 * tasks. Prep all tasks first and then enable them with preemption 5120 5092 * disabled. 5093 + * 5094 + * All cgroups should be initialized before scx_ops_init_task() so that 5095 + * the BPF scheduler can reliably track each task's cgroup membership 5096 + * from scx_ops_init_task(). Lock out cgroup on/offlining and task 5097 + * migrations while tasks are being initialized so that 5098 + * scx_cgroup_can_attach() never sees uninitialized tasks. 5121 5099 */ 5122 - spin_lock_irq(&scx_tasks_lock); 5100 + scx_cgroup_lock(); 5101 + ret = scx_cgroup_init(); 5102 + if (ret) 5103 + goto err_disable_unlock_all; 5123 5104 5105 + spin_lock_irq(&scx_tasks_lock); 5124 5106 scx_task_iter_init(&sti); 5125 5107 while ((p = scx_task_iter_next_locked(&sti))) { 5126 5108 /* ··· 5155 5117 goto err_disable_unlock_all; 5156 5118 } 5157 5119 5120 + scx_set_task_state(p, SCX_TASK_READY); 5121 + 5158 5122 put_task_struct(p); 5159 5123 spin_lock_irq(&scx_tasks_lock); 5160 5124 } 5161 5125 scx_task_iter_exit(&sti); 5126 + spin_unlock_irq(&scx_tasks_lock); 5127 + scx_cgroup_unlock(); 5128 + percpu_up_write(&scx_fork_rwsem); 5162 5129 5163 5130 /* 5164 - * All tasks are prepped but are still ops-disabled. Ensure that 5165 - * %current can't be scheduled out and switch everyone. 5166 - * preempt_disable() is necessary because we can't guarantee that 5167 - * %current won't be starved if scheduled out while switching. 5131 + * All tasks are READY. It's safe to turn on scx_enabled() and switch 5132 + * all eligible tasks. 5168 5133 */ 5169 - preempt_disable(); 5134 + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 5135 + static_branch_enable(&__scx_ops_enabled); 5170 5136 5171 5137 /* 5172 - * From here on, the disable path must assume that tasks have ops 5173 - * enabled and need to be recovered. 5174 - * 5175 - * Transition to ENABLING fails iff the BPF scheduler has already 5176 - * triggered scx_bpf_error(). Returning an error code here would lose 5177 - * the recorded error information. Exit indicating success so that the 5178 - * error is notified through ops.exit() with all the details. 5179 - */ 5180 - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { 5181 - preempt_enable(); 5182 - spin_unlock_irq(&scx_tasks_lock); 5183 - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 5184 - ret = 0; 5185 - goto err_disable_unlock_all; 5186 - } 5187 - 5188 - /* 5189 - * We're fully committed and can't fail. The PREPPED -> ENABLED 5138 + * We're fully committed and can't fail. The task READY -> ENABLED 5190 5139 * transitions here are synchronized against sched_ext_free() through 5191 5140 * scx_tasks_lock. 5192 5141 */ 5193 - WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 5194 - 5142 + percpu_down_write(&scx_fork_rwsem); 5143 + spin_lock_irq(&scx_tasks_lock); 5195 5144 scx_task_iter_init(&sti); 5196 5145 while ((p = scx_task_iter_next_locked(&sti))) { 5197 5146 const struct sched_class *old_class = p->sched_class; ··· 5186 5161 5187 5162 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 5188 5163 5189 - scx_set_task_state(p, SCX_TASK_READY); 5190 5164 __setscheduler_prio(p, p->prio); 5191 5165 check_class_changing(task_rq(p), p, old_class); 5192 5166 ··· 5194 5170 check_class_changed(task_rq(p), p, old_class, p->prio); 5195 5171 } 5196 5172 scx_task_iter_exit(&sti); 5197 - 5198 5173 spin_unlock_irq(&scx_tasks_lock); 5199 - preempt_enable(); 5200 - scx_cgroup_unlock(); 5201 - cpus_read_unlock(); 5202 5174 percpu_up_write(&scx_fork_rwsem); 5203 5175 5204 - /* see above ENABLING transition for the explanation on exiting with 0 */ 5176 + scx_ops_bypass(false); 5177 + 5178 + /* 5179 + * Returning an error code here would lose the recorded error 5180 + * information. Exit indicating success so that the error is notified 5181 + * through ops.exit() with all the details. 5182 + */ 5205 5183 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { 5206 5184 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 5207 5185 ret = 0; ··· 5238 5212 err_disable_unlock_all: 5239 5213 scx_cgroup_unlock(); 5240 5214 percpu_up_write(&scx_fork_rwsem); 5241 - err_disable_unlock_cpus: 5242 - cpus_read_unlock(); 5215 + scx_ops_bypass(false); 5243 5216 err_disable: 5244 5217 mutex_unlock(&scx_ops_enable_mutex); 5245 5218 /* must be fully disabled before returning */ ··· 5807 5782 SCX_TG_ONLINE); 5808 5783 5809 5784 BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); 5810 - init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); 5811 5785 #ifdef CONFIG_SMP 5812 5786 BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); 5813 5787 BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); ··· 6082 6058 if (dst_dsq->id == SCX_DSQ_LOCAL) { 6083 6059 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 6084 6060 if (!task_can_run_on_remote_rq(p, dst_rq, true)) { 6085 - dst_dsq = &scx_dsq_global; 6061 + dst_dsq = find_global_dsq(p); 6086 6062 dst_rq = src_rq; 6087 6063 } 6088 6064 } else { ··· 6199 6175 6200 6176 flush_dispatch_buf(dspc->rq); 6201 6177 6202 - dsq = find_non_local_dsq(dsq_id); 6178 + dsq = find_user_dsq(dsq_id); 6203 6179 if (unlikely(!dsq)) { 6204 6180 scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); 6205 6181 return false; ··· 6520 6496 goto out; 6521 6497 } 6522 6498 } else { 6523 - dsq = find_non_local_dsq(dsq_id); 6499 + dsq = find_user_dsq(dsq_id); 6524 6500 if (dsq) { 6525 6501 ret = READ_ONCE(dsq->nr); 6526 6502 goto out; ··· 6569 6545 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 6570 6546 return -EINVAL; 6571 6547 6572 - kit->dsq = find_non_local_dsq(dsq_id); 6548 + kit->dsq = find_user_dsq(dsq_id); 6573 6549 if (!kit->dsq) 6574 6550 return -ENOENT; 6575 6551

+15

tools/sched_ext/include/scx/common.bpf.h

··· 7 7 #ifndef __SCX_COMMON_BPF_H 8 8 #define __SCX_COMMON_BPF_H 9 9 10 + #ifdef LSP 11 + #define __bpf__ 12 + #include "../vmlinux/vmlinux.h" 13 + #else 10 14 #include "vmlinux.h" 15 + #endif 16 + 11 17 #include <bpf/bpf_helpers.h> 12 18 #include <bpf/bpf_tracing.h> 13 19 #include <asm-generic/errno.h> ··· 315 309 u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; 316 310 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, 317 311 const struct cpumask *src2) __ksym; 312 + u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; 313 + 314 + /* 315 + * Access a cpumask in read-only mode (typically to check bits). 316 + */ 317 + const struct cpumask *cast_mask(struct bpf_cpumask *mask) 318 + { 319 + return (const struct cpumask *)mask; 320 + } 318 321 319 322 /* rcu */ 320 323 void bpf_rcu_read_lock(void) __ksym;

+19

tools/sched_ext/include/scx/compat.bpf.h

··· 15 15 __ret; \ 16 16 }) 17 17 18 + /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ 19 + #define __COMPAT_scx_bpf_task_cgroup(p) \ 20 + (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ 21 + scx_bpf_task_cgroup((p)) : NULL) 22 + 23 + /* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */ 24 + #define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \ 25 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \ 26 + scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0) 27 + #define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \ 28 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \ 29 + scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0) 30 + #define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \ 31 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \ 32 + scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) 33 + #define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \ 34 + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \ 35 + scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) 36 + 18 37 /* 19 38 * Define sched_ext_ops. This may be expanded to define multiple variants for 20 39 * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().

+4

tools/sched_ext/include/scx/user_exit_info.h

··· 25 25 26 26 #ifdef __bpf__ 27 27 28 + #ifdef LSP 29 + #include "../vmlinux/vmlinux.h" 30 + #else 28 31 #include "vmlinux.h" 32 + #endif 29 33 #include <bpf/bpf_core_read.h> 30 34 31 35 #define UEI_DEFINE(__name) \

+20 -12

tools/sched_ext/scx_flatcg.bpf.c

··· 49 49 /* 50 50 * Maximum amount of retries to find a valid cgroup. 51 51 */ 52 - #define CGROUP_MAX_RETRIES 1024 52 + enum { 53 + FALLBACK_DSQ = 0, 54 + CGROUP_MAX_RETRIES = 1024, 55 + }; 53 56 54 57 char _license[] SEC("license") = "GPL"; 55 58 ··· 228 225 break; 229 226 230 227 /* 231 - * We can be oppotunistic here and not grab the 228 + * We can be opportunistic here and not grab the 232 229 * cgv_tree_lock and deal with the occasional races. 233 230 * However, hweight updates are already cached and 234 231 * relatively low-frequency. Let's just do the ··· 261 258 * and thus can't be updated and repositioned. Instead, we collect the 262 259 * vtime deltas separately and apply it asynchronously here. 263 260 */ 264 - delta = cgc->cvtime_delta; 265 - __sync_fetch_and_sub(&cgc->cvtime_delta, delta); 261 + delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta); 266 262 cvtime = cgv_node->cvtime + delta; 267 263 268 264 /* ··· 380 378 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); 381 379 } else { 382 380 stat_inc(FCG_STAT_GLOBAL); 383 - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 381 + scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags); 384 382 } 385 383 return; 386 384 } 387 385 388 - cgrp = scx_bpf_task_cgroup(p); 386 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 389 387 cgc = find_cgrp_ctx(cgrp); 390 388 if (!cgc) 391 389 goto out_release; ··· 511 509 { 512 510 struct cgroup *cgrp; 513 511 514 - cgrp = scx_bpf_task_cgroup(p); 512 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 515 513 update_active_weight_sums(cgrp, true); 516 514 bpf_cgroup_release(cgrp); 517 515 } ··· 524 522 if (fifo_sched) 525 523 return; 526 524 527 - cgrp = scx_bpf_task_cgroup(p); 525 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 528 526 cgc = find_cgrp_ctx(cgrp); 529 527 if (cgc) { 530 528 /* ··· 567 565 if (!taskc->bypassed_at) 568 566 return; 569 567 570 - cgrp = scx_bpf_task_cgroup(p); 568 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 571 569 cgc = find_cgrp_ctx(cgrp); 572 570 if (cgc) { 573 571 __sync_fetch_and_add(&cgc->cvtime_delta, ··· 581 579 { 582 580 struct cgroup *cgrp; 583 581 584 - cgrp = scx_bpf_task_cgroup(p); 582 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 585 583 update_active_weight_sums(cgrp, false); 586 584 bpf_cgroup_release(cgrp); 587 585 } ··· 783 781 pick_next_cgroup: 784 782 cpuc->cur_at = now; 785 783 786 - if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { 784 + if (scx_bpf_consume(FALLBACK_DSQ)) { 787 785 cpuc->cur_cgid = 0; 788 786 return; 789 787 } ··· 840 838 int ret; 841 839 842 840 /* 843 - * Technically incorrect as cgroup ID is full 64bit while dq ID is 841 + * Technically incorrect as cgroup ID is full 64bit while dsq ID is 844 842 * 63bit. Should not be a problem in practice and easy to spot in the 845 843 * unlikely case that it breaks. 846 844 */ ··· 928 926 p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; 929 927 } 930 928 929 + s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) 930 + { 931 + return scx_bpf_create_dsq(FALLBACK_DSQ, -1); 932 + } 933 + 931 934 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) 932 935 { 933 936 UEI_RECORD(uei, ei); ··· 951 944 .cgroup_init = (void *)fcg_cgroup_init, 952 945 .cgroup_exit = (void *)fcg_cgroup_exit, 953 946 .cgroup_move = (void *)fcg_cgroup_move, 947 + .init = (void *)fcg_init, 954 948 .exit = (void *)fcg_exit, 955 949 .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, 956 950 .name = "flatcg");

+6 -6

tools/sched_ext/scx_qmap.bpf.c

··· 318 318 319 319 if (tctx->highpri) { 320 320 /* exercise the set_*() and vtime interface too */ 321 - scx_bpf_dispatch_from_dsq_set_slice( 321 + __COMPAT_scx_bpf_dispatch_from_dsq_set_slice( 322 322 BPF_FOR_EACH_ITER, slice_ns * 2); 323 - scx_bpf_dispatch_from_dsq_set_vtime( 323 + __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime( 324 324 BPF_FOR_EACH_ITER, highpri_seq++); 325 - scx_bpf_dispatch_vtime_from_dsq( 325 + __COMPAT_scx_bpf_dispatch_vtime_from_dsq( 326 326 BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 327 327 } 328 328 } ··· 340 340 else 341 341 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 342 342 343 - if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, 344 - SCX_DSQ_LOCAL_ON | cpu, 345 - SCX_ENQ_PREEMPT)) { 343 + if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, 344 + SCX_DSQ_LOCAL_ON | cpu, 345 + SCX_ENQ_PREEMPT)) { 346 346 if (cpu == this_cpu) { 347 347 dispatched = true; 348 348 __sync_fetch_and_add(&nr_expedited_local, 1);

Configure Feed

Configure Feed