Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched_ext-for-6.12-rc1-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

- When sched_ext is in bypass mode (e.g. while disabling the BPF
scheduler), it was using one DSQ to implement global FIFO scheduling
as all it has to do is guaranteeing reasonable forward progress.

On multi-socket machines, this can lead to live-lock conditions under
certain workloads. Fixed by splitting the queue used for FIFO
scheduling per NUMA node. This required several preparation patches.

- Hotplug tests on powerpc could reliably trigger deadlock while
enabling a BPF scheduler.

This was caused by cpu_hotplug_lock nesting inside scx_fork_rwsem and
then CPU hotplug path trying to fork a new thread while holding
cpu_hotplug_lock.

Fixed by restructuring locking in enable and disable paths so that
the two locks are not coupled. This required several preparation
patches which also fixed a couple other issues in the enable path.

- A build fix for !CONFIG_SMP

- Userspace tooling sync and updates

* tag 'sched_ext-for-6.12-rc1-fixes-1' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
sched_ext: Remove redundant p->nr_cpus_allowed checker
sched_ext: Decouple locks in scx_ops_enable()
sched_ext: Decouple locks in scx_ops_disable_workfn()
sched_ext: Add scx_cgroup_enabled to gate cgroup operations and fix scx_tg_online()
sched_ext: Enable scx_ops_init_task() separately
sched_ext: Fix SCX_TASK_INIT -> SCX_TASK_READY transitions in scx_ops_enable()
sched_ext: Initialize in bypass mode
sched_ext: Remove SCX_OPS_PREPPING
sched_ext: Relocate check_hotplug_seq() call in scx_ops_enable()
sched_ext: Use shorter slice while bypassing
sched_ext: Split the global DSQ per NUMA node
sched_ext: Relocate find_user_dsq()
sched_ext: Allow only user DSQs for scx_bpf_consume(), scx_bpf_dsq_nr_queued() and bpf_iter_scx_dsq_new()
scx_flatcg: Use a user DSQ for fallback instead of SCX_DSQ_GLOBAL
tools/sched_ext: Receive misc updates from SCX repo
sched_ext: Add __COMPAT helpers for features added during v6.12 devel cycle
sched_ext: Build fix for !CONFIG_SMP

+236 -166
+172 -148
kernel/sched/ext.c
··· 9 9 #define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) 10 10 11 11 enum scx_consts { 12 + SCX_SLICE_BYPASS = SCX_SLICE_DFL / 4, 12 13 SCX_DSP_DFL_MAX_BATCH = 32, 13 14 SCX_DSP_MAX_LOOPS = 32, 14 15 SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, ··· 779 778 }; 780 779 781 780 enum scx_ops_enable_state { 782 - SCX_OPS_PREPPING, 783 781 SCX_OPS_ENABLING, 784 782 SCX_OPS_ENABLED, 785 783 SCX_OPS_DISABLING, ··· 786 786 }; 787 787 788 788 static const char *scx_ops_enable_state_str[] = { 789 - [SCX_OPS_PREPPING] = "prepping", 790 789 [SCX_OPS_ENABLING] = "enabling", 791 790 [SCX_OPS_ENABLED] = "enabled", 792 791 [SCX_OPS_DISABLING] = "disabling", ··· 853 854 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); 854 855 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); 855 856 static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); 857 + static bool scx_ops_init_task_enabled; 856 858 static bool scx_switching_all; 857 859 DEFINE_STATIC_KEY_FALSE(__scx_switched_all); 858 860 ··· 925 925 */ 926 926 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); 927 927 928 - /* dispatch queues */ 929 - static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; 928 + /* 929 + * Dispatch queues. 930 + * 931 + * The global DSQ (%SCX_DSQ_GLOBAL) is split per-node for scalability. This is 932 + * to avoid live-locking in bypass mode where all tasks are dispatched to 933 + * %SCX_DSQ_GLOBAL and all CPUs consume from it. If per-node split isn't 934 + * sufficient, it can be further split. 935 + */ 936 + static struct scx_dispatch_q **global_dsqs; 930 937 931 938 static const struct rhashtable_params dsq_hash_params = { 932 939 .key_len = 8, ··· 1034 1027 static bool u32_before(u32 a, u32 b) 1035 1028 { 1036 1029 return (s32)(a - b) < 0; 1030 + } 1031 + 1032 + static struct scx_dispatch_q *find_global_dsq(struct task_struct *p) 1033 + { 1034 + return global_dsqs[cpu_to_node(task_cpu(p))]; 1035 + } 1036 + 1037 + static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) 1038 + { 1039 + return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); 1037 1040 } 1038 1041 1039 1042 /* ··· 1654 1637 scx_ops_error("attempting to dispatch to a destroyed dsq"); 1655 1638 /* fall back to the global dsq */ 1656 1639 raw_spin_unlock(&dsq->lock); 1657 - dsq = &scx_dsq_global; 1640 + dsq = find_global_dsq(p); 1658 1641 raw_spin_lock(&dsq->lock); 1659 1642 } 1660 1643 } ··· 1820 1803 raw_spin_unlock(&dsq->lock); 1821 1804 } 1822 1805 1823 - static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) 1824 - { 1825 - return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); 1826 - } 1827 - 1828 - static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) 1829 - { 1830 - lockdep_assert(rcu_read_lock_any_held()); 1831 - 1832 - if (dsq_id == SCX_DSQ_GLOBAL) 1833 - return &scx_dsq_global; 1834 - else 1835 - return find_user_dsq(dsq_id); 1836 - } 1837 - 1838 1806 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, 1839 1807 struct task_struct *p) 1840 1808 { ··· 1832 1830 s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; 1833 1831 1834 1832 if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) 1835 - return &scx_dsq_global; 1833 + return find_global_dsq(p); 1836 1834 1837 1835 return &cpu_rq(cpu)->scx.local_dsq; 1838 1836 } 1839 1837 1840 - dsq = find_non_local_dsq(dsq_id); 1838 + if (dsq_id == SCX_DSQ_GLOBAL) 1839 + dsq = find_global_dsq(p); 1840 + else 1841 + dsq = find_user_dsq(dsq_id); 1842 + 1841 1843 if (unlikely(!dsq)) { 1842 1844 scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", 1843 1845 dsq_id, p->comm, p->pid); 1844 - return &scx_dsq_global; 1846 + return find_global_dsq(p); 1845 1847 } 1846 1848 1847 1849 return dsq; ··· 1944 1938 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, 1945 1939 int sticky_cpu) 1946 1940 { 1941 + bool bypassing = scx_rq_bypassing(rq); 1947 1942 struct task_struct **ddsp_taskp; 1948 1943 unsigned long qseq; 1949 1944 ··· 1962 1955 if (!scx_rq_online(rq)) 1963 1956 goto local; 1964 1957 1965 - if (scx_rq_bypassing(rq)) 1958 + if (bypassing) 1966 1959 goto global; 1967 1960 1968 1961 if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) ··· 2017 2010 2018 2011 global: 2019 2012 touch_core_sched(rq, p); /* see the comment in local: */ 2020 - p->scx.slice = SCX_SLICE_DFL; 2021 - dispatch_enqueue(&scx_dsq_global, p, enq_flags); 2013 + p->scx.slice = bypassing ? SCX_SLICE_BYPASS : SCX_SLICE_DFL; 2014 + dispatch_enqueue(find_global_dsq(p), p, enq_flags); 2022 2015 } 2023 2016 2024 2017 static bool task_runnable(const struct task_struct *p) ··· 2364 2357 } 2365 2358 } 2366 2359 #else /* CONFIG_SMP */ 2360 + static inline void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags, struct rq *src_rq, struct rq *dst_rq) { WARN_ON_ONCE(1); } 2367 2361 static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq, bool trigger_error) { return false; } 2368 2362 static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; } 2369 2363 #endif /* CONFIG_SMP */ ··· 2404 2396 return false; 2405 2397 } 2406 2398 2399 + static bool consume_global_dsq(struct rq *rq) 2400 + { 2401 + int node = cpu_to_node(cpu_of(rq)); 2402 + 2403 + return consume_dispatch_q(rq, global_dsqs[node]); 2404 + } 2405 + 2407 2406 /** 2408 2407 * dispatch_to_local_dsq - Dispatch a task to a local dsq 2409 2408 * @rq: current rq which is locked ··· 2444 2429 2445 2430 #ifdef CONFIG_SMP 2446 2431 if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) { 2447 - dispatch_enqueue(&scx_dsq_global, p, enq_flags | SCX_ENQ_CLEAR_OPSS); 2432 + dispatch_enqueue(find_global_dsq(p), p, 2433 + enq_flags | SCX_ENQ_CLEAR_OPSS); 2448 2434 return; 2449 2435 } 2450 2436 ··· 2645 2629 if (rq->scx.local_dsq.nr) 2646 2630 goto has_tasks; 2647 2631 2648 - if (consume_dispatch_q(rq, &scx_dsq_global)) 2632 + if (consume_global_dsq(rq)) 2649 2633 goto has_tasks; 2650 2634 2651 2635 if (!SCX_HAS_OP(dispatch) || scx_rq_bypassing(rq) || !scx_rq_online(rq)) ··· 2670 2654 2671 2655 if (rq->scx.local_dsq.nr) 2672 2656 goto has_tasks; 2673 - if (consume_dispatch_q(rq, &scx_dsq_global)) 2657 + if (consume_global_dsq(rq)) 2674 2658 goto has_tasks; 2675 2659 2676 2660 /* ··· 3074 3058 * there is an idle core elsewhere on the system. 3075 3059 */ 3076 3060 cpu = smp_processor_id(); 3077 - if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && 3061 + if ((wake_flags & SCX_WAKE_SYNC) && 3078 3062 !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && 3079 3063 cpu_rq(cpu)->scx.local_dsq.nr == 0) { 3080 3064 if (cpumask_test_cpu(cpu, p->cpus_ptr)) 3081 3065 goto cpu_found; 3082 - } 3083 - 3084 - if (p->nr_cpus_allowed == 1) { 3085 - if (test_and_clear_cpu_idle(prev_cpu)) { 3086 - cpu = prev_cpu; 3087 - goto cpu_found; 3088 - } else { 3089 - return prev_cpu; 3090 - } 3091 3066 } 3092 3067 3093 3068 /* ··· 3557 3550 { 3558 3551 percpu_rwsem_assert_held(&scx_fork_rwsem); 3559 3552 3560 - if (scx_enabled()) 3553 + if (scx_ops_init_task_enabled) 3561 3554 return scx_ops_init_task(p, task_group(p), true); 3562 3555 else 3563 3556 return 0; ··· 3565 3558 3566 3559 void scx_post_fork(struct task_struct *p) 3567 3560 { 3568 - if (scx_enabled()) { 3561 + if (scx_ops_init_task_enabled) { 3569 3562 scx_set_task_state(p, SCX_TASK_READY); 3570 3563 3571 3564 /* ··· 3697 3690 #ifdef CONFIG_EXT_GROUP_SCHED 3698 3691 3699 3692 DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); 3693 + static bool scx_cgroup_enabled; 3700 3694 static bool cgroup_warned_missing_weight; 3701 3695 static bool cgroup_warned_missing_idle; 3702 3696 ··· 3717 3709 3718 3710 static void scx_cgroup_warn_missing_idle(struct task_group *tg) 3719 3711 { 3720 - if (scx_ops_enable_state() == SCX_OPS_DISABLED || 3721 - cgroup_warned_missing_idle) 3712 + if (!scx_cgroup_enabled || cgroup_warned_missing_idle) 3722 3713 return; 3723 3714 3724 3715 if (!tg->idle) ··· 3738 3731 3739 3732 scx_cgroup_warn_missing_weight(tg); 3740 3733 3741 - if (SCX_HAS_OP(cgroup_init)) { 3742 - struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; 3734 + if (scx_cgroup_enabled) { 3735 + if (SCX_HAS_OP(cgroup_init)) { 3736 + struct scx_cgroup_init_args args = 3737 + { .weight = tg->scx_weight }; 3743 3738 3744 - ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, 3745 - tg->css.cgroup, &args); 3746 - if (!ret) 3739 + ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, cgroup_init, 3740 + tg->css.cgroup, &args); 3741 + if (ret) 3742 + ret = ops_sanitize_err("cgroup_init", ret); 3743 + } 3744 + if (ret == 0) 3747 3745 tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; 3748 - else 3749 - ret = ops_sanitize_err("cgroup_init", ret); 3750 3746 } else { 3751 3747 tg->scx_flags |= SCX_TG_ONLINE; 3752 3748 } ··· 3780 3770 /* released in scx_finish/cancel_attach() */ 3781 3771 percpu_down_read(&scx_cgroup_rwsem); 3782 3772 3783 - if (!scx_enabled()) 3773 + if (!scx_cgroup_enabled) 3784 3774 return 0; 3785 3775 3786 3776 cgroup_taskset_for_each(p, css, tset) { ··· 3823 3813 3824 3814 void scx_move_task(struct task_struct *p) 3825 3815 { 3826 - if (!scx_enabled()) 3816 + if (!scx_cgroup_enabled) 3827 3817 return; 3828 3818 3829 3819 /* ··· 3859 3849 struct cgroup_subsys_state *css; 3860 3850 struct task_struct *p; 3861 3851 3862 - if (!scx_enabled()) 3852 + if (!scx_cgroup_enabled) 3863 3853 goto out_unlock; 3864 3854 3865 3855 cgroup_taskset_for_each(p, css, tset) { ··· 3876 3866 { 3877 3867 percpu_down_read(&scx_cgroup_rwsem); 3878 3868 3879 - if (tg->scx_weight != weight) { 3869 + if (scx_cgroup_enabled && tg->scx_weight != weight) { 3880 3870 if (SCX_HAS_OP(cgroup_set_weight)) 3881 3871 SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_set_weight, 3882 3872 tg_cgrp(tg), weight); ··· 4048 4038 4049 4039 percpu_rwsem_assert_held(&scx_cgroup_rwsem); 4050 4040 4041 + WARN_ON_ONCE(!scx_cgroup_enabled); 4042 + scx_cgroup_enabled = false; 4043 + 4051 4044 /* 4052 4045 * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk 4053 4046 * cgroups and exit all the inited ones, all online cgroups are exited. ··· 4125 4112 css_put(css); 4126 4113 } 4127 4114 rcu_read_unlock(); 4115 + 4116 + WARN_ON_ONCE(scx_cgroup_enabled); 4117 + scx_cgroup_enabled = true; 4128 4118 4129 4119 return 0; 4130 4120 } ··· 4447 4431 WRITE_ONCE(scx_switching_all, false); 4448 4432 4449 4433 /* 4450 - * Avoid racing against fork and cgroup changes. See scx_ops_enable() 4451 - * for explanation on the locking order. 4434 + * Shut down cgroup support before tasks so that the cgroup attach path 4435 + * doesn't race against scx_ops_exit_task(). 4452 4436 */ 4453 - percpu_down_write(&scx_fork_rwsem); 4454 - cpus_read_lock(); 4455 4437 scx_cgroup_lock(); 4438 + scx_cgroup_exit(); 4439 + scx_cgroup_unlock(); 4456 4440 4457 - spin_lock_irq(&scx_tasks_lock); 4458 - scx_task_iter_init(&sti); 4459 4441 /* 4460 4442 * The BPF scheduler is going away. All tasks including %TASK_DEAD ones 4461 4443 * must be switched out and exited synchronously. 4462 4444 */ 4445 + percpu_down_write(&scx_fork_rwsem); 4446 + 4447 + scx_ops_init_task_enabled = false; 4448 + 4449 + spin_lock_irq(&scx_tasks_lock); 4450 + scx_task_iter_init(&sti); 4463 4451 while ((p = scx_task_iter_next_locked(&sti))) { 4464 4452 const struct sched_class *old_class = p->sched_class; 4465 4453 struct sched_enq_and_set_ctx ctx; ··· 4481 4461 } 4482 4462 scx_task_iter_exit(&sti); 4483 4463 spin_unlock_irq(&scx_tasks_lock); 4464 + percpu_up_write(&scx_fork_rwsem); 4484 4465 4485 4466 /* no task is on scx, turn off all the switches and flush in-progress calls */ 4486 - static_branch_disable_cpuslocked(&__scx_ops_enabled); 4467 + static_branch_disable(&__scx_ops_enabled); 4487 4468 for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) 4488 - static_branch_disable_cpuslocked(&scx_has_op[i]); 4489 - static_branch_disable_cpuslocked(&scx_ops_enq_last); 4490 - static_branch_disable_cpuslocked(&scx_ops_enq_exiting); 4491 - static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); 4492 - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 4469 + static_branch_disable(&scx_has_op[i]); 4470 + static_branch_disable(&scx_ops_enq_last); 4471 + static_branch_disable(&scx_ops_enq_exiting); 4472 + static_branch_disable(&scx_ops_cpu_preempt); 4473 + static_branch_disable(&scx_builtin_idle_enabled); 4493 4474 synchronize_rcu(); 4494 - 4495 - scx_cgroup_exit(); 4496 - 4497 - scx_cgroup_unlock(); 4498 - cpus_read_unlock(); 4499 - percpu_up_write(&scx_fork_rwsem); 4500 4475 4501 4476 if (ei->kind >= SCX_EXIT_ERROR) { 4502 4477 pr_err("sched_ext: BPF scheduler \"%s\" disabled (%s)\n", ··· 4944 4929 struct scx_task_iter sti; 4945 4930 struct task_struct *p; 4946 4931 unsigned long timeout; 4947 - int i, cpu, ret; 4932 + int i, cpu, node, ret; 4948 4933 4949 4934 if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), 4950 4935 cpu_possible_mask)) { ··· 4961 4946 ret = -ENOMEM; 4962 4947 goto err_unlock; 4963 4948 } 4949 + } 4950 + 4951 + if (!global_dsqs) { 4952 + struct scx_dispatch_q **dsqs; 4953 + 4954 + dsqs = kcalloc(nr_node_ids, sizeof(dsqs[0]), GFP_KERNEL); 4955 + if (!dsqs) { 4956 + ret = -ENOMEM; 4957 + goto err_unlock; 4958 + } 4959 + 4960 + for_each_node_state(node, N_POSSIBLE) { 4961 + struct scx_dispatch_q *dsq; 4962 + 4963 + dsq = kzalloc_node(sizeof(*dsq), GFP_KERNEL, node); 4964 + if (!dsq) { 4965 + for_each_node_state(node, N_POSSIBLE) 4966 + kfree(dsqs[node]); 4967 + kfree(dsqs); 4968 + ret = -ENOMEM; 4969 + goto err_unlock; 4970 + } 4971 + 4972 + init_dsq(dsq, SCX_DSQ_GLOBAL); 4973 + dsqs[node] = dsq; 4974 + } 4975 + 4976 + global_dsqs = dsqs; 4964 4977 } 4965 4978 4966 4979 if (scx_ops_enable_state() != SCX_OPS_DISABLED) { ··· 5014 4971 } 5015 4972 5016 4973 /* 5017 - * Set scx_ops, transition to PREPPING and clear exit info to arm the 4974 + * Set scx_ops, transition to ENABLING and clear exit info to arm the 5018 4975 * disable path. Failure triggers full disabling from here on. 5019 4976 */ 5020 4977 scx_ops = *ops; 5021 4978 5022 - WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != 4979 + WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_ENABLING) != 5023 4980 SCX_OPS_DISABLED); 5024 4981 5025 4982 atomic_set(&scx_exit_kind, SCX_EXIT_NONE); ··· 5040 4997 ret = SCX_CALL_OP_RET(SCX_KF_UNLOCKED, init); 5041 4998 if (ret) { 5042 4999 ret = ops_sanitize_err("init", ret); 5043 - goto err_disable_unlock_cpus; 5000 + cpus_read_unlock(); 5001 + goto err_disable; 5044 5002 } 5045 5003 } 5046 5004 ··· 5049 5005 if (((void (**)(void))ops)[i]) 5050 5006 static_branch_enable_cpuslocked(&scx_has_op[i]); 5051 5007 5008 + check_hotplug_seq(ops); 5052 5009 cpus_read_unlock(); 5053 5010 5054 5011 ret = validate_ops(ops); ··· 5077 5032 scx_watchdog_timeout / 2); 5078 5033 5079 5034 /* 5080 - * Lock out forks, cgroup on/offlining and moves before opening the 5081 - * floodgate so that they don't wander into the operations prematurely. 5082 - * 5083 - * We don't need to keep the CPUs stable but static_branch_*() requires 5084 - * cpus_read_lock() and scx_cgroup_rwsem must nest inside 5085 - * cpu_hotplug_lock because of the following dependency chain: 5086 - * 5087 - * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem 5088 - * 5089 - * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use 5090 - * static_branch_*_cpuslocked(). 5091 - * 5092 - * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the 5093 - * following dependency chain: 5094 - * 5095 - * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock 5035 + * Once __scx_ops_enabled is set, %current can be switched to SCX 5036 + * anytime. This can lead to stalls as some BPF schedulers (e.g. 5037 + * userspace scheduling) may not function correctly before all tasks are 5038 + * switched. Init in bypass mode to guarantee forward progress. 5096 5039 */ 5097 - percpu_down_write(&scx_fork_rwsem); 5098 - cpus_read_lock(); 5099 - scx_cgroup_lock(); 5100 - 5101 - check_hotplug_seq(ops); 5040 + scx_ops_bypass(true); 5102 5041 5103 5042 for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) 5104 5043 if (((void (**)(void))ops)[i]) 5105 - static_branch_enable_cpuslocked(&scx_has_op[i]); 5044 + static_branch_enable(&scx_has_op[i]); 5106 5045 5107 5046 if (ops->flags & SCX_OPS_ENQ_LAST) 5108 - static_branch_enable_cpuslocked(&scx_ops_enq_last); 5047 + static_branch_enable(&scx_ops_enq_last); 5109 5048 5110 5049 if (ops->flags & SCX_OPS_ENQ_EXITING) 5111 - static_branch_enable_cpuslocked(&scx_ops_enq_exiting); 5050 + static_branch_enable(&scx_ops_enq_exiting); 5112 5051 if (scx_ops.cpu_acquire || scx_ops.cpu_release) 5113 - static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); 5052 + static_branch_enable(&scx_ops_cpu_preempt); 5114 5053 5115 5054 if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { 5116 5055 reset_idle_masks(); 5117 - static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); 5056 + static_branch_enable(&scx_builtin_idle_enabled); 5118 5057 } else { 5119 - static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); 5058 + static_branch_disable(&scx_builtin_idle_enabled); 5120 5059 } 5121 5060 5122 5061 /* 5123 - * All cgroups should be initialized before letting in tasks. cgroup 5124 - * on/offlining and task migrations are already locked out. 5062 + * Lock out forks, cgroup on/offlining and moves before opening the 5063 + * floodgate so that they don't wander into the operations prematurely. 5125 5064 */ 5126 - ret = scx_cgroup_init(); 5127 - if (ret) 5128 - goto err_disable_unlock_all; 5065 + percpu_down_write(&scx_fork_rwsem); 5129 5066 5130 - static_branch_enable_cpuslocked(&__scx_ops_enabled); 5067 + WARN_ON_ONCE(scx_ops_init_task_enabled); 5068 + scx_ops_init_task_enabled = true; 5131 5069 5132 5070 /* 5133 5071 * Enable ops for every task. Fork is excluded by scx_fork_rwsem ··· 5118 5090 * leaving as sched_ext_free() can handle both prepped and enabled 5119 5091 * tasks. Prep all tasks first and then enable them with preemption 5120 5092 * disabled. 5093 + * 5094 + * All cgroups should be initialized before scx_ops_init_task() so that 5095 + * the BPF scheduler can reliably track each task's cgroup membership 5096 + * from scx_ops_init_task(). Lock out cgroup on/offlining and task 5097 + * migrations while tasks are being initialized so that 5098 + * scx_cgroup_can_attach() never sees uninitialized tasks. 5121 5099 */ 5122 - spin_lock_irq(&scx_tasks_lock); 5100 + scx_cgroup_lock(); 5101 + ret = scx_cgroup_init(); 5102 + if (ret) 5103 + goto err_disable_unlock_all; 5123 5104 5105 + spin_lock_irq(&scx_tasks_lock); 5124 5106 scx_task_iter_init(&sti); 5125 5107 while ((p = scx_task_iter_next_locked(&sti))) { 5126 5108 /* ··· 5155 5117 goto err_disable_unlock_all; 5156 5118 } 5157 5119 5120 + scx_set_task_state(p, SCX_TASK_READY); 5121 + 5158 5122 put_task_struct(p); 5159 5123 spin_lock_irq(&scx_tasks_lock); 5160 5124 } 5161 5125 scx_task_iter_exit(&sti); 5126 + spin_unlock_irq(&scx_tasks_lock); 5127 + scx_cgroup_unlock(); 5128 + percpu_up_write(&scx_fork_rwsem); 5162 5129 5163 5130 /* 5164 - * All tasks are prepped but are still ops-disabled. Ensure that 5165 - * %current can't be scheduled out and switch everyone. 5166 - * preempt_disable() is necessary because we can't guarantee that 5167 - * %current won't be starved if scheduled out while switching. 5131 + * All tasks are READY. It's safe to turn on scx_enabled() and switch 5132 + * all eligible tasks. 5168 5133 */ 5169 - preempt_disable(); 5134 + WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 5135 + static_branch_enable(&__scx_ops_enabled); 5170 5136 5171 5137 /* 5172 - * From here on, the disable path must assume that tasks have ops 5173 - * enabled and need to be recovered. 5174 - * 5175 - * Transition to ENABLING fails iff the BPF scheduler has already 5176 - * triggered scx_bpf_error(). Returning an error code here would lose 5177 - * the recorded error information. Exit indicating success so that the 5178 - * error is notified through ops.exit() with all the details. 5179 - */ 5180 - if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { 5181 - preempt_enable(); 5182 - spin_unlock_irq(&scx_tasks_lock); 5183 - WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 5184 - ret = 0; 5185 - goto err_disable_unlock_all; 5186 - } 5187 - 5188 - /* 5189 - * We're fully committed and can't fail. The PREPPED -> ENABLED 5138 + * We're fully committed and can't fail. The task READY -> ENABLED 5190 5139 * transitions here are synchronized against sched_ext_free() through 5191 5140 * scx_tasks_lock. 5192 5141 */ 5193 - WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); 5194 - 5142 + percpu_down_write(&scx_fork_rwsem); 5143 + spin_lock_irq(&scx_tasks_lock); 5195 5144 scx_task_iter_init(&sti); 5196 5145 while ((p = scx_task_iter_next_locked(&sti))) { 5197 5146 const struct sched_class *old_class = p->sched_class; ··· 5186 5161 5187 5162 sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 5188 5163 5189 - scx_set_task_state(p, SCX_TASK_READY); 5190 5164 __setscheduler_prio(p, p->prio); 5191 5165 check_class_changing(task_rq(p), p, old_class); 5192 5166 ··· 5194 5170 check_class_changed(task_rq(p), p, old_class, p->prio); 5195 5171 } 5196 5172 scx_task_iter_exit(&sti); 5197 - 5198 5173 spin_unlock_irq(&scx_tasks_lock); 5199 - preempt_enable(); 5200 - scx_cgroup_unlock(); 5201 - cpus_read_unlock(); 5202 5174 percpu_up_write(&scx_fork_rwsem); 5203 5175 5204 - /* see above ENABLING transition for the explanation on exiting with 0 */ 5176 + scx_ops_bypass(false); 5177 + 5178 + /* 5179 + * Returning an error code here would lose the recorded error 5180 + * information. Exit indicating success so that the error is notified 5181 + * through ops.exit() with all the details. 5182 + */ 5205 5183 if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { 5206 5184 WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); 5207 5185 ret = 0; ··· 5238 5212 err_disable_unlock_all: 5239 5213 scx_cgroup_unlock(); 5240 5214 percpu_up_write(&scx_fork_rwsem); 5241 - err_disable_unlock_cpus: 5242 - cpus_read_unlock(); 5215 + scx_ops_bypass(false); 5243 5216 err_disable: 5244 5217 mutex_unlock(&scx_ops_enable_mutex); 5245 5218 /* must be fully disabled before returning */ ··· 5807 5782 SCX_TG_ONLINE); 5808 5783 5809 5784 BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); 5810 - init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); 5811 5785 #ifdef CONFIG_SMP 5812 5786 BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); 5813 5787 BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); ··· 6082 6058 if (dst_dsq->id == SCX_DSQ_LOCAL) { 6083 6059 dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq); 6084 6060 if (!task_can_run_on_remote_rq(p, dst_rq, true)) { 6085 - dst_dsq = &scx_dsq_global; 6061 + dst_dsq = find_global_dsq(p); 6086 6062 dst_rq = src_rq; 6087 6063 } 6088 6064 } else { ··· 6199 6175 6200 6176 flush_dispatch_buf(dspc->rq); 6201 6177 6202 - dsq = find_non_local_dsq(dsq_id); 6178 + dsq = find_user_dsq(dsq_id); 6203 6179 if (unlikely(!dsq)) { 6204 6180 scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); 6205 6181 return false; ··· 6520 6496 goto out; 6521 6497 } 6522 6498 } else { 6523 - dsq = find_non_local_dsq(dsq_id); 6499 + dsq = find_user_dsq(dsq_id); 6524 6500 if (dsq) { 6525 6501 ret = READ_ONCE(dsq->nr); 6526 6502 goto out; ··· 6569 6545 if (flags & ~__SCX_DSQ_ITER_USER_FLAGS) 6570 6546 return -EINVAL; 6571 6547 6572 - kit->dsq = find_non_local_dsq(dsq_id); 6548 + kit->dsq = find_user_dsq(dsq_id); 6573 6549 if (!kit->dsq) 6574 6550 return -ENOENT; 6575 6551
+15
tools/sched_ext/include/scx/common.bpf.h
··· 7 7 #ifndef __SCX_COMMON_BPF_H 8 8 #define __SCX_COMMON_BPF_H 9 9 10 + #ifdef LSP 11 + #define __bpf__ 12 + #include "../vmlinux/vmlinux.h" 13 + #else 10 14 #include "vmlinux.h" 15 + #endif 16 + 11 17 #include <bpf/bpf_helpers.h> 12 18 #include <bpf/bpf_tracing.h> 13 19 #include <asm-generic/errno.h> ··· 315 309 u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; 316 310 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, 317 311 const struct cpumask *src2) __ksym; 312 + u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; 313 + 314 + /* 315 + * Access a cpumask in read-only mode (typically to check bits). 316 + */ 317 + const struct cpumask *cast_mask(struct bpf_cpumask *mask) 318 + { 319 + return (const struct cpumask *)mask; 320 + } 318 321 319 322 /* rcu */ 320 323 void bpf_rcu_read_lock(void) __ksym;
+19
tools/sched_ext/include/scx/compat.bpf.h
··· 15 15 __ret; \ 16 16 }) 17 17 18 + /* v6.12: 819513666966 ("sched_ext: Add cgroup support") */ 19 + #define __COMPAT_scx_bpf_task_cgroup(p) \ 20 + (bpf_ksym_exists(scx_bpf_task_cgroup) ? \ 21 + scx_bpf_task_cgroup((p)) : NULL) 22 + 23 + /* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */ 24 + #define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice) \ 25 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ? \ 26 + scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0) 27 + #define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime) \ 28 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ? \ 29 + scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0) 30 + #define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags) \ 31 + (bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ? \ 32 + scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) 33 + #define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags) \ 34 + (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ? \ 35 + scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false) 36 + 18 37 /* 19 38 * Define sched_ext_ops. This may be expanded to define multiple variants for 20 39 * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
+4
tools/sched_ext/include/scx/user_exit_info.h
··· 25 25 26 26 #ifdef __bpf__ 27 27 28 + #ifdef LSP 29 + #include "../vmlinux/vmlinux.h" 30 + #else 28 31 #include "vmlinux.h" 32 + #endif 29 33 #include <bpf/bpf_core_read.h> 30 34 31 35 #define UEI_DEFINE(__name) \
+20 -12
tools/sched_ext/scx_flatcg.bpf.c
··· 49 49 /* 50 50 * Maximum amount of retries to find a valid cgroup. 51 51 */ 52 - #define CGROUP_MAX_RETRIES 1024 52 + enum { 53 + FALLBACK_DSQ = 0, 54 + CGROUP_MAX_RETRIES = 1024, 55 + }; 53 56 54 57 char _license[] SEC("license") = "GPL"; 55 58 ··· 228 225 break; 229 226 230 227 /* 231 - * We can be oppotunistic here and not grab the 228 + * We can be opportunistic here and not grab the 232 229 * cgv_tree_lock and deal with the occasional races. 233 230 * However, hweight updates are already cached and 234 231 * relatively low-frequency. Let's just do the ··· 261 258 * and thus can't be updated and repositioned. Instead, we collect the 262 259 * vtime deltas separately and apply it asynchronously here. 263 260 */ 264 - delta = cgc->cvtime_delta; 265 - __sync_fetch_and_sub(&cgc->cvtime_delta, delta); 261 + delta = __sync_fetch_and_sub(&cgc->cvtime_delta, cgc->cvtime_delta); 266 262 cvtime = cgv_node->cvtime + delta; 267 263 268 264 /* ··· 380 378 scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); 381 379 } else { 382 380 stat_inc(FCG_STAT_GLOBAL); 383 - scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 381 + scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags); 384 382 } 385 383 return; 386 384 } 387 385 388 - cgrp = scx_bpf_task_cgroup(p); 386 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 389 387 cgc = find_cgrp_ctx(cgrp); 390 388 if (!cgc) 391 389 goto out_release; ··· 511 509 { 512 510 struct cgroup *cgrp; 513 511 514 - cgrp = scx_bpf_task_cgroup(p); 512 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 515 513 update_active_weight_sums(cgrp, true); 516 514 bpf_cgroup_release(cgrp); 517 515 } ··· 524 522 if (fifo_sched) 525 523 return; 526 524 527 - cgrp = scx_bpf_task_cgroup(p); 525 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 528 526 cgc = find_cgrp_ctx(cgrp); 529 527 if (cgc) { 530 528 /* ··· 567 565 if (!taskc->bypassed_at) 568 566 return; 569 567 570 - cgrp = scx_bpf_task_cgroup(p); 568 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 571 569 cgc = find_cgrp_ctx(cgrp); 572 570 if (cgc) { 573 571 __sync_fetch_and_add(&cgc->cvtime_delta, ··· 581 579 { 582 580 struct cgroup *cgrp; 583 581 584 - cgrp = scx_bpf_task_cgroup(p); 582 + cgrp = __COMPAT_scx_bpf_task_cgroup(p); 585 583 update_active_weight_sums(cgrp, false); 586 584 bpf_cgroup_release(cgrp); 587 585 } ··· 783 781 pick_next_cgroup: 784 782 cpuc->cur_at = now; 785 783 786 - if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { 784 + if (scx_bpf_consume(FALLBACK_DSQ)) { 787 785 cpuc->cur_cgid = 0; 788 786 return; 789 787 } ··· 840 838 int ret; 841 839 842 840 /* 843 - * Technically incorrect as cgroup ID is full 64bit while dq ID is 841 + * Technically incorrect as cgroup ID is full 64bit while dsq ID is 844 842 * 63bit. Should not be a problem in practice and easy to spot in the 845 843 * unlikely case that it breaks. 846 844 */ ··· 928 926 p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; 929 927 } 930 928 929 + s32 BPF_STRUCT_OPS_SLEEPABLE(fcg_init) 930 + { 931 + return scx_bpf_create_dsq(FALLBACK_DSQ, -1); 932 + } 933 + 931 934 void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) 932 935 { 933 936 UEI_RECORD(uei, ei); ··· 951 944 .cgroup_init = (void *)fcg_cgroup_init, 952 945 .cgroup_exit = (void *)fcg_cgroup_exit, 953 946 .cgroup_move = (void *)fcg_cgroup_move, 947 + .init = (void *)fcg_init, 954 948 .exit = (void *)fcg_exit, 955 949 .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, 956 950 .name = "flatcg");
+6 -6
tools/sched_ext/scx_qmap.bpf.c
··· 318 318 319 319 if (tctx->highpri) { 320 320 /* exercise the set_*() and vtime interface too */ 321 - scx_bpf_dispatch_from_dsq_set_slice( 321 + __COMPAT_scx_bpf_dispatch_from_dsq_set_slice( 322 322 BPF_FOR_EACH_ITER, slice_ns * 2); 323 - scx_bpf_dispatch_from_dsq_set_vtime( 323 + __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime( 324 324 BPF_FOR_EACH_ITER, highpri_seq++); 325 - scx_bpf_dispatch_vtime_from_dsq( 325 + __COMPAT_scx_bpf_dispatch_vtime_from_dsq( 326 326 BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); 327 327 } 328 328 } ··· 340 340 else 341 341 cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); 342 342 343 - if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, 344 - SCX_DSQ_LOCAL_ON | cpu, 345 - SCX_ENQ_PREEMPT)) { 343 + if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p, 344 + SCX_DSQ_LOCAL_ON | cpu, 345 + SCX_ENQ_PREEMPT)) { 346 346 if (cpu == this_cpu) { 347 347 dispatched = true; 348 348 __sync_fetch_and_add(&nr_expedited_local, 1);