Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched_ext-for-6.18-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

Pull sched_ext fixes from Tejun Heo:

- Fix scx_kick_pseqs corruption when multiple schedulers are loaded
concurrently

- Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc() to handle
systems with large CPU counts

- Defer queue_balance_callback() until after ops.dispatch to fix
callback ordering issues

- Sync error_irq_work before freeing scx_sched to prevent
use-after-free

- Mark scx_bpf_dsq_move_set_[slice|vtime]() with KF_RCU for proper RCU
protection

- Fix flag check for deferred callbacks

* tag 'sched_ext-for-6.18-rc3-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext:
sched_ext: fix flag check for deferred callbacks
sched_ext: Fix scx_kick_pseqs corruption on concurrent scheduler loads
sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()
sched_ext: defer queue_balance_callback() until after ops.dispatch
sched_ext: Sync error_irq_work before freeing scx_sched
sched_ext: Mark scx_bpf_dsq_move_set_[slice|vtime]() with KF_RCU

+112 -15
+111 -15
kernel/sched/ext.c
··· 67 67 68 68 static struct delayed_work scx_watchdog_work; 69 69 70 - /* for %SCX_KICK_WAIT */ 71 - static unsigned long __percpu *scx_kick_cpus_pnt_seqs; 70 + /* 71 + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence 72 + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 73 + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 74 + * lazily when enabling and freed when disabling to avoid waste when sched_ext 75 + * isn't active. 76 + */ 77 + struct scx_kick_pseqs { 78 + struct rcu_head rcu; 79 + unsigned long seqs[]; 80 + }; 81 + 82 + static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs); 72 83 73 84 /* 74 85 * Direct dispatch marker. ··· 791 780 if (rq->scx.flags & SCX_RQ_IN_WAKEUP) 792 781 return; 793 782 783 + /* Don't do anything if there already is a deferred operation. */ 784 + if (rq->scx.flags & SCX_RQ_BAL_CB_PENDING) 785 + return; 786 + 794 787 /* 795 788 * If in balance, the balance callbacks will be called before rq lock is 796 789 * released. Schedule one. 790 + * 791 + * 792 + * We can't directly insert the callback into the 793 + * rq's list: The call can drop its lock and make the pending balance 794 + * callback visible to unrelated code paths that call rq_pin_lock(). 795 + * 796 + * Just let balance_one() know that it must do it itself. 797 797 */ 798 798 if (rq->scx.flags & SCX_RQ_IN_BALANCE) { 799 - queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 800 - deferred_bal_cb_workfn); 799 + rq->scx.flags |= SCX_RQ_BAL_CB_PENDING; 801 800 return; 802 801 } 803 802 ··· 2024 2003 dspc->cursor = 0; 2025 2004 } 2026 2005 2006 + static inline void maybe_queue_balance_callback(struct rq *rq) 2007 + { 2008 + lockdep_assert_rq_held(rq); 2009 + 2010 + if (!(rq->scx.flags & SCX_RQ_BAL_CB_PENDING)) 2011 + return; 2012 + 2013 + queue_balance_callback(rq, &rq->scx.deferred_bal_cb, 2014 + deferred_bal_cb_workfn); 2015 + 2016 + rq->scx.flags &= ~SCX_RQ_BAL_CB_PENDING; 2017 + } 2018 + 2027 2019 static int balance_one(struct rq *rq, struct task_struct *prev) 2028 2020 { 2029 2021 struct scx_sched *sch = scx_root; ··· 2183 2149 } 2184 2150 #endif 2185 2151 rq_repin_lock(rq, rf); 2152 + 2153 + maybe_queue_balance_callback(rq); 2186 2154 2187 2155 return ret; 2188 2156 } ··· 3507 3471 struct scx_dispatch_q *dsq; 3508 3472 int node; 3509 3473 3474 + irq_work_sync(&sch->error_irq_work); 3510 3475 kthread_stop(sch->helper->task); 3476 + 3511 3477 free_percpu(sch->pcpu); 3512 3478 3513 3479 for_each_node_state(node, N_POSSIBLE) ··· 3888 3850 } 3889 3851 } 3890 3852 3853 + static void free_kick_pseqs_rcu(struct rcu_head *rcu) 3854 + { 3855 + struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu); 3856 + 3857 + kvfree(pseqs); 3858 + } 3859 + 3860 + static void free_kick_pseqs(void) 3861 + { 3862 + int cpu; 3863 + 3864 + for_each_possible_cpu(cpu) { 3865 + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); 3866 + struct scx_kick_pseqs *to_free; 3867 + 3868 + to_free = rcu_replace_pointer(*pseqs, NULL, true); 3869 + if (to_free) 3870 + call_rcu(&to_free->rcu, free_kick_pseqs_rcu); 3871 + } 3872 + } 3873 + 3891 3874 static void scx_disable_workfn(struct kthread_work *work) 3892 3875 { 3893 3876 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); ··· 4045 3986 free_percpu(scx_dsp_ctx); 4046 3987 scx_dsp_ctx = NULL; 4047 3988 scx_dsp_max_batch = 0; 3989 + free_kick_pseqs(); 4048 3990 4049 3991 mutex_unlock(&scx_enable_mutex); 4050 3992 ··· 4408 4348 irq_work_queue(&sch->error_irq_work); 4409 4349 } 4410 4350 4351 + static int alloc_kick_pseqs(void) 4352 + { 4353 + int cpu; 4354 + 4355 + /* 4356 + * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 4357 + * can exceed percpu allocator limits on large machines. 4358 + */ 4359 + for_each_possible_cpu(cpu) { 4360 + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); 4361 + struct scx_kick_pseqs *new_pseqs; 4362 + 4363 + WARN_ON_ONCE(rcu_access_pointer(*pseqs)); 4364 + 4365 + new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids), 4366 + GFP_KERNEL, cpu_to_node(cpu)); 4367 + if (!new_pseqs) { 4368 + free_kick_pseqs(); 4369 + return -ENOMEM; 4370 + } 4371 + 4372 + rcu_assign_pointer(*pseqs, new_pseqs); 4373 + } 4374 + 4375 + return 0; 4376 + } 4377 + 4411 4378 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) 4412 4379 { 4413 4380 struct scx_sched *sch; ··· 4582 4495 goto err_unlock; 4583 4496 } 4584 4497 4498 + ret = alloc_kick_pseqs(); 4499 + if (ret) 4500 + goto err_unlock; 4501 + 4585 4502 sch = scx_alloc_and_add_sched(ops); 4586 4503 if (IS_ERR(sch)) { 4587 4504 ret = PTR_ERR(sch); 4588 - goto err_unlock; 4505 + goto err_free_pseqs; 4589 4506 } 4590 4507 4591 4508 /* ··· 4792 4701 4793 4702 return 0; 4794 4703 4704 + err_free_pseqs: 4705 + free_kick_pseqs(); 4795 4706 err_unlock: 4796 4707 mutex_unlock(&scx_enable_mutex); 4797 4708 return ret; ··· 5175 5082 { 5176 5083 struct rq *this_rq = this_rq(); 5177 5084 struct scx_rq *this_scx = &this_rq->scx; 5178 - unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); 5085 + struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs); 5179 5086 bool should_wait = false; 5087 + unsigned long *pseqs; 5180 5088 s32 cpu; 5089 + 5090 + if (unlikely(!pseqs_pcpu)) { 5091 + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs"); 5092 + return; 5093 + } 5094 + 5095 + pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs; 5181 5096 5182 5097 for_each_cpu(cpu, this_scx->cpus_to_kick) { 5183 5098 should_wait |= kick_one_cpu(cpu, this_rq, pseqs); ··· 5308 5207 SCX_TG_ONLINE); 5309 5208 5310 5209 scx_idle_init_masks(); 5311 - 5312 - scx_kick_cpus_pnt_seqs = 5313 - __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, 5314 - __alignof__(scx_kick_cpus_pnt_seqs[0])); 5315 - BUG_ON(!scx_kick_cpus_pnt_seqs); 5316 5210 5317 5211 for_each_possible_cpu(cpu) { 5318 5212 struct rq *rq = cpu_rq(cpu); ··· 5784 5688 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) 5785 5689 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) 5786 5690 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local) 5787 - BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) 5788 - BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) 5691 + BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 5692 + BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 5789 5693 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 5790 5694 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 5791 5695 BTF_KFUNCS_END(scx_kfunc_ids_dispatch) ··· 5916 5820 5917 5821 BTF_KFUNCS_START(scx_kfunc_ids_unlocked) 5918 5822 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) 5919 - BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice) 5920 - BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime) 5823 + BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice, KF_RCU) 5824 + BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime, KF_RCU) 5921 5825 BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU) 5922 5826 BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU) 5923 5827 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)
+1
kernel/sched/sched.h
··· 784 784 SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ 785 785 SCX_RQ_BYPASSING = 1 << 4, 786 786 SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ 787 + SCX_RQ_BAL_CB_PENDING = 1 << 6, /* must queue a cb after dispatching */ 787 788 788 789 SCX_RQ_IN_WAKEUP = 1 << 16, 789 790 SCX_RQ_IN_BALANCE = 1 << 17,