Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched_ext: Fix SCX_KICK_WAIT deadlock by deferring wait to balance callback

SCX_KICK_WAIT busy-waits in kick_cpus_irq_workfn() using
smp_cond_load_acquire() until the target CPU's kick_sync advances. Because
the irq_work runs in hardirq context, the waiting CPU cannot reschedule and
its own kick_sync never advances. If multiple CPUs form a wait cycle, all
CPUs deadlock.

Replace the busy-wait in kick_cpus_irq_workfn() with resched_curr() to
force the CPU through do_pick_task_scx(), which queues a balance callback
to perform the wait. The balance callback drops the rq lock and enables
IRQs following the sched_core_balance() pattern, so the CPU can process
IPIs while waiting. The local CPU's kick_sync is advanced on entry to
do_pick_task_scx() and continuously during the wait, ensuring any CPU that
starts waiting for us sees the advancement and cannot form cyclic
dependencies.

Fixes: 90e55164dad4 ("sched_ext: Implement SCX_KICK_WAIT")
Cc: stable@vger.kernel.org # v6.12+
Reported-by: Christian Loehle <christian.loehle@arm.com>
Link: https://lore.kernel.org/r/20260316100249.1651641-1-christian.loehle@arm.com
Signed-off-by: Tejun Heo <tj@kernel.org>
Tested-by: Christian Loehle <christian.loehle@arm.com>

Tejun Heo 415cb193 db08b194

+73 -25
+70 -25
kernel/sched/ext.c
··· 2404 2404 { 2405 2405 struct scx_sched *sch = scx_root; 2406 2406 2407 - /* see kick_cpus_irq_workfn() */ 2407 + /* see kick_sync_wait_bal_cb() */ 2408 2408 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2409 2409 2410 2410 update_curr_scx(rq); ··· 2447 2447 switch_class(rq, next); 2448 2448 } 2449 2449 2450 + static void kick_sync_wait_bal_cb(struct rq *rq) 2451 + { 2452 + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 2453 + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 2454 + bool waited; 2455 + s32 cpu; 2456 + 2457 + /* 2458 + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 2459 + * — a target CPU may be waiting for us to process an IPI (e.g. TLB 2460 + * flush) while we wait for its kick_sync to advance. 2461 + * 2462 + * Also, keep advancing our own kick_sync so that new kick_sync waits 2463 + * targeting us, which can start after we drop the lock, cannot form 2464 + * cyclic dependencies. 2465 + */ 2466 + retry: 2467 + waited = false; 2468 + for_each_cpu(cpu, rq->scx.cpus_to_sync) { 2469 + /* 2470 + * smp_load_acquire() pairs with smp_store_release() on 2471 + * kick_sync updates on the target CPUs. 2472 + */ 2473 + if (cpu == cpu_of(rq) || 2474 + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 2475 + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 2476 + continue; 2477 + } 2478 + 2479 + raw_spin_rq_unlock_irq(rq); 2480 + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 2481 + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2482 + cpu_relax(); 2483 + } 2484 + raw_spin_rq_lock_irq(rq); 2485 + waited = true; 2486 + } 2487 + 2488 + if (waited) 2489 + goto retry; 2490 + } 2491 + 2450 2492 static struct task_struct *first_local_task(struct rq *rq) 2451 2493 { 2452 2494 return list_first_entry_or_null(&rq->scx.local_dsq.list, ··· 2502 2460 bool keep_prev; 2503 2461 struct task_struct *p; 2504 2462 2505 - /* see kick_cpus_irq_workfn() */ 2463 + /* see kick_sync_wait_bal_cb() */ 2506 2464 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2507 2465 2508 2466 rq_modified_begin(rq, &ext_sched_class); ··· 2511 2469 balance_one(rq, prev); 2512 2470 rq_repin_lock(rq, rf); 2513 2471 maybe_queue_balance_callback(rq); 2472 + 2473 + /* 2474 + * Defer to a balance callback which can drop rq lock and enable 2475 + * IRQs. Waiting directly in the pick path would deadlock against 2476 + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 2477 + */ 2478 + if (unlikely(rq->scx.kick_sync_pending)) { 2479 + rq->scx.kick_sync_pending = false; 2480 + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 2481 + kick_sync_wait_bal_cb); 2482 + } 2514 2483 2515 2484 /* 2516 2485 * If any higher-priority sched class enqueued a runnable task on ··· 4766 4713 if (!cpumask_empty(rq->scx.cpus_to_wait)) 4767 4714 dump_line(&ns, " cpus_to_wait : %*pb", 4768 4715 cpumask_pr_args(rq->scx.cpus_to_wait)); 4716 + if (!cpumask_empty(rq->scx.cpus_to_sync)) 4717 + dump_line(&ns, " cpus_to_sync : %*pb", 4718 + cpumask_pr_args(rq->scx.cpus_to_sync)); 4769 4719 4770 4720 used = seq_buf_used(&ns); 4771 4721 if (SCX_HAS_OP(sch, dump_cpu)) { ··· 5666 5610 5667 5611 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 5668 5612 if (cur_class == &ext_sched_class) { 5613 + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 5669 5614 ksyncs[cpu] = rq->scx.kick_sync; 5670 5615 should_wait = true; 5671 - } else { 5672 - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5673 5616 } 5617 + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5674 5618 } 5675 5619 5676 5620 resched_curr(rq); ··· 5725 5669 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 5726 5670 } 5727 5671 5728 - if (!should_wait) 5729 - return; 5730 - 5731 - for_each_cpu(cpu, this_scx->cpus_to_wait) { 5732 - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; 5733 - 5734 - /* 5735 - * Busy-wait until the task running at the time of kicking is no 5736 - * longer running. This can be used to implement e.g. core 5737 - * scheduling. 5738 - * 5739 - * smp_cond_load_acquire() pairs with store_releases in 5740 - * pick_task_scx() and put_prev_task_scx(). The former breaks 5741 - * the wait if SCX's scheduling path is entered even if the same 5742 - * task is picked subsequently. The latter is necessary to break 5743 - * the wait when $cpu is taken by a higher sched class. 5744 - */ 5745 - if (cpu != cpu_of(this_rq)) 5746 - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); 5747 - 5748 - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5672 + /* 5673 + * Can't wait in hardirq — kick_sync can't advance, deadlocking if 5674 + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 5675 + */ 5676 + if (should_wait) { 5677 + raw_spin_rq_lock(this_rq); 5678 + this_scx->kick_sync_pending = true; 5679 + resched_curr(this_rq); 5680 + raw_spin_rq_unlock(this_rq); 5749 5681 } 5750 5682 } 5751 5683 ··· 5838 5794 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 5839 5795 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 5840 5796 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 5797 + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 5841 5798 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 5842 5799 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 5843 5800
+3
kernel/sched/sched.h
··· 805 805 cpumask_var_t cpus_to_kick_if_idle; 806 806 cpumask_var_t cpus_to_preempt; 807 807 cpumask_var_t cpus_to_wait; 808 + cpumask_var_t cpus_to_sync; 809 + bool kick_sync_pending; 808 810 unsigned long kick_sync; 809 811 local_t reenq_local_deferred; 810 812 struct balance_callback deferred_bal_cb; 813 + struct balance_callback kick_sync_bal_cb; 811 814 struct irq_work deferred_irq_work; 812 815 struct irq_work kick_cpus_irq_work; 813 816 struct scx_dispatch_q bypass_dsq;