Merge tag 'sched_ext-for-7.0-rc6-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

+70 -25

kernel/sched/ext.c

··· 2404 2404 { 2405 2405 struct scx_sched *sch = scx_root; 2406 2406 2407 - /* see kick_cpus_irq_workfn() */ 2407 + /* see kick_sync_wait_bal_cb() */ 2408 2408 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2409 2409 2410 2410 update_curr_scx(rq); ··· 2447 2447 switch_class(rq, next); 2448 2448 } 2449 2449 2450 + static void kick_sync_wait_bal_cb(struct rq *rq) 2451 + { 2452 + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 2453 + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 2454 + bool waited; 2455 + s32 cpu; 2456 + 2457 + /* 2458 + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 2459 + * — a target CPU may be waiting for us to process an IPI (e.g. TLB 2460 + * flush) while we wait for its kick_sync to advance. 2461 + * 2462 + * Also, keep advancing our own kick_sync so that new kick_sync waits 2463 + * targeting us, which can start after we drop the lock, cannot form 2464 + * cyclic dependencies. 2465 + */ 2466 + retry: 2467 + waited = false; 2468 + for_each_cpu(cpu, rq->scx.cpus_to_sync) { 2469 + /* 2470 + * smp_load_acquire() pairs with smp_store_release() on 2471 + * kick_sync updates on the target CPUs. 2472 + */ 2473 + if (cpu == cpu_of(rq) || 2474 + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 2475 + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 2476 + continue; 2477 + } 2478 + 2479 + raw_spin_rq_unlock_irq(rq); 2480 + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 2481 + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2482 + cpu_relax(); 2483 + } 2484 + raw_spin_rq_lock_irq(rq); 2485 + waited = true; 2486 + } 2487 + 2488 + if (waited) 2489 + goto retry; 2490 + } 2491 + 2450 2492 static struct task_struct *first_local_task(struct rq *rq) 2451 2493 { 2452 2494 return list_first_entry_or_null(&rq->scx.local_dsq.list, ··· 2502 2460 bool keep_prev; 2503 2461 struct task_struct *p; 2504 2462 2505 - /* see kick_cpus_irq_workfn() */ 2463 + /* see kick_sync_wait_bal_cb() */ 2506 2464 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 2507 2465 2508 2466 rq_modified_begin(rq, &ext_sched_class); ··· 2511 2469 balance_one(rq, prev); 2512 2470 rq_repin_lock(rq, rf); 2513 2471 maybe_queue_balance_callback(rq); 2472 + 2473 + /* 2474 + * Defer to a balance callback which can drop rq lock and enable 2475 + * IRQs. Waiting directly in the pick path would deadlock against 2476 + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 2477 + */ 2478 + if (unlikely(rq->scx.kick_sync_pending)) { 2479 + rq->scx.kick_sync_pending = false; 2480 + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 2481 + kick_sync_wait_bal_cb); 2482 + } 2514 2483 2515 2484 /* 2516 2485 * If any higher-priority sched class enqueued a runnable task on ··· 4766 4713 if (!cpumask_empty(rq->scx.cpus_to_wait)) 4767 4714 dump_line(&ns, " cpus_to_wait : %*pb", 4768 4715 cpumask_pr_args(rq->scx.cpus_to_wait)); 4716 + if (!cpumask_empty(rq->scx.cpus_to_sync)) 4717 + dump_line(&ns, " cpus_to_sync : %*pb", 4718 + cpumask_pr_args(rq->scx.cpus_to_sync)); 4769 4719 4770 4720 used = seq_buf_used(&ns); 4771 4721 if (SCX_HAS_OP(sch, dump_cpu)) { ··· 5666 5610 5667 5611 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 5668 5612 if (cur_class == &ext_sched_class) { 5613 + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 5669 5614 ksyncs[cpu] = rq->scx.kick_sync; 5670 5615 should_wait = true; 5671 - } else { 5672 - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5673 5616 } 5617 + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5674 5618 } 5675 5619 5676 5620 resched_curr(rq); ··· 5725 5669 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 5726 5670 } 5727 5671 5728 - if (!should_wait) 5729 - return; 5730 - 5731 - for_each_cpu(cpu, this_scx->cpus_to_wait) { 5732 - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; 5733 - 5734 - /* 5735 - * Busy-wait until the task running at the time of kicking is no 5736 - * longer running. This can be used to implement e.g. core 5737 - * scheduling. 5738 - * 5739 - * smp_cond_load_acquire() pairs with store_releases in 5740 - * pick_task_scx() and put_prev_task_scx(). The former breaks 5741 - * the wait if SCX's scheduling path is entered even if the same 5742 - * task is picked subsequently. The latter is necessary to break 5743 - * the wait when $cpu is taken by a higher sched class. 5744 - */ 5745 - if (cpu != cpu_of(this_rq)) 5746 - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); 5747 - 5748 - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 5672 + /* 5673 + * Can't wait in hardirq — kick_sync can't advance, deadlocking if 5674 + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 5675 + */ 5676 + if (should_wait) { 5677 + raw_spin_rq_lock(this_rq); 5678 + this_scx->kick_sync_pending = true; 5679 + resched_curr(this_rq); 5680 + raw_spin_rq_unlock(this_rq); 5749 5681 } 5750 5682 } 5751 5683 ··· 5838 5794 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 5839 5795 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 5840 5796 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 5797 + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 5841 5798 rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); 5842 5799 rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); 5843 5800

+1 -1

kernel/sched/ext_idle.c

··· 543 543 * piled up on it even if there is an idle core elsewhere on 544 544 * the system. 545 545 */ 546 - waker_node = cpu_to_node(cpu); 546 + waker_node = scx_cpu_node_if_enabled(cpu); 547 547 if (!(current->flags & PF_EXITING) && 548 548 cpu_rq(cpu)->scx.local_dsq.nr == 0 && 549 549 (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&

+3

kernel/sched/sched.h

··· 805 805 cpumask_var_t cpus_to_kick_if_idle; 806 806 cpumask_var_t cpus_to_preempt; 807 807 cpumask_var_t cpus_to_wait; 808 + cpumask_var_t cpus_to_sync; 809 + bool kick_sync_pending; 808 810 unsigned long kick_sync; 809 811 local_t reenq_local_deferred; 810 812 struct balance_callback deferred_bal_cb; 813 + struct balance_callback kick_sync_bal_cb; 811 814 struct irq_work deferred_irq_work; 812 815 struct irq_work kick_cpus_irq_work; 813 816 struct scx_dispatch_q bypass_dsq;

+1

tools/testing/selftests/sched_ext/Makefile

··· 188 188 rt_stall \ 189 189 test_example \ 190 190 total_bw \ 191 + cyclic_kick_wait \ 191 192 192 193 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) 193 194

+68

tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock. 4 + * 5 + * Three CPUs are designated from userspace. Every enqueue from one of the 6 + * three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a 7 + * persistent A -> B -> C -> A wait cycle pressure. 8 + */ 9 + #include <scx/common.bpf.h> 10 + 11 + char _license[] SEC("license") = "GPL"; 12 + 13 + const volatile s32 test_cpu_a; 14 + const volatile s32 test_cpu_b; 15 + const volatile s32 test_cpu_c; 16 + 17 + u64 nr_enqueues; 18 + u64 nr_wait_kicks; 19 + 20 + UEI_DEFINE(uei); 21 + 22 + static s32 target_cpu(s32 cpu) 23 + { 24 + if (cpu == test_cpu_a) 25 + return test_cpu_b; 26 + if (cpu == test_cpu_b) 27 + return test_cpu_c; 28 + if (cpu == test_cpu_c) 29 + return test_cpu_a; 30 + return -1; 31 + } 32 + 33 + void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p, 34 + u64 enq_flags) 35 + { 36 + s32 this_cpu = bpf_get_smp_processor_id(); 37 + s32 tgt; 38 + 39 + __sync_fetch_and_add(&nr_enqueues, 1); 40 + 41 + if (p->flags & PF_KTHREAD) { 42 + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, 43 + enq_flags | SCX_ENQ_PREEMPT); 44 + return; 45 + } 46 + 47 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 48 + 49 + tgt = target_cpu(this_cpu); 50 + if (tgt < 0 || tgt == this_cpu) 51 + return; 52 + 53 + __sync_fetch_and_add(&nr_wait_kicks, 1); 54 + scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT); 55 + } 56 + 57 + void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei) 58 + { 59 + UEI_RECORD(uei, ei); 60 + } 61 + 62 + SEC(".struct_ops.link") 63 + struct sched_ext_ops cyclic_kick_wait_ops = { 64 + .enqueue = cyclic_kick_wait_enqueue, 65 + .exit = cyclic_kick_wait_exit, 66 + .name = "cyclic_kick_wait", 67 + .timeout_ms = 1000U, 68 + };

+194

tools/testing/selftests/sched_ext/cyclic_kick_wait.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Test SCX_KICK_WAIT forward progress under cyclic wait pressure. 4 + * 5 + * SCX_KICK_WAIT busy-waits until the target CPU enters the scheduling path. 6 + * If multiple CPUs form a wait cycle (A waits for B, B waits for C, C waits 7 + * for A), all CPUs deadlock unless the implementation breaks the cycle. 8 + * 9 + * This test creates that scenario: three CPUs are arranged in a ring. The BPF 10 + * scheduler's ops.enqueue() kicks the next CPU in the ring with SCX_KICK_WAIT 11 + * on every enqueue. Userspace pins 4 worker threads per CPU that loop calling 12 + * sched_yield(), generating a steady stream of enqueues and thus sustained 13 + * A->B->C->A kick_wait cycle pressure. The test passes if the system remains 14 + * responsive for 5 seconds without the scheduler being killed by the watchdog. 15 + */ 16 + #define _GNU_SOURCE 17 + 18 + #include <bpf/bpf.h> 19 + #include <errno.h> 20 + #include <pthread.h> 21 + #include <sched.h> 22 + #include <scx/common.h> 23 + #include <stdint.h> 24 + #include <string.h> 25 + #include <time.h> 26 + #include <unistd.h> 27 + 28 + #include "scx_test.h" 29 + #include "cyclic_kick_wait.bpf.skel.h" 30 + 31 + #define WORKERS_PER_CPU 4 32 + #define NR_TEST_CPUS 3 33 + #define NR_WORKERS (NR_TEST_CPUS * WORKERS_PER_CPU) 34 + 35 + struct worker_ctx { 36 + pthread_t tid; 37 + int cpu; 38 + volatile bool stop; 39 + volatile __u64 iters; 40 + bool started; 41 + }; 42 + 43 + static void *worker_fn(void *arg) 44 + { 45 + struct worker_ctx *worker = arg; 46 + cpu_set_t mask; 47 + 48 + CPU_ZERO(&mask); 49 + CPU_SET(worker->cpu, &mask); 50 + 51 + if (sched_setaffinity(0, sizeof(mask), &mask)) 52 + return (void *)(uintptr_t)errno; 53 + 54 + while (!worker->stop) { 55 + sched_yield(); 56 + worker->iters++; 57 + } 58 + 59 + return NULL; 60 + } 61 + 62 + static int join_worker(struct worker_ctx *worker) 63 + { 64 + void *ret; 65 + struct timespec ts; 66 + int err; 67 + 68 + if (!worker->started) 69 + return 0; 70 + 71 + if (clock_gettime(CLOCK_REALTIME, &ts)) 72 + return -errno; 73 + 74 + ts.tv_sec += 2; 75 + err = pthread_timedjoin_np(worker->tid, &ret, &ts); 76 + if (err == ETIMEDOUT) 77 + pthread_detach(worker->tid); 78 + if (err) 79 + return -err; 80 + 81 + if ((uintptr_t)ret) 82 + return -(int)(uintptr_t)ret; 83 + 84 + return 0; 85 + } 86 + 87 + static enum scx_test_status setup(void **ctx) 88 + { 89 + struct cyclic_kick_wait *skel; 90 + 91 + skel = cyclic_kick_wait__open(); 92 + SCX_FAIL_IF(!skel, "Failed to open skel"); 93 + SCX_ENUM_INIT(skel); 94 + 95 + *ctx = skel; 96 + return SCX_TEST_PASS; 97 + } 98 + 99 + static enum scx_test_status run(void *ctx) 100 + { 101 + struct cyclic_kick_wait *skel = ctx; 102 + struct worker_ctx workers[NR_WORKERS] = {}; 103 + struct bpf_link *link = NULL; 104 + enum scx_test_status status = SCX_TEST_PASS; 105 + int test_cpus[NR_TEST_CPUS]; 106 + int nr_cpus = 0; 107 + cpu_set_t mask; 108 + int ret, i; 109 + 110 + if (sched_getaffinity(0, sizeof(mask), &mask)) { 111 + SCX_ERR("Failed to get affinity (%d)", errno); 112 + return SCX_TEST_FAIL; 113 + } 114 + 115 + for (i = 0; i < CPU_SETSIZE; i++) { 116 + if (CPU_ISSET(i, &mask)) 117 + test_cpus[nr_cpus++] = i; 118 + if (nr_cpus == NR_TEST_CPUS) 119 + break; 120 + } 121 + 122 + if (nr_cpus < NR_TEST_CPUS) 123 + return SCX_TEST_SKIP; 124 + 125 + skel->rodata->test_cpu_a = test_cpus[0]; 126 + skel->rodata->test_cpu_b = test_cpus[1]; 127 + skel->rodata->test_cpu_c = test_cpus[2]; 128 + 129 + if (cyclic_kick_wait__load(skel)) { 130 + SCX_ERR("Failed to load skel"); 131 + return SCX_TEST_FAIL; 132 + } 133 + 134 + link = bpf_map__attach_struct_ops(skel->maps.cyclic_kick_wait_ops); 135 + if (!link) { 136 + SCX_ERR("Failed to attach scheduler"); 137 + return SCX_TEST_FAIL; 138 + } 139 + 140 + for (i = 0; i < NR_WORKERS; i++) 141 + workers[i].cpu = test_cpus[i / WORKERS_PER_CPU]; 142 + 143 + for (i = 0; i < NR_WORKERS; i++) { 144 + ret = pthread_create(&workers[i].tid, NULL, worker_fn, &workers[i]); 145 + if (ret) { 146 + SCX_ERR("Failed to create worker thread %d (%d)", i, ret); 147 + status = SCX_TEST_FAIL; 148 + goto out; 149 + } 150 + workers[i].started = true; 151 + } 152 + 153 + sleep(5); 154 + 155 + if (skel->data->uei.kind != EXIT_KIND(SCX_EXIT_NONE)) { 156 + SCX_ERR("Scheduler exited unexpectedly (kind=%llu code=%lld)", 157 + (unsigned long long)skel->data->uei.kind, 158 + (long long)skel->data->uei.exit_code); 159 + status = SCX_TEST_FAIL; 160 + } 161 + 162 + out: 163 + for (i = 0; i < NR_WORKERS; i++) 164 + workers[i].stop = true; 165 + 166 + for (i = 0; i < NR_WORKERS; i++) { 167 + ret = join_worker(&workers[i]); 168 + if (ret && status == SCX_TEST_PASS) { 169 + SCX_ERR("Failed to join worker thread %d (%d)", i, ret); 170 + status = SCX_TEST_FAIL; 171 + } 172 + } 173 + 174 + if (link) 175 + bpf_link__destroy(link); 176 + 177 + return status; 178 + } 179 + 180 + static void cleanup(void *ctx) 181 + { 182 + struct cyclic_kick_wait *skel = ctx; 183 + 184 + cyclic_kick_wait__destroy(skel); 185 + } 186 + 187 + struct scx_test cyclic_kick_wait = { 188 + .name = "cyclic_kick_wait", 189 + .description = "Verify SCX_KICK_WAIT forward progress under a 3-CPU wait cycle", 190 + .setup = setup, 191 + .run = run, 192 + .cleanup = cleanup, 193 + }; 194 + REGISTER_SCX_TEST(&cyclic_kick_wait)

Configure Feed

Configure Feed