Merge branch 'for-7.0-fixes' into for-7.1

+70 -25

kernel/sched/ext.c

··· 3018 3018 { 3019 3019 struct scx_sched *sch = scx_task_sched(p); 3020 3020 3021 - /* see kick_cpus_irq_workfn() */ 3021 + /* see kick_sync_wait_bal_cb() */ 3022 3022 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3023 3023 3024 3024 update_curr_scx(rq); ··· 3067 3067 switch_class(rq, next); 3068 3068 } 3069 3069 3070 + static void kick_sync_wait_bal_cb(struct rq *rq) 3071 + { 3072 + struct scx_kick_syncs __rcu *ks = __this_cpu_read(scx_kick_syncs); 3073 + unsigned long *ksyncs = rcu_dereference_sched(ks)->syncs; 3074 + bool waited; 3075 + s32 cpu; 3076 + 3077 + /* 3078 + * Drop rq lock and enable IRQs while waiting. IRQs must be enabled 3079 + * — a target CPU may be waiting for us to process an IPI (e.g. TLB 3080 + * flush) while we wait for its kick_sync to advance. 3081 + * 3082 + * Also, keep advancing our own kick_sync so that new kick_sync waits 3083 + * targeting us, which can start after we drop the lock, cannot form 3084 + * cyclic dependencies. 3085 + */ 3086 + retry: 3087 + waited = false; 3088 + for_each_cpu(cpu, rq->scx.cpus_to_sync) { 3089 + /* 3090 + * smp_load_acquire() pairs with smp_store_release() on 3091 + * kick_sync updates on the target CPUs. 3092 + */ 3093 + if (cpu == cpu_of(rq) || 3094 + smp_load_acquire(&cpu_rq(cpu)->scx.kick_sync) != ksyncs[cpu]) { 3095 + cpumask_clear_cpu(cpu, rq->scx.cpus_to_sync); 3096 + continue; 3097 + } 3098 + 3099 + raw_spin_rq_unlock_irq(rq); 3100 + while (READ_ONCE(cpu_rq(cpu)->scx.kick_sync) == ksyncs[cpu]) { 3101 + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3102 + cpu_relax(); 3103 + } 3104 + raw_spin_rq_lock_irq(rq); 3105 + waited = true; 3106 + } 3107 + 3108 + if (waited) 3109 + goto retry; 3110 + } 3111 + 3070 3112 static struct task_struct *first_local_task(struct rq *rq) 3071 3113 { 3072 3114 return list_first_entry_or_null(&rq->scx.local_dsq.list, ··· 3122 3080 bool keep_prev; 3123 3081 struct task_struct *p; 3124 3082 3125 - /* see kick_cpus_irq_workfn() */ 3083 + /* see kick_sync_wait_bal_cb() */ 3126 3084 smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); 3127 3085 3128 3086 rq_modified_begin(rq, &ext_sched_class); ··· 3131 3089 balance_one(rq, prev); 3132 3090 rq_repin_lock(rq, rf); 3133 3091 maybe_queue_balance_callback(rq); 3092 + 3093 + /* 3094 + * Defer to a balance callback which can drop rq lock and enable 3095 + * IRQs. Waiting directly in the pick path would deadlock against 3096 + * CPUs sending us IPIs (e.g. TLB flushes) while we wait for them. 3097 + */ 3098 + if (unlikely(rq->scx.kick_sync_pending)) { 3099 + rq->scx.kick_sync_pending = false; 3100 + queue_balance_callback(rq, &rq->scx.kick_sync_bal_cb, 3101 + kick_sync_wait_bal_cb); 3102 + } 3134 3103 3135 3104 /* 3136 3105 * If any higher-priority sched class enqueued a runnable task on ··· 6272 6219 if (!cpumask_empty(rq->scx.cpus_to_wait)) 6273 6220 dump_line(&ns, " cpus_to_wait : %*pb", 6274 6221 cpumask_pr_args(rq->scx.cpus_to_wait)); 6222 + if (!cpumask_empty(rq->scx.cpus_to_sync)) 6223 + dump_line(&ns, " cpus_to_sync : %*pb", 6224 + cpumask_pr_args(rq->scx.cpus_to_sync)); 6275 6225 6276 6226 used = seq_buf_used(&ns); 6277 6227 if (SCX_HAS_OP(sch, dump_cpu)) { ··· 7639 7583 7640 7584 if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { 7641 7585 if (cur_class == &ext_sched_class) { 7586 + cpumask_set_cpu(cpu, this_scx->cpus_to_sync); 7642 7587 ksyncs[cpu] = rq->scx.kick_sync; 7643 7588 should_wait = true; 7644 - } else { 7645 - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7646 7589 } 7590 + cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7647 7591 } 7648 7592 7649 7593 resched_curr(rq); ··· 7698 7642 cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); 7699 7643 } 7700 7644 7701 - if (!should_wait) 7702 - return; 7703 - 7704 - for_each_cpu(cpu, this_scx->cpus_to_wait) { 7705 - unsigned long *wait_kick_sync = &cpu_rq(cpu)->scx.kick_sync; 7706 - 7707 - /* 7708 - * Busy-wait until the task running at the time of kicking is no 7709 - * longer running. This can be used to implement e.g. core 7710 - * scheduling. 7711 - * 7712 - * smp_cond_load_acquire() pairs with store_releases in 7713 - * pick_task_scx() and put_prev_task_scx(). The former breaks 7714 - * the wait if SCX's scheduling path is entered even if the same 7715 - * task is picked subsequently. The latter is necessary to break 7716 - * the wait when $cpu is taken by a higher sched class. 7717 - */ 7718 - if (cpu != cpu_of(this_rq)) 7719 - smp_cond_load_acquire(wait_kick_sync, VAL != ksyncs[cpu]); 7720 - 7721 - cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); 7645 + /* 7646 + * Can't wait in hardirq — kick_sync can't advance, deadlocking if 7647 + * CPUs wait for each other. Defer to kick_sync_wait_bal_cb(). 7648 + */ 7649 + if (should_wait) { 7650 + raw_spin_rq_lock(this_rq); 7651 + this_scx->kick_sync_pending = true; 7652 + resched_curr(this_rq); 7653 + raw_spin_rq_unlock(this_rq); 7722 7654 } 7723 7655 } 7724 7656 ··· 7824 7780 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); 7825 7781 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); 7826 7782 BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); 7783 + BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_sync, GFP_KERNEL, n)); 7827 7784 raw_spin_lock_init(&rq->scx.deferred_reenq_lock); 7828 7785 INIT_LIST_HEAD(&rq->scx.deferred_reenq_locals); 7829 7786 INIT_LIST_HEAD(&rq->scx.deferred_reenq_users);

+1 -1

kernel/sched/ext_idle.c

··· 549 549 * piled up on it even if there is an idle core elsewhere on 550 550 * the system. 551 551 */ 552 - waker_node = cpu_to_node(cpu); 552 + waker_node = scx_cpu_node_if_enabled(cpu); 553 553 if (!(current->flags & PF_EXITING) && 554 554 cpu_rq(cpu)->scx.local_dsq.nr == 0 && 555 555 (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&

+3

kernel/sched/sched.h

··· 806 806 cpumask_var_t cpus_to_kick_if_idle; 807 807 cpumask_var_t cpus_to_preempt; 808 808 cpumask_var_t cpus_to_wait; 809 + cpumask_var_t cpus_to_sync; 810 + bool kick_sync_pending; 809 811 unsigned long kick_sync; 810 812 811 813 struct task_struct *sub_dispatch_prev; ··· 817 815 struct list_head deferred_reenq_locals; /* scheds requesting reenq of local DSQ */ 818 816 struct list_head deferred_reenq_users; /* user DSQs requesting reenq */ 819 817 struct balance_callback deferred_bal_cb; 818 + struct balance_callback kick_sync_bal_cb; 820 819 struct irq_work deferred_irq_work; 821 820 struct irq_work kick_cpus_irq_work; 822 821 };

+1

tools/testing/selftests/sched_ext/Makefile

··· 189 189 rt_stall \ 190 190 test_example \ 191 191 total_bw \ 192 + cyclic_kick_wait \ 192 193 193 194 testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) 194 195

+68

tools/testing/selftests/sched_ext/cyclic_kick_wait.bpf.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Stress concurrent SCX_KICK_WAIT calls to reproduce wait-cycle deadlock. 4 + * 5 + * Three CPUs are designated from userspace. Every enqueue from one of the 6 + * three CPUs kicks the next CPU in the ring with SCX_KICK_WAIT, creating a 7 + * persistent A -> B -> C -> A wait cycle pressure. 8 + */ 9 + #include <scx/common.bpf.h> 10 + 11 + char _license[] SEC("license") = "GPL"; 12 + 13 + const volatile s32 test_cpu_a; 14 + const volatile s32 test_cpu_b; 15 + const volatile s32 test_cpu_c; 16 + 17 + u64 nr_enqueues; 18 + u64 nr_wait_kicks; 19 + 20 + UEI_DEFINE(uei); 21 + 22 + static s32 target_cpu(s32 cpu) 23 + { 24 + if (cpu == test_cpu_a) 25 + return test_cpu_b; 26 + if (cpu == test_cpu_b) 27 + return test_cpu_c; 28 + if (cpu == test_cpu_c) 29 + return test_cpu_a; 30 + return -1; 31 + } 32 + 33 + void BPF_STRUCT_OPS(cyclic_kick_wait_enqueue, struct task_struct *p, 34 + u64 enq_flags) 35 + { 36 + s32 this_cpu = bpf_get_smp_processor_id(); 37 + s32 tgt; 38 + 39 + __sync_fetch_and_add(&nr_enqueues, 1); 40 + 41 + if (p->flags & PF_KTHREAD) { 42 + scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, 43 + enq_flags | SCX_ENQ_PREEMPT); 44 + return; 45 + } 46 + 47 + scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); 48 + 49 + tgt = target_cpu(this_cpu); 50 + if (tgt < 0 || tgt == this_cpu) 51 + return; 52 + 53 + __sync_fetch_and_add(&nr_wait_kicks, 1); 54 + scx_bpf_kick_cpu(tgt, SCX_KICK_WAIT); 55 + } 56 + 57 + void BPF_STRUCT_OPS(cyclic_kick_wait_exit, struct scx_exit_info *ei) 58 + { 59 + UEI_RECORD(uei, ei); 60 + } 61 + 62 + SEC(".struct_ops.link") 63 + struct sched_ext_ops cyclic_kick_wait_ops = { 64 + .enqueue = cyclic_kick_wait_enqueue, 65 + .exit = cyclic_kick_wait_exit, 66 + .name = "cyclic_kick_wait", 67 + .timeout_ms = 1000U, 68 + };

+194

tools/testing/selftests/sched_ext/cyclic_kick_wait.c

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * Test SCX_KICK_WAIT forward progress under cyclic wait pressure. 4 + * 5 + * SCX_KICK_WAIT busy-waits until the target CPU enters the scheduling path. 6 + * If multiple CPUs form a wait cycle (A waits for B, B waits for C, C waits 7 + * for A), all CPUs deadlock unless the implementation breaks the cycle. 8 + * 9 + * This test creates that scenario: three CPUs are arranged in a ring. The BPF 10 + * scheduler's ops.enqueue() kicks the next CPU in the ring with SCX_KICK_WAIT 11 + * on every enqueue. Userspace pins 4 worker threads per CPU that loop calling 12 + * sched_yield(), generating a steady stream of enqueues and thus sustained 13 + * A->B->C->A kick_wait cycle pressure. The test passes if the system remains 14 + * responsive for 5 seconds without the scheduler being killed by the watchdog. 15 + */ 16 + #define _GNU_SOURCE 17 + 18 + #include <bpf/bpf.h> 19 + #include <errno.h> 20 + #include <pthread.h> 21 + #include <sched.h> 22 + #include <scx/common.h> 23 + #include <stdint.h> 24 + #include <string.h> 25 + #include <time.h> 26 + #include <unistd.h> 27 + 28 + #include "scx_test.h" 29 + #include "cyclic_kick_wait.bpf.skel.h" 30 + 31 + #define WORKERS_PER_CPU 4 32 + #define NR_TEST_CPUS 3 33 + #define NR_WORKERS (NR_TEST_CPUS * WORKERS_PER_CPU) 34 + 35 + struct worker_ctx { 36 + pthread_t tid; 37 + int cpu; 38 + volatile bool stop; 39 + volatile __u64 iters; 40 + bool started; 41 + }; 42 + 43 + static void *worker_fn(void *arg) 44 + { 45 + struct worker_ctx *worker = arg; 46 + cpu_set_t mask; 47 + 48 + CPU_ZERO(&mask); 49 + CPU_SET(worker->cpu, &mask); 50 + 51 + if (sched_setaffinity(0, sizeof(mask), &mask)) 52 + return (void *)(uintptr_t)errno; 53 + 54 + while (!worker->stop) { 55 + sched_yield(); 56 + worker->iters++; 57 + } 58 + 59 + return NULL; 60 + } 61 + 62 + static int join_worker(struct worker_ctx *worker) 63 + { 64 + void *ret; 65 + struct timespec ts; 66 + int err; 67 + 68 + if (!worker->started) 69 + return 0; 70 + 71 + if (clock_gettime(CLOCK_REALTIME, &ts)) 72 + return -errno; 73 + 74 + ts.tv_sec += 2; 75 + err = pthread_timedjoin_np(worker->tid, &ret, &ts); 76 + if (err == ETIMEDOUT) 77 + pthread_detach(worker->tid); 78 + if (err) 79 + return -err; 80 + 81 + if ((uintptr_t)ret) 82 + return -(int)(uintptr_t)ret; 83 + 84 + return 0; 85 + } 86 + 87 + static enum scx_test_status setup(void **ctx) 88 + { 89 + struct cyclic_kick_wait *skel; 90 + 91 + skel = cyclic_kick_wait__open(); 92 + SCX_FAIL_IF(!skel, "Failed to open skel"); 93 + SCX_ENUM_INIT(skel); 94 + 95 + *ctx = skel; 96 + return SCX_TEST_PASS; 97 + } 98 + 99 + static enum scx_test_status run(void *ctx) 100 + { 101 + struct cyclic_kick_wait *skel = ctx; 102 + struct worker_ctx workers[NR_WORKERS] = {}; 103 + struct bpf_link *link = NULL; 104 + enum scx_test_status status = SCX_TEST_PASS; 105 + int test_cpus[NR_TEST_CPUS]; 106 + int nr_cpus = 0; 107 + cpu_set_t mask; 108 + int ret, i; 109 + 110 + if (sched_getaffinity(0, sizeof(mask), &mask)) { 111 + SCX_ERR("Failed to get affinity (%d)", errno); 112 + return SCX_TEST_FAIL; 113 + } 114 + 115 + for (i = 0; i < CPU_SETSIZE; i++) { 116 + if (CPU_ISSET(i, &mask)) 117 + test_cpus[nr_cpus++] = i; 118 + if (nr_cpus == NR_TEST_CPUS) 119 + break; 120 + } 121 + 122 + if (nr_cpus < NR_TEST_CPUS) 123 + return SCX_TEST_SKIP; 124 + 125 + skel->rodata->test_cpu_a = test_cpus[0]; 126 + skel->rodata->test_cpu_b = test_cpus[1]; 127 + skel->rodata->test_cpu_c = test_cpus[2]; 128 + 129 + if (cyclic_kick_wait__load(skel)) { 130 + SCX_ERR("Failed to load skel"); 131 + return SCX_TEST_FAIL; 132 + } 133 + 134 + link = bpf_map__attach_struct_ops(skel->maps.cyclic_kick_wait_ops); 135 + if (!link) { 136 + SCX_ERR("Failed to attach scheduler"); 137 + return SCX_TEST_FAIL; 138 + } 139 + 140 + for (i = 0; i < NR_WORKERS; i++) 141 + workers[i].cpu = test_cpus[i / WORKERS_PER_CPU]; 142 + 143 + for (i = 0; i < NR_WORKERS; i++) { 144 + ret = pthread_create(&workers[i].tid, NULL, worker_fn, &workers[i]); 145 + if (ret) { 146 + SCX_ERR("Failed to create worker thread %d (%d)", i, ret); 147 + status = SCX_TEST_FAIL; 148 + goto out; 149 + } 150 + workers[i].started = true; 151 + } 152 + 153 + sleep(5); 154 + 155 + if (skel->data->uei.kind != EXIT_KIND(SCX_EXIT_NONE)) { 156 + SCX_ERR("Scheduler exited unexpectedly (kind=%llu code=%lld)", 157 + (unsigned long long)skel->data->uei.kind, 158 + (long long)skel->data->uei.exit_code); 159 + status = SCX_TEST_FAIL; 160 + } 161 + 162 + out: 163 + for (i = 0; i < NR_WORKERS; i++) 164 + workers[i].stop = true; 165 + 166 + for (i = 0; i < NR_WORKERS; i++) { 167 + ret = join_worker(&workers[i]); 168 + if (ret && status == SCX_TEST_PASS) { 169 + SCX_ERR("Failed to join worker thread %d (%d)", i, ret); 170 + status = SCX_TEST_FAIL; 171 + } 172 + } 173 + 174 + if (link) 175 + bpf_link__destroy(link); 176 + 177 + return status; 178 + } 179 + 180 + static void cleanup(void *ctx) 181 + { 182 + struct cyclic_kick_wait *skel = ctx; 183 + 184 + cyclic_kick_wait__destroy(skel); 185 + } 186 + 187 + struct scx_test cyclic_kick_wait = { 188 + .name = "cyclic_kick_wait", 189 + .description = "Verify SCX_KICK_WAIT forward progress under a 3-CPU wait cycle", 190 + .setup = setup, 191 + .run = run, 192 + .cleanup = cleanup, 193 + }; 194 + REGISTER_SCX_TEST(&cyclic_kick_wait)

Configure Feed

Configure Feed