Merge tag 'sched_ext-for-7.0-rc2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext

+83 -18

kernel/sched/ext.c

··· 976 976 977 977 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) 978 978 { 979 - /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ 980 - WRITE_ONCE(dsq->nr, dsq->nr + delta); 979 + /* 980 + * scx_bpf_dsq_nr_queued() reads ->nr without locking. Use READ_ONCE() 981 + * on the read side and WRITE_ONCE() on the write side to properly 982 + * annotate the concurrent lockless access and avoid KCSAN warnings. 983 + */ 984 + WRITE_ONCE(dsq->nr, READ_ONCE(dsq->nr) + delta); 981 985 } 982 986 983 987 static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) ··· 2739 2735 unsigned long last_runnable = p->scx.runnable_at; 2740 2736 2741 2737 if (unlikely(time_after(jiffies, 2742 - last_runnable + scx_watchdog_timeout))) { 2738 + last_runnable + READ_ONCE(scx_watchdog_timeout)))) { 2743 2739 u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); 2744 2740 2745 2741 scx_exit(sch, SCX_EXIT_ERROR_STALL, 0, ··· 2767 2763 cond_resched(); 2768 2764 } 2769 2765 queue_delayed_work(system_unbound_wq, to_delayed_work(work), 2770 - scx_watchdog_timeout / 2); 2766 + READ_ONCE(scx_watchdog_timeout) / 2); 2771 2767 } 2772 2768 2773 2769 void scx_tick(struct rq *rq) ··· 3589 3585 ret = SCX_CALL_OP_RET(sch, SCX_KF_UNLOCKED, cgroup_init, NULL, 3590 3586 css->cgroup, &args); 3591 3587 if (ret) { 3592 - css_put(css); 3593 3588 scx_error(sch, "ops.cgroup_init() failed (%d)", ret); 3594 3589 return ret; 3595 3590 } ··· 3711 3708 static ssize_t scx_attr_ops_show(struct kobject *kobj, 3712 3709 struct kobj_attribute *ka, char *buf) 3713 3710 { 3714 - return sysfs_emit(buf, "%s\n", scx_root->ops.name); 3711 + struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 3712 + 3713 + return sysfs_emit(buf, "%s\n", sch->ops.name); 3715 3714 } 3716 3715 SCX_ATTR(ops); 3717 3716 ··· 3757 3752 3758 3753 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) 3759 3754 { 3760 - return add_uevent_var(env, "SCXOPS=%s", scx_root->ops.name); 3755 + const struct scx_sched *sch = container_of(kobj, struct scx_sched, kobj); 3756 + 3757 + return add_uevent_var(env, "SCXOPS=%s", sch->ops.name); 3761 3758 } 3762 3759 3763 3760 static const struct kset_uevent_ops scx_uevent_ops = { ··· 4430 4423 scx_bypass(false); 4431 4424 } 4432 4425 4426 + /* 4427 + * Claim the exit on @sch. The caller must ensure that the helper kthread work 4428 + * is kicked before the current task can be preempted. Once exit_kind is 4429 + * claimed, scx_error() can no longer trigger, so if the current task gets 4430 + * preempted and the BPF scheduler fails to schedule it back, the helper work 4431 + * will never be kicked and the whole system can wedge. 4432 + */ 4433 4433 static bool scx_claim_exit(struct scx_sched *sch, enum scx_exit_kind kind) 4434 4434 { 4435 4435 int none = SCX_EXIT_NONE; 4436 + 4437 + lockdep_assert_preemption_disabled(); 4436 4438 4437 4439 if (!atomic_try_cmpxchg(&sch->exit_kind, &none, kind)) 4438 4440 return false; ··· 4465 4449 rcu_read_lock(); 4466 4450 sch = rcu_dereference(scx_root); 4467 4451 if (sch) { 4452 + guard(preempt)(); 4468 4453 scx_claim_exit(sch, kind); 4469 4454 kthread_queue_work(sch->helper, &sch->disable_work); 4470 4455 } ··· 4788 4771 { 4789 4772 struct scx_exit_info *ei = sch->exit_info; 4790 4773 4774 + guard(preempt)(); 4775 + 4791 4776 if (!scx_claim_exit(sch, kind)) 4792 4777 return false; 4793 4778 ··· 4974 4955 return 0; 4975 4956 } 4976 4957 4977 - static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) 4958 + /* 4959 + * scx_enable() is offloaded to a dedicated system-wide RT kthread to avoid 4960 + * starvation. During the READY -> ENABLED task switching loop, the calling 4961 + * thread's sched_class gets switched from fair to ext. As fair has higher 4962 + * priority than ext, the calling thread can be indefinitely starved under 4963 + * fair-class saturation, leading to a system hang. 4964 + */ 4965 + struct scx_enable_cmd { 4966 + struct kthread_work work; 4967 + struct sched_ext_ops *ops; 4968 + int ret; 4969 + }; 4970 + 4971 + static void scx_enable_workfn(struct kthread_work *work) 4978 4972 { 4973 + struct scx_enable_cmd *cmd = 4974 + container_of(work, struct scx_enable_cmd, work); 4975 + struct sched_ext_ops *ops = cmd->ops; 4979 4976 struct scx_sched *sch; 4980 4977 struct scx_task_iter sti; 4981 4978 struct task_struct *p; 4982 4979 unsigned long timeout; 4983 4980 int i, cpu, ret; 4984 - 4985 - if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), 4986 - cpu_possible_mask)) { 4987 - pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 4988 - return -EINVAL; 4989 - } 4990 4981 4991 4982 mutex_lock(&scx_enable_mutex); 4992 4983 ··· 5089 5060 WRITE_ONCE(scx_watchdog_timeout, timeout); 5090 5061 WRITE_ONCE(scx_watchdog_timestamp, jiffies); 5091 5062 queue_delayed_work(system_unbound_wq, &scx_watchdog_work, 5092 - scx_watchdog_timeout / 2); 5063 + READ_ONCE(scx_watchdog_timeout) / 2); 5093 5064 5094 5065 /* 5095 5066 * Once __scx_enabled is set, %current can be switched to SCX anytime. ··· 5214 5185 5215 5186 atomic_long_inc(&scx_enable_seq); 5216 5187 5217 - return 0; 5188 + cmd->ret = 0; 5189 + return; 5218 5190 5219 5191 err_free_ksyncs: 5220 5192 free_kick_syncs(); 5221 5193 err_unlock: 5222 5194 mutex_unlock(&scx_enable_mutex); 5223 - return ret; 5195 + cmd->ret = ret; 5196 + return; 5224 5197 5225 5198 err_disable_unlock_all: 5226 5199 scx_cgroup_unlock(); ··· 5241 5210 */ 5242 5211 scx_error(sch, "scx_enable() failed (%d)", ret); 5243 5212 kthread_flush_work(&sch->disable_work); 5244 - return 0; 5213 + cmd->ret = 0; 5214 + } 5215 + 5216 + static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) 5217 + { 5218 + static struct kthread_worker *helper; 5219 + static DEFINE_MUTEX(helper_mutex); 5220 + struct scx_enable_cmd cmd; 5221 + 5222 + if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN), 5223 + cpu_possible_mask)) { 5224 + pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n"); 5225 + return -EINVAL; 5226 + } 5227 + 5228 + if (!READ_ONCE(helper)) { 5229 + mutex_lock(&helper_mutex); 5230 + if (!helper) { 5231 + helper = kthread_run_worker(0, "scx_enable_helper"); 5232 + if (IS_ERR_OR_NULL(helper)) { 5233 + helper = NULL; 5234 + mutex_unlock(&helper_mutex); 5235 + return -ENOMEM; 5236 + } 5237 + sched_set_fifo(helper->task); 5238 + } 5239 + mutex_unlock(&helper_mutex); 5240 + } 5241 + 5242 + kthread_init_work(&cmd.work, scx_enable_workfn); 5243 + cmd.ops = ops; 5244 + 5245 + kthread_queue_work(READ_ONCE(helper), &cmd.work); 5246 + kthread_flush_work(&cmd.work); 5247 + return cmd.ret; 5245 5248 } 5246 5249 5247 5250

+2 -3

kernel/sched/ext_idle.c

··· 663 663 BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); 664 664 BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL)); 665 665 666 - /* Allocate per-node idle cpumasks */ 667 - scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, 668 - num_possible_nodes()); 666 + /* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */ 667 + scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids); 669 668 BUG_ON(!scx_idle_node_masks); 670 669 671 670 for_each_node(i) {

+1 -1

kernel/sched/ext_internal.h

··· 74 74 * info communication. The following flag indicates whether ops.init() 75 75 * finished successfully. 76 76 */ 77 - SCX_EFLAG_INITIALIZED, 77 + SCX_EFLAG_INITIALIZED = 1LLU << 0, 78 78 }; 79 79 80 80 /*

+61

tools/sched_ext/Kconfig

··· 1 + # sched-ext mandatory options 2 + # 3 + CONFIG_BPF=y 4 + CONFIG_BPF_SYSCALL=y 5 + CONFIG_BPF_JIT=y 6 + CONFIG_DEBUG_INFO_BTF=y 7 + CONFIG_BPF_JIT_ALWAYS_ON=y 8 + CONFIG_BPF_JIT_DEFAULT_ON=y 9 + CONFIG_SCHED_CLASS_EXT=y 10 + 11 + # Required by some rust schedulers (e.g. scx_p2dq) 12 + # 13 + CONFIG_KALLSYMS_ALL=y 14 + 15 + # Required on arm64 16 + # 17 + # CONFIG_DEBUG_INFO_REDUCED is not set 18 + 19 + # LAVD tracks futex to give an additional time slice for futex holder 20 + # (i.e., avoiding lock holder preemption) for better system-wide progress. 21 + # LAVD first tries to use ftrace to trace futex function calls. 22 + # If that is not available, it tries to use a tracepoint. 23 + CONFIG_FUNCTION_TRACER=y 24 + 25 + # Enable scheduling debugging 26 + # 27 + CONFIG_SCHED_DEBUG=y 28 + 29 + # Enable extra scheduling features (for a better code coverage while testing 30 + # the schedulers) 31 + # 32 + CONFIG_SCHED_AUTOGROUP=y 33 + CONFIG_SCHED_CORE=y 34 + CONFIG_SCHED_MC=y 35 + 36 + # Enable fully preemptible kernel for a better test coverage of the schedulers 37 + # 38 + # CONFIG_PREEMPT_NONE is not set 39 + # CONFIG_PREEMPT_VOLUNTARY is not set 40 + CONFIG_PREEMPT=y 41 + CONFIG_PREEMPT_DYNAMIC=y 42 + 43 + # Additional debugging information (useful to catch potential locking issues) 44 + CONFIG_DEBUG_LOCKDEP=y 45 + CONFIG_DEBUG_ATOMIC_SLEEP=y 46 + CONFIG_PROVE_LOCKING=y 47 + 48 + # Bpftrace headers (for additional debug info) 49 + CONFIG_BPF_EVENTS=y 50 + CONFIG_FTRACE_SYSCALLS=y 51 + CONFIG_DYNAMIC_FTRACE=y 52 + CONFIG_KPROBES=y 53 + CONFIG_KPROBE_EVENTS=y 54 + CONFIG_UPROBES=y 55 + CONFIG_UPROBE_EVENTS=y 56 + CONFIG_DEBUG_FS=y 57 + 58 + # Enable access to kernel configuration and headers at runtime 59 + CONFIG_IKHEADERS=y 60 + CONFIG_IKCONFIG_PROC=y 61 + CONFIG_IKCONFIG=y

+2

tools/sched_ext/Makefile

··· 122 122 -I../../include \ 123 123 $(call get_sys_includes,$(CLANG)) \ 124 124 -Wall -Wno-compare-distinct-pointer-types \ 125 + -Wno-microsoft-anon-tag \ 126 + -fms-extensions \ 125 127 -O2 -mcpu=v3 126 128 127 129 # sort removes libbpf duplicates when not cross-building

-6

tools/sched_ext/README.md

··· 58 58 CONFIG_BPF_SYSCALL=y 59 59 CONFIG_BPF_JIT=y 60 60 CONFIG_DEBUG_INFO_BTF=y 61 - ``` 62 - 63 - It's also recommended that you also include the following Kconfig options: 64 - 65 - ``` 66 61 CONFIG_BPF_JIT_ALWAYS_ON=y 67 62 CONFIG_BPF_JIT_DEFAULT_ON=y 68 - CONFIG_PAHOLE_HAS_BTF_TAG=y 69 63 ``` 70 64 71 65 There is a `Kconfig` file in this directory whose contents you can append to

+5 -2

tools/sched_ext/include/scx/compat.h

··· 125 125 { 126 126 int fd; 127 127 char buf[32]; 128 + char *endptr; 128 129 ssize_t len; 129 130 long val; 130 131 ··· 138 137 buf[len] = 0; 139 138 close(fd); 140 139 141 - val = strtoul(buf, NULL, 10); 142 - SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); 140 + errno = 0; 141 + val = strtoul(buf, &endptr, 10); 142 + SCX_BUG_ON(errno == ERANGE || endptr == buf || 143 + (*endptr != '\n' && *endptr != '\0'), "invalid num hotplug events: %ld", val); 143 144 144 145 return val; 145 146 }

+1 -1

tools/sched_ext/scx_central.c

··· 66 66 assert(skel->rodata->nr_cpu_ids > 0); 67 67 assert(skel->rodata->nr_cpu_ids <= INT32_MAX); 68 68 69 - while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { 69 + while ((opt = getopt(argc, argv, "s:c:vh")) != -1) { 70 70 switch (opt) { 71 71 case 's': 72 72 skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;

+1 -1

tools/sched_ext/scx_sdt.c

··· 54 54 optind = 1; 55 55 skel = SCX_OPS_OPEN(sdt_ops, scx_sdt); 56 56 57 - while ((opt = getopt(argc, argv, "fvh")) != -1) { 57 + while ((opt = getopt(argc, argv, "vh")) != -1) { 58 58 switch (opt) { 59 59 case 'v': 60 60 verbose = true;

+2

tools/testing/selftests/sched_ext/Makefile

··· 93 93 $(CLANG_SYS_INCLUDES) \ 94 94 -Wall -Wno-compare-distinct-pointer-types \ 95 95 -Wno-incompatible-function-pointer-types \ 96 + -Wno-microsoft-anon-tag \ 97 + -fms-extensions \ 96 98 -O2 -mcpu=v3 97 99 98 100 # sort removes libbpf duplicates when not cross-building

+2 -1

tools/testing/selftests/sched_ext/init_enable_count.c

··· 57 57 char buf; 58 58 59 59 close(pipe_fds[1]); 60 - read(pipe_fds[0], &buf, 1); 60 + if (read(pipe_fds[0], &buf, 1) < 0) 61 + exit(1); 61 62 close(pipe_fds[0]); 62 63 exit(0); 63 64 }

+2 -2

tools/testing/selftests/sched_ext/peek_dsq.bpf.c

··· 58 58 { 59 59 u32 slot_key; 60 60 long *slot_pid_ptr; 61 - int ix; 61 + u32 ix; 62 62 63 63 if (pid <= 0) 64 64 return; 65 65 66 66 /* Find an empty slot or one with the same PID */ 67 67 bpf_for(ix, 0, 10) { 68 - slot_key = (pid + ix) % MAX_SAMPLES; 68 + slot_key = ((u64)pid + ix) % MAX_SAMPLES; 69 69 slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key); 70 70 if (!slot_pid_ptr) 71 71 continue;

-1

tools/testing/selftests/sched_ext/rt_stall.c

··· 15 15 #include <signal.h> 16 16 #include <bpf/bpf.h> 17 17 #include <scx/common.h> 18 - #include <unistd.h> 19 18 #include "rt_stall.bpf.skel.h" 20 19 #include "scx_test.h" 21 20 #include "../kselftest.h"

+3

tools/testing/selftests/sched_ext/runner.c

··· 166 166 enum scx_test_status status; 167 167 struct scx_test *test = &__scx_tests[i]; 168 168 169 + if (exit_req) 170 + break; 171 + 169 172 if (list) { 170 173 printf("%s\n", test->name); 171 174 if (i == (__scx_num_tests - 1))

Configure Feed

Configure Feed