sched_ext: Allocate scx_kick_cpus_pnt_seqs lazily using kvzalloc()

On systems with >4096 CPUs, scx_kick_cpus_pnt_seqs allocation fails during
boot because it exceeds the 32,768 byte percpu allocator limit.

Restructure to use DEFINE_PER_CPU() for the per-CPU pointers, with each CPU
pointing to its own kvzalloc'd array. Move allocation from boot time to
scx_enable() and free in scx_disable(), so the O(nr_cpu_ids^2) memory is only
consumed when sched_ext is active.

Use RCU to guard against racing with free. Arrays are freed via call_rcu()
and kick_cpus_irq_workfn() uses rcu_dereference_bh() with a NULL check.

While at it, rename to scx_kick_pseqs for brevity and update comments to
clarify these are pick_task sequence numbers.

v2: RCU protect scx_kick_seqs to manage kick_cpus_irq_workfn() racing
against disable as per Andrea.

v3: Fix bugs notcied by Andrea.

Reported-by: Phil Auld <pauld@redhat.com>
Link: http://lkml.kernel.org/r/20251007133523.GA93086@pauld.westford.csb
Cc: Andrea Righi <arighi@nvidia.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Reviewed-by: Phil Auld <pauld@redhat.com>
Reviewed-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

Tejun Heo 8 months ago 14c1da38 a8ad8731

+79 -10

1 changed file

expand all

kernel

sched

ext.c

+79 -10

kernel/sched/ext.c

··· 67 67 68 68 static struct delayed_work scx_watchdog_work; 69 69 70 - /* for %SCX_KICK_WAIT */ 71 - static unsigned long __percpu *scx_kick_cpus_pnt_seqs; 70 + /* 71 + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence 72 + * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu 73 + * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated 74 + * lazily when enabling and freed when disabling to avoid waste when sched_ext 75 + * isn't active. 76 + */ 77 + struct scx_kick_pseqs { 78 + struct rcu_head rcu; 79 + unsigned long seqs[]; 80 + }; 81 + 82 + static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs); 72 83 73 84 /* 74 85 * Direct dispatch marker. ··· 3888 3877 } 3889 3878 } 3890 3879 3880 + static void free_kick_pseqs_rcu(struct rcu_head *rcu) 3881 + { 3882 + struct scx_kick_pseqs *pseqs = container_of(rcu, struct scx_kick_pseqs, rcu); 3883 + 3884 + kvfree(pseqs); 3885 + } 3886 + 3887 + static void free_kick_pseqs(void) 3888 + { 3889 + int cpu; 3890 + 3891 + for_each_possible_cpu(cpu) { 3892 + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); 3893 + struct scx_kick_pseqs *to_free; 3894 + 3895 + to_free = rcu_replace_pointer(*pseqs, NULL, true); 3896 + if (to_free) 3897 + call_rcu(&to_free->rcu, free_kick_pseqs_rcu); 3898 + } 3899 + } 3900 + 3891 3901 static void scx_disable_workfn(struct kthread_work *work) 3892 3902 { 3893 3903 struct scx_sched *sch = container_of(work, struct scx_sched, disable_work); ··· 4045 4013 free_percpu(scx_dsp_ctx); 4046 4014 scx_dsp_ctx = NULL; 4047 4015 scx_dsp_max_batch = 0; 4016 + free_kick_pseqs(); 4048 4017 4049 4018 mutex_unlock(&scx_enable_mutex); 4050 4019 ··· 4408 4375 irq_work_queue(&sch->error_irq_work); 4409 4376 } 4410 4377 4378 + static int alloc_kick_pseqs(void) 4379 + { 4380 + int cpu; 4381 + 4382 + /* 4383 + * Allocate per-CPU arrays sized by nr_cpu_ids. Use kvzalloc as size 4384 + * can exceed percpu allocator limits on large machines. 4385 + */ 4386 + for_each_possible_cpu(cpu) { 4387 + struct scx_kick_pseqs **pseqs = per_cpu_ptr(&scx_kick_pseqs, cpu); 4388 + struct scx_kick_pseqs *new_pseqs; 4389 + 4390 + WARN_ON_ONCE(rcu_access_pointer(*pseqs)); 4391 + 4392 + new_pseqs = kvzalloc_node(struct_size(new_pseqs, seqs, nr_cpu_ids), 4393 + GFP_KERNEL, cpu_to_node(cpu)); 4394 + if (!new_pseqs) { 4395 + free_kick_pseqs(); 4396 + return -ENOMEM; 4397 + } 4398 + 4399 + rcu_assign_pointer(*pseqs, new_pseqs); 4400 + } 4401 + 4402 + return 0; 4403 + } 4404 + 4411 4405 static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) 4412 4406 { 4413 4407 struct scx_sched *sch; ··· 4577 4517 4578 4518 mutex_lock(&scx_enable_mutex); 4579 4519 4520 + ret = alloc_kick_pseqs(); 4521 + if (ret) 4522 + goto err_unlock; 4523 + 4580 4524 if (scx_enable_state() != SCX_DISABLED) { 4581 4525 ret = -EBUSY; 4582 - goto err_unlock; 4526 + goto err_free_pseqs; 4583 4527 } 4584 4528 4585 4529 sch = scx_alloc_and_add_sched(ops); 4586 4530 if (IS_ERR(sch)) { 4587 4531 ret = PTR_ERR(sch); 4588 - goto err_unlock; 4532 + goto err_free_pseqs; 4589 4533 } 4590 4534 4591 4535 /* ··· 4792 4728 4793 4729 return 0; 4794 4730 4731 + err_free_pseqs: 4732 + free_kick_pseqs(); 4795 4733 err_unlock: 4796 4734 mutex_unlock(&scx_enable_mutex); 4797 4735 return ret; ··· 5175 5109 { 5176 5110 struct rq *this_rq = this_rq(); 5177 5111 struct scx_rq *this_scx = &this_rq->scx; 5178 - unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); 5112 + struct scx_kick_pseqs __rcu *pseqs_pcpu = __this_cpu_read(scx_kick_pseqs); 5179 5113 bool should_wait = false; 5114 + unsigned long *pseqs; 5180 5115 s32 cpu; 5116 + 5117 + if (unlikely(!pseqs_pcpu)) { 5118 + pr_warn_once("kick_cpus_irq_workfn() called with NULL scx_kick_pseqs"); 5119 + return; 5120 + } 5121 + 5122 + pseqs = rcu_dereference_bh(pseqs_pcpu)->seqs; 5181 5123 5182 5124 for_each_cpu(cpu, this_scx->cpus_to_kick) { 5183 5125 should_wait |= kick_one_cpu(cpu, this_rq, pseqs); ··· 5308 5234 SCX_TG_ONLINE); 5309 5235 5310 5236 scx_idle_init_masks(); 5311 - 5312 - scx_kick_cpus_pnt_seqs = 5313 - __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, 5314 - __alignof__(scx_kick_cpus_pnt_seqs[0])); 5315 - BUG_ON(!scx_kick_cpus_pnt_seqs); 5316 5237 5317 5238 for_each_possible_cpu(cpu) { 5318 5239 struct rq *rq = cpu_rq(cpu);

Configure Feed

Configure Feed