sched_ext: idle: Extend topology optimizations to all tasks

The built-in idle selection policy, scx_select_cpu_dfl(), always
prioritizes picking idle CPUs within the same LLC or NUMA node, but
these optimizations are currently applied only when a task has no CPU
affinity constraints.

This is done primarily for efficiency, as it avoids the overhead of
updating a cpumask every time we need to select an idle CPU (which can
be costly in large SMP systems).

However, this approach limits the effectiveness of the built-in idle
policy and results in inconsistent behavior, as affinity-restricted
tasks don't benefit from topology-aware optimizations.

To address this, modify the policy to apply LLC and NUMA-aware
optimizations even when a task is constrained to a subset of CPUs.

We can still avoid updating the cpumasks by checking if the subset of
LLC and node CPUs are contained in the subset of allowed CPUs usable by
the task (which is true in most of the cases - for tasks that don't have
affinity constratints).

Moreover, use temporary local per-CPU cpumasks to determine the LLC and
node subsets, minimizing potential overhead even on large SMP systems.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Andrea Righi and committed by

Tejun Heo 1 year ago 29f512f5 29b49be6

+51 -22

1 changed file

expand all

kernel

sched

ext_idle.c

+51 -22

kernel/sched/ext_idle.c

··· 47 47 static struct scx_idle_cpus **scx_idle_node_masks; 48 48 49 49 /* 50 + * Local per-CPU cpumasks (used to generate temporary idle cpumasks). 51 + */ 52 + static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask); 53 + static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask); 54 + 55 + /* 50 56 * Return the idle masks associated to a target @node. 51 57 * 52 58 * NUMA_NO_NODE identifies the global idle cpumask. ··· 398 392 } 399 393 400 394 /* 395 + * Return true if @p can run on all possible CPUs, false otherwise. 396 + */ 397 + static inline bool task_affinity_all(const struct task_struct *p) 398 + { 399 + return p->nr_cpus_allowed >= num_possible_cpus(); 400 + } 401 + 402 + /* 401 403 * Built-in CPU idle selection policy: 402 404 * 403 405 * 1. Prioritize full-idle cores: ··· 440 426 */ 441 427 s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, u64 flags) 442 428 { 443 - const struct cpumask *llc_cpus = NULL; 444 - const struct cpumask *numa_cpus = NULL; 429 + const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL; 445 430 int node = scx_cpu_node_if_enabled(prev_cpu); 446 431 s32 cpu; 447 432 ··· 450 437 rcu_read_lock(); 451 438 452 439 /* 453 - * Determine the scheduling domain only if the task is allowed to run 454 - * on all CPUs. 440 + * Determine the subset of CPUs that the task can use in its 441 + * current LLC and node. 455 442 * 456 - * This is done primarily for efficiency, as it avoids the overhead of 457 - * updating a cpumask every time we need to select an idle CPU (which 458 - * can be costly in large SMP systems), but it also aligns logically: 459 - * if a task's scheduling domain is restricted by user-space (through 460 - * CPU affinity), the task will simply use the flat scheduling domain 461 - * defined by user-space. 443 + * If the task can run on all CPUs, use the node and LLC cpumasks 444 + * directly. 462 445 */ 463 - if (p->nr_cpus_allowed >= num_possible_cpus()) { 464 - if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) 465 - numa_cpus = numa_span(prev_cpu); 446 + if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) { 447 + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask); 448 + const struct cpumask *cpus = numa_span(prev_cpu); 466 449 467 - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) 468 - llc_cpus = llc_span(prev_cpu); 450 + if (task_affinity_all(p)) 451 + numa_cpus = cpus; 452 + else if (cpus && cpumask_and(local_cpus, p->cpus_ptr, cpus)) 453 + numa_cpus = local_cpus; 454 + } 455 + 456 + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { 457 + struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask); 458 + const struct cpumask *cpus = llc_span(prev_cpu); 459 + 460 + if (task_affinity_all(p)) 461 + llc_cpus = cpus; 462 + else if (cpus && cpumask_and(local_cpus, p->cpus_ptr, cpus)) 463 + llc_cpus = local_cpus; 469 464 } 470 465 471 466 /* ··· 617 596 */ 618 597 void scx_idle_init_masks(void) 619 598 { 620 - int node; 599 + int i; 621 600 622 601 /* Allocate global idle cpumasks */ 623 602 BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL)); ··· 628 607 sizeof(*scx_idle_node_masks), GFP_KERNEL); 629 608 BUG_ON(!scx_idle_node_masks); 630 609 631 - for_each_node(node) { 632 - scx_idle_node_masks[node] = kzalloc_node(sizeof(**scx_idle_node_masks), 633 - GFP_KERNEL, node); 634 - BUG_ON(!scx_idle_node_masks[node]); 610 + for_each_node(i) { 611 + scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks), 612 + GFP_KERNEL, i); 613 + BUG_ON(!scx_idle_node_masks[i]); 635 614 636 - BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->cpu, GFP_KERNEL, node)); 637 - BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[node]->smt, GFP_KERNEL, node)); 615 + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i)); 616 + BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i)); 617 + } 618 + 619 + /* Allocate local per-cpu idle cpumasks */ 620 + for_each_possible_cpu(i) { 621 + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i), 622 + GFP_KERNEL, cpu_to_node(i))); 623 + BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i), 624 + GFP_KERNEL, cpu_to_node(i))); 638 625 } 639 626 } 640 627

Configure Feed

Configure Feed