Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+1 -2

arch/s390/mm/pfault.c

··· 199 199 * return to userspace schedule() to block. 200 200 */ 201 201 __set_current_state(TASK_UNINTERRUPTIBLE); 202 - set_tsk_need_resched(tsk); 203 - set_preempt_need_resched(); 202 + set_need_resched_current(); 204 203 } 205 204 } 206 205 out:

+2

arch/x86/include/asm/topology.h

··· 325 325 extern void arch_scale_freq_tick(void); 326 326 #define arch_scale_freq_tick arch_scale_freq_tick 327 327 328 + extern int arch_sched_node_distance(int from, int to); 329 + 328 330 #endif /* _ASM_X86_TOPOLOGY_H */

+70

arch/x86/kernel/smpboot.c

··· 515 515 set_sched_topology(topology); 516 516 } 517 517 518 + #ifdef CONFIG_NUMA 519 + static int sched_avg_remote_distance; 520 + static int avg_remote_numa_distance(void) 521 + { 522 + int i, j; 523 + int distance, nr_remote, total_distance; 524 + 525 + if (sched_avg_remote_distance > 0) 526 + return sched_avg_remote_distance; 527 + 528 + nr_remote = 0; 529 + total_distance = 0; 530 + for_each_node_state(i, N_CPU) { 531 + for_each_node_state(j, N_CPU) { 532 + distance = node_distance(i, j); 533 + 534 + if (distance >= REMOTE_DISTANCE) { 535 + nr_remote++; 536 + total_distance += distance; 537 + } 538 + } 539 + } 540 + if (nr_remote) 541 + sched_avg_remote_distance = total_distance / nr_remote; 542 + else 543 + sched_avg_remote_distance = REMOTE_DISTANCE; 544 + 545 + return sched_avg_remote_distance; 546 + } 547 + 548 + int arch_sched_node_distance(int from, int to) 549 + { 550 + int d = node_distance(from, to); 551 + 552 + switch (boot_cpu_data.x86_vfm) { 553 + case INTEL_GRANITERAPIDS_X: 554 + case INTEL_ATOM_DARKMONT_X: 555 + 556 + if (!x86_has_numa_in_package || topology_max_packages() == 1 || 557 + d < REMOTE_DISTANCE) 558 + return d; 559 + 560 + /* 561 + * With SNC enabled, there could be too many levels of remote 562 + * NUMA node distances, creating NUMA domain levels 563 + * including local nodes and partial remote nodes. 564 + * 565 + * Trim finer distance tuning for NUMA nodes in remote package 566 + * for the purpose of building sched domains. Group NUMA nodes 567 + * in the remote package in the same sched group. 568 + * Simplify NUMA domains and avoid extra NUMA levels including 569 + * different remote NUMA nodes and local nodes. 570 + * 571 + * GNR and CWF don't expect systems with more than 2 packages 572 + * and more than 2 hops between packages. Single average remote 573 + * distance won't be appropriate if there are more than 2 574 + * packages as average distance to different remote packages 575 + * could be different. 576 + */ 577 + WARN_ONCE(topology_max_packages() > 2, 578 + "sched: Expect only up to 2 packages for GNR or CWF, " 579 + "but saw %d packages when building sched domains.", 580 + topology_max_packages()); 581 + 582 + d = avg_remote_numa_distance(); 583 + } 584 + return d; 585 + } 586 + #endif /* CONFIG_NUMA */ 587 + 518 588 void set_cpu_sibling_map(int cpu) 519 589 { 520 590 bool has_smt = __max_threads_per_core > 1;

+5

include/linux/cleanup.h

··· 348 348 #define __DEFINE_CLASS_IS_CONDITIONAL(_name, _is_cond) \ 349 349 static __maybe_unused const bool class_##_name##_is_conditional = _is_cond 350 350 351 + #define DEFINE_CLASS_IS_UNCONDITIONAL(_name) \ 352 + __DEFINE_CLASS_IS_CONDITIONAL(_name, false); \ 353 + static inline void * class_##_name##_lock_ptr(class_##_name##_t *_T) \ 354 + { return (void *)1; } 355 + 351 356 #define __GUARD_IS_ERR(_ptr) \ 352 357 ({ \ 353 358 unsigned long _rc = (__force unsigned long)(_ptr); \

+20 -13

include/linux/sched.h

··· 637 637 #endif 638 638 } __randomize_layout; 639 639 640 - typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); 641 - typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); 640 + struct rq_flags; 641 + typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *, struct rq_flags *rf); 642 642 643 643 struct sched_dl_entity { 644 644 struct rb_node rb_node; ··· 685 685 * 686 686 * @dl_server tells if this is a server entity. 687 687 * 688 - * @dl_defer tells if this is a deferred or regular server. For 689 - * now only defer server exists. 690 - * 691 - * @dl_defer_armed tells if the deferrable server is waiting 692 - * for the replenishment timer to activate it. 693 - * 694 688 * @dl_server_active tells if the dlserver is active(started). 695 689 * dlserver is started on first cfs enqueue on an idle runqueue 696 690 * and is stopped when a dequeue results in 0 cfs tasks on the 697 691 * runqueue. In other words, dlserver is active only when cpu's 698 692 * runqueue has atleast one cfs task. 699 693 * 694 + * @dl_defer tells if this is a deferred or regular server. For 695 + * now only defer server exists. 696 + * 697 + * @dl_defer_armed tells if the deferrable server is waiting 698 + * for the replenishment timer to activate it. 699 + * 700 700 * @dl_defer_running tells if the deferrable server is actually 701 701 * running, skipping the defer phase. 702 + * 703 + * @dl_defer_idle tracks idle state 702 704 */ 703 705 unsigned int dl_throttled : 1; 704 706 unsigned int dl_yielded : 1; ··· 711 709 unsigned int dl_defer : 1; 712 710 unsigned int dl_defer_armed : 1; 713 711 unsigned int dl_defer_running : 1; 712 + unsigned int dl_defer_idle : 1; 714 713 715 714 /* 716 715 * Bandwidth enforcement timer. Each -deadline task has its ··· 733 730 * dl_server_update(). 734 731 * 735 732 * @rq the runqueue this server is for 736 - * 737 - * @server_has_tasks() returns true if @server_pick return a 738 - * runnable task. 739 733 */ 740 734 struct rq *rq; 741 735 dl_server_pick_f server_pick_task; ··· 1861 1861 extern int dl_bw_alloc(int cpu, u64 dl_bw); 1862 1862 extern void dl_bw_free(int cpu, u64 dl_bw); 1863 1863 1864 - /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ 1865 - extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); 1864 + /* set_cpus_allowed_force() - consider using set_cpus_allowed_ptr() instead */ 1865 + extern void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask); 1866 1866 1867 1867 /** 1868 1868 * set_cpus_allowed_ptr - set CPU affinity mask of a task ··· 2056 2056 static inline int test_tsk_need_resched(struct task_struct *tsk) 2057 2057 { 2058 2058 return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); 2059 + } 2060 + 2061 + static inline void set_need_resched_current(void) 2062 + { 2063 + lockdep_assert_irqs_disabled(); 2064 + set_tsk_need_resched(current); 2065 + set_preempt_need_resched(); 2059 2066 } 2060 2067 2061 2068 /*

+3

include/linux/sched/topology.h

··· 92 92 unsigned int nr_balance_failed; /* initialise to 0 */ 93 93 94 94 /* idle_balance() stats */ 95 + unsigned int newidle_call; 96 + unsigned int newidle_success; 97 + unsigned int newidle_ratio; 95 98 u64 max_newidle_lb_cost; 96 99 unsigned long last_decay_max_lb_cost; 97 100

+1 -1

kernel/cgroup/cpuset.c

··· 4180 4180 rcu_read_lock(); 4181 4181 cs_mask = task_cs(tsk)->cpus_allowed; 4182 4182 if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { 4183 - do_set_cpus_allowed(tsk, cs_mask); 4183 + set_cpus_allowed_force(tsk, cs_mask); 4184 4184 changed = true; 4185 4185 } 4186 4186 rcu_read_unlock();

+5 -10

kernel/kthread.c

··· 593 593 594 594 static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) 595 595 { 596 - unsigned long flags; 597 - 598 596 if (!wait_task_inactive(p, state)) { 599 597 WARN_ON(1); 600 598 return; 601 599 } 602 600 601 + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) 602 + set_cpus_allowed_force(p, mask); 603 + 603 604 /* It's safe because the task is inactive. */ 604 - raw_spin_lock_irqsave(&p->pi_lock, flags); 605 - do_set_cpus_allowed(p, mask); 606 605 p->flags |= PF_NO_SETAFFINITY; 607 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 608 606 } 609 607 610 608 static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) ··· 855 857 { 856 858 struct kthread *kthread = to_kthread(p); 857 859 cpumask_var_t affinity; 858 - unsigned long flags; 859 860 int ret = 0; 860 861 861 862 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { ··· 879 882 list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); 880 883 kthread_fetch_affinity(kthread, affinity); 881 884 882 - /* It's safe because the task is inactive. */ 883 - raw_spin_lock_irqsave(&p->pi_lock, flags); 884 - do_set_cpus_allowed(p, affinity); 885 - raw_spin_unlock_irqrestore(&p->pi_lock, flags); 885 + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) 886 + set_cpus_allowed_force(p, affinity); 886 887 887 888 mutex_unlock(&kthreads_hotplug_lock); 888 889 out:

+3 -5

kernel/rcu/tiny.c

··· 70 70 */ 71 71 void rcu_sched_clock_irq(int user) 72 72 { 73 - if (user) { 73 + if (user) 74 74 rcu_qs(); 75 - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { 76 - set_tsk_need_resched(current); 77 - set_preempt_need_resched(); 78 - } 75 + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) 76 + set_need_resched_current(); 79 77 } 80 78 81 79 /*

+5 -9

kernel/rcu/tree.c

··· 2696 2696 /* The load-acquire pairs with the store-release setting to true. */ 2697 2697 if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { 2698 2698 /* Idle and userspace execution already are quiescent states. */ 2699 - if (!rcu_is_cpu_rrupt_from_idle() && !user) { 2700 - set_tsk_need_resched(current); 2701 - set_preempt_need_resched(); 2702 - } 2699 + if (!rcu_is_cpu_rrupt_from_idle() && !user) 2700 + set_need_resched_current(); 2703 2701 __this_cpu_write(rcu_data.rcu_urgent_qs, false); 2704 2702 } 2705 2703 rcu_flavor_sched_clock_irq(user); ··· 2822 2824 /* Perform RCU core processing work for the current CPU. */ 2823 2825 static __latent_entropy void rcu_core(void) 2824 2826 { 2825 - unsigned long flags; 2826 2827 struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); 2827 2828 struct rcu_node *rnp = rdp->mynode; 2828 2829 ··· 2834 2837 if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { 2835 2838 rcu_preempt_deferred_qs(current); 2836 2839 } else if (rcu_preempt_need_deferred_qs(current)) { 2837 - set_tsk_need_resched(current); 2838 - set_preempt_need_resched(); 2840 + guard(irqsave)(); 2841 + set_need_resched_current(); 2839 2842 } 2840 2843 2841 2844 /* Update RCU state based on any recent quiescent states. */ ··· 2844 2847 /* No grace period and unregistered callbacks? */ 2845 2848 if (!rcu_gp_in_progress() && 2846 2849 rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { 2847 - local_irq_save(flags); 2850 + guard(irqsave)(); 2848 2851 if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) 2849 2852 rcu_accelerate_cbs_unlocked(rnp, rdp); 2850 - local_irq_restore(flags); 2851 2853 } 2852 2854 2853 2855 rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check());

+1 -2

kernel/rcu/tree_exp.h

··· 729 729 __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); 730 730 /* Store .exp before .rcu_urgent_qs. */ 731 731 smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); 732 - set_tsk_need_resched(current); 733 - set_preempt_need_resched(); 732 + set_need_resched_current(); 734 733 } 735 734 736 735 #ifdef CONFIG_PREEMPT_RCU

+3 -6

kernel/rcu/tree_plugin.h

··· 753 753 // Also if no expediting and no possible deboosting, 754 754 // slow is OK. Plus nohz_full CPUs eventually get 755 755 // tick enabled. 756 - set_tsk_need_resched(current); 757 - set_preempt_need_resched(); 756 + set_need_resched_current(); 758 757 if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && 759 758 needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && 760 759 cpu_online(rdp->cpu)) { ··· 812 813 if (rcu_preempt_depth() > 0 || 813 814 (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { 814 815 /* No QS, force context switch if deferred. */ 815 - if (rcu_preempt_need_deferred_qs(t)) { 816 - set_tsk_need_resched(t); 817 - set_preempt_need_resched(); 818 - } 816 + if (rcu_preempt_need_deferred_qs(t)) 817 + set_need_resched_current(); 819 818 } else if (rcu_preempt_need_deferred_qs(t)) { 820 819 rcu_preempt_deferred_qs(t); /* Report deferred QS. */ 821 820 return;

+1 -2

kernel/rcu/tree_stall.h

··· 763 763 * progress and it could be we're stuck in kernel space without context 764 764 * switches for an entirely unreasonable amount of time. 765 765 */ 766 - set_tsk_need_resched(current); 767 - set_preempt_need_resched(); 766 + set_need_resched_current(); 768 767 } 769 768 770 769 static bool csd_lock_suppress_rcu_stall;

+175 -251

kernel/sched/core.c

··· 121 121 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); 122 122 123 123 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 124 + DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); 124 125 125 126 #ifdef CONFIG_SCHED_PROXY_EXEC 126 127 DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); ··· 584 583 * 585 584 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: 586 585 * 587 - * is set by activate_task() and cleared by deactivate_task(), under 588 - * rq->lock. Non-zero indicates the task is runnable, the special 586 + * is set by activate_task() and cleared by deactivate_task()/block_task(), 587 + * under rq->lock. Non-zero indicates the task is runnable, the special 589 588 * ON_RQ_MIGRATING state is used for migration without holding both 590 589 * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). 591 590 * ··· 2090 2089 */ 2091 2090 uclamp_rq_inc(rq, p, flags); 2092 2091 2092 + rq->queue_mask |= p->sched_class->queue_mask; 2093 2093 p->sched_class->enqueue_task(rq, p, flags); 2094 2094 2095 2095 psi_enqueue(p, flags); ··· 2123 2121 * and mark the task ->sched_delayed. 2124 2122 */ 2125 2123 uclamp_rq_dec(rq, p); 2124 + rq->queue_mask |= p->sched_class->queue_mask; 2126 2125 return p->sched_class->dequeue_task(rq, p, flags); 2127 2126 } 2128 2127 ··· 2170 2167 inline int task_curr(const struct task_struct *p) 2171 2168 { 2172 2169 return cpu_curr(task_cpu(p)) == p; 2173 - } 2174 - 2175 - /* 2176 - * ->switching_to() is called with the pi_lock and rq_lock held and must not 2177 - * mess with locking. 2178 - */ 2179 - void check_class_changing(struct rq *rq, struct task_struct *p, 2180 - const struct sched_class *prev_class) 2181 - { 2182 - if (prev_class != p->sched_class && p->sched_class->switching_to) 2183 - p->sched_class->switching_to(rq, p); 2184 - } 2185 - 2186 - /* 2187 - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, 2188 - * use the balance_callback list if you want balancing. 2189 - * 2190 - * this means any call to check_class_changed() must be followed by a call to 2191 - * balance_callback(). 2192 - */ 2193 - void check_class_changed(struct rq *rq, struct task_struct *p, 2194 - const struct sched_class *prev_class, 2195 - int oldprio) 2196 - { 2197 - if (prev_class != p->sched_class) { 2198 - if (prev_class->switched_from) 2199 - prev_class->switched_from(rq, p); 2200 - 2201 - p->sched_class->switched_to(rq, p); 2202 - } else if (oldprio != p->prio || dl_task(p)) 2203 - p->sched_class->prio_changed(rq, p, oldprio); 2204 2170 } 2205 2171 2206 2172 void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) ··· 2334 2362 } 2335 2363 2336 2364 static void 2337 - __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); 2365 + do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); 2338 2366 2339 2367 static void migrate_disable_switch(struct rq *rq, struct task_struct *p) 2340 2368 { ··· 2349 2377 if (p->cpus_ptr != &p->cpus_mask) 2350 2378 return; 2351 2379 2352 - /* 2353 - * Violates locking rules! See comment in __do_set_cpus_allowed(). 2354 - */ 2355 - __do_set_cpus_allowed(p, &ac); 2380 + scoped_guard (task_rq_lock, p) 2381 + do_set_cpus_allowed(p, &ac); 2356 2382 } 2357 2383 2358 2384 void ___migrate_enable(void) ··· 2583 2613 */ 2584 2614 WARN_ON_ONCE(!pending->stop_pending); 2585 2615 preempt_disable(); 2586 - task_rq_unlock(rq, p, &rf); 2616 + rq_unlock(rq, &rf); 2617 + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 2587 2618 stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, 2588 2619 &pending->arg, &pending->stop_work); 2589 2620 preempt_enable(); ··· 2593 2622 out: 2594 2623 if (pending) 2595 2624 pending->stop_pending = false; 2596 - task_rq_unlock(rq, p, &rf); 2625 + rq_unlock(rq, &rf); 2626 + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 2597 2627 2598 2628 if (complete) 2599 2629 complete_all(&pending->done); ··· 2665 2693 } 2666 2694 2667 2695 static void 2668 - __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) 2696 + do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) 2669 2697 { 2670 - struct rq *rq = task_rq(p); 2671 - bool queued, running; 2672 - 2673 - /* 2674 - * This here violates the locking rules for affinity, since we're only 2675 - * supposed to change these variables while holding both rq->lock and 2676 - * p->pi_lock. 2677 - * 2678 - * HOWEVER, it magically works, because ttwu() is the only code that 2679 - * accesses these variables under p->pi_lock and only does so after 2680 - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() 2681 - * before finish_task(). 2682 - * 2683 - * XXX do further audits, this smells like something putrid. 2684 - */ 2685 - if (ctx->flags & SCA_MIGRATE_DISABLE) 2686 - WARN_ON_ONCE(!p->on_cpu); 2687 - else 2688 - lockdep_assert_held(&p->pi_lock); 2689 - 2690 - queued = task_on_rq_queued(p); 2691 - running = task_current_donor(rq, p); 2692 - 2693 - if (queued) { 2694 - /* 2695 - * Because __kthread_bind() calls this on blocked tasks without 2696 - * holding rq->lock. 2697 - */ 2698 - lockdep_assert_rq_held(rq); 2699 - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 2698 + scoped_guard (sched_change, p, DEQUEUE_SAVE) { 2699 + p->sched_class->set_cpus_allowed(p, ctx); 2700 + mm_set_cpus_allowed(p->mm, ctx->new_mask); 2700 2701 } 2701 - if (running) 2702 - put_prev_task(rq, p); 2703 - 2704 - p->sched_class->set_cpus_allowed(p, ctx); 2705 - mm_set_cpus_allowed(p->mm, ctx->new_mask); 2706 - 2707 - if (queued) 2708 - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 2709 - if (running) 2710 - set_next_task(rq, p); 2711 2702 } 2712 2703 2713 2704 /* 2714 2705 * Used for kthread_bind() and select_fallback_rq(), in both cases the user 2715 2706 * affinity (if any) should be destroyed too. 2716 2707 */ 2717 - void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 2708 + void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) 2718 2709 { 2719 2710 struct affinity_context ac = { 2720 2711 .new_mask = new_mask, ··· 2689 2754 struct rcu_head rcu; 2690 2755 }; 2691 2756 2692 - __do_set_cpus_allowed(p, &ac); 2757 + scoped_guard (__task_rq_lock, p) 2758 + do_set_cpus_allowed(p, &ac); 2693 2759 2694 2760 /* 2695 2761 * Because this is called with p->pi_lock held, it is not possible ··· 2728 2792 * Use pi_lock to protect content of user_cpus_ptr 2729 2793 * 2730 2794 * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent 2731 - * do_set_cpus_allowed(). 2795 + * set_cpus_allowed_force(). 2732 2796 */ 2733 2797 raw_spin_lock_irqsave(&src->pi_lock, flags); 2734 2798 if (src->user_cpus_ptr) { ··· 3000 3064 unsigned int dest_cpu; 3001 3065 int ret = 0; 3002 3066 3003 - update_rq_clock(rq); 3004 - 3005 3067 if (kthread || is_migration_disabled(p)) { 3006 3068 /* 3007 3069 * Kernel threads are allowed on online && !active CPUs, ··· 3054 3120 goto out; 3055 3121 } 3056 3122 3057 - __do_set_cpus_allowed(p, ctx); 3123 + do_set_cpus_allowed(p, ctx); 3058 3124 3059 3125 return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); 3060 3126 ··· 3463 3529 } 3464 3530 fallthrough; 3465 3531 case possible: 3466 - /* 3467 - * XXX When called from select_task_rq() we only 3468 - * hold p->pi_lock and again violate locking order. 3469 - * 3470 - * More yuck to audit. 3471 - */ 3472 - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); 3532 + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); 3473 3533 state = fail; 3474 3534 break; 3475 3535 case fail: ··· 3705 3777 ttwu_do_wakeup(p); 3706 3778 ret = 1; 3707 3779 } 3708 - __task_rq_unlock(rq, &rf); 3780 + __task_rq_unlock(rq, p, &rf); 3709 3781 3710 3782 return ret; 3711 3783 } ··· 4159 4231 * __schedule(). See the comment for smp_mb__after_spinlock(). 4160 4232 * 4161 4233 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure 4162 - * schedule()'s deactivate_task() has 'happened' and p will no longer 4234 + * schedule()'s block_task() has 'happened' and p will no longer 4163 4235 * care about it's own p->state. See the comment in __schedule(). 4164 4236 */ 4165 4237 smp_acquire__after_ctrl_dep(); ··· 4298 4370 ret = func(p, arg); 4299 4371 4300 4372 if (rq) 4301 - rq_unlock(rq, &rf); 4373 + __task_rq_unlock(rq, p, &rf); 4302 4374 4303 4375 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); 4304 4376 return ret; ··· 5620 5692 * reasonable amount of time. 5621 5693 */ 5622 5694 u64 delta = rq_clock_task(rq) - curr->se.exec_start; 5623 - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 5695 + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30); 5624 5696 } 5625 5697 curr->sched_class->task_tick(rq, curr, 0); 5626 5698 ··· 5844 5916 const struct sched_class *start_class = prev->sched_class; 5845 5917 const struct sched_class *class; 5846 5918 5847 - #ifdef CONFIG_SCHED_CLASS_EXT 5848 - /* 5849 - * SCX requires a balance() call before every pick_task() including when 5850 - * waking up from SCHED_IDLE. If @start_class is below SCX, start from 5851 - * SCX instead. Also, set a flag to detect missing balance() call. 5852 - */ 5853 - if (scx_enabled()) { 5854 - rq->scx.flags |= SCX_RQ_BAL_PENDING; 5855 - if (sched_class_above(&ext_sched_class, start_class)) 5856 - start_class = &ext_sched_class; 5857 - } 5858 - #endif 5859 - 5860 5919 /* 5861 5920 * We must do the balancing pass before put_prev_task(), such 5862 5921 * that when we release the rq->lock the task is in the same ··· 5887 5972 5888 5973 /* Assume the next prioritized class is idle_sched_class */ 5889 5974 if (!p) { 5890 - p = pick_task_idle(rq); 5975 + p = pick_task_idle(rq, rf); 5891 5976 put_prev_set_next_task(rq, prev, p); 5892 5977 } 5893 5978 ··· 5899 5984 5900 5985 for_each_active_class(class) { 5901 5986 if (class->pick_next_task) { 5902 - p = class->pick_next_task(rq, prev); 5987 + p = class->pick_next_task(rq, prev, rf); 5988 + if (unlikely(p == RETRY_TASK)) 5989 + goto restart; 5903 5990 if (p) 5904 5991 return p; 5905 5992 } else { 5906 - p = class->pick_task(rq); 5993 + p = class->pick_task(rq, rf); 5994 + if (unlikely(p == RETRY_TASK)) 5995 + goto restart; 5907 5996 if (p) { 5908 5997 put_prev_set_next_task(rq, prev, p); 5909 5998 return p; ··· 5937 6018 return a->core_cookie == b->core_cookie; 5938 6019 } 5939 6020 5940 - static inline struct task_struct *pick_task(struct rq *rq) 6021 + /* 6022 + * Careful; this can return RETRY_TASK, it does not include the retry-loop 6023 + * itself due to the whole SMT pick retry thing below. 6024 + */ 6025 + static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) 5941 6026 { 5942 6027 const struct sched_class *class; 5943 6028 struct task_struct *p; ··· 5949 6026 rq->dl_server = NULL; 5950 6027 5951 6028 for_each_active_class(class) { 5952 - p = class->pick_task(rq); 6029 + p = class->pick_task(rq, rf); 5953 6030 if (p) 5954 6031 return p; 5955 6032 } ··· 5964 6041 static struct task_struct * 5965 6042 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 5966 6043 { 5967 - struct task_struct *next, *p, *max = NULL; 6044 + struct task_struct *next, *p, *max; 5968 6045 const struct cpumask *smt_mask; 5969 6046 bool fi_before = false; 5970 6047 bool core_clock_updated = (rq == rq->core); ··· 6049 6126 * and there are no cookied tasks running on siblings. 6050 6127 */ 6051 6128 if (!need_sync) { 6052 - next = pick_task(rq); 6129 + restart_single: 6130 + next = pick_task(rq, rf); 6131 + if (unlikely(next == RETRY_TASK)) 6132 + goto restart_single; 6053 6133 if (!next->core_cookie) { 6054 6134 rq->core_pick = NULL; 6055 6135 rq->core_dl_server = NULL; ··· 6072 6146 * 6073 6147 * Tie-break prio towards the current CPU 6074 6148 */ 6149 + restart_multi: 6150 + max = NULL; 6075 6151 for_each_cpu_wrap(i, smt_mask, cpu) { 6076 6152 rq_i = cpu_rq(i); 6077 6153 ··· 6085 6157 if (i != cpu && (rq_i != rq->core || !core_clock_updated)) 6086 6158 update_rq_clock(rq_i); 6087 6159 6088 - rq_i->core_pick = p = pick_task(rq_i); 6160 + p = pick_task(rq_i, rf); 6161 + if (unlikely(p == RETRY_TASK)) 6162 + goto restart_multi; 6163 + 6164 + rq_i->core_pick = p; 6089 6165 rq_i->core_dl_server = rq_i->dl_server; 6090 6166 6091 6167 if (!max || prio_less(max, p, fi_before)) ··· 6111 6179 if (cookie) 6112 6180 p = sched_core_find(rq_i, cookie); 6113 6181 if (!p) 6114 - p = idle_sched_class.pick_task(rq_i); 6182 + p = idle_sched_class.pick_task(rq_i, rf); 6115 6183 } 6116 6184 6117 6185 rq_i->core_pick = p; ··· 6744 6812 6745 6813 local_irq_disable(); 6746 6814 rcu_note_context_switch(preempt); 6815 + migrate_disable_switch(rq, prev); 6747 6816 6748 6817 /* 6749 6818 * Make sure that signal_pending_state()->signal_pending() below ··· 6851 6918 */ 6852 6919 ++*switch_count; 6853 6920 6854 - migrate_disable_switch(rq, prev); 6855 6921 psi_account_irqtime(rq, prev, next); 6856 6922 psi_sched_switch(prev, next, !task_on_rq_queued(prev) || 6857 6923 prev->se.sched_delayed); ··· 7258 7326 */ 7259 7327 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) 7260 7328 { 7261 - int prio, oldprio, queued, running, queue_flag = 7329 + int prio, oldprio, queue_flag = 7262 7330 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7263 7331 const struct sched_class *prev_class, *next_class; 7264 7332 struct rq_flags rf; ··· 7320 7388 prev_class = p->sched_class; 7321 7389 next_class = __setscheduler_class(p->policy, prio); 7322 7390 7323 - if (prev_class != next_class && p->se.sched_delayed) 7324 - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 7391 + if (prev_class != next_class) 7392 + queue_flag |= DEQUEUE_CLASS; 7325 7393 7326 - queued = task_on_rq_queued(p); 7327 - running = task_current_donor(rq, p); 7328 - if (queued) 7329 - dequeue_task(rq, p, queue_flag); 7330 - if (running) 7331 - put_prev_task(rq, p); 7332 - 7333 - /* 7334 - * Boosting condition are: 7335 - * 1. -rt task is running and holds mutex A 7336 - * --> -dl task blocks on mutex A 7337 - * 7338 - * 2. -dl task is running and holds mutex A 7339 - * --> -dl task blocks on mutex A and could preempt the 7340 - * running task 7341 - */ 7342 - if (dl_prio(prio)) { 7343 - if (!dl_prio(p->normal_prio) || 7344 - (pi_task && dl_prio(pi_task->prio) && 7345 - dl_entity_preempt(&pi_task->dl, &p->dl))) { 7346 - p->dl.pi_se = pi_task->dl.pi_se; 7347 - queue_flag |= ENQUEUE_REPLENISH; 7394 + scoped_guard (sched_change, p, queue_flag) { 7395 + /* 7396 + * Boosting condition are: 7397 + * 1. -rt task is running and holds mutex A 7398 + * --> -dl task blocks on mutex A 7399 + * 7400 + * 2. -dl task is running and holds mutex A 7401 + * --> -dl task blocks on mutex A and could preempt the 7402 + * running task 7403 + */ 7404 + if (dl_prio(prio)) { 7405 + if (!dl_prio(p->normal_prio) || 7406 + (pi_task && dl_prio(pi_task->prio) && 7407 + dl_entity_preempt(&pi_task->dl, &p->dl))) { 7408 + p->dl.pi_se = pi_task->dl.pi_se; 7409 + scope->flags |= ENQUEUE_REPLENISH; 7410 + } else { 7411 + p->dl.pi_se = &p->dl; 7412 + } 7413 + } else if (rt_prio(prio)) { 7414 + if (dl_prio(oldprio)) 7415 + p->dl.pi_se = &p->dl; 7416 + if (oldprio < prio) 7417 + scope->flags |= ENQUEUE_HEAD; 7348 7418 } else { 7349 - p->dl.pi_se = &p->dl; 7419 + if (dl_prio(oldprio)) 7420 + p->dl.pi_se = &p->dl; 7421 + if (rt_prio(oldprio)) 7422 + p->rt.timeout = 0; 7350 7423 } 7351 - } else if (rt_prio(prio)) { 7352 - if (dl_prio(oldprio)) 7353 - p->dl.pi_se = &p->dl; 7354 - if (oldprio < prio) 7355 - queue_flag |= ENQUEUE_HEAD; 7356 - } else { 7357 - if (dl_prio(oldprio)) 7358 - p->dl.pi_se = &p->dl; 7359 - if (rt_prio(oldprio)) 7360 - p->rt.timeout = 0; 7424 + 7425 + p->sched_class = next_class; 7426 + p->prio = prio; 7361 7427 } 7362 - 7363 - p->sched_class = next_class; 7364 - p->prio = prio; 7365 - 7366 - check_class_changing(rq, p, prev_class); 7367 - 7368 - if (queued) 7369 - enqueue_task(rq, p, queue_flag); 7370 - if (running) 7371 - set_next_task(rq, p); 7372 - 7373 - check_class_changed(rq, p, prev_class, oldprio); 7374 7428 out_unlock: 7375 7429 /* Avoid rq from going away on us: */ 7376 7430 preempt_disable(); 7377 7431 7378 7432 rq_unpin_lock(rq, &rf); 7379 7433 __balance_callbacks(rq); 7380 - raw_spin_rq_unlock(rq); 7434 + rq_repin_lock(rq, &rf); 7435 + __task_rq_unlock(rq, p, &rf); 7381 7436 7382 7437 preempt_enable(); 7383 7438 } ··· 8003 8084 */ 8004 8085 void sched_setnuma(struct task_struct *p, int nid) 8005 8086 { 8006 - bool queued, running; 8007 - struct rq_flags rf; 8008 - struct rq *rq; 8009 - 8010 - rq = task_rq_lock(p, &rf); 8011 - queued = task_on_rq_queued(p); 8012 - running = task_current_donor(rq, p); 8013 - 8014 - if (queued) 8015 - dequeue_task(rq, p, DEQUEUE_SAVE); 8016 - if (running) 8017 - put_prev_task(rq, p); 8018 - 8019 - p->numa_preferred_nid = nid; 8020 - 8021 - if (queued) 8022 - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 8023 - if (running) 8024 - set_next_task(rq, p); 8025 - task_rq_unlock(rq, p, &rf); 8087 + guard(task_rq_lock)(p); 8088 + scoped_guard (sched_change, p, DEQUEUE_SAVE) 8089 + p->numa_preferred_nid = nid; 8026 8090 } 8027 8091 #endif /* CONFIG_NUMA_BALANCING */ 8028 8092 ··· 8043 8141 struct rq_flags rf; 8044 8142 int cpu; 8045 8143 8046 - raw_spin_lock_irq(&p->pi_lock); 8047 - rq_lock(rq, &rf); 8048 - 8049 - update_rq_clock(rq); 8050 - 8051 - if (task_rq(p) == rq && task_on_rq_queued(p)) { 8144 + scoped_guard (raw_spinlock_irq, &p->pi_lock) { 8052 8145 cpu = select_fallback_rq(rq->cpu, p); 8053 - rq = __migrate_task(rq, &rf, p, cpu); 8054 - } 8055 8146 8056 - rq_unlock(rq, &rf); 8057 - raw_spin_unlock_irq(&p->pi_lock); 8147 + rq_lock(rq, &rf); 8148 + update_rq_clock(rq); 8149 + if (task_rq(p) == rq && task_on_rq_queued(p)) 8150 + rq = __migrate_task(rq, &rf, p, cpu); 8151 + rq_unlock(rq, &rf); 8152 + } 8058 8153 8059 8154 put_task_struct(p); 8060 8155 ··· 8489 8590 void __init sched_init_smp(void) 8490 8591 { 8491 8592 sched_init_numa(NUMA_NO_NODE); 8593 + 8594 + prandom_init_once(&sched_rnd_state); 8492 8595 8493 8596 /* 8494 8597 * There's no userspace yet to cause hotplug operations; hence all the ··· 9108 9207 */ 9109 9208 void sched_move_task(struct task_struct *tsk, bool for_autogroup) 9110 9209 { 9111 - int queued, running, queue_flags = 9112 - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 9210 + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 9211 + bool resched = false; 9113 9212 struct rq *rq; 9114 9213 9115 9214 CLASS(task_rq_lock, rq_guard)(tsk); 9116 9215 rq = rq_guard.rq; 9117 9216 9118 - update_rq_clock(rq); 9119 - 9120 - running = task_current_donor(rq, tsk); 9121 - queued = task_on_rq_queued(tsk); 9122 - 9123 - if (queued) 9124 - dequeue_task(rq, tsk, queue_flags); 9125 - if (running) 9126 - put_prev_task(rq, tsk); 9127 - 9128 - sched_change_group(tsk); 9129 - if (!for_autogroup) 9130 - scx_cgroup_move_task(tsk); 9131 - 9132 - if (queued) 9133 - enqueue_task(rq, tsk, queue_flags); 9134 - if (running) { 9135 - set_next_task(rq, tsk); 9136 - /* 9137 - * After changing group, the running task may have joined a 9138 - * throttled one but it's still the running task. Trigger a 9139 - * resched to make sure that task can still run. 9140 - */ 9141 - resched_curr(rq); 9217 + scoped_guard (sched_change, tsk, queue_flags) { 9218 + sched_change_group(tsk); 9219 + if (!for_autogroup) 9220 + scx_cgroup_move_task(tsk); 9221 + if (scope->running) 9222 + resched = true; 9142 9223 } 9224 + 9225 + if (resched) 9226 + resched_curr(rq); 9143 9227 } 9144 9228 9145 9229 static struct cgroup_subsys_state * ··· 10780 10894 } 10781 10895 #endif /* CONFIG_SCHED_MM_CID */ 10782 10896 10783 - #ifdef CONFIG_SCHED_CLASS_EXT 10784 - void sched_deq_and_put_task(struct task_struct *p, int queue_flags, 10785 - struct sched_enq_and_set_ctx *ctx) 10897 + static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); 10898 + 10899 + struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) 10786 10900 { 10901 + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); 10902 + struct rq *rq = task_rq(p); 10903 + 10904 + /* 10905 + * Must exclusively use matched flags since this is both dequeue and 10906 + * enqueue. 10907 + */ 10908 + WARN_ON_ONCE(flags & 0xFFFF0000); 10909 + 10910 + lockdep_assert_rq_held(rq); 10911 + 10912 + if (!(flags & DEQUEUE_NOCLOCK)) { 10913 + update_rq_clock(rq); 10914 + flags |= DEQUEUE_NOCLOCK; 10915 + } 10916 + 10917 + if (flags & DEQUEUE_CLASS) { 10918 + if (p->sched_class->switching_from) 10919 + p->sched_class->switching_from(rq, p); 10920 + } 10921 + 10922 + *ctx = (struct sched_change_ctx){ 10923 + .p = p, 10924 + .flags = flags, 10925 + .queued = task_on_rq_queued(p), 10926 + .running = task_current_donor(rq, p), 10927 + }; 10928 + 10929 + if (!(flags & DEQUEUE_CLASS)) { 10930 + if (p->sched_class->get_prio) 10931 + ctx->prio = p->sched_class->get_prio(rq, p); 10932 + else 10933 + ctx->prio = p->prio; 10934 + } 10935 + 10936 + if (ctx->queued) 10937 + dequeue_task(rq, p, flags); 10938 + if (ctx->running) 10939 + put_prev_task(rq, p); 10940 + 10941 + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) 10942 + p->sched_class->switched_from(rq, p); 10943 + 10944 + return ctx; 10945 + } 10946 + 10947 + void sched_change_end(struct sched_change_ctx *ctx) 10948 + { 10949 + struct task_struct *p = ctx->p; 10787 10950 struct rq *rq = task_rq(p); 10788 10951 10789 10952 lockdep_assert_rq_held(rq); 10790 10953 10791 - *ctx = (struct sched_enq_and_set_ctx){ 10792 - .p = p, 10793 - .queue_flags = queue_flags, 10794 - .queued = task_on_rq_queued(p), 10795 - .running = task_current(rq, p), 10796 - }; 10797 - 10798 - update_rq_clock(rq); 10799 - if (ctx->queued) 10800 - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); 10801 - if (ctx->running) 10802 - put_prev_task(rq, p); 10803 - } 10804 - 10805 - void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) 10806 - { 10807 - struct rq *rq = task_rq(ctx->p); 10808 - 10809 - lockdep_assert_rq_held(rq); 10954 + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) 10955 + p->sched_class->switching_to(rq, p); 10810 10956 10811 10957 if (ctx->queued) 10812 - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); 10958 + enqueue_task(rq, p, ctx->flags); 10813 10959 if (ctx->running) 10814 - set_next_task(rq, ctx->p); 10960 + set_next_task(rq, p); 10961 + 10962 + if (ctx->flags & ENQUEUE_CLASS) { 10963 + if (p->sched_class->switched_to) 10964 + p->sched_class->switched_to(rq, p); 10965 + } else { 10966 + p->sched_class->prio_changed(rq, p, ctx->prio); 10967 + } 10815 10968 } 10816 - #endif /* CONFIG_SCHED_CLASS_EXT */

+9 -25

kernel/sched/cpudeadline.c

··· 166 166 * cpudl_clear - remove a CPU from the cpudl max-heap 167 167 * @cp: the cpudl max-heap context 168 168 * @cpu: the target CPU 169 + * @online: the online state of the deadline runqueue 169 170 * 170 171 * Notes: assumes cpu_rq(cpu)->lock is locked 171 172 * 172 173 * Returns: (void) 173 174 */ 174 - void cpudl_clear(struct cpudl *cp, int cpu) 175 + void cpudl_clear(struct cpudl *cp, int cpu, bool online) 175 176 { 176 177 int old_idx, new_cpu; 177 178 unsigned long flags; ··· 185 184 if (old_idx == IDX_INVALID) { 186 185 /* 187 186 * Nothing to remove if old_idx was invalid. 188 - * This could happen if a rq_offline_dl is 187 + * This could happen if rq_online_dl or rq_offline_dl is 189 188 * called for a CPU without -dl tasks running. 190 189 */ 191 190 } else { ··· 196 195 cp->elements[new_cpu].idx = old_idx; 197 196 cp->elements[cpu].idx = IDX_INVALID; 198 197 cpudl_heapify(cp, old_idx); 199 - 200 - cpumask_set_cpu(cpu, cp->free_cpus); 201 198 } 199 + if (likely(online)) 200 + __cpumask_set_cpu(cpu, cp->free_cpus); 201 + else 202 + __cpumask_clear_cpu(cpu, cp->free_cpus); 203 + 202 204 raw_spin_unlock_irqrestore(&cp->lock, flags); 203 205 } 204 206 ··· 232 228 cp->elements[new_idx].cpu = cpu; 233 229 cp->elements[cpu].idx = new_idx; 234 230 cpudl_heapify_up(cp, new_idx); 235 - cpumask_clear_cpu(cpu, cp->free_cpus); 231 + __cpumask_clear_cpu(cpu, cp->free_cpus); 236 232 } else { 237 233 cp->elements[old_idx].dl = dl; 238 234 cpudl_heapify(cp, old_idx); 239 235 } 240 236 241 237 raw_spin_unlock_irqrestore(&cp->lock, flags); 242 - } 243 - 244 - /* 245 - * cpudl_set_freecpu - Set the cpudl.free_cpus 246 - * @cp: the cpudl max-heap context 247 - * @cpu: rd attached CPU 248 - */ 249 - void cpudl_set_freecpu(struct cpudl *cp, int cpu) 250 - { 251 - cpumask_set_cpu(cpu, cp->free_cpus); 252 - } 253 - 254 - /* 255 - * cpudl_clear_freecpu - Clear the cpudl.free_cpus 256 - * @cp: the cpudl max-heap context 257 - * @cpu: rd attached CPU 258 - */ 259 - void cpudl_clear_freecpu(struct cpudl *cp, int cpu) 260 - { 261 - cpumask_clear_cpu(cpu, cp->free_cpus); 262 238 } 263 239 264 240 /*

+1 -3

kernel/sched/cpudeadline.h

··· 19 19 20 20 int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); 21 21 void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 22 - void cpudl_clear(struct cpudl *cp, int cpu); 22 + void cpudl_clear(struct cpudl *cp, int cpu, bool online); 23 23 int cpudl_init(struct cpudl *cp); 24 - void cpudl_set_freecpu(struct cpudl *cp, int cpu); 25 - void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 26 24 void cpudl_cleanup(struct cpudl *cp);

+275 -61

kernel/sched/deadline.c

··· 125 125 static inline int dl_bw_cpus(int i) 126 126 { 127 127 struct root_domain *rd = cpu_rq(i)->rd; 128 - int cpus; 129 128 130 129 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 131 130 "sched RCU must be held"); 132 131 133 - if (cpumask_subset(rd->span, cpu_active_mask)) 134 - return cpumask_weight(rd->span); 135 - 136 - cpus = 0; 137 - 138 - for_each_cpu_and(i, rd->span, cpu_active_mask) 139 - cpus++; 140 - 141 - return cpus; 132 + return cpumask_weight_and(rd->span, cpu_active_mask); 142 133 } 143 134 144 135 static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) ··· 396 405 * up, and checks if the task is still in the "ACTIVE non contending" 397 406 * state or not (in the second case, it updates running_bw). 398 407 */ 399 - static void task_non_contending(struct sched_dl_entity *dl_se) 408 + static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) 400 409 { 401 410 struct hrtimer *timer = &dl_se->inactive_timer; 402 411 struct rq *rq = rq_of_dl_se(dl_se); ··· 435 444 } else { 436 445 struct task_struct *p = dl_task_of(dl_se); 437 446 438 - if (dl_task(p)) 447 + if (dl_task) 439 448 sub_running_bw(dl_se, dl_rq); 440 449 441 - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { 450 + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { 442 451 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 443 452 444 453 if (READ_ONCE(p->__state) == TASK_DEAD) ··· 1157 1166 sched_clock_tick(); 1158 1167 update_rq_clock(rq); 1159 1168 1160 - if (!dl_se->dl_runtime) 1169 + /* 1170 + * Make sure current has propagated its pending runtime into 1171 + * any relevant server through calling dl_server_update() and 1172 + * friends. 1173 + */ 1174 + rq->donor->sched_class->update_curr(rq); 1175 + 1176 + if (dl_se->dl_defer_idle) { 1177 + dl_server_stop(dl_se); 1161 1178 return HRTIMER_NORESTART; 1179 + } 1162 1180 1163 1181 if (dl_se->dl_defer_armed) { 1164 1182 /* ··· 1416 1416 } 1417 1417 1418 1418 static inline void 1419 - update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, 1420 - int flags); 1419 + update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags); 1420 + 1421 1421 static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1422 1422 { 1423 + bool idle = rq->curr == rq->idle; 1423 1424 s64 scaled_delta_exec; 1424 1425 1425 1426 if (unlikely(delta_exec <= 0)) { ··· 1441 1440 1442 1441 dl_se->runtime -= scaled_delta_exec; 1443 1442 1443 + if (dl_se->dl_defer_idle && !idle) 1444 + dl_se->dl_defer_idle = 0; 1445 + 1444 1446 /* 1445 1447 * The fair server can consume its runtime while throttled (not queued/ 1446 1448 * running as regular CFS). ··· 1453 1449 * starting a new period, pushing the activation. 1454 1450 */ 1455 1451 if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { 1452 + /* 1453 + * Non-servers would never get time accounted while throttled. 1454 + */ 1455 + WARN_ON_ONCE(!dl_server(dl_se)); 1456 + 1457 + /* 1458 + * While the server is marked idle, do not push out the 1459 + * activation further, instead wait for the period timer 1460 + * to lapse and stop the server. 1461 + */ 1462 + if (dl_se->dl_defer_idle && idle) { 1463 + /* 1464 + * The timer is at the zero-laxity point, this means 1465 + * dl_server_stop() / dl_server_start() can happen 1466 + * while now < deadline. This means update_dl_entity() 1467 + * will not replenish. Additionally start_dl_timer() 1468 + * will be set for 'deadline - runtime'. Negative 1469 + * runtime will not do. 1470 + */ 1471 + dl_se->runtime = 0; 1472 + return; 1473 + } 1474 + 1456 1475 /* 1457 1476 * If the server was previously activated - the starving condition 1458 1477 * took place, it this point it went away because the fair scheduler ··· 1487 1460 hrtimer_try_to_cancel(&dl_se->dl_timer); 1488 1461 1489 1462 replenish_dl_new_period(dl_se, dl_se->rq); 1463 + 1464 + if (idle) 1465 + dl_se->dl_defer_idle = 1; 1490 1466 1491 1467 /* 1492 1468 * Not being able to start the timer seems problematic. If it could not ··· 1573 1543 * as time available for the fair server, avoiding a penalty for the 1574 1544 * rt scheduler that did not consumed that time. 1575 1545 */ 1576 - void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) 1546 + void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) 1577 1547 { 1578 - s64 delta_exec; 1579 - 1580 - if (!rq->fair_server.dl_defer) 1581 - return; 1582 - 1583 - /* no need to discount more */ 1584 - if (rq->fair_server.runtime < 0) 1585 - return; 1586 - 1587 - delta_exec = rq_clock_task(rq) - p->se.exec_start; 1588 - if (delta_exec < 0) 1589 - return; 1590 - 1591 - rq->fair_server.runtime -= delta_exec; 1592 - 1593 - if (rq->fair_server.runtime < 0) { 1594 - rq->fair_server.dl_defer_running = 0; 1595 - rq->fair_server.runtime = 0; 1596 - } 1597 - 1598 - p->se.exec_start = rq_clock_task(rq); 1548 + if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer) 1549 + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1599 1550 } 1600 1551 1601 1552 void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) 1602 1553 { 1603 1554 /* 0 runtime = fair server disabled */ 1604 - if (dl_se->dl_runtime) 1555 + if (dl_se->dl_server_active && dl_se->dl_runtime) 1605 1556 update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1606 1557 } 1607 1558 1559 + /* 1560 + * dl_server && dl_defer: 1561 + * 1562 + * 6 1563 + * +--------------------+ 1564 + * v | 1565 + * +-------------+ 4 +-----------+ 5 +------------------+ 1566 + * +-> | A:init | <--- | D:running | -----> | E:replenish-wait | 1567 + * | +-------------+ +-----------+ +------------------+ 1568 + * | | | 1 ^ ^ | 1569 + * | | 1 +----------+ | 3 | 1570 + * | v | | 1571 + * | +--------------------------------+ 2 | 1572 + * | | | ----+ | 1573 + * | 8 | B:zero_laxity-wait | | | 1574 + * | | | <---+ | 1575 + * | +--------------------------------+ | 1576 + * | | ^ ^ 2 | 1577 + * | | 7 | 2 +--------------------+ 1578 + * | v | 1579 + * | +-------------+ | 1580 + * +-- | C:idle-wait | -+ 1581 + * +-------------+ 1582 + * ^ 7 | 1583 + * +---------+ 1584 + * 1585 + * 1586 + * [A] - init 1587 + * dl_server_active = 0 1588 + * dl_throttled = 0 1589 + * dl_defer_armed = 0 1590 + * dl_defer_running = 0/1 1591 + * dl_defer_idle = 0 1592 + * 1593 + * [B] - zero_laxity-wait 1594 + * dl_server_active = 1 1595 + * dl_throttled = 1 1596 + * dl_defer_armed = 1 1597 + * dl_defer_running = 0 1598 + * dl_defer_idle = 0 1599 + * 1600 + * [C] - idle-wait 1601 + * dl_server_active = 1 1602 + * dl_throttled = 1 1603 + * dl_defer_armed = 1 1604 + * dl_defer_running = 0 1605 + * dl_defer_idle = 1 1606 + * 1607 + * [D] - running 1608 + * dl_server_active = 1 1609 + * dl_throttled = 0 1610 + * dl_defer_armed = 0 1611 + * dl_defer_running = 1 1612 + * dl_defer_idle = 0 1613 + * 1614 + * [E] - replenish-wait 1615 + * dl_server_active = 1 1616 + * dl_throttled = 1 1617 + * dl_defer_armed = 0 1618 + * dl_defer_running = 1 1619 + * dl_defer_idle = 0 1620 + * 1621 + * 1622 + * [1] A->B, A->D 1623 + * dl_server_start() 1624 + * dl_server_active = 1; 1625 + * enqueue_dl_entity() 1626 + * update_dl_entity(WAKEUP) 1627 + * if (!dl_defer_running) 1628 + * dl_defer_armed = 1; 1629 + * dl_throttled = 1; 1630 + * if (dl_throttled && start_dl_timer()) 1631 + * return; // [B] 1632 + * __enqueue_dl_entity(); 1633 + * // [D] 1634 + * 1635 + * // deplete server runtime from client-class 1636 + * [2] B->B, C->B, E->B 1637 + * dl_server_update() 1638 + * update_curr_dl_se() // idle = false 1639 + * if (dl_defer_idle) 1640 + * dl_defer_idle = 0; 1641 + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) 1642 + * dl_defer_running = 0; 1643 + * hrtimer_try_to_cancel(); // stop timer 1644 + * replenish_dl_new_period() 1645 + * // fwd period 1646 + * dl_throttled = 1; 1647 + * dl_defer_armed = 1; 1648 + * start_dl_timer(); // restart timer 1649 + * // [B] 1650 + * 1651 + * // timer actually fires means we have runtime 1652 + * [3] B->D 1653 + * dl_server_timer() 1654 + * if (dl_defer_armed) 1655 + * dl_defer_running = 1; 1656 + * enqueue_dl_entity(REPLENISH) 1657 + * replenish_dl_entity() 1658 + * // fwd period 1659 + * if (dl_throttled) 1660 + * dl_throttled = 0; 1661 + * if (dl_defer_armed) 1662 + * dl_defer_armed = 0; 1663 + * __enqueue_dl_entity(); 1664 + * // [D] 1665 + * 1666 + * // schedule server 1667 + * [4] D->A 1668 + * pick_task_dl() 1669 + * p = server_pick_task(); 1670 + * if (!p) 1671 + * dl_server_stop() 1672 + * dequeue_dl_entity(); 1673 + * hrtimer_try_to_cancel(); 1674 + * dl_defer_armed = 0; 1675 + * dl_throttled = 0; 1676 + * dl_server_active = 0; 1677 + * // [A] 1678 + * return p; 1679 + * 1680 + * // server running 1681 + * [5] D->E 1682 + * update_curr_dl_se() 1683 + * if (dl_runtime_exceeded()) 1684 + * dl_throttled = 1; 1685 + * dequeue_dl_entity(); 1686 + * start_dl_timer(); 1687 + * // [E] 1688 + * 1689 + * // server replenished 1690 + * [6] E->D 1691 + * dl_server_timer() 1692 + * enqueue_dl_entity(REPLENISH) 1693 + * replenish_dl_entity() 1694 + * fwd-period 1695 + * if (dl_throttled) 1696 + * dl_throttled = 0; 1697 + * __enqueue_dl_entity(); 1698 + * // [D] 1699 + * 1700 + * // deplete server runtime from idle 1701 + * [7] B->C, C->C 1702 + * dl_server_update_idle() 1703 + * update_curr_dl_se() // idle = true 1704 + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) 1705 + * if (dl_defer_idle) 1706 + * return; 1707 + * dl_defer_running = 0; 1708 + * hrtimer_try_to_cancel(); 1709 + * replenish_dl_new_period() 1710 + * // fwd period 1711 + * dl_throttled = 1; 1712 + * dl_defer_armed = 1; 1713 + * dl_defer_idle = 1; 1714 + * start_dl_timer(); // restart timer 1715 + * // [C] 1716 + * 1717 + * // stop idle server 1718 + * [8] C->A 1719 + * dl_server_timer() 1720 + * if (dl_defer_idle) 1721 + * dl_server_stop(); 1722 + * // [A] 1723 + * 1724 + * 1725 + * digraph dl_server { 1726 + * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"] 1727 + * "A:init" -> "D:running" [label="1:dl_server_start"] 1728 + * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] 1729 + * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] 1730 + * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"] 1731 + * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"] 1732 + * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] 1733 + * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] 1734 + * "D:running" -> "A:init" [label="4:pick_task_dl"] 1735 + * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"] 1736 + * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] 1737 + * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"] 1738 + * } 1739 + * 1740 + * 1741 + * Notes: 1742 + * 1743 + * - When there are fair tasks running the most likely loop is [2]->[2]. 1744 + * the dl_server never actually runs, the timer never fires. 1745 + * 1746 + * - When there is actual fair starvation; the timer fires and starts the 1747 + * dl_server. This will then throttle and replenish like a normal DL 1748 + * task. Notably it will not 'defer' again. 1749 + * 1750 + * - When idle it will push the actication forward once, and then wait 1751 + * for the timer to hit or a non-idle update to restart things. 1752 + */ 1608 1753 void dl_server_start(struct sched_dl_entity *dl_se) 1609 1754 { 1610 1755 struct rq *rq = dl_se->rq; 1611 1756 1612 1757 if (!dl_server(dl_se) || dl_se->dl_server_active) 1613 1758 return; 1759 + 1760 + /* 1761 + * Update the current task to 'now'. 1762 + */ 1763 + rq->donor->sched_class->update_curr(rq); 1614 1764 1615 1765 if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) 1616 1766 return; ··· 1810 1600 hrtimer_try_to_cancel(&dl_se->dl_timer); 1811 1601 dl_se->dl_defer_armed = 0; 1812 1602 dl_se->dl_throttled = 0; 1603 + dl_se->dl_defer_idle = 0; 1813 1604 dl_se->dl_server_active = 0; 1814 1605 } 1815 1606 ··· 2022 1811 if (!dl_rq->dl_nr_running) { 2023 1812 dl_rq->earliest_dl.curr = 0; 2024 1813 dl_rq->earliest_dl.next = 0; 2025 - cpudl_clear(&rq->rd->cpudl, rq->cpu); 1814 + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); 2026 1815 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); 2027 1816 } else { 2028 1817 struct rb_node *leftmost = rb_first_cached(&dl_rq->root); ··· 2259 2048 * or "inactive") 2260 2049 */ 2261 2050 if (flags & DEQUEUE_SLEEP) 2262 - task_non_contending(dl_se); 2051 + task_non_contending(dl_se, true); 2263 2052 } 2264 2053 2265 2054 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) ··· 2354 2143 * it and the bandwidth timer will wake it up and will give it 2355 2144 * new scheduling parameters (thanks to dl_yielded=1). 2356 2145 */ 2357 - rq->curr->dl.dl_yielded = 1; 2146 + rq->donor->dl.dl_yielded = 1; 2358 2147 2359 2148 update_rq_clock(rq); 2360 2149 update_curr_dl(rq); ··· 2384 2173 struct rq *rq; 2385 2174 2386 2175 if (!(flags & WF_TTWU)) 2387 - goto out; 2176 + return cpu; 2388 2177 2389 2178 rq = cpu_rq(cpu); 2390 2179 ··· 2422 2211 } 2423 2212 rcu_read_unlock(); 2424 2213 2425 - out: 2426 2214 return cpu; 2427 2215 } 2428 2216 ··· 2565 2355 * __pick_next_task_dl - Helper to pick the next -deadline task to run. 2566 2356 * @rq: The runqueue to pick the next task from. 2567 2357 */ 2568 - static struct task_struct *__pick_task_dl(struct rq *rq) 2358 + static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) 2569 2359 { 2570 2360 struct sched_dl_entity *dl_se; 2571 2361 struct dl_rq *dl_rq = &rq->dl; ··· 2579 2369 WARN_ON_ONCE(!dl_se); 2580 2370 2581 2371 if (dl_server(dl_se)) { 2582 - p = dl_se->server_pick_task(dl_se); 2372 + p = dl_se->server_pick_task(dl_se, rf); 2583 2373 if (!p) { 2584 2374 dl_server_stop(dl_se); 2585 2375 goto again; ··· 2592 2382 return p; 2593 2383 } 2594 2384 2595 - static struct task_struct *pick_task_dl(struct rq *rq) 2385 + static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) 2596 2386 { 2597 - return __pick_task_dl(rq); 2387 + return __pick_task_dl(rq, rf); 2598 2388 } 2599 2389 2600 2390 static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) ··· 3093 2883 if (rq->dl.overloaded) 3094 2884 dl_set_overload(rq); 3095 2885 3096 - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); 3097 2886 if (rq->dl.dl_nr_running > 0) 3098 2887 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); 2888 + else 2889 + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); 3099 2890 } 3100 2891 3101 2892 /* Assumes rq->lock is held */ ··· 3105 2894 if (rq->dl.overloaded) 3106 2895 dl_clear_overload(rq); 3107 2896 3108 - cpudl_clear(&rq->rd->cpudl, rq->cpu); 3109 - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); 2897 + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); 3110 2898 } 3111 2899 3112 2900 void __init init_sched_dl_class(void) ··· 3183 2973 * will reset the task parameters. 3184 2974 */ 3185 2975 if (task_on_rq_queued(p) && p->dl.dl_runtime) 3186 - task_non_contending(&p->dl); 2976 + task_non_contending(&p->dl, false); 3187 2977 3188 2978 /* 3189 2979 * In case a task is setscheduled out from SCHED_DEADLINE we need to ··· 3255 3045 } 3256 3046 } 3257 3047 3048 + static u64 get_prio_dl(struct rq *rq, struct task_struct *p) 3049 + { 3050 + return p->dl.deadline; 3051 + } 3052 + 3258 3053 /* 3259 3054 * If the scheduling parameters of a -deadline task changed, 3260 3055 * a push or pull operation might be needed. 3261 3056 */ 3262 - static void prio_changed_dl(struct rq *rq, struct task_struct *p, 3263 - int oldprio) 3057 + static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) 3264 3058 { 3265 3059 if (!task_on_rq_queued(p)) 3266 3060 return; 3267 3061 3268 - /* 3269 - * This might be too much, but unfortunately 3270 - * we don't have the old deadline value, and 3271 - * we can't argue if the task is increasing 3272 - * or lowering its prio, so... 3273 - */ 3274 - if (!rq->dl.overloaded) 3062 + if (p->dl.deadline == old_deadline) 3063 + return; 3064 + 3065 + if (dl_time_before(old_deadline, p->dl.deadline)) 3275 3066 deadline_queue_pull_task(rq); 3276 3067 3277 3068 if (task_current_donor(rq, p)) { ··· 3305 3094 3306 3095 DEFINE_SCHED_CLASS(dl) = { 3307 3096 3097 + .queue_mask = 8, 3098 + 3308 3099 .enqueue_task = enqueue_task_dl, 3309 3100 .dequeue_task = dequeue_task_dl, 3310 3101 .yield_task = yield_task_dl, ··· 3329 3116 .task_tick = task_tick_dl, 3330 3117 .task_fork = task_fork_dl, 3331 3118 3119 + .get_prio = get_prio_dl, 3332 3120 .prio_changed = prio_changed_dl, 3333 3121 .switched_from = switched_from_dl, 3334 3122 .switched_to = switched_to_dl,

+4 -4

kernel/sched/debug.c

··· 796 796 797 797 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 798 798 { 799 - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; 799 + s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; 800 800 struct sched_entity *last, *first, *root; 801 801 struct rq *rq = cpu_rq(cpu); 802 802 unsigned long flags; ··· 819 819 last = __pick_last_entity(cfs_rq); 820 820 if (last) 821 821 right_vruntime = last->vruntime; 822 - min_vruntime = cfs_rq->min_vruntime; 822 + zero_vruntime = cfs_rq->zero_vruntime; 823 823 raw_spin_rq_unlock_irqrestore(rq, flags); 824 824 825 825 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", 826 826 SPLIT_NS(left_deadline)); 827 827 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", 828 828 SPLIT_NS(left_vruntime)); 829 - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", 830 - SPLIT_NS(min_vruntime)); 829 + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", 830 + SPLIT_NS(zero_vruntime)); 831 831 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", 832 832 SPLIT_NS(avg_vruntime(cfs_rq))); 833 833 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",

+36 -96

kernel/sched/ext.c

··· 1474 1474 static void yield_task_scx(struct rq *rq) 1475 1475 { 1476 1476 struct scx_sched *sch = scx_root; 1477 - struct task_struct *p = rq->curr; 1477 + struct task_struct *p = rq->donor; 1478 1478 1479 1479 if (SCX_HAS_OP(sch, yield)) 1480 1480 SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); ··· 1485 1485 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) 1486 1486 { 1487 1487 struct scx_sched *sch = scx_root; 1488 - struct task_struct *from = rq->curr; 1488 + struct task_struct *from = rq->donor; 1489 1489 1490 1490 if (SCX_HAS_OP(sch, yield)) 1491 1491 return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, ··· 2047 2047 2048 2048 lockdep_assert_rq_held(rq); 2049 2049 rq->scx.flags |= SCX_RQ_IN_BALANCE; 2050 - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); 2050 + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; 2051 2051 2052 2052 if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && 2053 2053 unlikely(rq->scx.cpu_released)) { ··· 2151 2151 has_tasks: 2152 2152 rq->scx.flags &= ~SCX_RQ_IN_BALANCE; 2153 2153 return true; 2154 - } 2155 - 2156 - static int balance_scx(struct rq *rq, struct task_struct *prev, 2157 - struct rq_flags *rf) 2158 - { 2159 - int ret; 2160 - 2161 - rq_unpin_lock(rq, rf); 2162 - 2163 - ret = balance_one(rq, prev); 2164 - 2165 - #ifdef CONFIG_SCHED_SMT 2166 - /* 2167 - * When core-sched is enabled, this ops.balance() call will be followed 2168 - * by pick_task_scx() on this CPU and the SMT siblings. Balance the 2169 - * siblings too. 2170 - */ 2171 - if (sched_core_enabled(rq)) { 2172 - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); 2173 - int scpu; 2174 - 2175 - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { 2176 - struct rq *srq = cpu_rq(scpu); 2177 - struct task_struct *sprev = srq->curr; 2178 - 2179 - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); 2180 - update_rq_clock(srq); 2181 - balance_one(srq, sprev); 2182 - } 2183 - } 2184 - #endif 2185 - rq_repin_lock(rq, rf); 2186 - 2187 - maybe_queue_balance_callback(rq); 2188 - 2189 - return ret; 2190 2154 } 2191 2155 2192 2156 static void process_ddsp_deferred_locals(struct rq *rq) ··· 2332 2368 struct task_struct, scx.dsq_list.node); 2333 2369 } 2334 2370 2335 - static struct task_struct *pick_task_scx(struct rq *rq) 2371 + static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) 2336 2372 { 2337 2373 struct task_struct *prev = rq->curr; 2374 + bool keep_prev, kick_idle = false; 2338 2375 struct task_struct *p; 2339 - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 2340 - bool kick_idle = false; 2341 2376 2342 - /* 2343 - * WORKAROUND: 2344 - * 2345 - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just 2346 - * have gone through balance_scx(). Unfortunately, there currently is a 2347 - * bug where fair could say yes on balance() but no on pick_task(), 2348 - * which then ends up calling pick_task_scx() without preceding 2349 - * balance_scx(). 2350 - * 2351 - * Keep running @prev if possible and avoid stalling from entering idle 2352 - * without balancing. 2353 - * 2354 - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() 2355 - * if pick_task_scx() is called without preceding balance_scx(). 2356 - */ 2357 - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { 2358 - if (prev->scx.flags & SCX_TASK_QUEUED) { 2359 - keep_prev = true; 2360 - } else { 2361 - keep_prev = false; 2362 - kick_idle = true; 2363 - } 2364 - } else if (unlikely(keep_prev && 2365 - prev->sched_class != &ext_sched_class)) { 2366 - /* 2367 - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is 2368 - * conditional on scx_enabled() and may have been skipped. 2369 - */ 2377 + rq_modified_clear(rq); 2378 + rq_unpin_lock(rq, rf); 2379 + balance_one(rq, prev); 2380 + rq_repin_lock(rq, rf); 2381 + maybe_queue_balance_callback(rq); 2382 + if (rq_modified_above(rq, &ext_sched_class)) 2383 + return RETRY_TASK; 2384 + 2385 + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; 2386 + if (unlikely(keep_prev && 2387 + prev->sched_class != &ext_sched_class)) { 2370 2388 WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); 2371 2389 keep_prev = false; 2372 2390 } ··· 2943 2997 p, p->scx.weight); 2944 2998 } 2945 2999 2946 - static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) 3000 + static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) 2947 3001 { 2948 3002 } 2949 3003 ··· 3216 3270 * their current sched_class. Call them directly from sched core instead. 3217 3271 */ 3218 3272 DEFINE_SCHED_CLASS(ext) = { 3273 + .queue_mask = 1, 3274 + 3219 3275 .enqueue_task = enqueue_task_scx, 3220 3276 .dequeue_task = dequeue_task_scx, 3221 3277 .yield_task = yield_task_scx, ··· 3225 3277 3226 3278 .wakeup_preempt = wakeup_preempt_scx, 3227 3279 3228 - .balance = balance_scx, 3229 3280 .pick_task = pick_task_scx, 3230 3281 3231 3282 .put_prev_task = put_prev_task_scx, ··· 3765 3818 */ 3766 3819 list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, 3767 3820 scx.runnable_node) { 3768 - struct sched_enq_and_set_ctx ctx; 3769 - 3770 3821 /* cycling deq/enq is enough, see the function comment */ 3771 - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 3772 - sched_enq_and_set_task(&ctx); 3822 + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { 3823 + /* nothing */ ; 3824 + } 3773 3825 } 3774 3826 3775 3827 /* resched to restore ticks and idle state */ ··· 3918 3972 3919 3973 scx_task_iter_start(&sti); 3920 3974 while ((p = scx_task_iter_next_locked(&sti))) { 3975 + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 3921 3976 const struct sched_class *old_class = p->sched_class; 3922 3977 const struct sched_class *new_class = 3923 3978 __setscheduler_class(p->policy, p->prio); 3924 - struct sched_enq_and_set_ctx ctx; 3925 3979 3926 - if (old_class != new_class && p->se.sched_delayed) 3927 - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 3980 + update_rq_clock(task_rq(p)); 3928 3981 3929 - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 3982 + if (old_class != new_class) 3983 + queue_flags |= DEQUEUE_CLASS; 3930 3984 3931 - p->sched_class = new_class; 3932 - check_class_changing(task_rq(p), p, old_class); 3985 + scoped_guard (sched_change, p, queue_flags) { 3986 + p->sched_class = new_class; 3987 + } 3933 3988 3934 - sched_enq_and_set_task(&ctx); 3935 - 3936 - check_class_changed(task_rq(p), p, old_class, p->prio); 3937 3989 scx_exit_task(p); 3938 3990 } 3939 3991 scx_task_iter_stop(&sti); ··· 4695 4751 percpu_down_write(&scx_fork_rwsem); 4696 4752 scx_task_iter_start(&sti); 4697 4753 while ((p = scx_task_iter_next_locked(&sti))) { 4754 + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; 4698 4755 const struct sched_class *old_class = p->sched_class; 4699 4756 const struct sched_class *new_class = 4700 4757 __setscheduler_class(p->policy, p->prio); 4701 - struct sched_enq_and_set_ctx ctx; 4702 4758 4703 4759 if (!tryget_task_struct(p)) 4704 4760 continue; 4705 4761 4706 - if (old_class != new_class && p->se.sched_delayed) 4707 - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); 4762 + if (old_class != new_class) 4763 + queue_flags |= DEQUEUE_CLASS; 4708 4764 4709 - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); 4765 + scoped_guard (sched_change, p, queue_flags) { 4766 + p->scx.slice = SCX_SLICE_DFL; 4767 + p->sched_class = new_class; 4768 + } 4710 4769 4711 - p->scx.slice = SCX_SLICE_DFL; 4712 - p->sched_class = new_class; 4713 - check_class_changing(task_rq(p), p, old_class); 4714 - 4715 - sched_enq_and_set_task(&ctx); 4716 - 4717 - check_class_changed(task_rq(p), p, old_class, p->prio); 4718 4770 put_task_struct(p); 4719 4771 } 4720 4772 scx_task_iter_stop(&sti);

+431 -169

kernel/sched/fair.c

··· 554 554 555 555 static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) 556 556 { 557 - return (s64)(se->vruntime - cfs_rq->min_vruntime); 557 + return (s64)(se->vruntime - cfs_rq->zero_vruntime); 558 558 } 559 559 560 560 #define __node_2_se(node) \ ··· 606 606 * 607 607 * Which we track using: 608 608 * 609 - * v0 := cfs_rq->min_vruntime 609 + * v0 := cfs_rq->zero_vruntime 610 610 * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime 611 611 * \Sum w_i := cfs_rq->avg_load 612 612 * 613 - * Since min_vruntime is a monotonic increasing variable that closely tracks 614 - * the per-task service, these deltas: (v_i - v), will be in the order of the 615 - * maximal (virtual) lag induced in the system due to quantisation. 613 + * Since zero_vruntime closely tracks the per-task service, these 614 + * deltas: (v_i - v), will be in the order of the maximal (virtual) lag 615 + * induced in the system due to quantisation. 616 616 * 617 617 * Also, we use scale_load_down() to reduce the size. 618 618 * ··· 671 671 avg = div_s64(avg, load); 672 672 } 673 673 674 - return cfs_rq->min_vruntime + avg; 674 + return cfs_rq->zero_vruntime + avg; 675 675 } 676 676 677 677 /* ··· 732 732 load += weight; 733 733 } 734 734 735 - return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; 735 + return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; 736 736 } 737 737 738 738 int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) ··· 740 740 return vruntime_eligible(cfs_rq, se->vruntime); 741 741 } 742 742 743 - static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) 743 + static void update_zero_vruntime(struct cfs_rq *cfs_rq) 744 744 { 745 - u64 min_vruntime = cfs_rq->min_vruntime; 746 - /* 747 - * open coded max_vruntime() to allow updating avg_vruntime 748 - */ 749 - s64 delta = (s64)(vruntime - min_vruntime); 750 - if (delta > 0) { 751 - avg_vruntime_update(cfs_rq, delta); 752 - min_vruntime = vruntime; 753 - } 754 - return min_vruntime; 755 - } 745 + u64 vruntime = avg_vruntime(cfs_rq); 746 + s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); 756 747 757 - static void update_min_vruntime(struct cfs_rq *cfs_rq) 758 - { 759 - struct sched_entity *se = __pick_root_entity(cfs_rq); 760 - struct sched_entity *curr = cfs_rq->curr; 761 - u64 vruntime = cfs_rq->min_vruntime; 748 + avg_vruntime_update(cfs_rq, delta); 762 749 763 - if (curr) { 764 - if (curr->on_rq) 765 - vruntime = curr->vruntime; 766 - else 767 - curr = NULL; 768 - } 769 - 770 - if (se) { 771 - if (!curr) 772 - vruntime = se->min_vruntime; 773 - else 774 - vruntime = min_vruntime(vruntime, se->min_vruntime); 775 - } 776 - 777 - /* ensure we never gain time by being placed backwards. */ 778 - cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); 750 + cfs_rq->zero_vruntime = vruntime; 779 751 } 780 752 781 753 static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) ··· 820 848 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) 821 849 { 822 850 avg_vruntime_add(cfs_rq, se); 851 + update_zero_vruntime(cfs_rq); 823 852 se->min_vruntime = se->vruntime; 824 853 se->min_slice = se->slice; 825 854 rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, ··· 832 859 rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, 833 860 &min_vruntime_cb); 834 861 avg_vruntime_sub(cfs_rq, se); 862 + update_zero_vruntime(cfs_rq); 835 863 } 836 864 837 865 struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) ··· 928 954 */ 929 955 if (cfs_rq->nr_queued == 1) 930 956 return curr && curr->on_rq ? curr : se; 957 + 958 + /* 959 + * Picking the ->next buddy will affect latency but not fairness. 960 + */ 961 + if (sched_feat(PICK_BUDDY) && 962 + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { 963 + /* ->next will never be delayed */ 964 + WARN_ON_ONCE(cfs_rq->next->sched_delayed); 965 + return cfs_rq->next; 966 + } 931 967 932 968 if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) 933 969 curr = NULL; ··· 1177 1193 return delta_exec; 1178 1194 } 1179 1195 1196 + static void set_next_buddy(struct sched_entity *se); 1197 + 1180 1198 /* 1181 1199 * Used by other classes to account runtime. 1182 1200 */ ··· 1212 1226 1213 1227 curr->vruntime += calc_delta_fair(delta_exec, curr); 1214 1228 resched = update_deadline(cfs_rq, curr); 1215 - update_min_vruntime(cfs_rq); 1216 1229 1217 1230 if (entity_is_task(curr)) { 1218 1231 /* ··· 1224 1239 * against fair_server such that it can account for this time 1225 1240 * and possibly avoid running this period. 1226 1241 */ 1227 - if (dl_server_active(&rq->fair_server)) 1228 - dl_server_update(&rq->fair_server, delta_exec); 1242 + dl_server_update(&rq->fair_server, delta_exec); 1229 1243 } 1230 1244 1231 1245 account_cfs_rq_runtime(cfs_rq, delta_exec); ··· 3792 3808 if (!curr) 3793 3809 __enqueue_entity(cfs_rq, se); 3794 3810 cfs_rq->nr_queued++; 3795 - 3796 - /* 3797 - * The entity's vruntime has been adjusted, so let's check 3798 - * whether the rq-wide min_vruntime needs updated too. Since 3799 - * the calculations above require stable min_vruntime rather 3800 - * than up-to-date one, we do the update at the end of the 3801 - * reweight process. 3802 - */ 3803 - update_min_vruntime(cfs_rq); 3804 3811 } 3805 3812 } 3806 3813 ··· 5404 5429 5405 5430 update_cfs_group(se); 5406 5431 5407 - /* 5408 - * Now advance min_vruntime if @se was the entity holding it back, 5409 - * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be 5410 - * put back on, and if we advance min_vruntime, we'll be placed back 5411 - * further than we started -- i.e. we'll be penalized. 5412 - */ 5413 - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) 5414 - update_min_vruntime(cfs_rq); 5415 - 5416 5432 if (flags & DEQUEUE_DELAYED) 5417 5433 finish_delayed_dequeue_entity(se); 5418 5434 ··· 5477 5511 pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) 5478 5512 { 5479 5513 struct sched_entity *se; 5480 - 5481 - /* 5482 - * Picking the ->next buddy will affect latency but not fairness. 5483 - */ 5484 - if (sched_feat(PICK_BUDDY) && 5485 - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { 5486 - /* ->next will never be delayed */ 5487 - WARN_ON_ONCE(cfs_rq->next->sched_delayed); 5488 - return cfs_rq->next; 5489 - } 5490 5514 5491 5515 se = pick_eevdf(cfs_rq); 5492 5516 if (se->sched_delayed) { ··· 6959 7003 h_nr_idle = 1; 6960 7004 } 6961 7005 6962 - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { 6963 - /* Account for idle runtime */ 6964 - if (!rq->nr_running) 6965 - dl_server_update_idle_time(rq, rq->curr); 7006 + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) 6966 7007 dl_server_start(&rq->fair_server); 6967 - } 6968 7008 6969 7009 /* At this point se is NULL and we are at root level*/ 6970 7010 add_nr_running(rq, 1); ··· 6986 7034 6987 7035 hrtick_update(rq); 6988 7036 } 6989 - 6990 - static void set_next_buddy(struct sched_entity *se); 6991 7037 6992 7038 /* 6993 7039 * Basically dequeue_task_fair(), except it can deal with dequeue_entity() ··· 8662 8712 set_task_max_allowed_capacity(p); 8663 8713 } 8664 8714 8665 - static int 8666 - balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 8667 - { 8668 - if (sched_fair_runnable(rq)) 8669 - return 1; 8670 - 8671 - return sched_balance_newidle(rq, rf) != 0; 8672 - } 8673 - 8674 8715 static void set_next_buddy(struct sched_entity *se) 8675 8716 { 8676 8717 for_each_sched_entity(se) { ··· 8673 8732 } 8674 8733 } 8675 8734 8735 + enum preempt_wakeup_action { 8736 + PREEMPT_WAKEUP_NONE, /* No preemption. */ 8737 + PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */ 8738 + PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */ 8739 + PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */ 8740 + }; 8741 + 8742 + static inline bool 8743 + set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags, 8744 + struct sched_entity *pse, struct sched_entity *se) 8745 + { 8746 + /* 8747 + * Keep existing buddy if the deadline is sooner than pse. 8748 + * The older buddy may be cache cold and completely unrelated 8749 + * to the current wakeup but that is unpredictable where as 8750 + * obeying the deadline is more in line with EEVDF objectives. 8751 + */ 8752 + if (cfs_rq->next && entity_before(cfs_rq->next, pse)) 8753 + return false; 8754 + 8755 + set_next_buddy(pse); 8756 + return true; 8757 + } 8758 + 8759 + /* 8760 + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not 8761 + * strictly enforced because the hint is either misunderstood or 8762 + * multiple tasks must be woken up. 8763 + */ 8764 + static inline enum preempt_wakeup_action 8765 + preempt_sync(struct rq *rq, int wake_flags, 8766 + struct sched_entity *pse, struct sched_entity *se) 8767 + { 8768 + u64 threshold, delta; 8769 + 8770 + /* 8771 + * WF_SYNC without WF_TTWU is not expected so warn if it happens even 8772 + * though it is likely harmless. 8773 + */ 8774 + WARN_ON_ONCE(!(wake_flags & WF_TTWU)); 8775 + 8776 + threshold = sysctl_sched_migration_cost; 8777 + delta = rq_clock_task(rq) - se->exec_start; 8778 + if ((s64)delta < 0) 8779 + delta = 0; 8780 + 8781 + /* 8782 + * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they 8783 + * could run on other CPUs. Reduce the threshold before preemption is 8784 + * allowed to an arbitrary lower value as it is more likely (but not 8785 + * guaranteed) the waker requires the wakee to finish. 8786 + */ 8787 + if (wake_flags & WF_RQ_SELECTED) 8788 + threshold >>= 2; 8789 + 8790 + /* 8791 + * As WF_SYNC is not strictly obeyed, allow some runtime for batch 8792 + * wakeups to be issued. 8793 + */ 8794 + if (entity_before(pse, se) && delta >= threshold) 8795 + return PREEMPT_WAKEUP_RESCHED; 8796 + 8797 + return PREEMPT_WAKEUP_NONE; 8798 + } 8799 + 8676 8800 /* 8677 8801 * Preempt the current task with a newly woken task if needed: 8678 8802 */ 8679 8803 static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) 8680 8804 { 8805 + enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; 8681 8806 struct task_struct *donor = rq->donor; 8682 8807 struct sched_entity *se = &donor->se, *pse = &p->se; 8683 8808 struct cfs_rq *cfs_rq = task_cfs_rq(donor); 8684 8809 int cse_is_idle, pse_is_idle; 8685 - bool do_preempt_short = false; 8686 8810 8687 8811 if (unlikely(se == pse)) 8688 8812 return; ··· 8760 8754 */ 8761 8755 if (task_is_throttled(p)) 8762 8756 return; 8763 - 8764 - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { 8765 - set_next_buddy(pse); 8766 - } 8767 8757 8768 8758 /* 8769 8759 * We can come here with TIF_NEED_RESCHED already set from new task ··· 8792 8790 * When non-idle entity preempt an idle entity, 8793 8791 * don't give idle entity slice protection. 8794 8792 */ 8795 - do_preempt_short = true; 8793 + preempt_action = PREEMPT_WAKEUP_SHORT; 8796 8794 goto preempt; 8797 8795 } 8798 8796 ··· 8811 8809 * If @p has a shorter slice than current and @p is eligible, override 8812 8810 * current's slice protection in order to allow preemption. 8813 8811 */ 8814 - do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); 8812 + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) { 8813 + preempt_action = PREEMPT_WAKEUP_SHORT; 8814 + goto pick; 8815 + } 8815 8816 8817 + /* 8818 + * Ignore wakee preemption on WF_FORK as it is less likely that 8819 + * there is shared data as exec often follow fork. Do not 8820 + * preempt for tasks that are sched_delayed as it would violate 8821 + * EEVDF to forcibly queue an ineligible task. 8822 + */ 8823 + if ((wake_flags & WF_FORK) || pse->sched_delayed) 8824 + return; 8825 + 8826 + /* 8827 + * If @p potentially is completing work required by current then 8828 + * consider preemption. 8829 + * 8830 + * Reschedule if waker is no longer eligible. */ 8831 + if (in_task() && !entity_eligible(cfs_rq, se)) { 8832 + preempt_action = PREEMPT_WAKEUP_RESCHED; 8833 + goto preempt; 8834 + } 8835 + 8836 + /* Prefer picking wakee soon if appropriate. */ 8837 + if (sched_feat(NEXT_BUDDY) && 8838 + set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { 8839 + 8840 + /* 8841 + * Decide whether to obey WF_SYNC hint for a new buddy. Old 8842 + * buddies are ignored as they may not be relevant to the 8843 + * waker and less likely to be cache hot. 8844 + */ 8845 + if (wake_flags & WF_SYNC) 8846 + preempt_action = preempt_sync(rq, wake_flags, pse, se); 8847 + } 8848 + 8849 + switch (preempt_action) { 8850 + case PREEMPT_WAKEUP_NONE: 8851 + return; 8852 + case PREEMPT_WAKEUP_RESCHED: 8853 + goto preempt; 8854 + case PREEMPT_WAKEUP_SHORT: 8855 + fallthrough; 8856 + case PREEMPT_WAKEUP_PICK: 8857 + break; 8858 + } 8859 + 8860 + pick: 8816 8861 /* 8817 8862 * If @p has become the most eligible task, force preemption. 8818 8863 */ 8819 - if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) 8864 + if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) 8820 8865 goto preempt; 8821 8866 8822 - if (sched_feat(RUN_TO_PARITY) && do_preempt_short) 8867 + if (sched_feat(RUN_TO_PARITY)) 8823 8868 update_protect_slice(cfs_rq, se); 8824 8869 8825 8870 return; 8826 8871 8827 8872 preempt: 8828 - if (do_preempt_short) 8873 + if (preempt_action == PREEMPT_WAKEUP_SHORT) 8829 8874 cancel_protect_slice(se); 8830 8875 8831 8876 resched_curr_lazy(rq); 8832 8877 } 8833 8878 8834 - static struct task_struct *pick_task_fair(struct rq *rq) 8879 + static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) 8835 8880 { 8836 8881 struct sched_entity *se; 8837 8882 struct cfs_rq *cfs_rq; ··· 8922 8873 int new_tasks; 8923 8874 8924 8875 again: 8925 - p = pick_task_fair(rq); 8876 + p = pick_task_fair(rq, rf); 8926 8877 if (!p) 8927 8878 goto idle; 8928 8879 se = &p->se; ··· 9001 8952 return NULL; 9002 8953 } 9003 8954 9004 - static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) 8955 + static struct task_struct * 8956 + fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) 9005 8957 { 9006 - return pick_next_task_fair(rq, prev, NULL); 9007 - } 9008 - 9009 - static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) 9010 - { 9011 - return pick_task_fair(dl_se->rq); 8958 + return pick_task_fair(dl_se->rq, rf); 9012 8959 } 9013 8960 9014 8961 void fair_server_init(struct rq *rq) ··· 9035 8990 */ 9036 8991 static void yield_task_fair(struct rq *rq) 9037 8992 { 9038 - struct task_struct *curr = rq->curr; 8993 + struct task_struct *curr = rq->donor; 9039 8994 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 9040 8995 struct sched_entity *se = &curr->se; 9041 8996 ··· 9059 9014 */ 9060 9015 rq_clock_skip_update(rq); 9061 9016 9062 - se->deadline += calc_delta_fair(se->slice, se); 9017 + /* 9018 + * Forfeit the remaining vruntime, only if the entity is eligible. This 9019 + * condition is necessary because in core scheduling we prefer to run 9020 + * ineligible tasks rather than force idling. If this happens we may 9021 + * end up in a loop where the core scheduler picks the yielding task, 9022 + * which yields immediately again; without the condition the vruntime 9023 + * ends up quickly running away. 9024 + */ 9025 + if (entity_eligible(cfs_rq, se)) { 9026 + se->vruntime = se->deadline; 9027 + se->deadline += calc_delta_fair(se->slice, se); 9028 + } 9063 9029 } 9064 9030 9065 9031 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) ··· 10734 10678 if (sd->flags & SD_ASYM_CPUCAPACITY) 10735 10679 sgs->group_misfit_task_load = 1; 10736 10680 10737 - for_each_cpu(i, sched_group_span(group)) { 10681 + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { 10738 10682 struct rq *rq = cpu_rq(i); 10739 10683 unsigned int local; 10740 10684 ··· 11786 11730 } 11787 11731 11788 11732 /* 11733 + * This flag serializes load-balancing passes over large domains 11734 + * (above the NODE topology level) - only one load-balancing instance 11735 + * may run at a time, to reduce overhead on very large systems with 11736 + * lots of CPUs and large NUMA distances. 11737 + * 11738 + * - Note that load-balancing passes triggered while another one 11739 + * is executing are skipped and not re-tried. 11740 + * 11741 + * - Also note that this does not serialize rebalance_domains() 11742 + * execution, as non-SD_SERIALIZE domains will still be 11743 + * load-balanced in parallel. 11744 + */ 11745 + static atomic_t sched_balance_running = ATOMIC_INIT(0); 11746 + 11747 + /* 11789 11748 * Check this_cpu to ensure it is balanced within domain. Attempt to move 11790 11749 * tasks if there is an imbalance. 11791 11750 */ ··· 11825 11754 .fbq_type = all, 11826 11755 .tasks = LIST_HEAD_INIT(env.tasks), 11827 11756 }; 11757 + bool need_unlock = false; 11828 11758 11829 11759 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); 11830 11760 ··· 11835 11763 if (!should_we_balance(&env)) { 11836 11764 *continue_balancing = 0; 11837 11765 goto out_balanced; 11766 + } 11767 + 11768 + if (!need_unlock && (sd->flags & SD_SERIALIZE)) { 11769 + int zero = 0; 11770 + if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1)) 11771 + goto out_balanced; 11772 + 11773 + need_unlock = true; 11838 11774 } 11839 11775 11840 11776 group = sched_balance_find_src_group(&env); ··· 12085 12005 sd->balance_interval < sd->max_interval) 12086 12006 sd->balance_interval *= 2; 12087 12007 out: 12008 + if (need_unlock) 12009 + atomic_set_release(&sched_balance_running, 0); 12010 + 12088 12011 return ld_moved; 12089 12012 } 12090 12013 ··· 12213 12130 } 12214 12131 12215 12132 /* 12216 - * This flag serializes load-balancing passes over large domains 12217 - * (above the NODE topology level) - only one load-balancing instance 12218 - * may run at a time, to reduce overhead on very large systems with 12219 - * lots of CPUs and large NUMA distances. 12220 - * 12221 - * - Note that load-balancing passes triggered while another one 12222 - * is executing are skipped and not re-tried. 12223 - * 12224 - * - Also note that this does not serialize rebalance_domains() 12225 - * execution, as non-SD_SERIALIZE domains will still be 12226 - * load-balanced in parallel. 12227 - */ 12228 - static atomic_t sched_balance_running = ATOMIC_INIT(0); 12229 - 12230 - /* 12231 12133 * Scale the max sched_balance_rq interval with the number of CPUs in the system. 12232 12134 * This trades load-balance latency on larger machines for less cross talk. 12233 12135 */ ··· 12221 12153 max_load_balance_interval = HZ*num_online_cpus()/10; 12222 12154 } 12223 12155 12224 - static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) 12156 + static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) 12225 12157 { 12158 + sd->newidle_call++; 12159 + sd->newidle_success += success; 12160 + 12161 + if (sd->newidle_call >= 1024) { 12162 + sd->newidle_ratio = sd->newidle_success; 12163 + sd->newidle_call /= 2; 12164 + sd->newidle_success /= 2; 12165 + } 12166 + } 12167 + 12168 + static inline bool 12169 + update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) 12170 + { 12171 + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; 12172 + unsigned long now = jiffies; 12173 + 12174 + if (cost) 12175 + update_newidle_stats(sd, success); 12176 + 12226 12177 if (cost > sd->max_newidle_lb_cost) { 12227 12178 /* 12228 12179 * Track max cost of a domain to make sure to not delay the 12229 12180 * next wakeup on the CPU. 12230 - * 12231 - * sched_balance_newidle() bumps the cost whenever newidle 12232 - * balance fails, and we don't want things to grow out of 12233 - * control. Use the sysctl_sched_migration_cost as the upper 12234 - * limit, plus a litle extra to avoid off by ones. 12235 12181 */ 12236 - sd->max_newidle_lb_cost = 12237 - min(cost, sysctl_sched_migration_cost + 200); 12238 - sd->last_decay_max_lb_cost = jiffies; 12239 - } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { 12182 + sd->max_newidle_lb_cost = cost; 12183 + sd->last_decay_max_lb_cost = now; 12184 + 12185 + } else if (time_after(now, next_decay)) { 12240 12186 /* 12241 12187 * Decay the newidle max times by ~1% per second to ensure that 12242 12188 * it is not outdated and the current max cost is actually 12243 12189 * shorter. 12244 12190 */ 12245 12191 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; 12246 - sd->last_decay_max_lb_cost = jiffies; 12247 - 12192 + sd->last_decay_max_lb_cost = now; 12248 12193 return true; 12249 12194 } 12250 12195 ··· 12280 12199 /* Earliest time when we have to do rebalance again */ 12281 12200 unsigned long next_balance = jiffies + 60*HZ; 12282 12201 int update_next_balance = 0; 12283 - int need_serialize, need_decay = 0; 12202 + int need_decay = 0; 12284 12203 u64 max_cost = 0; 12285 12204 12286 12205 rcu_read_lock(); ··· 12289 12208 * Decay the newidle max times here because this is a regular 12290 12209 * visit to all the domains. 12291 12210 */ 12292 - need_decay = update_newidle_cost(sd, 0); 12211 + need_decay = update_newidle_cost(sd, 0, 0); 12293 12212 max_cost += sd->max_newidle_lb_cost; 12294 12213 12295 12214 /* ··· 12304 12223 } 12305 12224 12306 12225 interval = get_sd_balance_interval(sd, busy); 12307 - 12308 - need_serialize = sd->flags & SD_SERIALIZE; 12309 - if (need_serialize) { 12310 - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) 12311 - goto out; 12312 - } 12313 - 12314 12226 if (time_after_eq(jiffies, sd->last_balance + interval)) { 12315 12227 if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { 12316 12228 /* ··· 12317 12243 sd->last_balance = jiffies; 12318 12244 interval = get_sd_balance_interval(sd, busy); 12319 12245 } 12320 - if (need_serialize) 12321 - atomic_set_release(&sched_balance_running, 0); 12322 - out: 12323 12246 if (time_after(next_balance, sd->last_balance + interval)) { 12324 12247 next_balance = sd->last_balance + interval; 12325 12248 update_next_balance = 1; ··· 12895 12824 12896 12825 rcu_read_lock(); 12897 12826 sd = rcu_dereference_check_sched_domain(this_rq->sd); 12827 + if (!sd) { 12828 + rcu_read_unlock(); 12829 + goto out; 12830 + } 12898 12831 12899 12832 if (!get_rd_overloaded(this_rq->rd) || 12900 - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { 12833 + this_rq->avg_idle < sd->max_newidle_lb_cost) { 12901 12834 12902 - if (sd) 12903 - update_next_balance(sd, &next_balance); 12835 + update_next_balance(sd, &next_balance); 12904 12836 rcu_read_unlock(); 12905 - 12906 12837 goto out; 12907 12838 } 12908 12839 rcu_read_unlock(); 12909 12840 12841 + rq_modified_clear(this_rq); 12910 12842 raw_spin_rq_unlock(this_rq); 12911 12843 12912 12844 t0 = sched_clock_cpu(this_cpu); ··· 12925 12851 break; 12926 12852 12927 12853 if (sd->flags & SD_BALANCE_NEWIDLE) { 12854 + unsigned int weight = 1; 12855 + 12856 + if (sched_feat(NI_RANDOM)) { 12857 + /* 12858 + * Throw a 1k sided dice; and only run 12859 + * newidle_balance according to the success 12860 + * rate. 12861 + */ 12862 + u32 d1k = sched_rng() % 1024; 12863 + weight = 1 + sd->newidle_ratio; 12864 + if (d1k > weight) { 12865 + update_newidle_stats(sd, 0); 12866 + continue; 12867 + } 12868 + weight = (1024 + weight/2) / weight; 12869 + } 12928 12870 12929 12871 pulled_task = sched_balance_rq(this_cpu, this_rq, 12930 12872 sd, CPU_NEWLY_IDLE, ··· 12952 12862 t0 = t1; 12953 12863 12954 12864 /* 12955 - * Failing newidle means it is not effective; 12956 - * bump the cost so we end up doing less of it. 12865 + * Track max cost of a domain to make sure to not delay the 12866 + * next wakeup on the CPU. 12957 12867 */ 12958 - if (!pulled_task) 12959 - domain_cost = (3 * sd->max_newidle_lb_cost) / 2; 12960 - 12961 - update_newidle_cost(sd, domain_cost); 12868 + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); 12962 12869 } 12963 12870 12964 12871 /* ··· 12980 12893 if (this_rq->cfs.h_nr_queued && !pulled_task) 12981 12894 pulled_task = 1; 12982 12895 12983 - /* Is there a task of a high priority class? */ 12984 - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) 12896 + /* If a higher prio class was modified, restart the pick */ 12897 + if (rq_modified_above(this_rq, &fair_sched_class)) 12985 12898 pulled_task = -1; 12986 12899 12987 12900 out: ··· 13099 13012 } 13100 13013 13101 13014 /* 13102 - * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. 13015 + * Consider any infeasible weight scenario. Take for instance two tasks, 13016 + * each bound to their respective sibling, one with weight 1 and one with 13017 + * weight 2. Then the lower weight task will run ahead of the higher weight 13018 + * task without bound. 13019 + * 13020 + * This utterly destroys the concept of a shared time base. 13021 + * 13022 + * Remember; all this is about a proportionally fair scheduling, where each 13023 + * tasks receives: 13024 + * 13025 + * w_i 13026 + * dt_i = ---------- dt (1) 13027 + * \Sum_j w_j 13028 + * 13029 + * which we do by tracking a virtual time, s_i: 13030 + * 13031 + * 1 13032 + * s_i = --- d[t]_i (2) 13033 + * w_i 13034 + * 13035 + * Where d[t] is a delta of discrete time, while dt is an infinitesimal. 13036 + * The immediate corollary is that the ideal schedule S, where (2) to use 13037 + * an infinitesimal delta, is: 13038 + * 13039 + * 1 13040 + * S = ---------- dt (3) 13041 + * \Sum_i w_i 13042 + * 13043 + * From which we can define the lag, or deviation from the ideal, as: 13044 + * 13045 + * lag(i) = S - s_i (4) 13046 + * 13047 + * And since the one and only purpose is to approximate S, we get that: 13048 + * 13049 + * \Sum_i w_i lag(i) := 0 (5) 13050 + * 13051 + * If this were not so, we no longer converge to S, and we can no longer 13052 + * claim our scheduler has any of the properties we derive from S. This is 13053 + * exactly what you did above, you broke it! 13054 + * 13055 + * 13056 + * Let's continue for a while though; to see if there is anything useful to 13057 + * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i: 13058 + * 13059 + * \Sum_i w_i s_i 13060 + * S = -------------- (6) 13061 + * \Sum_i w_i 13062 + * 13063 + * Which gives us a way to compute S, given our s_i. Now, if you've read 13064 + * our code, you know that we do not in fact do this, the reason for this 13065 + * is two-fold. Firstly, computing S in that way requires a 64bit division 13066 + * for every time we'd use it (see 12), and secondly, this only describes 13067 + * the steady-state, it doesn't handle dynamics. 13068 + * 13069 + * Anyway, in (6): s_i -> x + (s_i - x), to get: 13070 + * 13071 + * \Sum_i w_i (s_i - x) 13072 + * S - x = -------------------- (7) 13073 + * \Sum_i w_i 13074 + * 13075 + * Which shows that S and s_i transform alike (which makes perfect sense 13076 + * given that S is basically the (weighted) average of s_i). 13077 + * 13078 + * So the thing to remember is that the above is strictly UP. It is 13079 + * possible to generalize to multiple runqueues -- however it gets really 13080 + * yuck when you have to add affinity support, as illustrated by our very 13081 + * first counter-example. 13082 + * 13083 + * Luckily I think we can avoid needing a full multi-queue variant for 13084 + * core-scheduling (or load-balancing). The crucial observation is that we 13085 + * only actually need this comparison in the presence of forced-idle; only 13086 + * then do we need to tell if the stalled rq has higher priority over the 13087 + * other. 13088 + * 13089 + * [XXX assumes SMT2; better consider the more general case, I suspect 13090 + * it'll work out because our comparison is always between 2 rqs and the 13091 + * answer is only interesting if one of them is forced-idle] 13092 + * 13093 + * And (under assumption of SMT2) when there is forced-idle, there is only 13094 + * a single queue, so everything works like normal. 13095 + * 13096 + * Let, for our runqueue 'k': 13097 + * 13098 + * T_k = \Sum_i w_i s_i 13099 + * W_k = \Sum_i w_i ; for all i of k (8) 13100 + * 13101 + * Then we can write (6) like: 13102 + * 13103 + * T_k 13104 + * S_k = --- (9) 13105 + * W_k 13106 + * 13107 + * From which immediately follows that: 13108 + * 13109 + * T_k + T_l 13110 + * S_k+l = --------- (10) 13111 + * W_k + W_l 13112 + * 13113 + * On which we can define a combined lag: 13114 + * 13115 + * lag_k+l(i) := S_k+l - s_i (11) 13116 + * 13117 + * And that gives us the tools to compare tasks across a combined runqueue. 13118 + * 13119 + * 13120 + * Combined this gives the following: 13121 + * 13122 + * a) when a runqueue enters force-idle, sync it against it's sibling rq(s) 13123 + * using (7); this only requires storing single 'time'-stamps. 13124 + * 13125 + * b) when comparing tasks between 2 runqueues of which one is forced-idle, 13126 + * compare the combined lag, per (11). 13127 + * 13128 + * Now, of course cgroups (I so hate them) make this more interesting in 13129 + * that a) seems to suggest we need to iterate all cgroup on a CPU at such 13130 + * boundaries, but I think we can avoid that. The force-idle is for the 13131 + * whole CPU, all it's rqs. So we can mark it in the root and lazily 13132 + * propagate downward on demand. 13133 + */ 13134 + 13135 + /* 13136 + * So this sync is basically a relative reset of S to 0. 13137 + * 13138 + * So with 2 queues, when one goes idle, we drop them both to 0 and one 13139 + * then increases due to not being idle, and the idle one builds up lag to 13140 + * get re-elected. So far so simple, right? 13141 + * 13142 + * When there's 3, we can have the situation where 2 run and one is idle, 13143 + * we sync to 0 and let the idle one build up lag to get re-election. Now 13144 + * suppose another one also drops idle. At this point dropping all to 0 13145 + * again would destroy the built-up lag from the queue that was already 13146 + * idle, not good. 13147 + * 13148 + * So instead of syncing everything, we can: 13149 + * 13150 + * less := !((s64)(s_a - s_b) <= 0) 13151 + * 13152 + * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b 13153 + * == v_a - (v_b - S_a + S_b) 13154 + * 13155 + * IOW, we can recast the (lag) comparison to a one-sided difference. 13156 + * So if then, instead of syncing the whole queue, sync the idle queue 13157 + * against the active queue with S_a + S_b at the point where we sync. 13158 + * 13159 + * (XXX consider the implication of living in a cyclic group: N / 2^n N) 13160 + * 13161 + * This gives us means of syncing single queues against the active queue, 13162 + * and for already idle queues to preserve their build-up lag. 13163 + * 13164 + * Of course, then we get the situation where there's 2 active and one 13165 + * going idle, who do we pick to sync against? Theory would have us sync 13166 + * against the combined S, but as we've already demonstrated, there is no 13167 + * such thing in infeasible weight scenarios. 13168 + * 13169 + * One thing I've considered; and this is where that core_active rudiment 13170 + * came from, is having active queues sync up between themselves after 13171 + * every tick. This limits the observed divergence due to the work 13172 + * conservancy. 13173 + * 13174 + * On top of that, we can improve upon things by employing (10) here. 13175 + */ 13176 + 13177 + /* 13178 + * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed. 13103 13179 */ 13104 13180 static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, 13105 13181 bool forceidle) ··· 13276 13026 cfs_rq->forceidle_seq = fi_seq; 13277 13027 } 13278 13028 13279 - cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; 13029 + cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime; 13280 13030 } 13281 13031 } 13282 13032 ··· 13329 13079 13330 13080 /* 13331 13081 * Find delta after normalizing se's vruntime with its cfs_rq's 13332 - * min_vruntime_fi, which would have been updated in prior calls 13082 + * zero_vruntime_fi, which would have been updated in prior calls 13333 13083 * to se_fi_update(). 13334 13084 */ 13335 13085 delta = (s64)(sea->vruntime - seb->vruntime) + 13336 - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); 13086 + (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); 13337 13087 13338 13088 return delta > 0; 13339 13089 } ··· 13395 13145 * the current task. 13396 13146 */ 13397 13147 static void 13398 - prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 13148 + prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) 13399 13149 { 13400 13150 if (!task_on_rq_queued(p)) 13151 + return; 13152 + 13153 + if (p->prio == oldprio) 13401 13154 return; 13402 13155 13403 13156 if (rq->cfs.nr_queued == 1) ··· 13414 13161 if (task_current_donor(rq, p)) { 13415 13162 if (p->prio > oldprio) 13416 13163 resched_curr(rq); 13417 - } else 13164 + } else { 13418 13165 wakeup_preempt(rq, p, 0); 13166 + } 13419 13167 } 13420 13168 13421 13169 #ifdef CONFIG_FAIR_GROUP_SCHED ··· 13500 13246 attach_entity_cfs_rq(se); 13501 13247 } 13502 13248 13249 + static void switching_from_fair(struct rq *rq, struct task_struct *p) 13250 + { 13251 + if (p->se.sched_delayed) 13252 + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 13253 + } 13254 + 13503 13255 static void switched_from_fair(struct rq *rq, struct task_struct *p) 13504 13256 { 13505 13257 detach_task_cfs_rq(p); ··· 13579 13319 void init_cfs_rq(struct cfs_rq *cfs_rq) 13580 13320 { 13581 13321 cfs_rq->tasks_timeline = RB_ROOT_CACHED; 13582 - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 13322 + cfs_rq->zero_vruntime = (u64)(-(1LL << 20)); 13583 13323 raw_spin_lock_init(&cfs_rq->removed.lock); 13584 13324 } 13585 13325 ··· 13880 13620 */ 13881 13621 DEFINE_SCHED_CLASS(fair) = { 13882 13622 13623 + .queue_mask = 2, 13624 + 13883 13625 .enqueue_task = enqueue_task_fair, 13884 13626 .dequeue_task = dequeue_task_fair, 13885 13627 .yield_task = yield_task_fair, ··· 13890 13628 .wakeup_preempt = check_preempt_wakeup_fair, 13891 13629 13892 13630 .pick_task = pick_task_fair, 13893 - .pick_next_task = __pick_next_task_fair, 13631 + .pick_next_task = pick_next_task_fair, 13894 13632 .put_prev_task = put_prev_task_fair, 13895 13633 .set_next_task = set_next_task_fair, 13896 13634 13897 - .balance = balance_fair, 13898 13635 .select_task_rq = select_task_rq_fair, 13899 13636 .migrate_task_rq = migrate_task_rq_fair, 13900 13637 ··· 13908 13647 13909 13648 .reweight_task = reweight_task_fair, 13910 13649 .prio_changed = prio_changed_fair, 13650 + .switching_from = switching_from_fair, 13911 13651 .switched_from = switched_from_fair, 13912 13652 .switched_to = switched_to_fair, 13913 13653

+6 -1

kernel/sched/features.h

··· 29 29 * wakeup-preemption), since its likely going to consume data we 30 30 * touched, increases cache locality. 31 31 */ 32 - SCHED_FEAT(NEXT_BUDDY, false) 32 + SCHED_FEAT(NEXT_BUDDY, true) 33 33 34 34 /* 35 35 * Allow completely ignoring cfs_rq->next; which can be set from various ··· 121 121 SCHED_FEAT(UTIL_EST, true) 122 122 123 123 SCHED_FEAT(LATENCY_WARN, false) 124 + 125 + /* 126 + * Do newidle balancing proportional to its success rate using randomization. 127 + */ 128 + SCHED_FEAT(NI_RANDOM, true)

+24 -5

kernel/sched/idle.c

··· 452 452 resched_curr(rq); 453 453 } 454 454 455 + static void update_curr_idle(struct rq *rq); 456 + 455 457 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) 456 458 { 457 - dl_server_update_idle_time(rq, prev); 459 + update_curr_idle(rq); 458 460 scx_update_idle(rq, false, true); 459 461 } 460 462 ··· 468 466 next->se.exec_start = rq_clock_task(rq); 469 467 } 470 468 471 - struct task_struct *pick_task_idle(struct rq *rq) 469 + struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) 472 470 { 473 471 scx_update_idle(rq, true, false); 474 472 return rq->idle; ··· 498 496 */ 499 497 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 500 498 { 499 + update_curr_idle(rq); 501 500 } 502 501 503 - static void switched_to_idle(struct rq *rq, struct task_struct *p) 502 + static void switching_to_idle(struct rq *rq, struct task_struct *p) 504 503 { 505 504 BUG(); 506 505 } 507 506 508 507 static void 509 - prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) 508 + prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) 510 509 { 510 + if (p->prio == oldprio) 511 + return; 512 + 511 513 BUG(); 512 514 } 513 515 514 516 static void update_curr_idle(struct rq *rq) 515 517 { 518 + struct sched_entity *se = &rq->idle->se; 519 + u64 now = rq_clock_task(rq); 520 + s64 delta_exec; 521 + 522 + delta_exec = now - se->exec_start; 523 + if (unlikely(delta_exec <= 0)) 524 + return; 525 + 526 + se->exec_start = now; 527 + 528 + dl_server_update_idle(&rq->fair_server, delta_exec); 516 529 } 517 530 518 531 /* 519 532 * Simple, special scheduling class for the per-CPU idle tasks: 520 533 */ 521 534 DEFINE_SCHED_CLASS(idle) = { 535 + 536 + .queue_mask = 0, 522 537 523 538 /* no enqueue/yield_task for idle tasks */ 524 539 ··· 555 536 .task_tick = task_tick_idle, 556 537 557 538 .prio_changed = prio_changed_idle, 558 - .switched_to = switched_to_idle, 539 + .switching_to = switching_to_idle, 559 540 .update_curr = update_curr_idle, 560 541 };

+9 -4

kernel/sched/rt.c

··· 1490 1490 1491 1491 static void yield_task_rt(struct rq *rq) 1492 1492 { 1493 - requeue_task_rt(rq, rq->curr, 0); 1493 + requeue_task_rt(rq, rq->donor, 0); 1494 1494 } 1495 1495 1496 1496 static int find_lowest_rq(struct task_struct *task); ··· 1695 1695 return rt_task_of(rt_se); 1696 1696 } 1697 1697 1698 - static struct task_struct *pick_task_rt(struct rq *rq) 1698 + static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) 1699 1699 { 1700 1700 struct task_struct *p; 1701 1701 ··· 2437 2437 * us to initiate a push or pull. 2438 2438 */ 2439 2439 static void 2440 - prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 2440 + prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) 2441 2441 { 2442 2442 if (!task_on_rq_queued(p)) 2443 + return; 2444 + 2445 + if (p->prio == oldprio) 2443 2446 return; 2444 2447 2445 2448 if (task_current_donor(rq, p)) { ··· 2569 2566 2570 2567 DEFINE_SCHED_CLASS(rt) = { 2571 2568 2569 + .queue_mask = 4, 2570 + 2572 2571 .enqueue_task = enqueue_task_rt, 2573 2572 .dequeue_task = dequeue_task_rt, 2574 2573 .yield_task = yield_task_rt, ··· 2594 2589 2595 2590 .get_rr_interval = get_rr_interval_rt, 2596 2591 2597 - .prio_changed = prio_changed_rt, 2598 2592 .switched_to = switched_to_rt, 2593 + .prio_changed = prio_changed_rt, 2599 2594 2600 2595 .update_curr = update_curr_rt, 2601 2596

+208 -59

kernel/sched/sched.h

··· 5 5 #ifndef _KERNEL_SCHED_SCHED_H 6 6 #define _KERNEL_SCHED_SCHED_H 7 7 8 + #include <linux/prandom.h> 8 9 #include <linux/sched/affinity.h> 9 10 #include <linux/sched/autogroup.h> 10 11 #include <linux/sched/cpufreq.h> ··· 21 20 #include <linux/sched/task_flags.h> 22 21 #include <linux/sched/task.h> 23 22 #include <linux/sched/topology.h> 24 - 25 23 #include <linux/atomic.h> 26 24 #include <linux/bitmap.h> 27 25 #include <linux/bug.h> ··· 405 405 * naturally thottled to once per period, avoiding high context switch 406 406 * workloads from spamming the hrtimer program/cancel paths. 407 407 */ 408 + extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec); 408 409 extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); 409 410 extern void dl_server_start(struct sched_dl_entity *dl_se); 410 411 extern void dl_server_stop(struct sched_dl_entity *dl_se); ··· 413 412 dl_server_pick_f pick_task); 414 413 extern void sched_init_dl_servers(void); 415 414 416 - extern void dl_server_update_idle_time(struct rq *rq, 417 - struct task_struct *p); 418 415 extern void fair_server_init(struct rq *rq); 419 416 extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); 420 417 extern int dl_server_apply_params(struct sched_dl_entity *dl_se, ··· 681 682 s64 avg_vruntime; 682 683 u64 avg_load; 683 684 684 - u64 min_vruntime; 685 + u64 zero_vruntime; 685 686 #ifdef CONFIG_SCHED_CORE 686 687 unsigned int forceidle_seq; 687 - u64 min_vruntime_fi; 688 + u64 zero_vruntime_fi; 688 689 #endif 689 690 690 691 struct rb_root_cached tasks_timeline; ··· 779 780 */ 780 781 SCX_RQ_ONLINE = 1 << 0, 781 782 SCX_RQ_CAN_STOP_TICK = 1 << 1, 782 - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ 783 783 SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ 784 784 SCX_RQ_BYPASSING = 1 << 4, 785 785 SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ ··· 1118 1120 /* runqueue lock: */ 1119 1121 raw_spinlock_t __lock; 1120 1122 1123 + /* Per class runqueue modification mask; bits in class order. */ 1124 + unsigned int queue_mask; 1121 1125 unsigned int nr_running; 1122 1126 #ifdef CONFIG_NUMA_BALANCING 1123 1127 unsigned int nr_numa_running; ··· 1349 1349 } 1350 1350 1351 1351 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 1352 + DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); 1353 + 1354 + static inline u32 sched_rng(void) 1355 + { 1356 + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); 1357 + } 1352 1358 1353 1359 #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 1354 1360 #define this_rq() this_cpu_ptr(&runqueues) ··· 1438 1432 if (!sched_core_enabled(rq)) 1439 1433 return true; 1440 1434 1435 + if (rq->core->core_cookie == p->core_cookie) 1436 + return true; 1437 + 1441 1438 for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { 1442 1439 if (!available_idle_cpu(cpu)) { 1443 1440 idle_core = false; ··· 1452 1443 * A CPU in an idle core is always the best choice for tasks with 1453 1444 * cookies. 1454 1445 */ 1455 - return idle_core || rq->core->core_cookie == p->core_cookie; 1446 + return idle_core; 1456 1447 } 1457 1448 1458 1449 static inline bool sched_group_cookie_match(struct rq *rq, ··· 1836 1827 __acquires(p->pi_lock) 1837 1828 __acquires(rq->lock); 1838 1829 1839 - static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) 1830 + static inline void 1831 + __task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) 1840 1832 __releases(rq->lock) 1841 1833 { 1842 1834 rq_unpin_lock(rq, rf); ··· 1849 1839 __releases(rq->lock) 1850 1840 __releases(p->pi_lock) 1851 1841 { 1852 - rq_unpin_lock(rq, rf); 1853 - raw_spin_rq_unlock(rq); 1842 + __task_rq_unlock(rq, p, rf); 1854 1843 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); 1855 1844 } 1856 1845 1857 1846 DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, 1858 1847 _T->rq = task_rq_lock(_T->lock, &_T->rf), 1859 1848 task_rq_unlock(_T->rq, _T->lock, &_T->rf), 1849 + struct rq *rq; struct rq_flags rf) 1850 + 1851 + DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, 1852 + _T->rq = __task_rq_lock(_T->lock, &_T->rf), 1853 + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), 1860 1854 struct rq *rq; struct rq_flags rf) 1861 1855 1862 1856 static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ··· 2356 2342 /* 2357 2343 * {de,en}queue flags: 2358 2344 * 2359 - * DEQUEUE_SLEEP - task is no longer runnable 2360 - * ENQUEUE_WAKEUP - task just became runnable 2345 + * SLEEP/WAKEUP - task is no-longer/just-became runnable 2361 2346 * 2362 2347 * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks 2363 2348 * are in a known state which allows modification. Such pairs ··· 2369 2356 * 2370 2357 * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) 2371 2358 * 2359 + * DELAYED - de/re-queue a sched_delayed task 2360 + * 2361 + * CLASS - going to update p->sched_class; makes sched_change call the 2362 + * various switch methods. 2363 + * 2372 2364 * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) 2373 2365 * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) 2374 2366 * ENQUEUE_MIGRATED - the task was migrated during wakeup 2375 2367 * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called 2376 2368 * 2369 + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but 2370 + * SCHED_DEADLINE seems to rely on this for now. 2377 2371 */ 2378 2372 2379 - #define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ 2380 - #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ 2381 - #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ 2382 - #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ 2383 - #define DEQUEUE_SPECIAL 0x10 2384 - #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ 2385 - #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ 2386 - #define DEQUEUE_THROTTLE 0x800 2373 + #define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ 2374 + #define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ 2375 + #define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ 2376 + #define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ 2387 2377 2388 - #define ENQUEUE_WAKEUP 0x01 2389 - #define ENQUEUE_RESTORE 0x02 2390 - #define ENQUEUE_MOVE 0x04 2391 - #define ENQUEUE_NOCLOCK 0x08 2378 + #define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ 2379 + #define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ 2380 + #define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ 2392 2381 2393 - #define ENQUEUE_HEAD 0x10 2394 - #define ENQUEUE_REPLENISH 0x20 2395 - #define ENQUEUE_MIGRATED 0x40 2396 - #define ENQUEUE_INITIAL 0x80 2397 - #define ENQUEUE_MIGRATING 0x100 2398 - #define ENQUEUE_DELAYED 0x200 2399 - #define ENQUEUE_RQ_SELECTED 0x400 2382 + #define DEQUEUE_SPECIAL 0x00010000 2383 + #define DEQUEUE_THROTTLE 0x00020000 2384 + 2385 + #define ENQUEUE_WAKEUP 0x0001 2386 + #define ENQUEUE_RESTORE 0x0002 2387 + #define ENQUEUE_MOVE 0x0004 2388 + #define ENQUEUE_NOCLOCK 0x0008 2389 + 2390 + #define ENQUEUE_MIGRATING 0x0010 2391 + #define ENQUEUE_DELAYED 0x0020 2392 + #define ENQUEUE_CLASS 0x0040 2393 + 2394 + #define ENQUEUE_HEAD 0x00010000 2395 + #define ENQUEUE_REPLENISH 0x00020000 2396 + #define ENQUEUE_MIGRATED 0x00040000 2397 + #define ENQUEUE_INITIAL 0x00080000 2398 + #define ENQUEUE_RQ_SELECTED 0x00100000 2400 2399 2401 2400 #define RETRY_TASK ((void *)-1UL) 2402 2401 ··· 2425 2400 #ifdef CONFIG_UCLAMP_TASK 2426 2401 int uclamp_enabled; 2427 2402 #endif 2403 + /* 2404 + * idle: 0 2405 + * ext: 1 2406 + * fair: 2 2407 + * rt: 4 2408 + * dl: 8 2409 + * stop: 16 2410 + */ 2411 + unsigned int queue_mask; 2428 2412 2413 + /* 2414 + * move_queued_task/activate_task/enqueue_task: rq->lock 2415 + * ttwu_do_activate/activate_task/enqueue_task: rq->lock 2416 + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock 2417 + * ttwu_runnable/enqueue_task: task_rq_lock 2418 + * proxy_task_current: rq->lock 2419 + * sched_change_end 2420 + */ 2429 2421 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 2422 + /* 2423 + * move_queued_task/deactivate_task/dequeue_task: rq->lock 2424 + * __schedule/block_task/dequeue_task: rq->lock 2425 + * proxy_task_current: rq->lock 2426 + * wait_task_inactive: task_rq_lock 2427 + * sched_change_begin 2428 + */ 2430 2429 bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 2430 + 2431 + /* 2432 + * do_sched_yield: rq->lock 2433 + */ 2431 2434 void (*yield_task) (struct rq *rq); 2435 + /* 2436 + * yield_to: rq->lock (double) 2437 + */ 2432 2438 bool (*yield_to_task)(struct rq *rq, struct task_struct *p); 2433 2439 2440 + /* 2441 + * move_queued_task: rq->lock 2442 + * __migrate_swap_task: rq->lock 2443 + * ttwu_do_activate: rq->lock 2444 + * ttwu_runnable: task_rq_lock 2445 + * wake_up_new_task: task_rq_lock 2446 + */ 2434 2447 void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); 2435 2448 2449 + /* 2450 + * schedule/pick_next_task/prev_balance: rq->lock 2451 + */ 2436 2452 int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 2437 - struct task_struct *(*pick_task)(struct rq *rq); 2453 + 2454 + /* 2455 + * schedule/pick_next_task: rq->lock 2456 + */ 2457 + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); 2438 2458 /* 2439 2459 * Optional! When implemented pick_next_task() should be equivalent to: 2440 2460 * ··· 2489 2419 * set_next_task_first(next); 2490 2420 * } 2491 2421 */ 2492 - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); 2422 + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, 2423 + struct rq_flags *rf); 2493 2424 2425 + /* 2426 + * sched_change: 2427 + * __schedule: rq->lock 2428 + */ 2494 2429 void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); 2495 2430 void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); 2496 2431 2432 + /* 2433 + * select_task_rq: p->pi_lock 2434 + * sched_exec: p->pi_lock 2435 + */ 2497 2436 int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); 2498 2437 2438 + /* 2439 + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) 2440 + */ 2499 2441 void (*migrate_task_rq)(struct task_struct *p, int new_cpu); 2500 2442 2443 + /* 2444 + * ttwu_do_activate: rq->lock 2445 + * wake_up_new_task: task_rq_lock 2446 + */ 2501 2447 void (*task_woken)(struct rq *this_rq, struct task_struct *task); 2502 2448 2449 + /* 2450 + * do_set_cpus_allowed: task_rq_lock + sched_change 2451 + */ 2503 2452 void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); 2504 2453 2454 + /* 2455 + * sched_set_rq_{on,off}line: rq->lock 2456 + */ 2505 2457 void (*rq_online)(struct rq *rq); 2506 2458 void (*rq_offline)(struct rq *rq); 2507 2459 2460 + /* 2461 + * push_cpu_stop: p->pi_lock && rq->lock 2462 + */ 2508 2463 struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); 2509 2464 2465 + /* 2466 + * hrtick: rq->lock 2467 + * sched_tick: rq->lock 2468 + * sched_tick_remote: rq->lock 2469 + */ 2510 2470 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); 2471 + /* 2472 + * sched_cgroup_fork: p->pi_lock 2473 + */ 2511 2474 void (*task_fork)(struct task_struct *p); 2475 + /* 2476 + * finish_task_switch: no locks 2477 + */ 2512 2478 void (*task_dead)(struct task_struct *p); 2513 2479 2514 2480 /* 2515 - * The switched_from() call is allowed to drop rq->lock, therefore we 2516 - * cannot assume the switched_from/switched_to pair is serialized by 2517 - * rq->lock. They are however serialized by p->pi_lock. 2481 + * sched_change 2518 2482 */ 2519 - void (*switching_to) (struct rq *this_rq, struct task_struct *task); 2520 - void (*switched_from)(struct rq *this_rq, struct task_struct *task); 2521 - void (*switched_to) (struct rq *this_rq, struct task_struct *task); 2483 + void (*switching_from)(struct rq *this_rq, struct task_struct *task); 2484 + void (*switched_from) (struct rq *this_rq, struct task_struct *task); 2485 + void (*switching_to) (struct rq *this_rq, struct task_struct *task); 2486 + void (*switched_to) (struct rq *this_rq, struct task_struct *task); 2487 + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); 2488 + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 2489 + u64 oldprio); 2490 + 2491 + /* 2492 + * set_load_weight: task_rq_lock + sched_change 2493 + * __setscheduler_parms: task_rq_lock + sched_change 2494 + */ 2522 2495 void (*reweight_task)(struct rq *this_rq, struct task_struct *task, 2523 2496 const struct load_weight *lw); 2524 - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 2525 - int oldprio); 2526 2497 2498 + /* 2499 + * sched_rr_get_interval: task_rq_lock 2500 + */ 2527 2501 unsigned int (*get_rr_interval)(struct rq *rq, 2528 2502 struct task_struct *task); 2529 2503 2504 + /* 2505 + * task_sched_runtime: task_rq_lock 2506 + */ 2530 2507 void (*update_curr)(struct rq *rq); 2531 2508 2532 2509 #ifdef CONFIG_FAIR_GROUP_SCHED 2510 + /* 2511 + * sched_change_group: task_rq_lock + sched_change 2512 + */ 2533 2513 void (*task_change_group)(struct task_struct *p); 2534 2514 #endif 2535 2515 2536 2516 #ifdef CONFIG_SCHED_CORE 2517 + /* 2518 + * pick_next_task: rq->lock 2519 + * try_steal_cookie: rq->lock (double) 2520 + */ 2537 2521 int (*task_is_throttled)(struct task_struct *p, int cpu); 2538 2522 #endif 2539 2523 }; 2524 + 2525 + /* 2526 + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. 2527 + */ 2528 + static inline void rq_modified_clear(struct rq *rq) 2529 + { 2530 + rq->queue_mask = 0; 2531 + } 2532 + 2533 + static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) 2534 + { 2535 + unsigned int mask = class->queue_mask; 2536 + return rq->queue_mask & ~((mask << 1) - 1); 2537 + } 2540 2538 2541 2539 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 2542 2540 { ··· 2717 2579 return rq->cfs.nr_queued > 0; 2718 2580 } 2719 2581 2720 - extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); 2721 - extern struct task_struct *pick_task_idle(struct rq *rq); 2582 + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, 2583 + struct rq_flags *rf); 2584 + extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); 2722 2585 2723 2586 #define SCA_CHECK 0x01 2724 2587 #define SCA_MIGRATE_DISABLE 0x02 ··· 2749 2610 static inline cpumask_t *alloc_user_cpus_ptr(int node) 2750 2611 { 2751 2612 /* 2752 - * See do_set_cpus_allowed() above for the rcu_head usage. 2613 + * See set_cpus_allowed_force() above for the rcu_head usage. 2753 2614 */ 2754 2615 int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); 2755 2616 ··· 4014 3875 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); 4015 3876 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); 4016 3877 4017 - extern void check_class_changing(struct rq *rq, struct task_struct *p, 4018 - const struct sched_class *prev_class); 4019 - extern void check_class_changed(struct rq *rq, struct task_struct *p, 4020 - const struct sched_class *prev_class, 4021 - int oldprio); 4022 - 4023 3878 extern struct balance_callback *splice_balance_callbacks(struct rq *rq); 4024 3879 extern void balance_callbacks(struct rq *rq, struct balance_callback *head); 4025 3880 4026 - #ifdef CONFIG_SCHED_CLASS_EXT 4027 3881 /* 4028 - * Used by SCX in the enable/disable paths to move tasks between sched_classes 4029 - * and establish invariants. 3882 + * The 'sched_change' pattern is the safe, easy and slow way of changing a 3883 + * task's scheduling properties. It dequeues a task, such that the scheduler 3884 + * is fully unaware of it; at which point its properties can be modified; 3885 + * after which it is enqueued again. 3886 + * 3887 + * Typically this must be called while holding task_rq_lock, since most/all 3888 + * properties are serialized under those locks. There is currently one 3889 + * exception to this rule in sched/ext which only holds rq->lock. 4030 3890 */ 4031 - struct sched_enq_and_set_ctx { 3891 + 3892 + /* 3893 + * This structure is a temporary, used to preserve/convey the queueing state 3894 + * of the task between sched_change_begin() and sched_change_end(). Ensuring 3895 + * the task's queueing state is idempotent across the operation. 3896 + */ 3897 + struct sched_change_ctx { 3898 + u64 prio; 4032 3899 struct task_struct *p; 4033 - int queue_flags; 3900 + int flags; 4034 3901 bool queued; 4035 3902 bool running; 4036 3903 }; 4037 3904 4038 - void sched_deq_and_put_task(struct task_struct *p, int queue_flags, 4039 - struct sched_enq_and_set_ctx *ctx); 4040 - void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); 3905 + struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); 3906 + void sched_change_end(struct sched_change_ctx *ctx); 4041 3907 4042 - #endif /* CONFIG_SCHED_CLASS_EXT */ 3908 + DEFINE_CLASS(sched_change, struct sched_change_ctx *, 3909 + sched_change_end(_T), 3910 + sched_change_begin(p, flags), 3911 + struct task_struct *p, unsigned int flags) 3912 + 3913 + DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) 4043 3914 4044 3915 #include "ext.h" 4045 3916

+1 -1

kernel/sched/stats.h

··· 206 206 207 207 rq = __task_rq_lock(p, &rf); 208 208 psi_task_change(p, p->psi_flags, 0); 209 - __task_rq_unlock(rq, &rf); 209 + __task_rq_unlock(rq, p, &rf); 210 210 } 211 211 } 212 212

+9 -4

kernel/sched/stop_task.c

··· 32 32 stop->se.exec_start = rq_clock_task(rq); 33 33 } 34 34 35 - static struct task_struct *pick_task_stop(struct rq *rq) 35 + static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) 36 36 { 37 37 if (!sched_stop_runnable(rq)) 38 38 return NULL; ··· 75 75 { 76 76 } 77 77 78 - static void switched_to_stop(struct rq *rq, struct task_struct *p) 78 + static void switching_to_stop(struct rq *rq, struct task_struct *p) 79 79 { 80 80 BUG(); /* its impossible to change to this class */ 81 81 } 82 82 83 83 static void 84 - prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) 84 + prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) 85 85 { 86 + if (p->prio == oldprio) 87 + return; 88 + 86 89 BUG(); /* how!?, what priority? */ 87 90 } 88 91 ··· 97 94 * Simple, special scheduling class for the per-CPU stop tasks: 98 95 */ 99 96 DEFINE_SCHED_CLASS(stop) = { 97 + 98 + .queue_mask = 16, 100 99 101 100 .enqueue_task = enqueue_task_stop, 102 101 .dequeue_task = dequeue_task_stop, ··· 117 112 .task_tick = task_tick_stop, 118 113 119 114 .prio_changed = prio_changed_stop, 120 - .switched_to = switched_to_stop, 115 + .switching_to = switching_to_stop, 121 116 .update_curr = update_curr_stop, 122 117 };

+29 -60

kernel/sched/syscalls.c

··· 64 64 65 65 void set_user_nice(struct task_struct *p, long nice) 66 66 { 67 - bool queued, running; 68 - struct rq *rq; 69 67 int old_prio; 70 68 71 69 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ··· 72 74 * We have to be careful, if called from sys_setpriority(), 73 75 * the task might be in the middle of scheduling on another CPU. 74 76 */ 75 - CLASS(task_rq_lock, rq_guard)(p); 76 - rq = rq_guard.rq; 77 - 78 - update_rq_clock(rq); 77 + guard(task_rq_lock)(p); 79 78 80 79 /* 81 80 * The RT priorities are set via sched_setscheduler(), but we still ··· 85 90 return; 86 91 } 87 92 88 - queued = task_on_rq_queued(p); 89 - running = task_current_donor(rq, p); 90 - if (queued) 91 - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 92 - if (running) 93 - put_prev_task(rq, p); 94 - 95 - p->static_prio = NICE_TO_PRIO(nice); 96 - set_load_weight(p, true); 97 - old_prio = p->prio; 98 - p->prio = effective_prio(p); 99 - 100 - if (queued) 101 - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 102 - if (running) 103 - set_next_task(rq, p); 104 - 105 - /* 106 - * If the task increased its priority or is running and 107 - * lowered its priority, then reschedule its CPU: 108 - */ 109 - p->sched_class->prio_changed(rq, p, old_prio); 93 + scoped_guard (sched_change, p, DEQUEUE_SAVE) { 94 + p->static_prio = NICE_TO_PRIO(nice); 95 + set_load_weight(p, true); 96 + old_prio = p->prio; 97 + p->prio = effective_prio(p); 98 + } 110 99 } 111 100 EXPORT_SYMBOL(set_user_nice); 112 101 ··· 494 515 bool user, bool pi) 495 516 { 496 517 int oldpolicy = -1, policy = attr->sched_policy; 497 - int retval, oldprio, newprio, queued, running; 518 + int retval, oldprio, newprio; 498 519 const struct sched_class *prev_class, *next_class; 499 520 struct balance_callback *head; 500 521 struct rq_flags rf; ··· 674 695 prev_class = p->sched_class; 675 696 next_class = __setscheduler_class(policy, newprio); 676 697 677 - if (prev_class != next_class && p->se.sched_delayed) 678 - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 698 + if (prev_class != next_class) 699 + queue_flags |= DEQUEUE_CLASS; 679 700 680 - queued = task_on_rq_queued(p); 681 - running = task_current_donor(rq, p); 682 - if (queued) 683 - dequeue_task(rq, p, queue_flags); 684 - if (running) 685 - put_prev_task(rq, p); 701 + scoped_guard (sched_change, p, queue_flags) { 686 702 687 - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { 688 - __setscheduler_params(p, attr); 689 - p->sched_class = next_class; 690 - p->prio = newprio; 703 + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { 704 + __setscheduler_params(p, attr); 705 + p->sched_class = next_class; 706 + p->prio = newprio; 707 + } 708 + __setscheduler_uclamp(p, attr); 709 + 710 + if (scope->queued) { 711 + /* 712 + * We enqueue to tail when the priority of a task is 713 + * increased (user space view). 714 + */ 715 + if (oldprio < p->prio) 716 + scope->flags |= ENQUEUE_HEAD; 717 + } 691 718 } 692 - __setscheduler_uclamp(p, attr); 693 - check_class_changing(rq, p, prev_class); 694 - 695 - if (queued) { 696 - /* 697 - * We enqueue to tail when the priority of a task is 698 - * increased (user space view). 699 - */ 700 - if (oldprio < p->prio) 701 - queue_flags |= ENQUEUE_HEAD; 702 - 703 - enqueue_task(rq, p, queue_flags); 704 - } 705 - if (running) 706 - set_next_task(rq, p); 707 - 708 - check_class_changed(rq, p, prev_class, oldprio); 709 719 710 720 /* Avoid rq from going away on us: */ 711 721 preempt_disable(); ··· 1319 1351 rq = this_rq_lock_irq(&rf); 1320 1352 1321 1353 schedstat_inc(rq->yld_count); 1322 - current->sched_class->yield_task(rq); 1354 + rq->donor->sched_class->yield_task(rq); 1323 1355 1324 1356 preempt_disable(); 1325 1357 rq_unlock_irq(rq, &rf); ··· 1388 1420 */ 1389 1421 int __sched yield_to(struct task_struct *p, bool preempt) 1390 1422 { 1391 - struct task_struct *curr = current; 1423 + struct task_struct *curr; 1392 1424 struct rq *rq, *p_rq; 1393 1425 int yielded = 0; 1394 1426 1395 1427 scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { 1396 1428 rq = this_rq(); 1429 + curr = rq->donor; 1397 1430 1398 1431 again: 1399 1432 p_rq = task_rq(p);

+92 -22

kernel/sched/topology.c

··· 1590 1590 #ifdef CONFIG_NUMA 1591 1591 enum numa_topology_type sched_numa_topology_type; 1592 1592 1593 + /* 1594 + * sched_domains_numa_distance is derived from sched_numa_node_distance 1595 + * and provides a simplified view of NUMA distances used specifically 1596 + * for building NUMA scheduling domains. 1597 + */ 1593 1598 static int sched_domains_numa_levels; 1599 + static int sched_numa_node_levels; 1594 1600 1595 1601 int sched_max_numa_distance; 1596 1602 static int *sched_domains_numa_distance; 1603 + static int *sched_numa_node_distance; 1597 1604 static struct cpumask ***sched_domains_numa_masks; 1598 1605 #endif /* CONFIG_NUMA */ 1599 1606 ··· 1669 1662 1670 1663 .last_balance = jiffies, 1671 1664 .balance_interval = sd_weight, 1665 + 1666 + /* 50% success rate */ 1667 + .newidle_call = 512, 1668 + .newidle_success = 256, 1669 + .newidle_ratio = 512, 1670 + 1672 1671 .max_newidle_lb_cost = 0, 1673 1672 .last_decay_max_lb_cost = jiffies, 1674 1673 .child = child, ··· 1858 1845 return true; 1859 1846 1860 1847 rcu_read_lock(); 1861 - distances = rcu_dereference(sched_domains_numa_distance); 1848 + distances = rcu_dereference(sched_numa_node_distance); 1862 1849 if (!distances) 1863 1850 goto unlock; 1864 - for (i = 0; i < sched_domains_numa_levels; i++) { 1851 + for (i = 0; i < sched_numa_node_levels; i++) { 1865 1852 if (distances[i] == distance) { 1866 1853 found = true; 1867 1854 break; ··· 1937 1924 1938 1925 #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) 1939 1926 1940 - void sched_init_numa(int offline_node) 1927 + /* 1928 + * An architecture could modify its NUMA distance, to change 1929 + * grouping of NUMA nodes and number of NUMA levels when creating 1930 + * NUMA level sched domains. 1931 + * 1932 + * A NUMA level is created for each unique 1933 + * arch_sched_node_distance. 1934 + */ 1935 + static int numa_node_dist(int i, int j) 1941 1936 { 1942 - struct sched_domain_topology_level *tl; 1943 - unsigned long *distance_map; 1937 + return node_distance(i, j); 1938 + } 1939 + 1940 + int arch_sched_node_distance(int from, int to) 1941 + __weak __alias(numa_node_dist); 1942 + 1943 + static bool modified_sched_node_distance(void) 1944 + { 1945 + return numa_node_dist != arch_sched_node_distance; 1946 + } 1947 + 1948 + static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), 1949 + int **dist, int *levels) 1950 + { 1951 + unsigned long *distance_map __free(bitmap) = NULL; 1944 1952 int nr_levels = 0; 1945 1953 int i, j; 1946 1954 int *distances; 1947 - struct cpumask ***masks; 1948 1955 1949 1956 /* 1950 1957 * O(nr_nodes^2) de-duplicating selection sort -- in order to find the ··· 1972 1939 */ 1973 1940 distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); 1974 1941 if (!distance_map) 1975 - return; 1942 + return -ENOMEM; 1976 1943 1977 1944 bitmap_zero(distance_map, NR_DISTANCE_VALUES); 1978 1945 for_each_cpu_node_but(i, offline_node) { 1979 1946 for_each_cpu_node_but(j, offline_node) { 1980 - int distance = node_distance(i, j); 1947 + int distance = n_dist(i, j); 1981 1948 1982 1949 if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { 1983 1950 sched_numa_warn("Invalid distance value range"); 1984 - bitmap_free(distance_map); 1985 - return; 1951 + return -EINVAL; 1986 1952 } 1987 1953 1988 1954 bitmap_set(distance_map, distance, 1); ··· 1994 1962 nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); 1995 1963 1996 1964 distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); 1997 - if (!distances) { 1998 - bitmap_free(distance_map); 1999 - return; 2000 - } 1965 + if (!distances) 1966 + return -ENOMEM; 2001 1967 2002 1968 for (i = 0, j = 0; i < nr_levels; i++, j++) { 2003 1969 j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); 2004 1970 distances[i] = j; 2005 1971 } 2006 - rcu_assign_pointer(sched_domains_numa_distance, distances); 1972 + *dist = distances; 1973 + *levels = nr_levels; 2007 1974 2008 - bitmap_free(distance_map); 1975 + return 0; 1976 + } 1977 + 1978 + void sched_init_numa(int offline_node) 1979 + { 1980 + struct sched_domain_topology_level *tl; 1981 + int nr_levels, nr_node_levels; 1982 + int i, j; 1983 + int *distances, *domain_distances; 1984 + struct cpumask ***masks; 1985 + 1986 + /* Record the NUMA distances from SLIT table */ 1987 + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, 1988 + &nr_node_levels)) 1989 + return; 1990 + 1991 + /* Record modified NUMA distances for building sched domains */ 1992 + if (modified_sched_node_distance()) { 1993 + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, 1994 + &domain_distances, &nr_levels)) { 1995 + kfree(distances); 1996 + return; 1997 + } 1998 + } else { 1999 + domain_distances = distances; 2000 + nr_levels = nr_node_levels; 2001 + } 2002 + rcu_assign_pointer(sched_numa_node_distance, distances); 2003 + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); 2004 + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); 2009 2005 2010 2006 /* 2011 2007 * 'nr_levels' contains the number of unique distances ··· 2051 1991 * 2052 1992 * We reset it to 'nr_levels' at the end of this function. 2053 1993 */ 1994 + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); 1995 + 2054 1996 sched_domains_numa_levels = 0; 2055 1997 2056 1998 masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); ··· 2078 2016 masks[i][j] = mask; 2079 2017 2080 2018 for_each_cpu_node_but(k, offline_node) { 2081 - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) 2019 + if (sched_debug() && 2020 + (arch_sched_node_distance(j, k) != 2021 + arch_sched_node_distance(k, j))) 2082 2022 sched_numa_warn("Node-distance not symmetric"); 2083 2023 2084 - if (node_distance(j, k) > sched_domains_numa_distance[i]) 2024 + if (arch_sched_node_distance(j, k) > 2025 + sched_domains_numa_distance[i]) 2085 2026 continue; 2086 2027 2087 2028 cpumask_or(mask, mask, cpumask_of_node(k)); ··· 2124 2059 sched_domain_topology = tl; 2125 2060 2126 2061 sched_domains_numa_levels = nr_levels; 2127 - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); 2128 2062 2129 2063 init_numa_topology_type(offline_node); 2130 2064 } ··· 2131 2067 2132 2068 static void sched_reset_numa(void) 2133 2069 { 2134 - int nr_levels, *distances; 2070 + int nr_levels, *distances, *dom_distances = NULL; 2135 2071 struct cpumask ***masks; 2136 2072 2137 2073 nr_levels = sched_domains_numa_levels; 2074 + sched_numa_node_levels = 0; 2138 2075 sched_domains_numa_levels = 0; 2139 2076 sched_max_numa_distance = 0; 2140 2077 sched_numa_topology_type = NUMA_DIRECT; 2141 - distances = sched_domains_numa_distance; 2078 + distances = sched_numa_node_distance; 2079 + if (sched_numa_node_distance != sched_domains_numa_distance) 2080 + dom_distances = sched_domains_numa_distance; 2081 + rcu_assign_pointer(sched_numa_node_distance, NULL); 2142 2082 rcu_assign_pointer(sched_domains_numa_distance, NULL); 2143 2083 masks = sched_domains_numa_masks; 2144 2084 rcu_assign_pointer(sched_domains_numa_masks, NULL); ··· 2151 2083 2152 2084 synchronize_rcu(); 2153 2085 kfree(distances); 2086 + kfree(dom_distances); 2154 2087 for (i = 0; i < nr_levels && masks; i++) { 2155 2088 if (!masks[i]) 2156 2089 continue; ··· 2198 2129 continue; 2199 2130 2200 2131 /* Set ourselves in the remote node's masks */ 2201 - if (node_distance(j, node) <= sched_domains_numa_distance[i]) 2132 + if (arch_sched_node_distance(j, node) <= 2133 + sched_domains_numa_distance[i]) 2202 2134 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); 2203 2135 } 2204 2136 }

Configure Feed

Configure Feed