Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-7.1

-3

arch/x86/include/asm/mmu_context.h

··· 136 136 } 137 137 #endif 138 138 139 - #define enter_lazy_tlb enter_lazy_tlb 140 - extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); 141 - 142 139 extern void mm_init_global_asid(struct mm_struct *mm); 143 140 extern void mm_free_global_asid(struct mm_struct *mm); 144 141

+26

arch/x86/include/asm/tlbflush.h

··· 172 172 }; 173 173 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared); 174 174 175 + /* 176 + * Please ignore the name of this function. It should be called 177 + * switch_to_kernel_thread(). 178 + * 179 + * enter_lazy_tlb() is a hint from the scheduler that we are entering a 180 + * kernel thread or other context without an mm. Acceptable implementations 181 + * include doing nothing whatsoever, switching to init_mm, or various clever 182 + * lazy tricks to try to minimize TLB flushes. 183 + * 184 + * The scheduler reserves the right to call enter_lazy_tlb() several times 185 + * in a row. It will notify us that we're going back to a real mm by 186 + * calling switch_mm_irqs_off(). 187 + */ 188 + #define enter_lazy_tlb enter_lazy_tlb 189 + static __always_inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 190 + { 191 + if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 192 + return; 193 + 194 + this_cpu_write(cpu_tlbstate_shared.is_lazy, true); 195 + } 196 + 175 197 bool nmi_uaccess_okay(void); 176 198 #define nmi_uaccess_okay nmi_uaccess_okay 177 199 ··· 502 480 { 503 481 } 504 482 #endif 483 + #else /* !MODULE */ 484 + #define enter_lazy_tlb enter_lazy_tlb 485 + extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 486 + __compiletime_error("enter_lazy_tlb() should not be used in modules"); 505 487 #endif /* !MODULE */ 506 488 507 489 static inline void __native_tlb_flush_global(unsigned long cr4)

-21

arch/x86/mm/tlb.c

··· 972 972 } 973 973 974 974 /* 975 - * Please ignore the name of this function. It should be called 976 - * switch_to_kernel_thread(). 977 - * 978 - * enter_lazy_tlb() is a hint from the scheduler that we are entering a 979 - * kernel thread or other context without an mm. Acceptable implementations 980 - * include doing nothing whatsoever, switching to init_mm, or various clever 981 - * lazy tricks to try to minimize TLB flushes. 982 - * 983 - * The scheduler reserves the right to call enter_lazy_tlb() several times 984 - * in a row. It will notify us that we're going back to a real mm by 985 - * calling switch_mm_irqs_off(). 986 - */ 987 - void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 988 - { 989 - if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) 990 - return; 991 - 992 - this_cpu_write(cpu_tlbstate_shared.is_lazy, true); 993 - } 994 - 995 - /* 996 975 * Using a temporary mm allows to set temporary mappings that are not accessible 997 976 * by other CPUs. Such mappings are needed to perform sensitive memory writes 998 977 * that override the kernel memory protections (e.g., W^X), without exposing the

+1

include/linux/sched/topology.h

··· 95 95 unsigned int newidle_call; 96 96 unsigned int newidle_success; 97 97 unsigned int newidle_ratio; 98 + u64 newidle_stamp; 98 99 u64 max_newidle_lb_cost; 99 100 unsigned long last_decay_max_lb_cost; 100 101

+3

include/uapi/linux/sched.h

··· 146 146 SCHED_FLAG_KEEP_ALL | \ 147 147 SCHED_FLAG_UTIL_CLAMP) 148 148 149 + /* Only for sched_getattr() own flag param, if task is SCHED_DEADLINE */ 150 + #define SCHED_GETATTR_FLAG_DL_DYNAMIC 0x01 151 + 149 152 #endif /* _UAPI_LINUX_SCHED_H */

+2 -7

kernel/sched/core.c

··· 687 687 } 688 688 } 689 689 690 - void raw_spin_rq_unlock(struct rq *rq) 691 - { 692 - raw_spin_unlock(rq_lockp(rq)); 693 - } 694 - 695 690 /* 696 691 * double_rq_lock - safely lock two runqueues 697 692 */ ··· 5673 5678 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); 5674 5679 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); 5675 5680 if (os == TICK_SCHED_REMOTE_RUNNING) 5676 - queue_delayed_work(system_unbound_wq, dwork, HZ); 5681 + queue_delayed_work(system_dfl_wq, dwork, HZ); 5677 5682 } 5678 5683 5679 5684 static void sched_tick_start(int cpu) ··· 5692 5697 if (os == TICK_SCHED_REMOTE_OFFLINE) { 5693 5698 twork->cpu = cpu; 5694 5699 INIT_DELAYED_WORK(&twork->work, sched_tick_remote); 5695 - queue_delayed_work(system_unbound_wq, &twork->work, HZ); 5700 + queue_delayed_work(system_dfl_wq, &twork->work, HZ); 5696 5701 } 5697 5702 } 5698 5703

+20 -3

kernel/sched/deadline.c

··· 2142 2142 int flags) 2143 2143 { 2144 2144 struct task_struct *p = dl_task_of(dl_se); 2145 + struct rq *rq = rq_of_dl_rq(dl_rq); 2145 2146 2146 2147 if (!schedstat_enabled()) 2147 2148 return; 2149 + 2150 + if (p != rq->curr) 2151 + update_stats_wait_end_dl(dl_rq, dl_se); 2148 2152 2149 2153 if ((flags & DEQUEUE_SLEEP)) { 2150 2154 unsigned int state; ··· 3617 3613 dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); 3618 3614 } 3619 3615 3620 - void __getparam_dl(struct task_struct *p, struct sched_attr *attr) 3616 + void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags) 3621 3617 { 3622 3618 struct sched_dl_entity *dl_se = &p->dl; 3619 + struct rq *rq = task_rq(p); 3620 + u64 adj_deadline; 3623 3621 3624 3622 attr->sched_priority = p->rt_priority; 3625 - attr->sched_runtime = dl_se->dl_runtime; 3626 - attr->sched_deadline = dl_se->dl_deadline; 3623 + if (flags & SCHED_GETATTR_FLAG_DL_DYNAMIC) { 3624 + guard(raw_spinlock_irq)(&rq->__lock); 3625 + update_rq_clock(rq); 3626 + if (task_current(rq, p)) 3627 + update_curr_dl(rq); 3628 + 3629 + attr->sched_runtime = dl_se->runtime; 3630 + adj_deadline = dl_se->deadline - rq_clock(rq) + ktime_get_ns(); 3631 + attr->sched_deadline = adj_deadline; 3632 + } else { 3633 + attr->sched_runtime = dl_se->dl_runtime; 3634 + attr->sched_deadline = dl_se->dl_deadline; 3635 + } 3627 3636 attr->sched_period = dl_se->dl_period; 3628 3637 attr->sched_flags &= ~SCHED_DL_FLAGS; 3629 3638 attr->sched_flags |= dl_se->flags;

+13 -1

kernel/sched/debug.c

··· 8 8 */ 9 9 #include <linux/debugfs.h> 10 10 #include <linux/nmi.h> 11 + #include <linux/log2.h> 11 12 #include "sched.h" 12 13 13 14 /* ··· 902 901 903 902 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 904 903 { 905 - s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; 904 + s64 left_vruntime = -1, right_vruntime = -1, left_deadline = -1, spread; 905 + s64 zero_vruntime = -1, sum_w_vruntime = -1; 906 906 struct sched_entity *last, *first, *root; 907 907 struct rq *rq = cpu_rq(cpu); 908 + unsigned int sum_shift; 908 909 unsigned long flags; 910 + u64 sum_weight; 909 911 910 912 #ifdef CONFIG_FAIR_GROUP_SCHED 911 913 SEQ_printf(m, "\n"); ··· 929 925 if (last) 930 926 right_vruntime = last->vruntime; 931 927 zero_vruntime = cfs_rq->zero_vruntime; 928 + sum_w_vruntime = cfs_rq->sum_w_vruntime; 929 + sum_weight = cfs_rq->sum_weight; 930 + sum_shift = cfs_rq->sum_shift; 932 931 raw_spin_rq_unlock_irqrestore(rq, flags); 933 932 934 933 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", ··· 940 933 SPLIT_NS(left_vruntime)); 941 934 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", 942 935 SPLIT_NS(zero_vruntime)); 936 + SEQ_printf(m, " .%-30s: %Ld (%d bits)\n", "sum_w_vruntime", 937 + sum_w_vruntime, ilog2(abs(sum_w_vruntime))); 938 + SEQ_printf(m, " .%-30s: %Lu\n", "sum_weight", 939 + sum_weight); 940 + SEQ_printf(m, " .%-30s: %u\n", "sum_shift", sum_shift); 943 941 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", 944 942 SPLIT_NS(avg_vruntime(cfs_rq))); 945 943 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime",

+2 -3

kernel/sched/ext.c

··· 3208 3208 3209 3209 intv = READ_ONCE(scx_watchdog_interval); 3210 3210 if (intv < ULONG_MAX) 3211 - queue_delayed_work(system_unbound_wq, to_delayed_work(work), 3212 - intv); 3211 + queue_delayed_work(system_dfl_wq, to_delayed_work(work), intv); 3213 3212 } 3214 3213 3215 3214 void scx_tick(struct rq *rq) ··· 5232 5233 WRITE_ONCE(scx_watchdog_interval, intv); 5233 5234 5234 5235 if (intv < ULONG_MAX) 5235 - mod_delayed_work(system_unbound_wq, &scx_watchdog_work, intv); 5236 + mod_delayed_work(system_dfl_wq, &scx_watchdog_work, intv); 5236 5237 else 5237 5238 cancel_delayed_work_sync(&scx_watchdog_work); 5238 5239 }

+264 -63

kernel/sched/fair.c

··· 225 225 update_sysctl(); 226 226 } 227 227 228 + #ifndef CONFIG_64BIT 228 229 #define WMULT_CONST (~0U) 229 230 #define WMULT_SHIFT 32 230 231 ··· 284 283 285 284 return mul_u64_u32_shr(delta_exec, fact, shift); 286 285 } 286 + #else 287 + static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw) 288 + { 289 + return (delta_exec * weight) / lw->weight; 290 + } 291 + #endif 287 292 288 293 /* 289 294 * delta /= w ··· 672 665 * Since zero_vruntime closely tracks the per-task service, these 673 666 * deltas: (v_i - v0), will be in the order of the maximal (virtual) lag 674 667 * induced in the system due to quantisation. 675 - * 676 - * Also, we use scale_load_down() to reduce the size. 677 - * 678 - * As measured, the max (key * weight) value was ~44 bits for a kernel build. 679 668 */ 669 + static inline unsigned long avg_vruntime_weight(struct cfs_rq *cfs_rq, unsigned long w) 670 + { 671 + #ifdef CONFIG_64BIT 672 + if (cfs_rq->sum_shift) 673 + w = max(2UL, w >> cfs_rq->sum_shift); 674 + #endif 675 + return w; 676 + } 677 + 678 + static inline void 679 + __sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 680 + { 681 + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); 682 + s64 w_vruntime, key = entity_key(cfs_rq, se); 683 + 684 + w_vruntime = key * weight; 685 + WARN_ON_ONCE((w_vruntime >> 63) != (w_vruntime >> 62)); 686 + 687 + cfs_rq->sum_w_vruntime += w_vruntime; 688 + cfs_rq->sum_weight += weight; 689 + } 690 + 691 + static void 692 + sum_w_vruntime_add_paranoid(struct cfs_rq *cfs_rq, struct sched_entity *se) 693 + { 694 + unsigned long weight; 695 + s64 key, tmp; 696 + 697 + again: 698 + weight = avg_vruntime_weight(cfs_rq, se->load.weight); 699 + key = entity_key(cfs_rq, se); 700 + 701 + if (check_mul_overflow(key, weight, &key)) 702 + goto overflow; 703 + 704 + if (check_add_overflow(cfs_rq->sum_w_vruntime, key, &tmp)) 705 + goto overflow; 706 + 707 + cfs_rq->sum_w_vruntime = tmp; 708 + cfs_rq->sum_weight += weight; 709 + return; 710 + 711 + overflow: 712 + /* 713 + * There's gotta be a limit -- if we're still failing at this point 714 + * there's really nothing much to be done about things. 715 + */ 716 + BUG_ON(cfs_rq->sum_shift >= 10); 717 + cfs_rq->sum_shift++; 718 + 719 + /* 720 + * Note: \Sum (k_i * (w_i >> 1)) != (\Sum (k_i * w_i)) >> 1 721 + */ 722 + cfs_rq->sum_w_vruntime = 0; 723 + cfs_rq->sum_weight = 0; 724 + 725 + for (struct rb_node *node = cfs_rq->tasks_timeline.rb_leftmost; 726 + node; node = rb_next(node)) 727 + __sum_w_vruntime_add(cfs_rq, __node_2_se(node)); 728 + 729 + goto again; 730 + } 731 + 680 732 static void 681 733 sum_w_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 682 734 { 683 - unsigned long weight = scale_load_down(se->load.weight); 684 - s64 key = entity_key(cfs_rq, se); 735 + if (sched_feat(PARANOID_AVG)) 736 + return sum_w_vruntime_add_paranoid(cfs_rq, se); 685 737 686 - cfs_rq->sum_w_vruntime += key * weight; 687 - cfs_rq->sum_weight += weight; 738 + __sum_w_vruntime_add(cfs_rq, se); 688 739 } 689 740 690 741 static void 691 742 sum_w_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) 692 743 { 693 - unsigned long weight = scale_load_down(se->load.weight); 744 + unsigned long weight = avg_vruntime_weight(cfs_rq, se->load.weight); 694 745 s64 key = entity_key(cfs_rq, se); 695 746 696 747 cfs_rq->sum_w_vruntime -= key * weight; ··· 790 725 s64 runtime = cfs_rq->sum_w_vruntime; 791 726 792 727 if (curr) { 793 - unsigned long w = scale_load_down(curr->load.weight); 728 + unsigned long w = avg_vruntime_weight(cfs_rq, curr->load.weight); 794 729 795 730 runtime += entity_key(cfs_rq, curr) * w; 796 731 weight += w; ··· 800 735 if (runtime < 0) 801 736 runtime -= (weight - 1); 802 737 803 - delta = div_s64(runtime, weight); 738 + delta = div64_long(runtime, weight); 804 739 } else if (curr) { 805 740 /* 806 741 * When there is but one element, it is the average. ··· 829 764 * 830 765 * -r_max < lag < max(r_max, q) 831 766 */ 832 - static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) 767 + static s64 entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 avruntime) 833 768 { 834 769 u64 max_slice = cfs_rq_max_slice(cfs_rq) + TICK_NSEC; 835 770 s64 vlag, limit; 836 771 837 - WARN_ON_ONCE(!se->on_rq); 838 - 839 - vlag = avg_vruntime(cfs_rq) - se->vruntime; 772 + vlag = avruntime - se->vruntime; 840 773 limit = calc_delta_fair(max_slice, se); 841 774 842 - se->vlag = clamp(vlag, -limit, limit); 775 + return clamp(vlag, -limit, limit); 776 + } 777 + 778 + static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) 779 + { 780 + WARN_ON_ONCE(!se->on_rq); 781 + 782 + se->vlag = entity_lag(cfs_rq, se, avg_vruntime(cfs_rq)); 843 783 } 844 784 845 785 /* ··· 871 801 long load = cfs_rq->sum_weight; 872 802 873 803 if (curr && curr->on_rq) { 874 - unsigned long weight = scale_load_down(curr->load.weight); 804 + unsigned long weight = avg_vruntime_weight(cfs_rq, curr->load.weight); 875 805 876 806 avg += entity_key(cfs_rq, curr) * weight; 877 807 load += weight; ··· 3910 3840 se_weight(se) * -se->avg.load_sum); 3911 3841 } 3912 3842 3913 - static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); 3843 + static void 3844 + rescale_entity(struct sched_entity *se, unsigned long weight, bool rel_vprot) 3845 + { 3846 + unsigned long old_weight = se->load.weight; 3847 + 3848 + /* 3849 + * VRUNTIME 3850 + * -------- 3851 + * 3852 + * COROLLARY #1: The virtual runtime of the entity needs to be 3853 + * adjusted if re-weight at !0-lag point. 3854 + * 3855 + * Proof: For contradiction assume this is not true, so we can 3856 + * re-weight without changing vruntime at !0-lag point. 3857 + * 3858 + * Weight VRuntime Avg-VRuntime 3859 + * before w v V 3860 + * after w' v' V' 3861 + * 3862 + * Since lag needs to be preserved through re-weight: 3863 + * 3864 + * lag = (V - v)*w = (V'- v')*w', where v = v' 3865 + * ==> V' = (V - v)*w/w' + v (1) 3866 + * 3867 + * Let W be the total weight of the entities before reweight, 3868 + * since V' is the new weighted average of entities: 3869 + * 3870 + * V' = (WV + w'v - wv) / (W + w' - w) (2) 3871 + * 3872 + * by using (1) & (2) we obtain: 3873 + * 3874 + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v 3875 + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v 3876 + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v 3877 + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) 3878 + * 3879 + * Since we are doing at !0-lag point which means V != v, we 3880 + * can simplify (3): 3881 + * 3882 + * ==> W / (W + w' - w) = w / w' 3883 + * ==> Ww' = Ww + ww' - ww 3884 + * ==> W * (w' - w) = w * (w' - w) 3885 + * ==> W = w (re-weight indicates w' != w) 3886 + * 3887 + * So the cfs_rq contains only one entity, hence vruntime of 3888 + * the entity @v should always equal to the cfs_rq's weighted 3889 + * average vruntime @V, which means we will always re-weight 3890 + * at 0-lag point, thus breach assumption. Proof completed. 3891 + * 3892 + * 3893 + * COROLLARY #2: Re-weight does NOT affect weighted average 3894 + * vruntime of all the entities. 3895 + * 3896 + * Proof: According to corollary #1, Eq. (1) should be: 3897 + * 3898 + * (V - v)*w = (V' - v')*w' 3899 + * ==> v' = V' - (V - v)*w/w' (4) 3900 + * 3901 + * According to the weighted average formula, we have: 3902 + * 3903 + * V' = (WV - wv + w'v') / (W - w + w') 3904 + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') 3905 + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') 3906 + * = (WV + w'V' - Vw) / (W - w + w') 3907 + * 3908 + * ==> V'*(W - w + w') = WV + w'V' - Vw 3909 + * ==> V' * (W - w) = (W - w) * V (5) 3910 + * 3911 + * If the entity is the only one in the cfs_rq, then reweight 3912 + * always occurs at 0-lag point, so V won't change. Or else 3913 + * there are other entities, hence W != w, then Eq. (5) turns 3914 + * into V' = V. So V won't change in either case, proof done. 3915 + * 3916 + * 3917 + * So according to corollary #1 & #2, the effect of re-weight 3918 + * on vruntime should be: 3919 + * 3920 + * v' = V' - (V - v) * w / w' (4) 3921 + * = V - (V - v) * w / w' 3922 + * = V - vl * w / w' 3923 + * = V - vl' 3924 + */ 3925 + se->vlag = div64_long(se->vlag * old_weight, weight); 3926 + 3927 + /* 3928 + * DEADLINE 3929 + * -------- 3930 + * 3931 + * When the weight changes, the virtual time slope changes and 3932 + * we should adjust the relative virtual deadline accordingly. 3933 + * 3934 + * d' = v' + (d - v)*w/w' 3935 + * = V' - (V - v)*w/w' + (d - v)*w/w' 3936 + * = V - (V - v)*w/w' + (d - v)*w/w' 3937 + * = V + (d - V)*w/w' 3938 + */ 3939 + if (se->rel_deadline) 3940 + se->deadline = div64_long(se->deadline * old_weight, weight); 3941 + 3942 + if (rel_vprot) 3943 + se->vprot = div64_long(se->vprot * old_weight, weight); 3944 + } 3914 3945 3915 3946 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 3916 3947 unsigned long weight) 3917 3948 { 3918 3949 bool curr = cfs_rq->curr == se; 3919 3950 bool rel_vprot = false; 3920 - u64 vprot; 3951 + u64 avruntime = 0; 3921 3952 3922 3953 if (se->on_rq) { 3923 3954 /* commit outstanding execution time */ 3924 3955 update_curr(cfs_rq); 3925 - update_entity_lag(cfs_rq, se); 3926 - se->deadline -= se->vruntime; 3956 + avruntime = avg_vruntime(cfs_rq); 3957 + se->vlag = entity_lag(cfs_rq, se, avruntime); 3958 + se->deadline -= avruntime; 3927 3959 se->rel_deadline = 1; 3928 3960 if (curr && protect_slice(se)) { 3929 - vprot = se->vprot - se->vruntime; 3961 + se->vprot -= avruntime; 3930 3962 rel_vprot = true; 3931 3963 } 3932 3964 ··· 4039 3867 } 4040 3868 dequeue_load_avg(cfs_rq, se); 4041 3869 4042 - /* 4043 - * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), 4044 - * we need to scale se->vlag when w_i changes. 4045 - */ 4046 - se->vlag = div_s64(se->vlag * se->load.weight, weight); 4047 - if (se->rel_deadline) 4048 - se->deadline = div_s64(se->deadline * se->load.weight, weight); 4049 - 4050 - if (rel_vprot) 4051 - vprot = div_s64(vprot * se->load.weight, weight); 3870 + rescale_entity(se, weight, rel_vprot); 4052 3871 4053 3872 update_load_set(&se->load, weight); 4054 3873 4055 3874 do { 4056 3875 u32 divider = get_pelt_divider(&se->avg); 4057 - 4058 3876 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider); 4059 3877 } while (0); 4060 3878 4061 3879 enqueue_load_avg(cfs_rq, se); 4062 3880 if (se->on_rq) { 4063 - place_entity(cfs_rq, se, 0); 4064 3881 if (rel_vprot) 4065 - se->vprot = se->vruntime + vprot; 3882 + se->vprot += avruntime; 3883 + se->deadline += avruntime; 3884 + se->rel_deadline = 0; 3885 + se->vruntime = avruntime - se->vlag; 3886 + 4066 3887 update_load_add(&cfs_rq->load, se->load.weight); 4067 3888 if (!curr) 4068 3889 __enqueue_entity(cfs_rq, se); ··· 5345 5180 */ 5346 5181 if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) { 5347 5182 struct sched_entity *curr = cfs_rq->curr; 5348 - unsigned long load; 5183 + long load; 5349 5184 5350 5185 lag = se->vlag; 5351 5186 ··· 5403 5238 */ 5404 5239 load = cfs_rq->sum_weight; 5405 5240 if (curr && curr->on_rq) 5406 - load += scale_load_down(curr->load.weight); 5241 + load += avg_vruntime_weight(cfs_rq, curr->load.weight); 5407 5242 5408 - lag *= load + scale_load_down(se->load.weight); 5243 + lag *= load + avg_vruntime_weight(cfs_rq, se->load.weight); 5409 5244 if (WARN_ON_ONCE(!load)) 5410 5245 load = 1; 5411 - lag = div_s64(lag, load); 5246 + lag = div64_long(lag, load); 5412 5247 } 5413 5248 5414 5249 se->vruntime = vruntime - lag; 5415 5250 5416 - if (se->rel_deadline) { 5251 + if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { 5417 5252 se->deadline += se->vruntime; 5418 5253 se->rel_deadline = 0; 5419 5254 return; ··· 7018 6853 7019 6854 static inline bool cpu_overutilized(int cpu) 7020 6855 { 7021 - unsigned long rq_util_min, rq_util_max; 6856 + unsigned long rq_util_max; 7022 6857 7023 6858 if (!sched_energy_enabled()) 7024 6859 return false; 7025 6860 7026 - rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); 7027 6861 rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); 7028 6862 7029 6863 /* Return true only if the utilization doesn't fit CPU's capacity */ 7030 - return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); 6864 + return !util_fits_cpu(cpu_util_cfs(cpu), 0, rq_util_max, cpu); 7031 6865 } 7032 6866 7033 6867 /* ··· 7064 6900 rq->nr_running); 7065 6901 } 7066 6902 7067 - static int sched_idle_cpu(int cpu) 6903 + static int choose_sched_idle_rq(struct rq *rq, struct task_struct *p) 7068 6904 { 7069 - return sched_idle_rq(cpu_rq(cpu)); 6905 + return sched_idle_rq(rq) && !task_has_idle_policy(p); 6906 + } 6907 + 6908 + static int choose_idle_cpu(int cpu, struct task_struct *p) 6909 + { 6910 + return available_idle_cpu(cpu) || 6911 + choose_sched_idle_rq(cpu_rq(cpu), p); 7070 6912 } 7071 6913 7072 6914 static void ··· 7637 7467 if (!sched_core_cookie_match(rq, p)) 7638 7468 continue; 7639 7469 7640 - if (sched_idle_cpu(i)) 7470 + if (choose_sched_idle_rq(rq, p)) 7641 7471 return i; 7642 7472 7643 7473 if (available_idle_cpu(i)) { ··· 7728 7558 7729 7559 static inline int __select_idle_cpu(int cpu, struct task_struct *p) 7730 7560 { 7731 - if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) && 7732 - sched_cpu_cookie_match(cpu_rq(cpu), p)) 7561 + if (choose_idle_cpu(cpu, p) && sched_cpu_cookie_match(cpu_rq(cpu), p)) 7733 7562 return cpu; 7734 7563 7735 7564 return -1; ··· 7801 7632 if (!available_idle_cpu(cpu)) { 7802 7633 idle = false; 7803 7634 if (*idle_cpu == -1) { 7804 - if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, cpus)) { 7635 + if (choose_sched_idle_rq(cpu_rq(cpu), p) && 7636 + cpumask_test_cpu(cpu, cpus)) { 7805 7637 *idle_cpu = cpu; 7806 7638 break; 7807 7639 } ··· 7837 7667 */ 7838 7668 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) 7839 7669 continue; 7840 - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) 7670 + if (choose_idle_cpu(cpu, p)) 7841 7671 return cpu; 7842 7672 } 7843 7673 ··· 7959 7789 for_each_cpu_wrap(cpu, cpus, target) { 7960 7790 unsigned long cpu_cap = capacity_of(cpu); 7961 7791 7962 - if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) 7792 + if (!choose_idle_cpu(cpu, p)) 7963 7793 continue; 7964 7794 7965 7795 fits = util_fits_cpu(task_util, util_min, util_max, cpu); ··· 8030 7860 */ 8031 7861 lockdep_assert_irqs_disabled(); 8032 7862 8033 - if ((available_idle_cpu(target) || sched_idle_cpu(target)) && 7863 + if (choose_idle_cpu(target, p) && 8034 7864 asym_fits_cpu(task_util, util_min, util_max, target)) 8035 7865 return target; 8036 7866 ··· 8038 7868 * If the previous CPU is cache affine and idle, don't be stupid: 8039 7869 */ 8040 7870 if (prev != target && cpus_share_cache(prev, target) && 8041 - (available_idle_cpu(prev) || sched_idle_cpu(prev)) && 7871 + choose_idle_cpu(prev, p) && 8042 7872 asym_fits_cpu(task_util, util_min, util_max, prev)) { 8043 7873 8044 7874 if (!static_branch_unlikely(&sched_cluster_active) || ··· 8070 7900 if (recent_used_cpu != prev && 8071 7901 recent_used_cpu != target && 8072 7902 cpus_share_cache(recent_used_cpu, target) && 8073 - (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && 7903 + choose_idle_cpu(recent_used_cpu, p) && 8074 7904 cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && 8075 7905 asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { 8076 7906 ··· 10217 10047 unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ 10218 10048 unsigned int group_smt_balance; /* Task on busy SMT be moved */ 10219 10049 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ 10050 + unsigned int group_overutilized; /* At least one CPU is overutilized in the group */ 10220 10051 #ifdef CONFIG_NUMA_BALANCING 10221 10052 unsigned int nr_numa_running; 10222 10053 unsigned int nr_preferred_running; ··· 10450 10279 static inline bool 10451 10280 group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs) 10452 10281 { 10282 + /* 10283 + * With EAS and uclamp, 1 CPU in the group must be overutilized to 10284 + * consider the group overloaded. 10285 + */ 10286 + if (sched_energy_enabled() && !sgs->group_overutilized) 10287 + return false; 10288 + 10453 10289 if (sgs->sum_nr_running <= sgs->group_weight) 10454 10290 return false; 10455 10291 ··· 10640 10462 * @group: sched_group whose statistics are to be updated. 10641 10463 * @sgs: variable to hold the statistics for this group. 10642 10464 * @sg_overloaded: sched_group is overloaded 10643 - * @sg_overutilized: sched_group is overutilized 10644 10465 */ 10645 10466 static inline void update_sg_lb_stats(struct lb_env *env, 10646 10467 struct sd_lb_stats *sds, 10647 10468 struct sched_group *group, 10648 10469 struct sg_lb_stats *sgs, 10649 - bool *sg_overloaded, 10650 - bool *sg_overutilized) 10470 + bool *sg_overloaded) 10651 10471 { 10652 10472 int i, nr_running, local_group, sd_flags = env->sd->flags; 10653 10473 bool balancing_at_rd = !env->sd->parent; ··· 10667 10491 sgs->sum_nr_running += nr_running; 10668 10492 10669 10493 if (cpu_overutilized(i)) 10670 - *sg_overutilized = 1; 10494 + sgs->group_overutilized = 1; 10671 10495 10672 10496 /* 10673 10497 * No need to call idle_cpu() if nr_running is not 0 ··· 11338 11162 update_group_capacity(env->sd, env->dst_cpu); 11339 11163 } 11340 11164 11341 - update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded, &sg_overutilized); 11165 + update_sg_lb_stats(env, sds, sg, sgs, &sg_overloaded); 11342 11166 11343 11167 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { 11344 11168 sds->busiest = sg; 11345 11169 sds->busiest_stat = *sgs; 11346 11170 } 11171 + 11172 + sg_overutilized |= sgs->group_overutilized; 11347 11173 11348 11174 /* Now, start updating sd_lb_stats */ 11349 11175 sds->total_load += sgs->group_load; ··· 12467 12289 sd->newidle_success += success; 12468 12290 12469 12291 if (sd->newidle_call >= 1024) { 12470 - sd->newidle_ratio = sd->newidle_success; 12292 + u64 now = sched_clock(); 12293 + s64 delta = now - sd->newidle_stamp; 12294 + sd->newidle_stamp = now; 12295 + int ratio = 0; 12296 + 12297 + if (delta < 0) 12298 + delta = 0; 12299 + 12300 + if (sched_feat(NI_RATE)) { 12301 + /* 12302 + * ratio delta freq 12303 + * 12304 + * 1024 - 4 s - 128 Hz 12305 + * 512 - 2 s - 256 Hz 12306 + * 256 - 1 s - 512 Hz 12307 + * 128 - .5 s - 1024 Hz 12308 + * 64 - .25 s - 2048 Hz 12309 + */ 12310 + ratio = delta >> 22; 12311 + } 12312 + 12313 + ratio += sd->newidle_success; 12314 + 12315 + sd->newidle_ratio = min(1024, ratio); 12471 12316 sd->newidle_call /= 2; 12472 12317 sd->newidle_success /= 2; 12473 12318 } ··· 12537 12336 { 12538 12337 int continue_balancing = 1; 12539 12338 int cpu = rq->cpu; 12540 - int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); 12339 + int busy = idle != CPU_IDLE && !sched_idle_rq(rq); 12541 12340 unsigned long interval; 12542 12341 struct sched_domain *sd; 12543 12342 /* Earliest time when we have to do rebalance again */ ··· 12575 12374 * state even if we migrated tasks. Update it. 12576 12375 */ 12577 12376 idle = idle_cpu(cpu); 12578 - busy = !idle && !sched_idle_cpu(cpu); 12377 + busy = !idle && !sched_idle_rq(rq); 12579 12378 } 12580 12379 sd->last_balance = jiffies; 12581 12380 interval = get_sd_balance_interval(sd, busy); ··· 13197 12996 if (sd->flags & SD_BALANCE_NEWIDLE) { 13198 12997 unsigned int weight = 1; 13199 12998 13200 - if (sched_feat(NI_RANDOM)) { 12999 + if (sched_feat(NI_RANDOM) && sd->newidle_ratio < 1024) { 13201 13000 /* 13202 13001 * Throw a 1k sided dice; and only run 13203 13002 * newidle_balance according to the success

+3

kernel/sched/features.h

··· 58 58 SCHED_FEAT(DELAY_DEQUEUE, true) 59 59 SCHED_FEAT(DELAY_ZERO, true) 60 60 61 + SCHED_FEAT(PARANOID_AVG, false) 62 + 61 63 /* 62 64 * Allow wakeup-time preemption of the current task: 63 65 */ ··· 128 126 * Do newidle balancing proportional to its success rate using randomization. 129 127 */ 130 128 SCHED_FEAT(NI_RANDOM, true) 129 + SCHED_FEAT(NI_RATE, true)

+6 -1

kernel/sched/rt.c

··· 1302 1302 int flags) 1303 1303 { 1304 1304 struct task_struct *p = NULL; 1305 + struct rq *rq = rq_of_rt_rq(rt_rq); 1305 1306 1306 1307 if (!schedstat_enabled()) 1307 1308 return; 1308 1309 1309 - if (rt_entity_is_task(rt_se)) 1310 + if (rt_entity_is_task(rt_se)) { 1310 1311 p = rt_task_of(rt_se); 1312 + 1313 + if (p != rq->curr) 1314 + update_stats_wait_end_rt(rt_rq, rt_se); 1315 + } 1311 1316 1312 1317 if ((flags & DEQUEUE_SLEEP) && p) { 1313 1318 unsigned int state;

+9 -5

kernel/sched/sched.h

··· 356 356 extern void sched_dl_do_global(void); 357 357 extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); 358 358 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 359 - extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 359 + extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr, unsigned int flags); 360 360 extern bool __checkparam_dl(const struct sched_attr *attr); 361 361 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 362 362 extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); ··· 684 684 685 685 s64 sum_w_vruntime; 686 686 u64 sum_weight; 687 - 688 687 u64 zero_vruntime; 688 + unsigned int sum_shift; 689 + 689 690 #ifdef CONFIG_SCHED_CORE 690 691 unsigned int forceidle_seq; 691 692 u64 zero_vruntime_fi; ··· 1610 1609 extern bool raw_spin_rq_trylock(struct rq *rq) 1611 1610 __cond_acquires(true, __rq_lockp(rq)); 1612 1611 1613 - extern void raw_spin_rq_unlock(struct rq *rq) 1614 - __releases(__rq_lockp(rq)); 1615 - 1616 1612 static inline void raw_spin_rq_lock(struct rq *rq) 1617 1613 __acquires(__rq_lockp(rq)) 1618 1614 { 1619 1615 raw_spin_rq_lock_nested(rq, 0); 1616 + } 1617 + 1618 + static inline void raw_spin_rq_unlock(struct rq *rq) 1619 + __releases(__rq_lockp(rq)) 1620 + { 1621 + raw_spin_unlock(rq_lockp(rq)); 1620 1622 } 1621 1623 1622 1624 static inline void raw_spin_rq_lock_irq(struct rq *rq)

+11 -5

kernel/sched/syscalls.c

··· 881 881 return -E2BIG; 882 882 } 883 883 884 - static void get_params(struct task_struct *p, struct sched_attr *attr) 884 + static void get_params(struct task_struct *p, struct sched_attr *attr, unsigned int flags) 885 885 { 886 886 if (task_has_dl_policy(p)) { 887 - __getparam_dl(p, attr); 887 + __getparam_dl(p, attr, flags); 888 888 } else if (task_has_rt_policy(p)) { 889 889 attr->sched_priority = p->rt_priority; 890 890 } else { ··· 950 950 return -ESRCH; 951 951 952 952 if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) 953 - get_params(p, &attr); 953 + get_params(p, &attr, 0); 954 954 955 955 return sched_setattr(p, &attr); 956 956 } ··· 1035 1035 int retval; 1036 1036 1037 1037 if (unlikely(!uattr || pid < 0 || usize > PAGE_SIZE || 1038 - usize < SCHED_ATTR_SIZE_VER0 || flags)) 1038 + usize < SCHED_ATTR_SIZE_VER0)) 1039 1039 return -EINVAL; 1040 1040 1041 1041 scoped_guard (rcu) { 1042 1042 p = find_process_by_pid(pid); 1043 1043 if (!p) 1044 1044 return -ESRCH; 1045 + 1046 + if (flags) { 1047 + if (!task_has_dl_policy(p) || 1048 + flags != SCHED_GETATTR_FLAG_DL_DYNAMIC) 1049 + return -EINVAL; 1050 + } 1045 1051 1046 1052 retval = security_task_getscheduler(p); 1047 1053 if (retval) ··· 1056 1050 kattr.sched_policy = p->policy; 1057 1051 if (p->sched_reset_on_fork) 1058 1052 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; 1059 - get_params(p, &kattr); 1053 + get_params(p, &kattr, flags); 1060 1054 kattr.sched_flags &= SCHED_FLAG_ALL; 1061 1055 1062 1056 #ifdef CONFIG_UCLAMP_TASK

+3

kernel/sched/topology.c

··· 4 4 */ 5 5 6 6 #include <linux/sched/isolation.h> 7 + #include <linux/sched/clock.h> 7 8 #include <linux/bsearch.h> 8 9 #include "sched.h" 9 10 ··· 1643 1642 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); 1644 1643 int sd_id, sd_weight, sd_flags = 0; 1645 1644 struct cpumask *sd_span; 1645 + u64 now = sched_clock(); 1646 1646 1647 1647 sd_weight = cpumask_weight(tl->mask(tl, cpu)); 1648 1648 ··· 1681 1679 .newidle_call = 512, 1682 1680 .newidle_success = 256, 1683 1681 .newidle_ratio = 512, 1682 + .newidle_stamp = now, 1684 1683 1685 1684 .max_newidle_lb_cost = 0, 1686 1685 .last_decay_max_lb_cost = jiffies,

Configure Feed

Configure Feed