Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Borislav Petkov:

- Fix virtual runtime calculation when recomputing a sched entity's
weights

- Fix wrongly rejected unprivileged poll requests to the cgroup psi
pressure files

- Make sure the load balancing is done by only one CPU

* tag 'sched_urgent_for_v6.7_rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Fix the decision for load balance
sched: psi: fix unprivileged polling against cgroups
sched/eevdf: Fix vruntime adjustment on reweight

+135 -38
-12
kernel/cgroup/cgroup.c
··· 3885 3885 return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); 3886 3886 } 3887 3887 3888 - static int cgroup_pressure_open(struct kernfs_open_file *of) 3889 - { 3890 - if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE)) 3891 - return -EPERM; 3892 - 3893 - return 0; 3894 - } 3895 - 3896 3888 static void cgroup_pressure_release(struct kernfs_open_file *of) 3897 3889 { 3898 3890 struct cgroup_file_ctx *ctx = of->priv; ··· 5291 5299 { 5292 5300 .name = "io.pressure", 5293 5301 .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), 5294 - .open = cgroup_pressure_open, 5295 5302 .seq_show = cgroup_io_pressure_show, 5296 5303 .write = cgroup_io_pressure_write, 5297 5304 .poll = cgroup_pressure_poll, ··· 5299 5308 { 5300 5309 .name = "memory.pressure", 5301 5310 .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), 5302 - .open = cgroup_pressure_open, 5303 5311 .seq_show = cgroup_memory_pressure_show, 5304 5312 .write = cgroup_memory_pressure_write, 5305 5313 .poll = cgroup_pressure_poll, ··· 5307 5317 { 5308 5318 .name = "cpu.pressure", 5309 5319 .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), 5310 - .open = cgroup_pressure_open, 5311 5320 .seq_show = cgroup_cpu_pressure_show, 5312 5321 .write = cgroup_cpu_pressure_write, 5313 5322 .poll = cgroup_pressure_poll, ··· 5316 5327 { 5317 5328 .name = "irq.pressure", 5318 5329 .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), 5319 - .open = cgroup_pressure_open, 5320 5330 .seq_show = cgroup_irq_pressure_show, 5321 5331 .write = cgroup_irq_pressure_write, 5322 5332 .poll = cgroup_pressure_poll,
+135 -26
kernel/sched/fair.c
··· 3666 3666 dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } 3667 3667 #endif 3668 3668 3669 + static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se, 3670 + unsigned long weight) 3671 + { 3672 + unsigned long old_weight = se->load.weight; 3673 + u64 avruntime = avg_vruntime(cfs_rq); 3674 + s64 vlag, vslice; 3675 + 3676 + /* 3677 + * VRUNTIME 3678 + * ======== 3679 + * 3680 + * COROLLARY #1: The virtual runtime of the entity needs to be 3681 + * adjusted if re-weight at !0-lag point. 3682 + * 3683 + * Proof: For contradiction assume this is not true, so we can 3684 + * re-weight without changing vruntime at !0-lag point. 3685 + * 3686 + * Weight VRuntime Avg-VRuntime 3687 + * before w v V 3688 + * after w' v' V' 3689 + * 3690 + * Since lag needs to be preserved through re-weight: 3691 + * 3692 + * lag = (V - v)*w = (V'- v')*w', where v = v' 3693 + * ==> V' = (V - v)*w/w' + v (1) 3694 + * 3695 + * Let W be the total weight of the entities before reweight, 3696 + * since V' is the new weighted average of entities: 3697 + * 3698 + * V' = (WV + w'v - wv) / (W + w' - w) (2) 3699 + * 3700 + * by using (1) & (2) we obtain: 3701 + * 3702 + * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v 3703 + * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v 3704 + * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v 3705 + * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3) 3706 + * 3707 + * Since we are doing at !0-lag point which means V != v, we 3708 + * can simplify (3): 3709 + * 3710 + * ==> W / (W + w' - w) = w / w' 3711 + * ==> Ww' = Ww + ww' - ww 3712 + * ==> W * (w' - w) = w * (w' - w) 3713 + * ==> W = w (re-weight indicates w' != w) 3714 + * 3715 + * So the cfs_rq contains only one entity, hence vruntime of 3716 + * the entity @v should always equal to the cfs_rq's weighted 3717 + * average vruntime @V, which means we will always re-weight 3718 + * at 0-lag point, thus breach assumption. Proof completed. 3719 + * 3720 + * 3721 + * COROLLARY #2: Re-weight does NOT affect weighted average 3722 + * vruntime of all the entities. 3723 + * 3724 + * Proof: According to corollary #1, Eq. (1) should be: 3725 + * 3726 + * (V - v)*w = (V' - v')*w' 3727 + * ==> v' = V' - (V - v)*w/w' (4) 3728 + * 3729 + * According to the weighted average formula, we have: 3730 + * 3731 + * V' = (WV - wv + w'v') / (W - w + w') 3732 + * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w') 3733 + * = (WV - wv + w'V' - Vw + wv) / (W - w + w') 3734 + * = (WV + w'V' - Vw) / (W - w + w') 3735 + * 3736 + * ==> V'*(W - w + w') = WV + w'V' - Vw 3737 + * ==> V' * (W - w) = (W - w) * V (5) 3738 + * 3739 + * If the entity is the only one in the cfs_rq, then reweight 3740 + * always occurs at 0-lag point, so V won't change. Or else 3741 + * there are other entities, hence W != w, then Eq. (5) turns 3742 + * into V' = V. So V won't change in either case, proof done. 3743 + * 3744 + * 3745 + * So according to corollary #1 & #2, the effect of re-weight 3746 + * on vruntime should be: 3747 + * 3748 + * v' = V' - (V - v) * w / w' (4) 3749 + * = V - (V - v) * w / w' 3750 + * = V - vl * w / w' 3751 + * = V - vl' 3752 + */ 3753 + if (avruntime != se->vruntime) { 3754 + vlag = (s64)(avruntime - se->vruntime); 3755 + vlag = div_s64(vlag * old_weight, weight); 3756 + se->vruntime = avruntime - vlag; 3757 + } 3758 + 3759 + /* 3760 + * DEADLINE 3761 + * ======== 3762 + * 3763 + * When the weight changes, the virtual time slope changes and 3764 + * we should adjust the relative virtual deadline accordingly. 3765 + * 3766 + * d' = v' + (d - v)*w/w' 3767 + * = V' - (V - v)*w/w' + (d - v)*w/w' 3768 + * = V - (V - v)*w/w' + (d - v)*w/w' 3769 + * = V + (d - V)*w/w' 3770 + */ 3771 + vslice = (s64)(se->deadline - avruntime); 3772 + vslice = div_s64(vslice * old_weight, weight); 3773 + se->deadline = avruntime + vslice; 3774 + } 3775 + 3669 3776 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, 3670 3777 unsigned long weight) 3671 3778 { 3672 - unsigned long old_weight = se->load.weight; 3779 + bool curr = cfs_rq->curr == se; 3673 3780 3674 3781 if (se->on_rq) { 3675 3782 /* commit outstanding execution time */ 3676 - if (cfs_rq->curr == se) 3783 + if (curr) 3677 3784 update_curr(cfs_rq); 3678 3785 else 3679 - avg_vruntime_sub(cfs_rq, se); 3786 + __dequeue_entity(cfs_rq, se); 3680 3787 update_load_sub(&cfs_rq->load, se->load.weight); 3681 3788 } 3682 3789 dequeue_load_avg(cfs_rq, se); 3683 - 3684 - update_load_set(&se->load, weight); 3685 3790 3686 3791 if (!se->on_rq) { 3687 3792 /* 3688 3793 * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i), 3689 3794 * we need to scale se->vlag when w_i changes. 3690 3795 */ 3691 - se->vlag = div_s64(se->vlag * old_weight, weight); 3796 + se->vlag = div_s64(se->vlag * se->load.weight, weight); 3692 3797 } else { 3693 - s64 deadline = se->deadline - se->vruntime; 3694 - /* 3695 - * When the weight changes, the virtual time slope changes and 3696 - * we should adjust the relative virtual deadline accordingly. 3697 - */ 3698 - deadline = div_s64(deadline * old_weight, weight); 3699 - se->deadline = se->vruntime + deadline; 3700 - if (se != cfs_rq->curr) 3701 - min_deadline_cb_propagate(&se->run_node, NULL); 3798 + reweight_eevdf(cfs_rq, se, weight); 3702 3799 } 3800 + 3801 + update_load_set(&se->load, weight); 3703 3802 3704 3803 #ifdef CONFIG_SMP 3705 3804 do { ··· 3811 3712 enqueue_load_avg(cfs_rq, se); 3812 3713 if (se->on_rq) { 3813 3714 update_load_add(&cfs_rq->load, se->load.weight); 3814 - if (cfs_rq->curr != se) 3815 - avg_vruntime_add(cfs_rq, se); 3715 + if (!curr) { 3716 + /* 3717 + * The entity's vruntime has been adjusted, so let's check 3718 + * whether the rq-wide min_vruntime needs updated too. Since 3719 + * the calculations above require stable min_vruntime rather 3720 + * than up-to-date one, we do the update at the end of the 3721 + * reweight process. 3722 + */ 3723 + __enqueue_entity(cfs_rq, se); 3724 + update_min_vruntime(cfs_rq); 3725 + } 3816 3726 } 3817 3727 } 3818 3728 ··· 3965 3857 3966 3858 #ifndef CONFIG_SMP 3967 3859 shares = READ_ONCE(gcfs_rq->tg->shares); 3968 - 3969 - if (likely(se->load.weight == shares)) 3970 - return; 3971 3860 #else 3972 - shares = calc_group_shares(gcfs_rq); 3861 + shares = calc_group_shares(gcfs_rq); 3973 3862 #endif 3974 - 3975 - reweight_entity(cfs_rq_of(se), se, shares); 3863 + if (unlikely(se->load.weight != shares)) 3864 + reweight_entity(cfs_rq_of(se), se, shares); 3976 3865 } 3977 3866 3978 3867 #else /* CONFIG_FAIR_GROUP_SCHED */ ··· 11184 11079 continue; 11185 11080 } 11186 11081 11187 - /* Are we the first idle CPU? */ 11082 + /* 11083 + * Are we the first idle core in a non-SMT domain or higher, 11084 + * or the first idle CPU in a SMT domain? 11085 + */ 11188 11086 return cpu == env->dst_cpu; 11189 11087 } 11190 11088 11191 - if (idle_smt == env->dst_cpu) 11192 - return true; 11089 + /* Are we the first idle CPU with busy siblings? */ 11090 + if (idle_smt != -1) 11091 + return idle_smt == env->dst_cpu; 11193 11092 11194 11093 /* Are we the first CPU of this group ? */ 11195 11094 return group_balance_cpu(sg) == env->dst_cpu;