Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched-urgent-2021-06-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar:
"Misc fixes:

- Fix performance regression caused by lack of intended batching of
RCU callbacks by over-eager NOHZ-full code.

- Fix cgroups related corruption of load_avg and load_sum metrics.

- Three fixes to fix blocked load, util_sum/runnable_sum and util_est
tracking bugs"

* tag 'sched-urgent-2021-06-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
sched/pelt: Ensure that *_sum is always synced with *_avg
tick/nohz: Only check for RCU deferred wakeup on user/guest entry when needed
sched/fair: Make sure to update tg contrib for blocked load
sched/fair: Keep load_avg and load_sum synced

+41 -25
+2 -1
include/linux/entry-kvm.h
··· 3 3 #define __LINUX_ENTRYKVM_H 4 4 5 5 #include <linux/entry-common.h> 6 + #include <linux/tick.h> 6 7 7 8 /* Transfer to guest mode work */ 8 9 #ifdef CONFIG_KVM_XFER_TO_GUEST_WORK ··· 58 57 static inline void xfer_to_guest_mode_prepare(void) 59 58 { 60 59 lockdep_assert_irqs_disabled(); 61 - rcu_nocb_flush_deferred_wakeup(); 60 + tick_nohz_user_enter_prepare(); 62 61 } 63 62 64 63 /**
+8
include/linux/sched.h
··· 350 350 * Only for tasks we track a moving average of the past instantaneous 351 351 * estimated utilization. This allows to absorb sporadic drops in utilization 352 352 * of an otherwise almost periodic task. 353 + * 354 + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg 355 + * updates. When a task is dequeued, its util_est should not be updated if its 356 + * util_avg has not been updated in the meantime. 357 + * This information is mapped into the MSB bit of util_est.enqueued at dequeue 358 + * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg 359 + * for a task) it is safe to use MSB. 353 360 */ 354 361 struct util_est { 355 362 unsigned int enqueued; 356 363 unsigned int ewma; 357 364 #define UTIL_EST_WEIGHT_SHIFT 2 365 + #define UTIL_AVG_UNCHANGED 0x80000000 358 366 } __attribute__((__aligned__(sizeof(u64)))); 359 367 360 368 /*
+7
include/linux/tick.h
··· 11 11 #include <linux/context_tracking_state.h> 12 12 #include <linux/cpumask.h> 13 13 #include <linux/sched.h> 14 + #include <linux/rcupdate.h> 14 15 15 16 #ifdef CONFIG_GENERIC_CLOCKEVENTS 16 17 extern void __init tick_init(void); ··· 299 298 { 300 299 if (tick_nohz_full_enabled()) 301 300 __tick_nohz_task_switch(); 301 + } 302 + 303 + static inline void tick_nohz_user_enter_prepare(void) 304 + { 305 + if (tick_nohz_full_cpu(smp_processor_id())) 306 + rcu_nocb_flush_deferred_wakeup(); 302 307 } 303 308 304 309 #endif
+3 -2
kernel/entry/common.c
··· 5 5 #include <linux/highmem.h> 6 6 #include <linux/livepatch.h> 7 7 #include <linux/audit.h> 8 + #include <linux/tick.h> 8 9 9 10 #include "common.h" 10 11 ··· 187 186 local_irq_disable_exit_to_user(); 188 187 189 188 /* Check if any of the above work has queued a deferred wakeup */ 190 - rcu_nocb_flush_deferred_wakeup(); 189 + tick_nohz_user_enter_prepare(); 191 190 192 191 ti_work = READ_ONCE(current_thread_info()->flags); 193 192 } ··· 203 202 lockdep_assert_irqs_disabled(); 204 203 205 204 /* Flush pending rcuog wakeup before the last need_resched() check */ 206 - rcu_nocb_flush_deferred_wakeup(); 205 + tick_nohz_user_enter_prepare(); 207 206 208 207 if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) 209 208 ti_work = exit_to_user_mode_loop(regs, ti_work);
+2 -1
kernel/sched/debug.c
··· 885 885 #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) 886 886 #define __P(F) __PS(#F, F) 887 887 #define P(F) __PS(#F, p->F) 888 + #define PM(F, M) __PS(#F, p->F & (M)) 888 889 #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) 889 890 #define __PN(F) __PSN(#F, F) 890 891 #define PN(F) __PSN(#F, p->F) ··· 1012 1011 P(se.avg.util_avg); 1013 1012 P(se.avg.last_update_time); 1014 1013 P(se.avg.util_est.ewma); 1015 - P(se.avg.util_est.enqueued); 1014 + PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED); 1016 1015 #endif 1017 1016 #ifdef CONFIG_UCLAMP_TASK 1018 1017 __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
+17 -11
kernel/sched/fair.c
··· 3499 3499 static inline void 3500 3500 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) 3501 3501 { 3502 - long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; 3502 + long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; 3503 3503 unsigned long load_avg; 3504 3504 u64 load_sum = 0; 3505 - s64 delta_sum; 3506 3505 u32 divider; 3507 3506 3508 3507 if (!runnable_sum) ··· 3548 3549 load_sum = (s64)se_weight(se) * runnable_sum; 3549 3550 load_avg = div_s64(load_sum, divider); 3550 3551 3551 - delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; 3552 - delta_avg = load_avg - se->avg.load_avg; 3552 + delta = load_avg - se->avg.load_avg; 3553 3553 3554 3554 se->avg.load_sum = runnable_sum; 3555 3555 se->avg.load_avg = load_avg; 3556 - add_positive(&cfs_rq->avg.load_avg, delta_avg); 3557 - add_positive(&cfs_rq->avg.load_sum, delta_sum); 3556 + 3557 + add_positive(&cfs_rq->avg.load_avg, delta); 3558 + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; 3558 3559 } 3559 3560 3560 3561 static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) ··· 3765 3766 */ 3766 3767 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3767 3768 { 3769 + /* 3770 + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. 3771 + * See ___update_load_avg() for details. 3772 + */ 3773 + u32 divider = get_pelt_divider(&cfs_rq->avg); 3774 + 3768 3775 dequeue_load_avg(cfs_rq, se); 3769 3776 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3770 - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3777 + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; 3771 3778 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); 3772 - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); 3779 + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; 3773 3780 3774 3781 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3775 3782 ··· 3907 3902 { 3908 3903 struct util_est ue = READ_ONCE(p->se.avg.util_est); 3909 3904 3910 - return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); 3905 + return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); 3911 3906 } 3912 3907 3913 3908 static inline unsigned long task_util_est(struct task_struct *p) ··· 4007 4002 * Reset EWMA on utilization increases, the moving average is used only 4008 4003 * to smooth utilization decreases. 4009 4004 */ 4010 - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); 4005 + ue.enqueued = task_util(p); 4011 4006 if (sched_feat(UTIL_EST_FASTUP)) { 4012 4007 if (ue.ewma < ue.enqueued) { 4013 4008 ue.ewma = ue.enqueued; ··· 4056 4051 ue.ewma += last_ewma_diff; 4057 4052 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; 4058 4053 done: 4054 + ue.enqueued |= UTIL_AVG_UNCHANGED; 4059 4055 WRITE_ONCE(p->se.avg.util_est, ue); 4060 4056 4061 4057 trace_sched_util_est_se_tp(&p->se); ··· 8036 8030 /* Propagate pending load changes to the parent, if any: */ 8037 8031 se = cfs_rq->tg->se[cpu]; 8038 8032 if (se && !skip_blocked_update(se)) 8039 - update_load_avg(cfs_rq_of(se), se, 0); 8033 + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); 8040 8034 8041 8035 /* 8042 8036 * There can be a lot of idle CPU cgroups. Don't let fully
+1 -10
kernel/sched/pelt.h
··· 42 42 return LOAD_AVG_MAX - 1024 + avg->period_contrib; 43 43 } 44 44 45 - /* 46 - * When a task is dequeued, its estimated utilization should not be update if 47 - * its util_avg has not been updated at least once. 48 - * This flag is used to synchronize util_avg updates with util_est updates. 49 - * We map this information into the LSB bit of the utilization saved at 50 - * dequeue time (i.e. util_est.dequeued). 51 - */ 52 - #define UTIL_AVG_UNCHANGED 0x1 53 - 54 45 static inline void cfs_se_util_change(struct sched_avg *avg) 55 46 { 56 47 unsigned int enqueued; ··· 49 58 if (!sched_feat(UTIL_EST)) 50 59 return; 51 60 52 - /* Avoid store if the flag has been already set */ 61 + /* Avoid store if the flag has been already reset */ 53 62 enqueued = avg->util_est.enqueued; 54 63 if (!(enqueued & UTIL_AVG_UNCHANGED)) 55 64 return;
+1
kernel/time/tick-sched.c
··· 230 230 231 231 #ifdef CONFIG_NO_HZ_FULL 232 232 cpumask_var_t tick_nohz_full_mask; 233 + EXPORT_SYMBOL_GPL(tick_nohz_full_mask); 233 234 bool tick_nohz_full_running; 234 235 EXPORT_SYMBOL_GPL(tick_nohz_full_running); 235 236 static atomic_t tick_dep_mask;