Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched-urgent-2020-07-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into master

Pull scheduler fixes from Thomas Gleixner:
"A set of scheduler fixes:

- Plug a load average accounting race which was introduced with a
recent optimization casing load average to show bogus numbers.

- Fix the rseq CPU id initialization for new tasks. sched_fork() does
not update the rseq CPU id so the id is the stale id of the parent
task, which can cause user space data corruption.

- Handle a 0 return value of task_h_load() correctly in the load
balancer, which does not decrease imbalance and therefore pulls
until the maximum number of loops is reached, which might be all
tasks just created by a fork bomb"

* tag 'sched-urgent-2020-07-19' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: handle case of task_h_load() returning 0
sched: Fix unreliable rseq cpu_id for new tasks
sched: Fix loadavg accounting race

+66 -22
-4
include/linux/sched.h
··· 114 114 115 115 #define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) 116 116 117 - #define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ 118 - (task->flags & PF_FROZEN) == 0 && \ 119 - (task->state & TASK_NOLOAD) == 0) 120 - 121 117 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP 122 118 123 119 /*
+53 -16
kernel/sched/core.c
··· 1311 1311 1312 1312 void activate_task(struct rq *rq, struct task_struct *p, int flags) 1313 1313 { 1314 - if (task_contributes_to_load(p)) 1315 - rq->nr_uninterruptible--; 1316 - 1317 1314 enqueue_task(rq, p, flags); 1318 1315 1319 1316 p->on_rq = TASK_ON_RQ_QUEUED; ··· 1319 1322 void deactivate_task(struct rq *rq, struct task_struct *p, int flags) 1320 1323 { 1321 1324 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING; 1322 - 1323 - if (task_contributes_to_load(p)) 1324 - rq->nr_uninterruptible++; 1325 1325 1326 1326 dequeue_task(rq, p, flags); 1327 1327 } ··· 2230 2236 2231 2237 lockdep_assert_held(&rq->lock); 2232 2238 2233 - #ifdef CONFIG_SMP 2234 2239 if (p->sched_contributes_to_load) 2235 2240 rq->nr_uninterruptible--; 2236 2241 2242 + #ifdef CONFIG_SMP 2237 2243 if (wake_flags & WF_MIGRATED) 2238 2244 en_flags |= ENQUEUE_MIGRATED; 2239 2245 #endif ··· 2577 2583 * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). 2578 2584 */ 2579 2585 smp_rmb(); 2580 - if (p->on_rq && ttwu_remote(p, wake_flags)) 2586 + if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) 2581 2587 goto unlock; 2582 2588 2583 2589 if (p->in_iowait) { ··· 2586 2592 } 2587 2593 2588 2594 #ifdef CONFIG_SMP 2589 - p->sched_contributes_to_load = !!task_contributes_to_load(p); 2590 - p->state = TASK_WAKING; 2591 - 2592 2595 /* 2593 2596 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 2594 2597 * possible to, falsely, observe p->on_cpu == 0. ··· 2604 2613 * 2605 2614 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 2606 2615 * __schedule(). See the comment for smp_mb__after_spinlock(). 2616 + * 2617 + * Form a control-dep-acquire with p->on_rq == 0 above, to ensure 2618 + * schedule()'s deactivate_task() has 'happened' and p will no longer 2619 + * care about it's own p->state. See the comment in __schedule(). 2607 2620 */ 2608 - smp_rmb(); 2621 + smp_acquire__after_ctrl_dep(); 2622 + 2623 + /* 2624 + * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq 2625 + * == 0), which means we need to do an enqueue, change p->state to 2626 + * TASK_WAKING such that we can unlock p->pi_lock before doing the 2627 + * enqueue, such as ttwu_queue_wakelist(). 2628 + */ 2629 + p->state = TASK_WAKING; 2609 2630 2610 2631 /* 2611 2632 * If the owning (remote) CPU is still in the middle of schedule() with ··· 2965 2962 * Silence PROVE_RCU. 2966 2963 */ 2967 2964 raw_spin_lock_irqsave(&p->pi_lock, flags); 2965 + rseq_migrate(p); 2968 2966 /* 2969 2967 * We're setting the CPU for the first time, we don't migrate, 2970 2968 * so use __set_task_cpu(). ··· 3030 3026 * as we're not fully set-up yet. 3031 3027 */ 3032 3028 p->recent_used_cpu = task_cpu(p); 3029 + rseq_migrate(p); 3033 3030 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 3034 3031 #endif 3035 3032 rq = __task_rq_lock(p, &rf); ··· 4102 4097 { 4103 4098 struct task_struct *prev, *next; 4104 4099 unsigned long *switch_count; 4100 + unsigned long prev_state; 4105 4101 struct rq_flags rf; 4106 4102 struct rq *rq; 4107 4103 int cpu; ··· 4119 4113 local_irq_disable(); 4120 4114 rcu_note_context_switch(preempt); 4121 4115 4116 + /* See deactivate_task() below. */ 4117 + prev_state = prev->state; 4118 + 4122 4119 /* 4123 4120 * Make sure that signal_pending_state()->signal_pending() below 4124 4121 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) 4125 - * done by the caller to avoid the race with signal_wake_up(). 4122 + * done by the caller to avoid the race with signal_wake_up(): 4126 4123 * 4127 - * The membarrier system call requires a full memory barrier 4124 + * __set_current_state(@state) signal_wake_up() 4125 + * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) 4126 + * wake_up_state(p, state) 4127 + * LOCK rq->lock LOCK p->pi_state 4128 + * smp_mb__after_spinlock() smp_mb__after_spinlock() 4129 + * if (signal_pending_state()) if (p->state & @state) 4130 + * 4131 + * Also, the membarrier system call requires a full memory barrier 4128 4132 * after coming from user-space, before storing to rq->curr. 4129 4133 */ 4130 4134 rq_lock(rq, &rf); ··· 4145 4129 update_rq_clock(rq); 4146 4130 4147 4131 switch_count = &prev->nivcsw; 4148 - if (!preempt && prev->state) { 4149 - if (signal_pending_state(prev->state, prev)) { 4132 + /* 4133 + * We must re-load prev->state in case ttwu_remote() changed it 4134 + * before we acquired rq->lock. 4135 + */ 4136 + if (!preempt && prev_state && prev_state == prev->state) { 4137 + if (signal_pending_state(prev_state, prev)) { 4150 4138 prev->state = TASK_RUNNING; 4151 4139 } else { 4140 + prev->sched_contributes_to_load = 4141 + (prev_state & TASK_UNINTERRUPTIBLE) && 4142 + !(prev_state & TASK_NOLOAD) && 4143 + !(prev->flags & PF_FROZEN); 4144 + 4145 + if (prev->sched_contributes_to_load) 4146 + rq->nr_uninterruptible++; 4147 + 4148 + /* 4149 + * __schedule() ttwu() 4150 + * prev_state = prev->state; if (READ_ONCE(p->on_rq) && ...) 4151 + * LOCK rq->lock goto out; 4152 + * smp_mb__after_spinlock(); smp_acquire__after_ctrl_dep(); 4153 + * p->on_rq = 0; p->state = TASK_WAKING; 4154 + * 4155 + * After this, schedule() must not care about p->state any more. 4156 + */ 4152 4157 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); 4153 4158 4154 4159 if (prev->in_iowait) {
+13 -2
kernel/sched/fair.c
··· 4039 4039 return; 4040 4040 } 4041 4041 4042 - rq->misfit_task_load = task_h_load(p); 4042 + /* 4043 + * Make sure that misfit_task_load will not be null even if 4044 + * task_h_load() returns 0. 4045 + */ 4046 + rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); 4043 4047 } 4044 4048 4045 4049 #else /* CONFIG_SMP */ ··· 7642 7638 7643 7639 switch (env->migration_type) { 7644 7640 case migrate_load: 7645 - load = task_h_load(p); 7641 + /* 7642 + * Depending of the number of CPUs and tasks and the 7643 + * cgroup hierarchy, task_h_load() can return a null 7644 + * value. Make sure that env->imbalance decreases 7645 + * otherwise detach_tasks() will stop only after 7646 + * detaching up to loop_max tasks. 7647 + */ 7648 + load = max_t(unsigned long, task_h_load(p), 1); 7646 7649 7647 7650 if (sched_feat(LB_MIN) && 7648 7651 load < 16 && !env->sd->nr_balance_failed)