Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:
"A couple of scheduler fixes:

- force watchdog reset while processing sysrq-w

- fix a deadlock when enabling trace events in the scheduler

- fixes to the throttled next buddy logic

- fixes for the average accounting (missing serialization and
underflow handling)

- allow kernel threads for fallback to online but not active cpus"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/core: Allow kthreads to fall back to online && !active cpus
sched/fair: Do not announce throttled next buddy in dequeue_task_fair()
sched/fair: Initialize throttle_count for new task-groups lazily
sched/fair: Fix cfs_rq avg tracking underflow
kernel/sysrq, watchdog, sched/core: Reset watchdog on all CPUs while processing sysrq-w
sched/debug: Fix deadlock when enabling sched events
sched/fair: Fix post_init_entity_util_avg() serialization

+66 -21
+8 -5
kernel/sched/core.c
··· 1536 1536 for (;;) { 1537 1537 /* Any allowed, online CPU? */ 1538 1538 for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { 1539 - if (!cpu_active(dest_cpu)) 1539 + if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) 1540 + continue; 1541 + if (!cpu_online(dest_cpu)) 1540 1542 continue; 1541 1543 goto out; 1542 1544 } ··· 2537 2535 */ 2538 2536 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); 2539 2537 #endif 2540 - /* Post initialize new task's util average when its cfs_rq is set */ 2538 + rq = __task_rq_lock(p, &rf); 2541 2539 post_init_entity_util_avg(&p->se); 2542 2540 2543 - rq = __task_rq_lock(p, &rf); 2544 2541 activate_task(rq, p, 0); 2545 2542 p->on_rq = TASK_ON_RQ_QUEUED; 2546 2543 trace_sched_wakeup_new(p); ··· 5149 5148 /* 5150 5149 * reset the NMI-timeout, listing all files on a slow 5151 5150 * console might take a lot of time: 5151 + * Also, reset softlockup watchdogs on all CPUs, because 5152 + * another CPU might be blocked waiting for us to process 5153 + * an IPI. 5152 5154 */ 5153 5155 touch_nmi_watchdog(); 5156 + touch_all_softlockup_watchdogs(); 5154 5157 if (!state_filter || (p->state & state_filter)) 5155 5158 sched_show_task(p); 5156 5159 } 5157 - 5158 - touch_all_softlockup_watchdogs(); 5159 5160 5160 5161 #ifdef CONFIG_SCHED_DEBUG 5161 5162 if (!state_filter)
+57 -15
kernel/sched/fair.c
··· 2904 2904 } 2905 2905 } 2906 2906 2907 + /* 2908 + * Unsigned subtract and clamp on underflow. 2909 + * 2910 + * Explicitly do a load-store to ensure the intermediate value never hits 2911 + * memory. This allows lockless observations without ever seeing the negative 2912 + * values. 2913 + */ 2914 + #define sub_positive(_ptr, _val) do { \ 2915 + typeof(_ptr) ptr = (_ptr); \ 2916 + typeof(*ptr) val = (_val); \ 2917 + typeof(*ptr) res, var = READ_ONCE(*ptr); \ 2918 + res = var - val; \ 2919 + if (res > var) \ 2920 + res = 0; \ 2921 + WRITE_ONCE(*ptr, res); \ 2922 + } while (0) 2923 + 2907 2924 /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ 2908 2925 static inline int 2909 2926 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) ··· 2930 2913 2931 2914 if (atomic_long_read(&cfs_rq->removed_load_avg)) { 2932 2915 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); 2933 - sa->load_avg = max_t(long, sa->load_avg - r, 0); 2934 - sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); 2916 + sub_positive(&sa->load_avg, r); 2917 + sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); 2935 2918 removed_load = 1; 2936 2919 } 2937 2920 2938 2921 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 2939 2922 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); 2940 - sa->util_avg = max_t(long, sa->util_avg - r, 0); 2941 - sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); 2923 + sub_positive(&sa->util_avg, r); 2924 + sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); 2942 2925 removed_util = 1; 2943 2926 } 2944 2927 ··· 3011 2994 &se->avg, se->on_rq * scale_load_down(se->load.weight), 3012 2995 cfs_rq->curr == se, NULL); 3013 2996 3014 - cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); 3015 - cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0); 3016 - cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); 3017 - cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0); 2997 + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); 2998 + sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); 2999 + sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3000 + sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3018 3001 3019 3002 cfs_rq_util_change(cfs_rq); 3020 3003 } ··· 3263 3246 trace_sched_stat_iowait_enabled() || 3264 3247 trace_sched_stat_blocked_enabled() || 3265 3248 trace_sched_stat_runtime_enabled()) { 3266 - pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " 3249 + printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, " 3267 3250 "stat_blocked and stat_runtime require the " 3268 3251 "kernel parameter schedstats=enabled or " 3269 3252 "kernel.sched_schedstats=1\n"); ··· 4202 4185 if (!cfs_bandwidth_used()) 4203 4186 return; 4204 4187 4188 + /* Synchronize hierarchical throttle counter: */ 4189 + if (unlikely(!cfs_rq->throttle_uptodate)) { 4190 + struct rq *rq = rq_of(cfs_rq); 4191 + struct cfs_rq *pcfs_rq; 4192 + struct task_group *tg; 4193 + 4194 + cfs_rq->throttle_uptodate = 1; 4195 + 4196 + /* Get closest up-to-date node, because leaves go first: */ 4197 + for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { 4198 + pcfs_rq = tg->cfs_rq[cpu_of(rq)]; 4199 + if (pcfs_rq->throttle_uptodate) 4200 + break; 4201 + } 4202 + if (tg) { 4203 + cfs_rq->throttle_count = pcfs_rq->throttle_count; 4204 + cfs_rq->throttled_clock_task = rq_clock_task(rq); 4205 + } 4206 + } 4207 + 4205 4208 /* an active group must be handled by the update_curr()->put() path */ 4206 4209 if (!cfs_rq->runtime_enabled || cfs_rq->curr) 4207 4210 return; ··· 4537 4500 4538 4501 /* Don't dequeue parent if it has other entities besides us */ 4539 4502 if (cfs_rq->load.weight) { 4503 + /* Avoid re-evaluating load for this entity: */ 4504 + se = parent_entity(se); 4540 4505 /* 4541 4506 * Bias pick_next to pick a task from this cfs_rq, as 4542 4507 * p is sleeping when it is within its sched_slice. 4543 4508 */ 4544 - if (task_sleep && parent_entity(se)) 4545 - set_next_buddy(parent_entity(se)); 4546 - 4547 - /* avoid re-evaluating load for this entity */ 4548 - se = parent_entity(se); 4509 + if (task_sleep && se && !throttled_hierarchy(cfs_rq)) 4510 + set_next_buddy(se); 4549 4511 break; 4550 4512 } 4551 4513 flags |= DEQUEUE_SLEEP; ··· 8532 8496 8533 8497 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) 8534 8498 { 8535 - struct cfs_rq *cfs_rq; 8536 8499 struct sched_entity *se; 8500 + struct cfs_rq *cfs_rq; 8501 + struct rq *rq; 8537 8502 int i; 8538 8503 8539 8504 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); ··· 8549 8512 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); 8550 8513 8551 8514 for_each_possible_cpu(i) { 8515 + rq = cpu_rq(i); 8516 + 8552 8517 cfs_rq = kzalloc_node(sizeof(struct cfs_rq), 8553 8518 GFP_KERNEL, cpu_to_node(i)); 8554 8519 if (!cfs_rq) ··· 8564 8525 init_cfs_rq(cfs_rq); 8565 8526 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8566 8527 init_entity_runnable_average(se); 8528 + 8529 + raw_spin_lock_irq(&rq->lock); 8567 8530 post_init_entity_util_avg(se); 8531 + raw_spin_unlock_irq(&rq->lock); 8568 8532 } 8569 8533 8570 8534 return 1;
+1 -1
kernel/sched/sched.h
··· 437 437 438 438 u64 throttled_clock, throttled_clock_task; 439 439 u64 throttled_clock_task_time; 440 - int throttled, throttle_count; 440 + int throttled, throttle_count, throttle_uptodate; 441 441 struct list_head throttled_list; 442 442 #endif /* CONFIG_CFS_BANDWIDTH */ 443 443 #endif /* CONFIG_FAIR_GROUP_SCHED */