Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Thomas Gleixner:
"From the scheduler departement:

- a bunch of sched deadline related fixes which deal with various
buglets and corner cases.

- two fixes for the loadavg spikes which are caused by the delayed
NOHZ accounting"

* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/deadline: Use deadline instead of period when calculating overflow
sched/deadline: Throttle a constrained deadline task activated after the deadline
sched/deadline: Make sure the replenishment timer fires in the next period
sched/loadavg: Use {READ,WRITE}_ONCE() for sample window
sched/loadavg: Avoid loadavg spikes caused by delayed NO_HZ accounting
sched/deadline: Add missing update_rq_clock() in dl_task_timer()

+69 -14
+57 -6
kernel/sched/deadline.c
··· 445 445 * 446 446 * This function returns true if: 447 447 * 448 - * runtime / (deadline - t) > dl_runtime / dl_period , 448 + * runtime / (deadline - t) > dl_runtime / dl_deadline , 449 449 * 450 450 * IOW we can't recycle current parameters. 451 451 * 452 - * Notice that the bandwidth check is done against the period. For 452 + * Notice that the bandwidth check is done against the deadline. For 453 453 * task with deadline equal to period this is the same of using 454 - * dl_deadline instead of dl_period in the equation above. 454 + * dl_period instead of dl_deadline in the equation above. 455 455 */ 456 456 static bool dl_entity_overflow(struct sched_dl_entity *dl_se, 457 457 struct sched_dl_entity *pi_se, u64 t) ··· 476 476 * of anything below microseconds resolution is actually fiction 477 477 * (but still we want to give the user that illusion >;). 478 478 */ 479 - left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); 479 + left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); 480 480 right = ((dl_se->deadline - t) >> DL_SCALE) * 481 481 (pi_se->dl_runtime >> DL_SCALE); 482 482 ··· 505 505 } 506 506 } 507 507 508 + static inline u64 dl_next_period(struct sched_dl_entity *dl_se) 509 + { 510 + return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period; 511 + } 512 + 508 513 /* 509 514 * If the entity depleted all its runtime, and if we want it to sleep 510 515 * while waiting for some new execution time to become available, we 511 - * set the bandwidth enforcement timer to the replenishment instant 516 + * set the bandwidth replenishment timer to the replenishment instant 512 517 * and try to activate it. 513 518 * 514 519 * Notice that it is important for the caller to know if the timer ··· 535 530 * that it is actually coming from rq->clock and not from 536 531 * hrtimer's time base reading. 537 532 */ 538 - act = ns_to_ktime(dl_se->deadline); 533 + act = ns_to_ktime(dl_next_period(dl_se)); 539 534 now = hrtimer_cb_get_time(timer); 540 535 delta = ktime_to_ns(now) - rq_clock(rq); 541 536 act = ktime_add_ns(act, delta); ··· 643 638 lockdep_unpin_lock(&rq->lock, rf.cookie); 644 639 rq = dl_task_offline_migration(rq, p); 645 640 rf.cookie = lockdep_pin_lock(&rq->lock); 641 + update_rq_clock(rq); 646 642 647 643 /* 648 644 * Now that the task has been migrated to the new RQ and we ··· 693 687 694 688 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 695 689 timer->function = dl_task_timer; 690 + } 691 + 692 + /* 693 + * During the activation, CBS checks if it can reuse the current task's 694 + * runtime and period. If the deadline of the task is in the past, CBS 695 + * cannot use the runtime, and so it replenishes the task. This rule 696 + * works fine for implicit deadline tasks (deadline == period), and the 697 + * CBS was designed for implicit deadline tasks. However, a task with 698 + * constrained deadline (deadine < period) might be awakened after the 699 + * deadline, but before the next period. In this case, replenishing the 700 + * task would allow it to run for runtime / deadline. As in this case 701 + * deadline < period, CBS enables a task to run for more than the 702 + * runtime / period. In a very loaded system, this can cause a domino 703 + * effect, making other tasks miss their deadlines. 704 + * 705 + * To avoid this problem, in the activation of a constrained deadline 706 + * task after the deadline but before the next period, throttle the 707 + * task and set the replenishing timer to the begin of the next period, 708 + * unless it is boosted. 709 + */ 710 + static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) 711 + { 712 + struct task_struct *p = dl_task_of(dl_se); 713 + struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se)); 714 + 715 + if (dl_time_before(dl_se->deadline, rq_clock(rq)) && 716 + dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { 717 + if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) 718 + return; 719 + dl_se->dl_throttled = 1; 720 + } 696 721 } 697 722 698 723 static ··· 959 922 __dequeue_dl_entity(dl_se); 960 923 } 961 924 925 + static inline bool dl_is_constrained(struct sched_dl_entity *dl_se) 926 + { 927 + return dl_se->dl_deadline < dl_se->dl_period; 928 + } 929 + 962 930 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) 963 931 { 964 932 struct task_struct *pi_task = rt_mutex_get_top_task(p); ··· 988 946 BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); 989 947 return; 990 948 } 949 + 950 + /* 951 + * Check if a constrained deadline task was activated 952 + * after the deadline but before the next period. 953 + * If that is the case, the task will be throttled and 954 + * the replenishment timer will be set to the next period. 955 + */ 956 + if (!p->dl.dl_throttled && dl_is_constrained(&p->dl)) 957 + dl_check_constrained_dl(&p->dl); 991 958 992 959 /* 993 960 * If p is throttled, we do nothing. In fact, if it exhausted
+12 -8
kernel/sched/loadavg.c
··· 169 169 * If the folding window started, make sure we start writing in the 170 170 * next idle-delta. 171 171 */ 172 - if (!time_before(jiffies, calc_load_update)) 172 + if (!time_before(jiffies, READ_ONCE(calc_load_update))) 173 173 idx++; 174 174 175 175 return idx & 1; ··· 202 202 struct rq *this_rq = this_rq(); 203 203 204 204 /* 205 - * If we're still before the sample window, we're done. 205 + * If we're still before the pending sample window, we're done. 206 206 */ 207 + this_rq->calc_load_update = READ_ONCE(calc_load_update); 207 208 if (time_before(jiffies, this_rq->calc_load_update)) 208 209 return; 209 210 ··· 213 212 * accounted through the nohz accounting, so skip the entire deal and 214 213 * sync up for the next window. 215 214 */ 216 - this_rq->calc_load_update = calc_load_update; 217 215 if (time_before(jiffies, this_rq->calc_load_update + 10)) 218 216 this_rq->calc_load_update += LOAD_FREQ; 219 217 } ··· 308 308 */ 309 309 static void calc_global_nohz(void) 310 310 { 311 + unsigned long sample_window; 311 312 long delta, active, n; 312 313 313 - if (!time_before(jiffies, calc_load_update + 10)) { 314 + sample_window = READ_ONCE(calc_load_update); 315 + if (!time_before(jiffies, sample_window + 10)) { 314 316 /* 315 317 * Catch-up, fold however many we are behind still 316 318 */ 317 - delta = jiffies - calc_load_update - 10; 319 + delta = jiffies - sample_window - 10; 318 320 n = 1 + (delta / LOAD_FREQ); 319 321 320 322 active = atomic_long_read(&calc_load_tasks); ··· 326 324 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 327 325 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 328 326 329 - calc_load_update += n * LOAD_FREQ; 327 + WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ); 330 328 } 331 329 332 330 /* ··· 354 352 */ 355 353 void calc_global_load(unsigned long ticks) 356 354 { 355 + unsigned long sample_window; 357 356 long active, delta; 358 357 359 - if (time_before(jiffies, calc_load_update + 10)) 358 + sample_window = READ_ONCE(calc_load_update); 359 + if (time_before(jiffies, sample_window + 10)) 360 360 return; 361 361 362 362 /* ··· 375 371 avenrun[1] = calc_load(avenrun[1], EXP_5, active); 376 372 avenrun[2] = calc_load(avenrun[2], EXP_15, active); 377 373 378 - calc_load_update += LOAD_FREQ; 374 + WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); 379 375 380 376 /* 381 377 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.