Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/deadline: Deferrable dl server

Among the motivations for the DL servers is the real-time throttling
mechanism. This mechanism works by throttling the rt_rq after
running for a long period without leaving space for fair tasks.

The base dl server avoids this problem by boosting fair tasks instead
of throttling the rt_rq. The point is that it boosts without waiting
for potential starvation, causing some non-intuitive cases.

For example, an IRQ dispatches two tasks on an idle system, a fair
and an RT. The DL server will be activated, running the fair task
before the RT one. This problem can be avoided by deferring the
dl server activation.

By setting the defer option, the dl_server will dispatch an
SCHED_DEADLINE reservation with replenished runtime, but throttled.

The dl_timer will be set for the defer time at (period - runtime) ns
from start time. Thus boosting the fair rq at defer time.

If the fair scheduler has the opportunity to run while waiting
for defer time, the dl server runtime will be consumed. If
the runtime is completely consumed before the defer time, the
server will be replenished while still in a throttled state. Then,
the dl_timer will be reset to the new defer time

If the fair server reaches the defer time without consuming
its runtime, the server will start running, following CBS rules
(thus without breaking SCHED_DEADLINE). Then the server will
continue the running state (without deferring) until it fair
tasks are able to execute as regular fair scheduler (end of
the starvation).

Signed-off-by: Daniel Bristot de Oliveira <bristot@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Juri Lelli <juri.lelli@redhat.com>
Link: https://lore.kernel.org/r/dd175943c72533cd9f0b87767c6499204879cc38.1716811044.git.bristot@kernel.org

authored by

Daniel Bristot de Oliveira and committed by
Peter Zijlstra
a110a81c 557a6bfc

+298 -45
+12
include/linux/sched.h
··· 641 641 * overruns. 642 642 * 643 643 * @dl_server tells if this is a server entity. 644 + * 645 + * @dl_defer tells if this is a deferred or regular server. For 646 + * now only defer server exists. 647 + * 648 + * @dl_defer_armed tells if the deferrable server is waiting 649 + * for the replenishment timer to activate it. 650 + * 651 + * @dl_defer_running tells if the deferrable server is actually 652 + * running, skipping the defer phase. 644 653 */ 645 654 unsigned int dl_throttled : 1; 646 655 unsigned int dl_yielded : 1; 647 656 unsigned int dl_non_contending : 1; 648 657 unsigned int dl_overrun : 1; 649 658 unsigned int dl_server : 1; 659 + unsigned int dl_defer : 1; 660 + unsigned int dl_defer_armed : 1; 661 + unsigned int dl_defer_running : 1; 650 662 651 663 /* 652 664 * Bandwidth enforcement timer. Each -deadline task has its
+261 -40
kernel/sched/deadline.c
··· 771 771 /* for non-boosted task, pi_of(dl_se) == dl_se */ 772 772 dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 773 773 dl_se->runtime = pi_of(dl_se)->dl_runtime; 774 + 775 + /* 776 + * If it is a deferred reservation, and the server 777 + * is not handling an starvation case, defer it. 778 + */ 779 + if (dl_se->dl_defer & !dl_se->dl_defer_running) { 780 + dl_se->dl_throttled = 1; 781 + dl_se->dl_defer_armed = 1; 782 + } 774 783 } 775 784 776 785 /* ··· 818 809 replenish_dl_new_period(dl_se, rq); 819 810 } 820 811 812 + static int start_dl_timer(struct sched_dl_entity *dl_se); 813 + static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); 814 + 821 815 /* 822 816 * Pure Earliest Deadline First (EDF) scheduling does not deal with the 823 817 * possibility of a entity lasting more than what it declared, and thus ··· 849 837 /* 850 838 * This could be the case for a !-dl task that is boosted. 851 839 * Just go with full inherited parameters. 840 + * 841 + * Or, it could be the case of a deferred reservation that 842 + * was not able to consume its runtime in background and 843 + * reached this point with current u > U. 844 + * 845 + * In both cases, set a new period. 852 846 */ 853 - if (dl_se->dl_deadline == 0) 854 - replenish_dl_new_period(dl_se, rq); 847 + if (dl_se->dl_deadline == 0 || 848 + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { 849 + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 850 + dl_se->runtime = pi_of(dl_se)->dl_runtime; 851 + } 855 852 856 853 if (dl_se->dl_yielded && dl_se->runtime > 0) 857 854 dl_se->runtime = 0; ··· 894 873 dl_se->dl_yielded = 0; 895 874 if (dl_se->dl_throttled) 896 875 dl_se->dl_throttled = 0; 876 + 877 + /* 878 + * If this is the replenishment of a deferred reservation, 879 + * clear the flag and return. 880 + */ 881 + if (dl_se->dl_defer_armed) { 882 + dl_se->dl_defer_armed = 0; 883 + return; 884 + } 885 + 886 + /* 887 + * A this point, if the deferred server is not armed, and the deadline 888 + * is in the future, if it is not running already, throttle the server 889 + * and arm the defer timer. 890 + */ 891 + if (dl_se->dl_defer && !dl_se->dl_defer_running && 892 + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { 893 + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { 894 + 895 + /* 896 + * Set dl_se->dl_defer_armed and dl_throttled variables to 897 + * inform the start_dl_timer() that this is a deferred 898 + * activation. 899 + */ 900 + dl_se->dl_defer_armed = 1; 901 + dl_se->dl_throttled = 1; 902 + if (!start_dl_timer(dl_se)) { 903 + /* 904 + * If for whatever reason (delays), a previous timer was 905 + * queued but not serviced, cancel it and clean the 906 + * deferrable server variables intended for start_dl_timer(). 907 + */ 908 + hrtimer_try_to_cancel(&dl_se->dl_timer); 909 + dl_se->dl_defer_armed = 0; 910 + dl_se->dl_throttled = 0; 911 + } 912 + } 913 + } 897 914 } 898 915 899 916 /* ··· 1082 1023 } 1083 1024 1084 1025 replenish_dl_new_period(dl_se, rq); 1026 + } else if (dl_server(dl_se) && dl_se->dl_defer) { 1027 + /* 1028 + * The server can still use its previous deadline, so check if 1029 + * it left the dl_defer_running state. 1030 + */ 1031 + if (!dl_se->dl_defer_running) { 1032 + dl_se->dl_defer_armed = 1; 1033 + dl_se->dl_throttled = 1; 1034 + } 1085 1035 } 1086 1036 } 1087 1037 ··· 1123 1055 * We want the timer to fire at the deadline, but considering 1124 1056 * that it is actually coming from rq->clock and not from 1125 1057 * hrtimer's time base reading. 1058 + * 1059 + * The deferred reservation will have its timer set to 1060 + * (deadline - runtime). At that point, the CBS rule will decide 1061 + * if the current deadline can be used, or if a replenishment is 1062 + * required to avoid add too much pressure on the system 1063 + * (current u > U). 1126 1064 */ 1127 - act = ns_to_ktime(dl_next_period(dl_se)); 1065 + if (dl_se->dl_defer_armed) { 1066 + WARN_ON_ONCE(!dl_se->dl_throttled); 1067 + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); 1068 + } else { 1069 + /* act = deadline - rel-deadline + period */ 1070 + act = ns_to_ktime(dl_next_period(dl_se)); 1071 + } 1072 + 1128 1073 now = hrtimer_cb_get_time(timer); 1129 1074 delta = ktime_to_ns(now) - rq_clock(rq); 1130 1075 act = ktime_add_ns(act, delta); ··· 1187 1106 #endif 1188 1107 } 1189 1108 1109 + /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ 1110 + static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; 1111 + 1112 + static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) 1113 + { 1114 + struct rq *rq = rq_of_dl_se(dl_se); 1115 + u64 fw; 1116 + 1117 + scoped_guard (rq_lock, rq) { 1118 + struct rq_flags *rf = &scope.rf; 1119 + 1120 + if (!dl_se->dl_throttled || !dl_se->dl_runtime) 1121 + return HRTIMER_NORESTART; 1122 + 1123 + sched_clock_tick(); 1124 + update_rq_clock(rq); 1125 + 1126 + if (!dl_se->dl_runtime) 1127 + return HRTIMER_NORESTART; 1128 + 1129 + if (!dl_se->server_has_tasks(dl_se)) { 1130 + replenish_dl_entity(dl_se); 1131 + return HRTIMER_NORESTART; 1132 + } 1133 + 1134 + if (dl_se->dl_defer_armed) { 1135 + /* 1136 + * First check if the server could consume runtime in background. 1137 + * If so, it is possible to push the defer timer for this amount 1138 + * of time. The dl_server_min_res serves as a limit to avoid 1139 + * forwarding the timer for a too small amount of time. 1140 + */ 1141 + if (dl_time_before(rq_clock(dl_se->rq), 1142 + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { 1143 + 1144 + /* reset the defer timer */ 1145 + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; 1146 + 1147 + hrtimer_forward_now(timer, ns_to_ktime(fw)); 1148 + return HRTIMER_RESTART; 1149 + } 1150 + 1151 + dl_se->dl_defer_running = 1; 1152 + } 1153 + 1154 + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); 1155 + 1156 + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) 1157 + resched_curr(rq); 1158 + 1159 + __push_dl_task(rq, rf); 1160 + } 1161 + 1162 + return HRTIMER_NORESTART; 1163 + } 1164 + 1190 1165 /* 1191 1166 * This is the bandwidth enforcement timer callback. If here, we know 1192 1167 * a task is not on its dl_rq, since the fact that the timer was running ··· 1265 1128 struct rq_flags rf; 1266 1129 struct rq *rq; 1267 1130 1268 - if (dl_server(dl_se)) { 1269 - struct rq *rq = rq_of_dl_se(dl_se); 1270 - struct rq_flags rf; 1271 - 1272 - rq_lock(rq, &rf); 1273 - if (dl_se->dl_throttled) { 1274 - sched_clock_tick(); 1275 - update_rq_clock(rq); 1276 - 1277 - if (dl_se->server_has_tasks(dl_se)) { 1278 - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); 1279 - resched_curr(rq); 1280 - __push_dl_task(rq, &rf); 1281 - } else { 1282 - replenish_dl_entity(dl_se); 1283 - } 1284 - 1285 - } 1286 - rq_unlock(rq, &rf); 1287 - 1288 - return HRTIMER_NORESTART; 1289 - } 1131 + if (dl_server(dl_se)) 1132 + return dl_server_timer(timer, dl_se); 1290 1133 1291 1134 p = dl_task_of(dl_se); 1292 1135 rq = task_rq_lock(p, &rf); ··· 1436 1319 return (delta * u_act) >> BW_SHIFT; 1437 1320 } 1438 1321 1439 - static inline void 1440 - update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, 1441 - int flags); 1442 - static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1322 + s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1443 1323 { 1444 1324 s64 scaled_delta_exec; 1445 - 1446 - if (unlikely(delta_exec <= 0)) { 1447 - if (unlikely(dl_se->dl_yielded)) 1448 - goto throttle; 1449 - return; 1450 - } 1451 - 1452 - if (dl_entity_is_special(dl_se)) 1453 - return; 1454 1325 1455 1326 /* 1456 1327 * For tasks that participate in GRUB, we implement GRUB-PA: the ··· 1458 1353 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); 1459 1354 } 1460 1355 1356 + return scaled_delta_exec; 1357 + } 1358 + 1359 + static inline void 1360 + update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, 1361 + int flags); 1362 + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1363 + { 1364 + s64 scaled_delta_exec; 1365 + 1366 + if (unlikely(delta_exec <= 0)) { 1367 + if (unlikely(dl_se->dl_yielded)) 1368 + goto throttle; 1369 + return; 1370 + } 1371 + 1372 + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) 1373 + return; 1374 + 1375 + if (dl_entity_is_special(dl_se)) 1376 + return; 1377 + 1378 + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); 1379 + 1461 1380 dl_se->runtime -= scaled_delta_exec; 1381 + 1382 + /* 1383 + * The fair server can consume its runtime while throttled (not queued/ 1384 + * running as regular CFS). 1385 + * 1386 + * If the server consumes its entire runtime in this state. The server 1387 + * is not required for the current period. Thus, reset the server by 1388 + * starting a new period, pushing the activation. 1389 + */ 1390 + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { 1391 + /* 1392 + * If the server was previously activated - the starving condition 1393 + * took place, it this point it went away because the fair scheduler 1394 + * was able to get runtime in background. So return to the initial 1395 + * state. 1396 + */ 1397 + dl_se->dl_defer_running = 0; 1398 + 1399 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1400 + 1401 + replenish_dl_new_period(dl_se, dl_se->rq); 1402 + 1403 + /* 1404 + * Not being able to start the timer seems problematic. If it could not 1405 + * be started for whatever reason, we need to "unthrottle" the DL server 1406 + * and queue right away. Otherwise nothing might queue it. That's similar 1407 + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. 1408 + */ 1409 + WARN_ON_ONCE(!start_dl_timer(dl_se)); 1410 + 1411 + return; 1412 + } 1462 1413 1463 1414 throttle: 1464 1415 if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { ··· 1575 1414 } 1576 1415 } 1577 1416 1417 + /* 1418 + * In the non-defer mode, the idle time is not accounted, as the 1419 + * server provides a guarantee. 1420 + * 1421 + * If the dl_server is in defer mode, the idle time is also considered 1422 + * as time available for the fair server, avoiding a penalty for the 1423 + * rt scheduler that did not consumed that time. 1424 + */ 1425 + void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) 1426 + { 1427 + s64 delta_exec, scaled_delta_exec; 1428 + 1429 + if (!rq->fair_server.dl_defer) 1430 + return; 1431 + 1432 + /* no need to discount more */ 1433 + if (rq->fair_server.runtime < 0) 1434 + return; 1435 + 1436 + delta_exec = rq_clock_task(rq) - p->se.exec_start; 1437 + if (delta_exec < 0) 1438 + return; 1439 + 1440 + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); 1441 + 1442 + rq->fair_server.runtime -= scaled_delta_exec; 1443 + 1444 + if (rq->fair_server.runtime < 0) { 1445 + rq->fair_server.dl_defer_running = 0; 1446 + rq->fair_server.runtime = 0; 1447 + } 1448 + 1449 + p->se.exec_start = rq_clock_task(rq); 1450 + } 1451 + 1578 1452 void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) 1579 1453 { 1580 - update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1454 + /* 0 runtime = fair server disabled */ 1455 + if (dl_se->dl_runtime) 1456 + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1581 1457 } 1582 1458 1583 1459 void dl_server_start(struct sched_dl_entity *dl_se) ··· 1628 1430 dl_se->dl_period = 1000 * NSEC_PER_MSEC; 1629 1431 1630 1432 dl_se->dl_server = 1; 1433 + dl_se->dl_defer = 1; 1631 1434 setup_new_dl_entity(dl_se); 1632 1435 } 1633 1436 ··· 1646 1447 return; 1647 1448 1648 1449 dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); 1450 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1451 + dl_se->dl_defer_armed = 0; 1452 + dl_se->dl_throttled = 0; 1649 1453 } 1650 1454 1651 1455 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, ··· 1960 1758 * be counted in the active utilization; hence, we need to call 1961 1759 * add_running_bw(). 1962 1760 */ 1963 - if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { 1761 + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { 1964 1762 if (flags & ENQUEUE_WAKEUP) 1965 1763 task_contending(dl_se, flags); 1966 1764 ··· 1980 1778 } else if ((flags & ENQUEUE_RESTORE) && 1981 1779 dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { 1982 1780 setup_new_dl_entity(dl_se); 1781 + } 1782 + 1783 + /* 1784 + * If the reservation is still throttled, e.g., it got replenished but is a 1785 + * deferred task and still got to wait, don't enqueue. 1786 + */ 1787 + if (dl_se->dl_throttled && start_dl_timer(dl_se)) 1788 + return; 1789 + 1790 + /* 1791 + * We're about to enqueue, make sure we're not ->dl_throttled! 1792 + * In case the timer was not started, say because the defer time 1793 + * has passed, mark as not throttled and mark unarmed. 1794 + * Also cancel earlier timers, since letting those run is pointless. 1795 + */ 1796 + if (dl_se->dl_throttled) { 1797 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1798 + dl_se->dl_defer_armed = 0; 1799 + dl_se->dl_throttled = 0; 1983 1800 } 1984 1801 1985 1802 __enqueue_dl_entity(dl_se);
+20 -4
kernel/sched/fair.c
··· 1156 1156 static void update_curr(struct cfs_rq *cfs_rq) 1157 1157 { 1158 1158 struct sched_entity *curr = cfs_rq->curr; 1159 + struct rq *rq = rq_of(cfs_rq); 1159 1160 s64 delta_exec; 1160 1161 1161 1162 if (unlikely(!curr)) 1162 1163 return; 1163 1164 1164 - delta_exec = update_curr_se(rq_of(cfs_rq), curr); 1165 + delta_exec = update_curr_se(rq, curr); 1165 1166 if (unlikely(delta_exec <= 0)) 1166 1167 return; 1167 1168 ··· 1170 1169 update_deadline(cfs_rq, curr); 1171 1170 update_min_vruntime(cfs_rq); 1172 1171 1173 - if (entity_is_task(curr)) 1174 - update_curr_task(task_of(curr), delta_exec); 1172 + if (entity_is_task(curr)) { 1173 + struct task_struct *p = task_of(curr); 1174 + 1175 + update_curr_task(p, delta_exec); 1176 + 1177 + /* 1178 + * Any fair task that runs outside of fair_server should 1179 + * account against fair_server such that it can account for 1180 + * this time and possibly avoid running this period. 1181 + */ 1182 + if (p->dl_server != &rq->fair_server) 1183 + dl_server_update(&rq->fair_server, delta_exec); 1184 + } 1175 1185 1176 1186 account_cfs_rq_runtime(cfs_rq, delta_exec); 1177 1187 } ··· 6780 6768 */ 6781 6769 util_est_enqueue(&rq->cfs, p); 6782 6770 6783 - if (!throttled_hierarchy(task_cfs_rq(p)) && !rq->cfs.h_nr_running) 6771 + if (!throttled_hierarchy(task_cfs_rq(p)) && !rq->cfs.h_nr_running) { 6772 + /* Account for idle runtime */ 6773 + if (!rq->nr_running) 6774 + dl_server_update_idle_time(rq, rq->curr); 6784 6775 dl_server_start(&rq->fair_server); 6776 + } 6785 6777 6786 6778 /* 6787 6779 * If in_iowait is set, the code below may not trigger any cpufreq
+2
kernel/sched/idle.c
··· 452 452 453 453 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 454 454 { 455 + dl_server_update_idle_time(rq, prev); 455 456 } 456 457 457 458 static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) 458 459 { 459 460 update_idle_core(rq); 460 461 schedstat_inc(rq->sched_goidle); 462 + next->se.exec_start = rq_clock_task(rq); 461 463 } 462 464 463 465 #ifdef CONFIG_SMP
+3 -1
kernel/sched/sched.h
··· 335 335 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 336 336 extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); 337 337 extern int dl_bw_check_overflow(int cpu); 338 - 338 + extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); 339 339 /* 340 340 * SCHED_DEADLINE supports servers (nested scheduling) with the following 341 341 * interface: ··· 363 363 dl_server_has_tasks_f has_tasks, 364 364 dl_server_pick_f pick); 365 365 366 + extern void dl_server_update_idle_time(struct rq *rq, 367 + struct task_struct *p); 366 368 extern void fair_server_init(struct rq *rq); 367 369 368 370 #ifdef CONFIG_CGROUP_SCHED