sched/deadline: Deferrable dl server · tjh.dev/kernel@a110a81

+12

include/linux/sched.h

··· 641 641 * overruns. 642 642 * 643 643 * @dl_server tells if this is a server entity. 644 + * 645 + * @dl_defer tells if this is a deferred or regular server. For 646 + * now only defer server exists. 647 + * 648 + * @dl_defer_armed tells if the deferrable server is waiting 649 + * for the replenishment timer to activate it. 650 + * 651 + * @dl_defer_running tells if the deferrable server is actually 652 + * running, skipping the defer phase. 644 653 */ 645 654 unsigned int dl_throttled : 1; 646 655 unsigned int dl_yielded : 1; 647 656 unsigned int dl_non_contending : 1; 648 657 unsigned int dl_overrun : 1; 649 658 unsigned int dl_server : 1; 659 + unsigned int dl_defer : 1; 660 + unsigned int dl_defer_armed : 1; 661 + unsigned int dl_defer_running : 1; 650 662 651 663 /* 652 664 * Bandwidth enforcement timer. Each -deadline task has its

+261 -40

kernel/sched/deadline.c

··· 771 771 /* for non-boosted task, pi_of(dl_se) == dl_se */ 772 772 dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 773 773 dl_se->runtime = pi_of(dl_se)->dl_runtime; 774 + 775 + /* 776 + * If it is a deferred reservation, and the server 777 + * is not handling an starvation case, defer it. 778 + */ 779 + if (dl_se->dl_defer & !dl_se->dl_defer_running) { 780 + dl_se->dl_throttled = 1; 781 + dl_se->dl_defer_armed = 1; 782 + } 774 783 } 775 784 776 785 /* ··· 818 809 replenish_dl_new_period(dl_se, rq); 819 810 } 820 811 812 + static int start_dl_timer(struct sched_dl_entity *dl_se); 813 + static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); 814 + 821 815 /* 822 816 * Pure Earliest Deadline First (EDF) scheduling does not deal with the 823 817 * possibility of a entity lasting more than what it declared, and thus ··· 849 837 /* 850 838 * This could be the case for a !-dl task that is boosted. 851 839 * Just go with full inherited parameters. 840 + * 841 + * Or, it could be the case of a deferred reservation that 842 + * was not able to consume its runtime in background and 843 + * reached this point with current u > U. 844 + * 845 + * In both cases, set a new period. 852 846 */ 853 - if (dl_se->dl_deadline == 0) 854 - replenish_dl_new_period(dl_se, rq); 847 + if (dl_se->dl_deadline == 0 || 848 + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { 849 + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 850 + dl_se->runtime = pi_of(dl_se)->dl_runtime; 851 + } 855 852 856 853 if (dl_se->dl_yielded && dl_se->runtime > 0) 857 854 dl_se->runtime = 0; ··· 894 873 dl_se->dl_yielded = 0; 895 874 if (dl_se->dl_throttled) 896 875 dl_se->dl_throttled = 0; 876 + 877 + /* 878 + * If this is the replenishment of a deferred reservation, 879 + * clear the flag and return. 880 + */ 881 + if (dl_se->dl_defer_armed) { 882 + dl_se->dl_defer_armed = 0; 883 + return; 884 + } 885 + 886 + /* 887 + * A this point, if the deferred server is not armed, and the deadline 888 + * is in the future, if it is not running already, throttle the server 889 + * and arm the defer timer. 890 + */ 891 + if (dl_se->dl_defer && !dl_se->dl_defer_running && 892 + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { 893 + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { 894 + 895 + /* 896 + * Set dl_se->dl_defer_armed and dl_throttled variables to 897 + * inform the start_dl_timer() that this is a deferred 898 + * activation. 899 + */ 900 + dl_se->dl_defer_armed = 1; 901 + dl_se->dl_throttled = 1; 902 + if (!start_dl_timer(dl_se)) { 903 + /* 904 + * If for whatever reason (delays), a previous timer was 905 + * queued but not serviced, cancel it and clean the 906 + * deferrable server variables intended for start_dl_timer(). 907 + */ 908 + hrtimer_try_to_cancel(&dl_se->dl_timer); 909 + dl_se->dl_defer_armed = 0; 910 + dl_se->dl_throttled = 0; 911 + } 912 + } 913 + } 897 914 } 898 915 899 916 /* ··· 1082 1023 } 1083 1024 1084 1025 replenish_dl_new_period(dl_se, rq); 1026 + } else if (dl_server(dl_se) && dl_se->dl_defer) { 1027 + /* 1028 + * The server can still use its previous deadline, so check if 1029 + * it left the dl_defer_running state. 1030 + */ 1031 + if (!dl_se->dl_defer_running) { 1032 + dl_se->dl_defer_armed = 1; 1033 + dl_se->dl_throttled = 1; 1034 + } 1085 1035 } 1086 1036 } 1087 1037 ··· 1123 1055 * We want the timer to fire at the deadline, but considering 1124 1056 * that it is actually coming from rq->clock and not from 1125 1057 * hrtimer's time base reading. 1058 + * 1059 + * The deferred reservation will have its timer set to 1060 + * (deadline - runtime). At that point, the CBS rule will decide 1061 + * if the current deadline can be used, or if a replenishment is 1062 + * required to avoid add too much pressure on the system 1063 + * (current u > U). 1126 1064 */ 1127 - act = ns_to_ktime(dl_next_period(dl_se)); 1065 + if (dl_se->dl_defer_armed) { 1066 + WARN_ON_ONCE(!dl_se->dl_throttled); 1067 + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); 1068 + } else { 1069 + /* act = deadline - rel-deadline + period */ 1070 + act = ns_to_ktime(dl_next_period(dl_se)); 1071 + } 1072 + 1128 1073 now = hrtimer_cb_get_time(timer); 1129 1074 delta = ktime_to_ns(now) - rq_clock(rq); 1130 1075 act = ktime_add_ns(act, delta); ··· 1187 1106 #endif 1188 1107 } 1189 1108 1109 + /* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ 1110 + static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; 1111 + 1112 + static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) 1113 + { 1114 + struct rq *rq = rq_of_dl_se(dl_se); 1115 + u64 fw; 1116 + 1117 + scoped_guard (rq_lock, rq) { 1118 + struct rq_flags *rf = &scope.rf; 1119 + 1120 + if (!dl_se->dl_throttled || !dl_se->dl_runtime) 1121 + return HRTIMER_NORESTART; 1122 + 1123 + sched_clock_tick(); 1124 + update_rq_clock(rq); 1125 + 1126 + if (!dl_se->dl_runtime) 1127 + return HRTIMER_NORESTART; 1128 + 1129 + if (!dl_se->server_has_tasks(dl_se)) { 1130 + replenish_dl_entity(dl_se); 1131 + return HRTIMER_NORESTART; 1132 + } 1133 + 1134 + if (dl_se->dl_defer_armed) { 1135 + /* 1136 + * First check if the server could consume runtime in background. 1137 + * If so, it is possible to push the defer timer for this amount 1138 + * of time. The dl_server_min_res serves as a limit to avoid 1139 + * forwarding the timer for a too small amount of time. 1140 + */ 1141 + if (dl_time_before(rq_clock(dl_se->rq), 1142 + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { 1143 + 1144 + /* reset the defer timer */ 1145 + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; 1146 + 1147 + hrtimer_forward_now(timer, ns_to_ktime(fw)); 1148 + return HRTIMER_RESTART; 1149 + } 1150 + 1151 + dl_se->dl_defer_running = 1; 1152 + } 1153 + 1154 + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); 1155 + 1156 + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) 1157 + resched_curr(rq); 1158 + 1159 + __push_dl_task(rq, rf); 1160 + } 1161 + 1162 + return HRTIMER_NORESTART; 1163 + } 1164 + 1190 1165 /* 1191 1166 * This is the bandwidth enforcement timer callback. If here, we know 1192 1167 * a task is not on its dl_rq, since the fact that the timer was running ··· 1265 1128 struct rq_flags rf; 1266 1129 struct rq *rq; 1267 1130 1268 - if (dl_server(dl_se)) { 1269 - struct rq *rq = rq_of_dl_se(dl_se); 1270 - struct rq_flags rf; 1271 - 1272 - rq_lock(rq, &rf); 1273 - if (dl_se->dl_throttled) { 1274 - sched_clock_tick(); 1275 - update_rq_clock(rq); 1276 - 1277 - if (dl_se->server_has_tasks(dl_se)) { 1278 - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); 1279 - resched_curr(rq); 1280 - __push_dl_task(rq, &rf); 1281 - } else { 1282 - replenish_dl_entity(dl_se); 1283 - } 1284 - 1285 - } 1286 - rq_unlock(rq, &rf); 1287 - 1288 - return HRTIMER_NORESTART; 1289 - } 1131 + if (dl_server(dl_se)) 1132 + return dl_server_timer(timer, dl_se); 1290 1133 1291 1134 p = dl_task_of(dl_se); 1292 1135 rq = task_rq_lock(p, &rf); ··· 1436 1319 return (delta * u_act) >> BW_SHIFT; 1437 1320 } 1438 1321 1439 - static inline void 1440 - update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, 1441 - int flags); 1442 - static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1322 + s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1443 1323 { 1444 1324 s64 scaled_delta_exec; 1445 - 1446 - if (unlikely(delta_exec <= 0)) { 1447 - if (unlikely(dl_se->dl_yielded)) 1448 - goto throttle; 1449 - return; 1450 - } 1451 - 1452 - if (dl_entity_is_special(dl_se)) 1453 - return; 1454 1325 1455 1326 /* 1456 1327 * For tasks that participate in GRUB, we implement GRUB-PA: the ··· 1458 1353 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); 1459 1354 } 1460 1355 1356 + return scaled_delta_exec; 1357 + } 1358 + 1359 + static inline void 1360 + update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, 1361 + int flags); 1362 + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) 1363 + { 1364 + s64 scaled_delta_exec; 1365 + 1366 + if (unlikely(delta_exec <= 0)) { 1367 + if (unlikely(dl_se->dl_yielded)) 1368 + goto throttle; 1369 + return; 1370 + } 1371 + 1372 + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) 1373 + return; 1374 + 1375 + if (dl_entity_is_special(dl_se)) 1376 + return; 1377 + 1378 + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); 1379 + 1461 1380 dl_se->runtime -= scaled_delta_exec; 1381 + 1382 + /* 1383 + * The fair server can consume its runtime while throttled (not queued/ 1384 + * running as regular CFS). 1385 + * 1386 + * If the server consumes its entire runtime in this state. The server 1387 + * is not required for the current period. Thus, reset the server by 1388 + * starting a new period, pushing the activation. 1389 + */ 1390 + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { 1391 + /* 1392 + * If the server was previously activated - the starving condition 1393 + * took place, it this point it went away because the fair scheduler 1394 + * was able to get runtime in background. So return to the initial 1395 + * state. 1396 + */ 1397 + dl_se->dl_defer_running = 0; 1398 + 1399 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1400 + 1401 + replenish_dl_new_period(dl_se, dl_se->rq); 1402 + 1403 + /* 1404 + * Not being able to start the timer seems problematic. If it could not 1405 + * be started for whatever reason, we need to "unthrottle" the DL server 1406 + * and queue right away. Otherwise nothing might queue it. That's similar 1407 + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. 1408 + */ 1409 + WARN_ON_ONCE(!start_dl_timer(dl_se)); 1410 + 1411 + return; 1412 + } 1462 1413 1463 1414 throttle: 1464 1415 if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { ··· 1575 1414 } 1576 1415 } 1577 1416 1417 + /* 1418 + * In the non-defer mode, the idle time is not accounted, as the 1419 + * server provides a guarantee. 1420 + * 1421 + * If the dl_server is in defer mode, the idle time is also considered 1422 + * as time available for the fair server, avoiding a penalty for the 1423 + * rt scheduler that did not consumed that time. 1424 + */ 1425 + void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) 1426 + { 1427 + s64 delta_exec, scaled_delta_exec; 1428 + 1429 + if (!rq->fair_server.dl_defer) 1430 + return; 1431 + 1432 + /* no need to discount more */ 1433 + if (rq->fair_server.runtime < 0) 1434 + return; 1435 + 1436 + delta_exec = rq_clock_task(rq) - p->se.exec_start; 1437 + if (delta_exec < 0) 1438 + return; 1439 + 1440 + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); 1441 + 1442 + rq->fair_server.runtime -= scaled_delta_exec; 1443 + 1444 + if (rq->fair_server.runtime < 0) { 1445 + rq->fair_server.dl_defer_running = 0; 1446 + rq->fair_server.runtime = 0; 1447 + } 1448 + 1449 + p->se.exec_start = rq_clock_task(rq); 1450 + } 1451 + 1578 1452 void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) 1579 1453 { 1580 - update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1454 + /* 0 runtime = fair server disabled */ 1455 + if (dl_se->dl_runtime) 1456 + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); 1581 1457 } 1582 1458 1583 1459 void dl_server_start(struct sched_dl_entity *dl_se) ··· 1628 1430 dl_se->dl_period = 1000 * NSEC_PER_MSEC; 1629 1431 1630 1432 dl_se->dl_server = 1; 1433 + dl_se->dl_defer = 1; 1631 1434 setup_new_dl_entity(dl_se); 1632 1435 } 1633 1436 ··· 1646 1447 return; 1647 1448 1648 1449 dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); 1450 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1451 + dl_se->dl_defer_armed = 0; 1452 + dl_se->dl_throttled = 0; 1649 1453 } 1650 1454 1651 1455 void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, ··· 1960 1758 * be counted in the active utilization; hence, we need to call 1961 1759 * add_running_bw(). 1962 1760 */ 1963 - if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { 1761 + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { 1964 1762 if (flags & ENQUEUE_WAKEUP) 1965 1763 task_contending(dl_se, flags); 1966 1764 ··· 1980 1778 } else if ((flags & ENQUEUE_RESTORE) && 1981 1779 dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { 1982 1780 setup_new_dl_entity(dl_se); 1781 + } 1782 + 1783 + /* 1784 + * If the reservation is still throttled, e.g., it got replenished but is a 1785 + * deferred task and still got to wait, don't enqueue. 1786 + */ 1787 + if (dl_se->dl_throttled && start_dl_timer(dl_se)) 1788 + return; 1789 + 1790 + /* 1791 + * We're about to enqueue, make sure we're not ->dl_throttled! 1792 + * In case the timer was not started, say because the defer time 1793 + * has passed, mark as not throttled and mark unarmed. 1794 + * Also cancel earlier timers, since letting those run is pointless. 1795 + */ 1796 + if (dl_se->dl_throttled) { 1797 + hrtimer_try_to_cancel(&dl_se->dl_timer); 1798 + dl_se->dl_defer_armed = 0; 1799 + dl_se->dl_throttled = 0; 1983 1800 } 1984 1801 1985 1802 __enqueue_dl_entity(dl_se);

+20 -4

kernel/sched/fair.c

··· 1156 1156 static void update_curr(struct cfs_rq *cfs_rq) 1157 1157 { 1158 1158 struct sched_entity *curr = cfs_rq->curr; 1159 + struct rq *rq = rq_of(cfs_rq); 1159 1160 s64 delta_exec; 1160 1161 1161 1162 if (unlikely(!curr)) 1162 1163 return; 1163 1164 1164 - delta_exec = update_curr_se(rq_of(cfs_rq), curr); 1165 + delta_exec = update_curr_se(rq, curr); 1165 1166 if (unlikely(delta_exec <= 0)) 1166 1167 return; 1167 1168 ··· 1170 1169 update_deadline(cfs_rq, curr); 1171 1170 update_min_vruntime(cfs_rq); 1172 1171 1173 - if (entity_is_task(curr)) 1174 - update_curr_task(task_of(curr), delta_exec); 1172 + if (entity_is_task(curr)) { 1173 + struct task_struct *p = task_of(curr); 1174 + 1175 + update_curr_task(p, delta_exec); 1176 + 1177 + /* 1178 + * Any fair task that runs outside of fair_server should 1179 + * account against fair_server such that it can account for 1180 + * this time and possibly avoid running this period. 1181 + */ 1182 + if (p->dl_server != &rq->fair_server) 1183 + dl_server_update(&rq->fair_server, delta_exec); 1184 + } 1175 1185 1176 1186 account_cfs_rq_runtime(cfs_rq, delta_exec); 1177 1187 } ··· 6780 6768 */ 6781 6769 util_est_enqueue(&rq->cfs, p); 6782 6770 6783 - if (!throttled_hierarchy(task_cfs_rq(p)) && !rq->cfs.h_nr_running) 6771 + if (!throttled_hierarchy(task_cfs_rq(p)) && !rq->cfs.h_nr_running) { 6772 + /* Account for idle runtime */ 6773 + if (!rq->nr_running) 6774 + dl_server_update_idle_time(rq, rq->curr); 6784 6775 dl_server_start(&rq->fair_server); 6776 + } 6785 6777 6786 6778 /* 6787 6779 * If in_iowait is set, the code below may not trigger any cpufreq

+2

kernel/sched/idle.c

··· 452 452 453 453 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 454 454 { 455 + dl_server_update_idle_time(rq, prev); 455 456 } 456 457 457 458 static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) 458 459 { 459 460 update_idle_core(rq); 460 461 schedstat_inc(rq->sched_goidle); 462 + next->se.exec_start = rq_clock_task(rq); 461 463 } 462 464 463 465 #ifdef CONFIG_SMP

+3 -1

kernel/sched/sched.h

··· 335 335 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 336 336 extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); 337 337 extern int dl_bw_check_overflow(int cpu); 338 - 338 + extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); 339 339 /* 340 340 * SCHED_DEADLINE supports servers (nested scheduling) with the following 341 341 * interface: ··· 363 363 dl_server_has_tasks_f has_tasks, 364 364 dl_server_pick_f pick); 365 365 366 + extern void dl_server_update_idle_time(struct rq *rq, 367 + struct task_struct *p); 366 368 extern void fair_server_init(struct rq *rq); 367 369 368 370 #ifdef CONFIG_CGROUP_SCHED

Configure Feed

Configure Feed