Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/fair: Task based throttle time accounting

With task based throttle model, the previous way to check cfs_rq's
nr_queued to decide if throttled time should be accounted doesn't work
as expected, e.g. when a cfs_rq which has a single task is throttled,
that task could later block in kernel mode instead of being dequeued on
limbo list and accounting this as throttled time is not accurate.

Rework throttle time accounting for a cfs_rq as follows:
- start accounting when the first task gets throttled in its hierarchy;
- stop accounting on unthrottle.

Note that there will be a time gap between when a cfs_rq is throttled
and when a task in its hierarchy is actually throttled. This accounting
mechanism only starts accounting in the latter case.

Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # accounting mechanism
Co-developed-by: K Prateek Nayak <kprateek.nayak@amd.com> # simplify implementation
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250829081120.806-5-ziqianlu@bytedance.com

authored by

Aaron Lu and committed by
Peter Zijlstra
eb962f25 e1fad12d

+32 -25
+31 -25
kernel/sched/fair.c
··· 5293 5293 check_enqueue_throttle(cfs_rq); 5294 5294 list_add_leaf_cfs_rq(cfs_rq); 5295 5295 #ifdef CONFIG_CFS_BANDWIDTH 5296 - if (throttled_hierarchy(cfs_rq)) { 5296 + if (cfs_rq->pelt_clock_throttled) { 5297 5297 struct rq *rq = rq_of(cfs_rq); 5298 5298 5299 - if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) 5300 - cfs_rq->throttled_clock = rq_clock(rq); 5301 - if (!cfs_rq->throttled_clock_self) 5302 - cfs_rq->throttled_clock_self = rq_clock(rq); 5303 - 5304 - if (cfs_rq->pelt_clock_throttled) { 5305 - cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 5306 - cfs_rq->throttled_clock_pelt; 5307 - cfs_rq->pelt_clock_throttled = 0; 5308 - } 5299 + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 5300 + cfs_rq->throttled_clock_pelt; 5301 + cfs_rq->pelt_clock_throttled = 0; 5309 5302 } 5310 5303 #endif 5311 5304 } ··· 5386 5393 * DELAY_DEQUEUE relies on spurious wakeups, special task 5387 5394 * states must not suffer spurious wakeups, excempt them. 5388 5395 */ 5389 - if (flags & DEQUEUE_SPECIAL) 5396 + if (flags & (DEQUEUE_SPECIAL | DEQUEUE_THROTTLE)) 5390 5397 delay = false; 5391 5398 5392 5399 WARN_ON_ONCE(delay && se->sched_delayed); ··· 5792 5799 rq = scope.rq; 5793 5800 update_rq_clock(rq); 5794 5801 WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); 5795 - dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); 5802 + dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_THROTTLE); 5796 5803 list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); 5797 5804 /* 5798 5805 * Must not set throttled before dequeue or dequeue will ··· 5952 5959 task_work_add(p, &p->sched_throttle_work, TWA_RESUME); 5953 5960 } 5954 5961 5962 + static void record_throttle_clock(struct cfs_rq *cfs_rq) 5963 + { 5964 + struct rq *rq = rq_of(cfs_rq); 5965 + 5966 + if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) 5967 + cfs_rq->throttled_clock = rq_clock(rq); 5968 + 5969 + if (!cfs_rq->throttled_clock_self) 5970 + cfs_rq->throttled_clock_self = rq_clock(rq); 5971 + } 5972 + 5955 5973 static int tg_throttle_down(struct task_group *tg, void *data) 5956 5974 { 5957 5975 struct rq *rq = data; ··· 5971 5967 if (cfs_rq->throttle_count++) 5972 5968 return 0; 5973 5969 5974 - 5975 - /* group is entering throttled state, stop time */ 5976 - WARN_ON_ONCE(cfs_rq->throttled_clock_self); 5977 - if (cfs_rq->nr_queued) 5978 - cfs_rq->throttled_clock_self = rq_clock(rq); 5979 - else { 5980 - /* 5981 - * For cfs_rqs that still have entities enqueued, PELT clock 5982 - * stop happens at dequeue time when all entities are dequeued. 5983 - */ 5970 + /* 5971 + * For cfs_rqs that still have entities enqueued, PELT clock 5972 + * stop happens at dequeue time when all entities are dequeued. 5973 + */ 5974 + if (!cfs_rq->nr_queued) { 5984 5975 list_del_leaf_cfs_rq(cfs_rq); 5985 5976 cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5986 5977 cfs_rq->pelt_clock_throttled = 1; 5987 5978 } 5988 5979 5980 + WARN_ON_ONCE(cfs_rq->throttled_clock_self); 5989 5981 WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); 5990 5982 return 0; 5991 5983 } ··· 6024 6024 */ 6025 6025 cfs_rq->throttled = 1; 6026 6026 WARN_ON_ONCE(cfs_rq->throttled_clock); 6027 - if (cfs_rq->nr_queued) 6028 - cfs_rq->throttled_clock = rq_clock(rq); 6029 6027 return true; 6030 6028 } 6031 6029 ··· 6731 6733 static bool task_is_throttled(struct task_struct *p) { return false; } 6732 6734 static void dequeue_throttled_task(struct task_struct *p, int flags) {} 6733 6735 static bool enqueue_throttled_task(struct task_struct *p) { return false; } 6736 + static void record_throttle_clock(struct cfs_rq *cfs_rq) {} 6734 6737 6735 6738 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 6736 6739 { ··· 7050 7051 bool was_sched_idle = sched_idle_rq(rq); 7051 7052 bool task_sleep = flags & DEQUEUE_SLEEP; 7052 7053 bool task_delayed = flags & DEQUEUE_DELAYED; 7054 + bool task_throttled = flags & DEQUEUE_THROTTLE; 7053 7055 struct task_struct *p = NULL; 7054 7056 int h_nr_idle = 0; 7055 7057 int h_nr_queued = 0; ··· 7083 7083 7084 7084 if (cfs_rq_is_idle(cfs_rq)) 7085 7085 h_nr_idle = h_nr_queued; 7086 + 7087 + if (throttled_hierarchy(cfs_rq) && task_throttled) 7088 + record_throttle_clock(cfs_rq); 7086 7089 7087 7090 /* Don't dequeue parent if it has other entities besides us */ 7088 7091 if (cfs_rq->load.weight) { ··· 7123 7120 7124 7121 if (cfs_rq_is_idle(cfs_rq)) 7125 7122 h_nr_idle = h_nr_queued; 7123 + 7124 + if (throttled_hierarchy(cfs_rq) && task_throttled) 7125 + record_throttle_clock(cfs_rq); 7126 7126 } 7127 7127 7128 7128 sub_nr_running(rq, h_nr_queued);
+1
kernel/sched/sched.h
··· 2344 2344 #define DEQUEUE_SPECIAL 0x10 2345 2345 #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ 2346 2346 #define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ 2347 + #define DEQUEUE_THROTTLE 0x800 2347 2348 2348 2349 #define ENQUEUE_WAKEUP 0x01 2349 2350 #define ENQUEUE_RESTORE 0x02