Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

sched/fair: Switch to task based throttle model

In current throttle model, when a cfs_rq is throttled, its entity will
be dequeued from cpu's rq, making tasks attached to it not able to run,
thus achiveing the throttle target.

This has a drawback though: assume a task is a reader of percpu_rwsem
and is waiting. When it gets woken, it can not run till its task group's
next period comes, which can be a relatively long time. Waiting writer
will have to wait longer due to this and it also makes further reader
build up and eventually trigger task hung.

To improve this situation, change the throttle model to task based, i.e.
when a cfs_rq is throttled, record its throttled status but do not remove
it from cpu's rq. Instead, for tasks that belong to this cfs_rq, when
they get picked, add a task work to them so that when they return
to user, they can be dequeued there. In this way, tasks throttled will
not hold any kernel resources. And on unthrottle, enqueue back those
tasks so they can continue to run.

Throttled cfs_rq's PELT clock is handled differently now: previously the
cfs_rq's PELT clock is stopped once it entered throttled state but since
now tasks(in kernel mode) can continue to run, change the behaviour to
stop PELT clock when the throttled cfs_rq has no tasks left.

Suggested-by: Chengming Zhou <chengming.zhou@linux.dev> # tag on pick
Signed-off-by: Valentin Schneider <vschneid@redhat.com>
Signed-off-by: Aaron Lu <ziqianlu@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Valentin Schneider <vschneid@redhat.com>
Tested-by: Chen Yu <yu.c.chen@intel.com>
Tested-by: Matteo Martelli <matteo.martelli@codethink.co.uk>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lore.kernel.org/r/20250829081120.806-4-ziqianlu@bytedance.com

authored by

Valentin Schneider and committed by
Peter Zijlstra
e1fad12d 7fc2d143

+186 -172
+182 -169
kernel/sched/fair.c
··· 5291 5291 5292 5292 if (cfs_rq->nr_queued == 1) { 5293 5293 check_enqueue_throttle(cfs_rq); 5294 - if (!throttled_hierarchy(cfs_rq)) { 5295 - list_add_leaf_cfs_rq(cfs_rq); 5296 - } else { 5294 + list_add_leaf_cfs_rq(cfs_rq); 5297 5295 #ifdef CONFIG_CFS_BANDWIDTH 5296 + if (throttled_hierarchy(cfs_rq)) { 5298 5297 struct rq *rq = rq_of(cfs_rq); 5299 5298 5300 5299 if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) 5301 5300 cfs_rq->throttled_clock = rq_clock(rq); 5302 5301 if (!cfs_rq->throttled_clock_self) 5303 5302 cfs_rq->throttled_clock_self = rq_clock(rq); 5304 - #endif 5303 + 5304 + if (cfs_rq->pelt_clock_throttled) { 5305 + cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 5306 + cfs_rq->throttled_clock_pelt; 5307 + cfs_rq->pelt_clock_throttled = 0; 5308 + } 5305 5309 } 5310 + #endif 5306 5311 } 5307 5312 } 5308 5313 ··· 5346 5341 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5347 5342 5348 5343 cfs_rq->h_nr_runnable--; 5349 - if (cfs_rq_throttled(cfs_rq)) 5350 - break; 5351 5344 } 5352 5345 } 5353 5346 ··· 5366 5363 struct cfs_rq *cfs_rq = cfs_rq_of(se); 5367 5364 5368 5365 cfs_rq->h_nr_runnable++; 5369 - if (cfs_rq_throttled(cfs_rq)) 5370 - break; 5371 5366 } 5372 5367 } 5373 5368 ··· 5451 5450 if (flags & DEQUEUE_DELAYED) 5452 5451 finish_delayed_dequeue_entity(se); 5453 5452 5454 - if (cfs_rq->nr_queued == 0) 5453 + if (cfs_rq->nr_queued == 0) { 5455 5454 update_idle_cfs_rq_clock_pelt(cfs_rq); 5455 + #ifdef CONFIG_CFS_BANDWIDTH 5456 + if (throttled_hierarchy(cfs_rq)) { 5457 + struct rq *rq = rq_of(cfs_rq); 5458 + 5459 + list_del_leaf_cfs_rq(cfs_rq); 5460 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5461 + cfs_rq->pelt_clock_throttled = 1; 5462 + } 5463 + #endif 5464 + } 5456 5465 5457 5466 return true; 5458 5467 } ··· 5801 5790 WARN_ON_ONCE(p->throttled || !list_empty(&p->throttle_node)); 5802 5791 dequeue_task_fair(rq, p, DEQUEUE_SLEEP | DEQUEUE_SPECIAL); 5803 5792 list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); 5793 + /* 5794 + * Must not set throttled before dequeue or dequeue will 5795 + * mistakenly regard this task as an already throttled one. 5796 + */ 5804 5797 p->throttled = true; 5805 5798 resched_curr(rq); 5806 5799 } ··· 5818 5803 INIT_LIST_HEAD(&p->throttle_node); 5819 5804 } 5820 5805 5806 + /* 5807 + * Task is throttled and someone wants to dequeue it again: 5808 + * it could be sched/core when core needs to do things like 5809 + * task affinity change, task group change, task sched class 5810 + * change etc. and in these cases, DEQUEUE_SLEEP is not set; 5811 + * or the task is blocked after throttled due to freezer etc. 5812 + * and in these cases, DEQUEUE_SLEEP is set. 5813 + */ 5814 + static void detach_task_cfs_rq(struct task_struct *p); 5815 + static void dequeue_throttled_task(struct task_struct *p, int flags) 5816 + { 5817 + WARN_ON_ONCE(p->se.on_rq); 5818 + list_del_init(&p->throttle_node); 5819 + 5820 + /* task blocked after throttled */ 5821 + if (flags & DEQUEUE_SLEEP) { 5822 + p->throttled = false; 5823 + return; 5824 + } 5825 + 5826 + /* 5827 + * task is migrating off its old cfs_rq, detach 5828 + * the task's load from its old cfs_rq. 5829 + */ 5830 + if (task_on_rq_migrating(p)) 5831 + detach_task_cfs_rq(p); 5832 + } 5833 + 5834 + static bool enqueue_throttled_task(struct task_struct *p) 5835 + { 5836 + struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); 5837 + 5838 + /* @p should have gone through dequeue_throttled_task() first */ 5839 + WARN_ON_ONCE(!list_empty(&p->throttle_node)); 5840 + 5841 + /* 5842 + * If the throttled task @p is enqueued to a throttled cfs_rq, 5843 + * take the fast path by directly putting the task on the 5844 + * target cfs_rq's limbo list. 5845 + * 5846 + * Do not do that when @p is current because the following race can 5847 + * cause @p's group_node to be incorectly re-insterted in its rq's 5848 + * cfs_tasks list, despite being throttled: 5849 + * 5850 + * cpuX cpuY 5851 + * p ret2user 5852 + * throttle_cfs_rq_work() sched_move_task(p) 5853 + * LOCK task_rq_lock 5854 + * dequeue_task_fair(p) 5855 + * UNLOCK task_rq_lock 5856 + * LOCK task_rq_lock 5857 + * task_current_donor(p) == true 5858 + * task_on_rq_queued(p) == true 5859 + * dequeue_task(p) 5860 + * put_prev_task(p) 5861 + * sched_change_group() 5862 + * enqueue_task(p) -> p's new cfs_rq 5863 + * is throttled, go 5864 + * fast path and skip 5865 + * actual enqueue 5866 + * set_next_task(p) 5867 + * list_move(&se->group_node, &rq->cfs_tasks); // bug 5868 + * schedule() 5869 + * 5870 + * In the above race case, @p current cfs_rq is in the same rq as 5871 + * its previous cfs_rq because sched_move_task() only moves a task 5872 + * to a different group from the same rq, so we can use its current 5873 + * cfs_rq to derive rq and test if the task is current. 5874 + */ 5875 + if (throttled_hierarchy(cfs_rq) && 5876 + !task_current_donor(rq_of(cfs_rq), p)) { 5877 + list_add(&p->throttle_node, &cfs_rq->throttled_limbo_list); 5878 + return true; 5879 + } 5880 + 5881 + /* we can't take the fast path, do an actual enqueue*/ 5882 + p->throttled = false; 5883 + return false; 5884 + } 5885 + 5886 + static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); 5821 5887 static int tg_unthrottle_up(struct task_group *tg, void *data) 5822 5888 { 5823 5889 struct rq *rq = data; 5824 5890 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5891 + struct task_struct *p, *tmp; 5825 5892 5826 - cfs_rq->throttle_count--; 5827 - if (!cfs_rq->throttle_count) { 5893 + if (--cfs_rq->throttle_count) 5894 + return 0; 5895 + 5896 + if (cfs_rq->pelt_clock_throttled) { 5828 5897 cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) - 5829 5898 cfs_rq->throttled_clock_pelt; 5830 - 5831 - /* Add cfs_rq with load or one or more already running entities to the list */ 5832 - if (!cfs_rq_is_decayed(cfs_rq)) 5833 - list_add_leaf_cfs_rq(cfs_rq); 5834 - 5835 - if (cfs_rq->throttled_clock_self) { 5836 - u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; 5837 - 5838 - cfs_rq->throttled_clock_self = 0; 5839 - 5840 - if (WARN_ON_ONCE((s64)delta < 0)) 5841 - delta = 0; 5842 - 5843 - cfs_rq->throttled_clock_self_time += delta; 5844 - } 5899 + cfs_rq->pelt_clock_throttled = 0; 5845 5900 } 5901 + 5902 + if (cfs_rq->throttled_clock_self) { 5903 + u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; 5904 + 5905 + cfs_rq->throttled_clock_self = 0; 5906 + 5907 + if (WARN_ON_ONCE((s64)delta < 0)) 5908 + delta = 0; 5909 + 5910 + cfs_rq->throttled_clock_self_time += delta; 5911 + } 5912 + 5913 + /* Re-enqueue the tasks that have been throttled at this level. */ 5914 + list_for_each_entry_safe(p, tmp, &cfs_rq->throttled_limbo_list, throttle_node) { 5915 + list_del_init(&p->throttle_node); 5916 + p->throttled = false; 5917 + enqueue_task_fair(rq_of(cfs_rq), p, ENQUEUE_WAKEUP); 5918 + } 5919 + 5920 + /* Add cfs_rq with load or one or more already running entities to the list */ 5921 + if (!cfs_rq_is_decayed(cfs_rq)) 5922 + list_add_leaf_cfs_rq(cfs_rq); 5846 5923 5847 5924 return 0; 5848 5925 } ··· 5964 5857 struct rq *rq = data; 5965 5858 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; 5966 5859 5860 + if (cfs_rq->throttle_count++) 5861 + return 0; 5862 + 5863 + 5967 5864 /* group is entering throttled state, stop time */ 5968 - if (!cfs_rq->throttle_count) { 5969 - cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5865 + WARN_ON_ONCE(cfs_rq->throttled_clock_self); 5866 + if (cfs_rq->nr_queued) 5867 + cfs_rq->throttled_clock_self = rq_clock(rq); 5868 + else { 5869 + /* 5870 + * For cfs_rqs that still have entities enqueued, PELT clock 5871 + * stop happens at dequeue time when all entities are dequeued. 5872 + */ 5970 5873 list_del_leaf_cfs_rq(cfs_rq); 5971 - 5972 - WARN_ON_ONCE(cfs_rq->throttled_clock_self); 5973 - if (cfs_rq->nr_queued) 5974 - cfs_rq->throttled_clock_self = rq_clock(rq); 5874 + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); 5875 + cfs_rq->pelt_clock_throttled = 1; 5975 5876 } 5976 - cfs_rq->throttle_count++; 5977 5877 5878 + WARN_ON_ONCE(!list_empty(&cfs_rq->throttled_limbo_list)); 5978 5879 return 0; 5979 5880 } 5980 5881 ··· 5990 5875 { 5991 5876 struct rq *rq = rq_of(cfs_rq); 5992 5877 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 5993 - struct sched_entity *se; 5994 - long queued_delta, runnable_delta, idle_delta, dequeue = 1; 5878 + int dequeue = 1; 5995 5879 5996 5880 raw_spin_lock(&cfs_b->lock); 5997 5881 /* This will start the period timer if necessary */ ··· 6013 5899 if (!dequeue) 6014 5900 return false; /* Throttle no longer required. */ 6015 5901 6016 - se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 6017 - 6018 5902 /* freeze hierarchy runnable averages while throttled */ 6019 5903 rcu_read_lock(); 6020 5904 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); 6021 5905 rcu_read_unlock(); 6022 5906 6023 - queued_delta = cfs_rq->h_nr_queued; 6024 - runnable_delta = cfs_rq->h_nr_runnable; 6025 - idle_delta = cfs_rq->h_nr_idle; 6026 - for_each_sched_entity(se) { 6027 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6028 - int flags; 6029 - 6030 - /* throttled entity or throttle-on-deactivate */ 6031 - if (!se->on_rq) 6032 - goto done; 6033 - 6034 - /* 6035 - * Abuse SPECIAL to avoid delayed dequeue in this instance. 6036 - * This avoids teaching dequeue_entities() about throttled 6037 - * entities and keeps things relatively simple. 6038 - */ 6039 - flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; 6040 - if (se->sched_delayed) 6041 - flags |= DEQUEUE_DELAYED; 6042 - dequeue_entity(qcfs_rq, se, flags); 6043 - 6044 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6045 - idle_delta = cfs_rq->h_nr_queued; 6046 - 6047 - qcfs_rq->h_nr_queued -= queued_delta; 6048 - qcfs_rq->h_nr_runnable -= runnable_delta; 6049 - qcfs_rq->h_nr_idle -= idle_delta; 6050 - 6051 - if (qcfs_rq->load.weight) { 6052 - /* Avoid re-evaluating load for this entity: */ 6053 - se = parent_entity(se); 6054 - break; 6055 - } 6056 - } 6057 - 6058 - for_each_sched_entity(se) { 6059 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6060 - /* throttled entity or throttle-on-deactivate */ 6061 - if (!se->on_rq) 6062 - goto done; 6063 - 6064 - update_load_avg(qcfs_rq, se, 0); 6065 - se_update_runnable(se); 6066 - 6067 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6068 - idle_delta = cfs_rq->h_nr_queued; 6069 - 6070 - qcfs_rq->h_nr_queued -= queued_delta; 6071 - qcfs_rq->h_nr_runnable -= runnable_delta; 6072 - qcfs_rq->h_nr_idle -= idle_delta; 6073 - } 6074 - 6075 - /* At this point se is NULL and we are at root level*/ 6076 - sub_nr_running(rq, queued_delta); 6077 - done: 6078 5907 /* 6079 5908 * Note: distribution will already see us throttled via the 6080 5909 * throttled-list. rq->lock protects completion. ··· 6033 5976 { 6034 5977 struct rq *rq = rq_of(cfs_rq); 6035 5978 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 6036 - struct sched_entity *se; 6037 - long queued_delta, runnable_delta, idle_delta; 6038 - long rq_h_nr_queued = rq->cfs.h_nr_queued; 5979 + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; 5980 + 5981 + /* 5982 + * It's possible we are called with !runtime_remaining due to things 5983 + * like user changed quota setting(see tg_set_cfs_bandwidth()) or async 5984 + * unthrottled us with a positive runtime_remaining but other still 5985 + * running entities consumed those runtime before we reached here. 5986 + * 5987 + * Anyway, we can't unthrottle this cfs_rq without any runtime remaining 5988 + * because any enqueue in tg_unthrottle_up() will immediately trigger a 5989 + * throttle, which is not supposed to happen on unthrottle path. 5990 + */ 5991 + if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) 5992 + return; 6039 5993 6040 5994 se = cfs_rq->tg->se[cpu_of(rq)]; 6041 5995 ··· 6076 6008 if (list_add_leaf_cfs_rq(cfs_rq_of(se))) 6077 6009 break; 6078 6010 } 6079 - goto unthrottle_throttle; 6080 6011 } 6081 6012 6082 - queued_delta = cfs_rq->h_nr_queued; 6083 - runnable_delta = cfs_rq->h_nr_runnable; 6084 - idle_delta = cfs_rq->h_nr_idle; 6085 - for_each_sched_entity(se) { 6086 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6087 - 6088 - /* Handle any unfinished DELAY_DEQUEUE business first. */ 6089 - if (se->sched_delayed) { 6090 - int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; 6091 - 6092 - dequeue_entity(qcfs_rq, se, flags); 6093 - } else if (se->on_rq) 6094 - break; 6095 - enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); 6096 - 6097 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6098 - idle_delta = cfs_rq->h_nr_queued; 6099 - 6100 - qcfs_rq->h_nr_queued += queued_delta; 6101 - qcfs_rq->h_nr_runnable += runnable_delta; 6102 - qcfs_rq->h_nr_idle += idle_delta; 6103 - 6104 - /* end evaluation on encountering a throttled cfs_rq */ 6105 - if (cfs_rq_throttled(qcfs_rq)) 6106 - goto unthrottle_throttle; 6107 - } 6108 - 6109 - for_each_sched_entity(se) { 6110 - struct cfs_rq *qcfs_rq = cfs_rq_of(se); 6111 - 6112 - update_load_avg(qcfs_rq, se, UPDATE_TG); 6113 - se_update_runnable(se); 6114 - 6115 - if (cfs_rq_is_idle(group_cfs_rq(se))) 6116 - idle_delta = cfs_rq->h_nr_queued; 6117 - 6118 - qcfs_rq->h_nr_queued += queued_delta; 6119 - qcfs_rq->h_nr_runnable += runnable_delta; 6120 - qcfs_rq->h_nr_idle += idle_delta; 6121 - 6122 - /* end evaluation on encountering a throttled cfs_rq */ 6123 - if (cfs_rq_throttled(qcfs_rq)) 6124 - goto unthrottle_throttle; 6125 - } 6126 - 6127 - /* Start the fair server if un-throttling resulted in new runnable tasks */ 6128 - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) 6129 - dl_server_start(&rq->fair_server); 6130 - 6131 - /* At this point se is NULL and we are at root level*/ 6132 - add_nr_running(rq, queued_delta); 6133 - 6134 - unthrottle_throttle: 6135 6013 assert_list_leaf_cfs_rq(rq); 6136 6014 6137 6015 /* Determine whether we need to wake up potentially idle CPU: */ ··· 6731 6717 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} 6732 6718 static void task_throttle_setup_work(struct task_struct *p) {} 6733 6719 static bool task_is_throttled(struct task_struct *p) { return false; } 6720 + static void dequeue_throttled_task(struct task_struct *p, int flags) {} 6721 + static bool enqueue_throttled_task(struct task_struct *p) { return false; } 6734 6722 6735 6723 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) 6736 6724 { ··· 6925 6909 int rq_h_nr_queued = rq->cfs.h_nr_queued; 6926 6910 u64 slice = 0; 6927 6911 6912 + if (task_is_throttled(p) && enqueue_throttled_task(p)) 6913 + return; 6914 + 6928 6915 /* 6929 6916 * The code below (indirectly) updates schedutil which looks at 6930 6917 * the cfs_rq utilization to select a frequency. ··· 6980 6961 if (cfs_rq_is_idle(cfs_rq)) 6981 6962 h_nr_idle = 1; 6982 6963 6983 - /* end evaluation on encountering a throttled cfs_rq */ 6984 - if (cfs_rq_throttled(cfs_rq)) 6985 - goto enqueue_throttle; 6986 - 6987 6964 flags = ENQUEUE_WAKEUP; 6988 6965 } 6989 6966 ··· 7001 6986 7002 6987 if (cfs_rq_is_idle(cfs_rq)) 7003 6988 h_nr_idle = 1; 7004 - 7005 - /* end evaluation on encountering a throttled cfs_rq */ 7006 - if (cfs_rq_throttled(cfs_rq)) 7007 - goto enqueue_throttle; 7008 6989 } 7009 6990 7010 6991 if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { ··· 7030 7019 if (!task_new) 7031 7020 check_update_overutilized_status(rq); 7032 7021 7033 - enqueue_throttle: 7034 7022 assert_list_leaf_cfs_rq(rq); 7035 7023 7036 7024 hrtick_update(rq); ··· 7084 7074 if (cfs_rq_is_idle(cfs_rq)) 7085 7075 h_nr_idle = h_nr_queued; 7086 7076 7087 - /* end evaluation on encountering a throttled cfs_rq */ 7088 - if (cfs_rq_throttled(cfs_rq)) 7089 - return 0; 7090 - 7091 7077 /* Don't dequeue parent if it has other entities besides us */ 7092 7078 if (cfs_rq->load.weight) { 7093 7079 slice = cfs_rq_min_slice(cfs_rq); ··· 7120 7114 7121 7115 if (cfs_rq_is_idle(cfs_rq)) 7122 7116 h_nr_idle = h_nr_queued; 7123 - 7124 - /* end evaluation on encountering a throttled cfs_rq */ 7125 - if (cfs_rq_throttled(cfs_rq)) 7126 - return 0; 7127 7117 } 7128 7118 7129 7119 sub_nr_running(rq, h_nr_queued); ··· 7153 7151 */ 7154 7152 static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) 7155 7153 { 7154 + if (task_is_throttled(p)) { 7155 + dequeue_throttled_task(p, flags); 7156 + return true; 7157 + } 7158 + 7156 7159 if (!p->se.sched_delayed) 7157 7160 util_est_dequeue(&rq->cfs, p); 7158 7161 ··· 8826 8819 { 8827 8820 struct sched_entity *se; 8828 8821 struct cfs_rq *cfs_rq; 8822 + struct task_struct *p; 8823 + bool throttled; 8829 8824 8830 8825 again: 8831 8826 cfs_rq = &rq->cfs; 8832 8827 if (!cfs_rq->nr_queued) 8833 8828 return NULL; 8834 8829 8830 + throttled = false; 8831 + 8835 8832 do { 8836 8833 /* Might not have done put_prev_entity() */ 8837 8834 if (cfs_rq->curr && cfs_rq->curr->on_rq) 8838 8835 update_curr(cfs_rq); 8839 8836 8840 - if (unlikely(check_cfs_rq_runtime(cfs_rq))) 8841 - goto again; 8837 + throttled |= check_cfs_rq_runtime(cfs_rq); 8842 8838 8843 8839 se = pick_next_entity(rq, cfs_rq); 8844 8840 if (!se) ··· 8849 8839 cfs_rq = group_cfs_rq(se); 8850 8840 } while (cfs_rq); 8851 8841 8852 - return task_of(se); 8842 + p = task_of(se); 8843 + if (unlikely(throttled)) 8844 + task_throttle_setup_work(p); 8845 + return p; 8853 8846 } 8854 8847 8855 8848 static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first);
+2 -2
kernel/sched/pelt.h
··· 162 162 { 163 163 u64 throttled; 164 164 165 - if (unlikely(cfs_rq->throttle_count)) 165 + if (unlikely(cfs_rq->pelt_clock_throttled)) 166 166 throttled = U64_MAX; 167 167 else 168 168 throttled = cfs_rq->throttled_clock_pelt_time; ··· 173 173 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ 174 174 static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) 175 175 { 176 - if (unlikely(cfs_rq->throttle_count)) 176 + if (unlikely(cfs_rq->pelt_clock_throttled)) 177 177 return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time; 178 178 179 179 return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
+2 -1
kernel/sched/sched.h
··· 735 735 u64 throttled_clock_pelt_time; 736 736 u64 throttled_clock_self; 737 737 u64 throttled_clock_self_time; 738 - int throttled; 738 + bool throttled:1; 739 + bool pelt_clock_throttled:1; 739 740 int throttle_count; 740 741 struct list_head throttled_list; 741 742 struct list_head throttled_csd_list;