Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'sched-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
"Debuggability:

- Change most occurances of BUG_ON() to WARN_ON_ONCE()

- Reorganize & fix TASK_ state comparisons, turn it into a bitmap

- Update/fix misc scheduler debugging facilities

Load-balancing & regular scheduling:

- Improve the behavior of the scheduler in presence of lot of
SCHED_IDLE tasks - in particular they should not impact other
scheduling classes.

- Optimize task load tracking, cleanups & fixes

- Clean up & simplify misc load-balancing code

Freezer:

- Rewrite the core freezer to behave better wrt thawing and be
simpler in general, by replacing PF_FROZEN with TASK_FROZEN &
fixing/adjusting all the fallout.

Deadline scheduler:

- Fix the DL capacity-aware code

- Factor out dl_task_is_earliest_deadline() &
replenish_dl_new_period()

- Relax/optimize locking in task_non_contending()

Cleanups:

- Factor out the update_current_exec_runtime() helper

- Various cleanups, simplifications"

* tag 'sched-core-2022-10-07' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
sched: Fix more TASK_state comparisons
sched: Fix TASK_state comparisons
sched/fair: Move call to list_last_entry() in detach_tasks
sched/fair: Cleanup loop_max and loop_break
sched/fair: Make sure to try to detach at least one movable task
sched: Show PF_flag holes
freezer,sched: Rewrite core freezer logic
sched: Widen TAKS_state literals
sched/wait: Add wait_event_state()
sched/completion: Add wait_for_completion_state()
sched: Add TASK_ANY for wait_task_inactive()
sched: Change wait_task_inactive()s match_state
freezer,umh: Clean up freezer/initrd interaction
freezer: Have {,un}lock_system_sleep() save/restore flags
sched: Rename task_running() to task_on_cpu()
sched/fair: Cleanup for SIS_PROP
sched/fair: Default to false in test_idle_cores()
sched/fair: Remove useless check in select_idle_core()
sched/fair: Avoid double search on same cpu
sched/fair: Remove redundant check in select_idle_smt()
...

+618 -762
+8 -4
drivers/acpi/x86/s2idle.c
··· 654 654 655 655 int acpi_register_lps0_dev(struct acpi_s2idle_dev_ops *arg) 656 656 { 657 + unsigned int sleep_flags; 658 + 657 659 if (!lps0_device_handle || sleep_no_lps0) 658 660 return -ENODEV; 659 661 660 - lock_system_sleep(); 662 + sleep_flags = lock_system_sleep(); 661 663 list_add(&arg->list_node, &lps0_s2idle_devops_head); 662 - unlock_system_sleep(); 664 + unlock_system_sleep(sleep_flags); 663 665 664 666 return 0; 665 667 } ··· 669 667 670 668 void acpi_unregister_lps0_dev(struct acpi_s2idle_dev_ops *arg) 671 669 { 670 + unsigned int sleep_flags; 671 + 672 672 if (!lps0_device_handle || sleep_no_lps0) 673 673 return; 674 674 675 - lock_system_sleep(); 675 + sleep_flags = lock_system_sleep(); 676 676 list_del(&arg->list_node); 677 - unlock_system_sleep(); 677 + unlock_system_sleep(sleep_flags); 678 678 } 679 679 EXPORT_SYMBOL_GPL(acpi_unregister_lps0_dev); 680 680
+1 -3
drivers/android/binder.c
··· 4259 4259 struct binder_proc *proc = thread->proc; 4260 4260 int ret = 0; 4261 4261 4262 - freezer_do_not_count(); 4263 4262 binder_inner_proc_lock(proc); 4264 4263 for (;;) { 4265 - prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE); 4264 + prepare_to_wait(&thread->wait, &wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE); 4266 4265 if (binder_has_work_ilocked(thread, do_proc_work)) 4267 4266 break; 4268 4267 if (do_proc_work) ··· 4278 4279 } 4279 4280 finish_wait(&thread->wait, &wait); 4280 4281 binder_inner_proc_unlock(proc); 4281 - freezer_count(); 4282 4282 4283 4283 return ret; 4284 4284 }
+2 -2
drivers/media/pci/pt3/pt3.c
··· 445 445 pt3_proc_dma(adap); 446 446 447 447 delay = ktime_set(0, PT3_FETCH_DELAY * NSEC_PER_MSEC); 448 - set_current_state(TASK_UNINTERRUPTIBLE); 449 - freezable_schedule_hrtimeout_range(&delay, 448 + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 449 + schedule_hrtimeout_range(&delay, 450 450 PT3_FETCH_DELAY_DELTA * NSEC_PER_MSEC, 451 451 HRTIMER_MODE_REL); 452 452 }
+1 -1
drivers/powercap/idle_inject.c
··· 254 254 iit = per_cpu_ptr(&idle_inject_thread, cpu); 255 255 iit->should_run = 0; 256 256 257 - wait_task_inactive(iit->tsk, 0); 257 + wait_task_inactive(iit->tsk, TASK_ANY); 258 258 } 259 259 260 260 cpu_hotplug_enable();
+4 -3
drivers/scsi/scsi_transport_spi.c
··· 998 998 spi_dv_device(struct scsi_device *sdev) 999 999 { 1000 1000 struct scsi_target *starget = sdev->sdev_target; 1001 - u8 *buffer; 1002 1001 const int len = SPI_MAX_ECHO_BUFFER_SIZE*2; 1002 + unsigned int sleep_flags; 1003 + u8 *buffer; 1003 1004 1004 1005 /* 1005 1006 * Because this function and the power management code both call ··· 1008 1007 * while suspend or resume is in progress. Hence the 1009 1008 * lock/unlock_system_sleep() calls. 1010 1009 */ 1011 - lock_system_sleep(); 1010 + sleep_flags = lock_system_sleep(); 1012 1011 1013 1012 if (scsi_autopm_get_device(sdev)) 1014 1013 goto unlock_system_sleep; ··· 1059 1058 scsi_autopm_put_device(sdev); 1060 1059 1061 1060 unlock_system_sleep: 1062 - unlock_system_sleep(); 1061 + unlock_system_sleep(sleep_flags); 1063 1062 } 1064 1063 EXPORT_SYMBOL(spi_dv_device); 1065 1064
+2 -2
fs/cifs/inode.c
··· 2327 2327 static int 2328 2328 cifs_wait_bit_killable(struct wait_bit_key *key, int mode) 2329 2329 { 2330 - freezable_schedule_unsafe(); 2330 + schedule(); 2331 2331 if (signal_pending_state(mode, current)) 2332 2332 return -ERESTARTSYS; 2333 2333 return 0; ··· 2345 2345 return 0; 2346 2346 2347 2347 rc = wait_on_bit_lock_action(flags, CIFS_INO_LOCK, cifs_wait_bit_killable, 2348 - TASK_KILLABLE); 2348 + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 2349 2349 if (rc) 2350 2350 return rc; 2351 2351
+3 -2
fs/cifs/transport.c
··· 753 753 { 754 754 int error; 755 755 756 - error = wait_event_freezekillable_unsafe(server->response_q, 757 - midQ->mid_state != MID_REQUEST_SUBMITTED); 756 + error = wait_event_state(server->response_q, 757 + midQ->mid_state != MID_REQUEST_SUBMITTED, 758 + (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); 758 759 if (error < 0) 759 760 return -ERESTARTSYS; 760 761
+3 -4
fs/coredump.c
··· 402 402 if (core_waiters > 0) { 403 403 struct core_thread *ptr; 404 404 405 - freezer_do_not_count(); 406 - wait_for_completion(&core_state->startup); 407 - freezer_count(); 405 + wait_for_completion_state(&core_state->startup, 406 + TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 408 407 /* 409 408 * Wait for all the threads to become inactive, so that 410 409 * all the thread context (extended register state, like ··· 411 412 */ 412 413 ptr = core_state->dumper.next; 413 414 while (ptr != NULL) { 414 - wait_task_inactive(ptr->task, 0); 415 + wait_task_inactive(ptr->task, TASK_ANY); 415 416 ptr = ptr->next; 416 417 } 417 418 }
+2 -1
fs/nfs/file.c
··· 567 567 } 568 568 569 569 wait_on_bit_action(&NFS_I(inode)->flags, NFS_INO_INVALIDATING, 570 - nfs_wait_bit_killable, TASK_KILLABLE); 570 + nfs_wait_bit_killable, 571 + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 571 572 572 573 lock_page(page); 573 574 mapping = page_file_mapping(page);
+4 -8
fs/nfs/inode.c
··· 72 72 return nfs_fileid_to_ino_t(fattr->fileid); 73 73 } 74 74 75 - static int nfs_wait_killable(int mode) 75 + int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) 76 76 { 77 - freezable_schedule_unsafe(); 77 + schedule(); 78 78 if (signal_pending_state(mode, current)) 79 79 return -ERESTARTSYS; 80 80 return 0; 81 - } 82 - 83 - int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) 84 - { 85 - return nfs_wait_killable(mode); 86 81 } 87 82 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); 88 83 ··· 1327 1332 */ 1328 1333 for (;;) { 1329 1334 ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, 1330 - nfs_wait_bit_killable, TASK_KILLABLE); 1335 + nfs_wait_bit_killable, 1336 + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 1331 1337 if (ret) 1332 1338 goto out; 1333 1339 spin_lock(&inode->i_lock);
+2 -1
fs/nfs/nfs3proc.c
··· 36 36 res = rpc_call_sync(clnt, msg, flags); 37 37 if (res != -EJUKEBOX) 38 38 break; 39 - freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME); 39 + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 40 + schedule_timeout(NFS_JUKEBOX_RETRY_TIME); 40 41 res = -ERESTARTSYS; 41 42 } while (!fatal_signal_pending(current)); 42 43 return res;
+7 -7
fs/nfs/nfs4proc.c
··· 416 416 { 417 417 might_sleep(); 418 418 419 - freezable_schedule_timeout_killable_unsafe( 420 - nfs4_update_delay(timeout)); 419 + __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 420 + schedule_timeout(nfs4_update_delay(timeout)); 421 421 if (!__fatal_signal_pending(current)) 422 422 return 0; 423 423 return -EINTR; ··· 427 427 { 428 428 might_sleep(); 429 429 430 - freezable_schedule_timeout_interruptible_unsafe(nfs4_update_delay(timeout)); 430 + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); 431 + schedule_timeout(nfs4_update_delay(timeout)); 431 432 if (!signal_pending(current)) 432 433 return 0; 433 434 return __fatal_signal_pending(current) ? -EINTR :-ERESTARTSYS; ··· 7407 7406 status = nfs4_proc_setlk(state, cmd, request); 7408 7407 if ((status != -EAGAIN) || IS_SETLK(cmd)) 7409 7408 break; 7410 - freezable_schedule_timeout_interruptible(timeout); 7409 + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 7410 + schedule_timeout(timeout); 7411 7411 timeout *= 2; 7412 7412 timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout); 7413 7413 status = -ERESTARTSYS; ··· 7476 7474 break; 7477 7475 7478 7476 status = -ERESTARTSYS; 7479 - freezer_do_not_count(); 7480 - wait_woken(&waiter.wait, TASK_INTERRUPTIBLE, 7477 + wait_woken(&waiter.wait, TASK_INTERRUPTIBLE|TASK_FREEZABLE, 7481 7478 NFS4_LOCK_MAXTIMEOUT); 7482 - freezer_count(); 7483 7479 } while (!signalled()); 7484 7480 7485 7481 remove_wait_queue(q, &waiter.wait);
+2 -1
fs/nfs/nfs4state.c
··· 1314 1314 1315 1315 refcount_inc(&clp->cl_count); 1316 1316 res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, 1317 - nfs_wait_bit_killable, TASK_KILLABLE); 1317 + nfs_wait_bit_killable, 1318 + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 1318 1319 if (res) 1319 1320 goto out; 1320 1321 if (clp->cl_cons_state < 0)
+2 -2
fs/nfs/pnfs.c
··· 1908 1908 pnfs_layoutcommit_inode(lo->plh_inode, false); 1909 1909 return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, 1910 1910 nfs_wait_bit_killable, 1911 - TASK_KILLABLE); 1911 + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 1912 1912 } 1913 1913 1914 1914 static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo) ··· 3192 3192 status = wait_on_bit_lock_action(&nfsi->flags, 3193 3193 NFS_INO_LAYOUTCOMMITTING, 3194 3194 nfs_wait_bit_killable, 3195 - TASK_KILLABLE); 3195 + TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 3196 3196 if (status) 3197 3197 goto out; 3198 3198 }
+4 -4
fs/xfs/xfs_trans_ail.c
··· 602 602 603 603 while (1) { 604 604 if (tout && tout <= 20) 605 - set_current_state(TASK_KILLABLE); 605 + set_current_state(TASK_KILLABLE|TASK_FREEZABLE); 606 606 else 607 - set_current_state(TASK_INTERRUPTIBLE); 607 + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 608 608 609 609 /* 610 610 * Check kthread_should_stop() after we set the task state to ··· 653 653 ailp->ail_target == ailp->ail_target_prev && 654 654 list_empty(&ailp->ail_buf_list)) { 655 655 spin_unlock(&ailp->ail_lock); 656 - freezable_schedule(); 656 + schedule(); 657 657 tout = 0; 658 658 continue; 659 659 } 660 660 spin_unlock(&ailp->ail_lock); 661 661 662 662 if (tout) 663 - freezable_schedule_timeout(msecs_to_jiffies(tout)); 663 + schedule_timeout(msecs_to_jiffies(tout)); 664 664 665 665 __set_current_state(TASK_RUNNING); 666 666
+1
include/linux/completion.h
··· 103 103 extern void wait_for_completion_io(struct completion *); 104 104 extern int wait_for_completion_interruptible(struct completion *x); 105 105 extern int wait_for_completion_killable(struct completion *x); 106 + extern int wait_for_completion_state(struct completion *x, unsigned int state); 106 107 extern unsigned long wait_for_completion_timeout(struct completion *x, 107 108 unsigned long timeout); 108 109 extern unsigned long wait_for_completion_io_timeout(struct completion *x,
+10 -235
include/linux/freezer.h
··· 8 8 #include <linux/sched.h> 9 9 #include <linux/wait.h> 10 10 #include <linux/atomic.h> 11 + #include <linux/jump_label.h> 11 12 12 13 #ifdef CONFIG_FREEZER 13 - extern atomic_t system_freezing_cnt; /* nr of freezing conds in effect */ 14 + DECLARE_STATIC_KEY_FALSE(freezer_active); 15 + 14 16 extern bool pm_freezing; /* PM freezing in effect */ 15 17 extern bool pm_nosig_freezing; /* PM nosig freezing in effect */ 16 18 ··· 24 22 /* 25 23 * Check if a process has been frozen 26 24 */ 27 - static inline bool frozen(struct task_struct *p) 28 - { 29 - return p->flags & PF_FROZEN; 30 - } 25 + extern bool frozen(struct task_struct *p); 31 26 32 27 extern bool freezing_slow_path(struct task_struct *p); 33 28 ··· 33 34 */ 34 35 static inline bool freezing(struct task_struct *p) 35 36 { 36 - if (likely(!atomic_read(&system_freezing_cnt))) 37 - return false; 38 - return freezing_slow_path(p); 37 + if (static_branch_unlikely(&freezer_active)) 38 + return freezing_slow_path(p); 39 + 40 + return false; 39 41 } 40 42 41 43 /* Takes and releases task alloc lock using task_lock() */ ··· 48 48 extern void thaw_processes(void); 49 49 extern void thaw_kernel_threads(void); 50 50 51 - /* 52 - * DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION 53 - * If try_to_freeze causes a lockdep warning it means the caller may deadlock 54 - */ 55 - static inline bool try_to_freeze_unsafe(void) 51 + static inline bool try_to_freeze(void) 56 52 { 57 53 might_sleep(); 58 54 if (likely(!freezing(current))) 59 55 return false; 60 - return __refrigerator(false); 61 - } 62 - 63 - static inline bool try_to_freeze(void) 64 - { 65 56 if (!(current->flags & PF_NOFREEZE)) 66 57 debug_check_no_locks_held(); 67 - return try_to_freeze_unsafe(); 58 + return __refrigerator(false); 68 59 } 69 60 70 61 extern bool freeze_task(struct task_struct *p); ··· 70 79 } 71 80 #endif /* !CONFIG_CGROUP_FREEZER */ 72 81 73 - /* 74 - * The PF_FREEZER_SKIP flag should be set by a vfork parent right before it 75 - * calls wait_for_completion(&vfork) and reset right after it returns from this 76 - * function. Next, the parent should call try_to_freeze() to freeze itself 77 - * appropriately in case the child has exited before the freezing of tasks is 78 - * complete. However, we don't want kernel threads to be frozen in unexpected 79 - * places, so we allow them to block freeze_processes() instead or to set 80 - * PF_NOFREEZE if needed. Fortunately, in the ____call_usermodehelper() case the 81 - * parent won't really block freeze_processes(), since ____call_usermodehelper() 82 - * (the child) does a little before exec/exit and it can't be frozen before 83 - * waking up the parent. 84 - */ 85 - 86 - 87 - /** 88 - * freezer_do_not_count - tell freezer to ignore %current 89 - * 90 - * Tell freezers to ignore the current task when determining whether the 91 - * target frozen state is reached. IOW, the current task will be 92 - * considered frozen enough by freezers. 93 - * 94 - * The caller shouldn't do anything which isn't allowed for a frozen task 95 - * until freezer_cont() is called. Usually, freezer[_do_not]_count() pair 96 - * wrap a scheduling operation and nothing much else. 97 - */ 98 - static inline void freezer_do_not_count(void) 99 - { 100 - current->flags |= PF_FREEZER_SKIP; 101 - } 102 - 103 - /** 104 - * freezer_count - tell freezer to stop ignoring %current 105 - * 106 - * Undo freezer_do_not_count(). It tells freezers that %current should be 107 - * considered again and tries to freeze if freezing condition is already in 108 - * effect. 109 - */ 110 - static inline void freezer_count(void) 111 - { 112 - current->flags &= ~PF_FREEZER_SKIP; 113 - /* 114 - * If freezing is in progress, the following paired with smp_mb() 115 - * in freezer_should_skip() ensures that either we see %true 116 - * freezing() or freezer_should_skip() sees !PF_FREEZER_SKIP. 117 - */ 118 - smp_mb(); 119 - try_to_freeze(); 120 - } 121 - 122 - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ 123 - static inline void freezer_count_unsafe(void) 124 - { 125 - current->flags &= ~PF_FREEZER_SKIP; 126 - smp_mb(); 127 - try_to_freeze_unsafe(); 128 - } 129 - 130 - /** 131 - * freezer_should_skip - whether to skip a task when determining frozen 132 - * state is reached 133 - * @p: task in quesion 134 - * 135 - * This function is used by freezers after establishing %true freezing() to 136 - * test whether a task should be skipped when determining the target frozen 137 - * state is reached. IOW, if this function returns %true, @p is considered 138 - * frozen enough. 139 - */ 140 - static inline bool freezer_should_skip(struct task_struct *p) 141 - { 142 - /* 143 - * The following smp_mb() paired with the one in freezer_count() 144 - * ensures that either freezer_count() sees %true freezing() or we 145 - * see cleared %PF_FREEZER_SKIP and return %false. This makes it 146 - * impossible for a task to slip frozen state testing after 147 - * clearing %PF_FREEZER_SKIP. 148 - */ 149 - smp_mb(); 150 - return p->flags & PF_FREEZER_SKIP; 151 - } 152 - 153 - /* 154 - * These functions are intended to be used whenever you want allow a sleeping 155 - * task to be frozen. Note that neither return any clear indication of 156 - * whether a freeze event happened while in this function. 157 - */ 158 - 159 - /* Like schedule(), but should not block the freezer. */ 160 - static inline void freezable_schedule(void) 161 - { 162 - freezer_do_not_count(); 163 - schedule(); 164 - freezer_count(); 165 - } 166 - 167 - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ 168 - static inline void freezable_schedule_unsafe(void) 169 - { 170 - freezer_do_not_count(); 171 - schedule(); 172 - freezer_count_unsafe(); 173 - } 174 - 175 - /* 176 - * Like schedule_timeout(), but should not block the freezer. Do not 177 - * call this with locks held. 178 - */ 179 - static inline long freezable_schedule_timeout(long timeout) 180 - { 181 - long __retval; 182 - freezer_do_not_count(); 183 - __retval = schedule_timeout(timeout); 184 - freezer_count(); 185 - return __retval; 186 - } 187 - 188 - /* 189 - * Like schedule_timeout_interruptible(), but should not block the freezer. Do not 190 - * call this with locks held. 191 - */ 192 - static inline long freezable_schedule_timeout_interruptible(long timeout) 193 - { 194 - long __retval; 195 - freezer_do_not_count(); 196 - __retval = schedule_timeout_interruptible(timeout); 197 - freezer_count(); 198 - return __retval; 199 - } 200 - 201 - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ 202 - static inline long freezable_schedule_timeout_interruptible_unsafe(long timeout) 203 - { 204 - long __retval; 205 - 206 - freezer_do_not_count(); 207 - __retval = schedule_timeout_interruptible(timeout); 208 - freezer_count_unsafe(); 209 - return __retval; 210 - } 211 - 212 - /* Like schedule_timeout_killable(), but should not block the freezer. */ 213 - static inline long freezable_schedule_timeout_killable(long timeout) 214 - { 215 - long __retval; 216 - freezer_do_not_count(); 217 - __retval = schedule_timeout_killable(timeout); 218 - freezer_count(); 219 - return __retval; 220 - } 221 - 222 - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ 223 - static inline long freezable_schedule_timeout_killable_unsafe(long timeout) 224 - { 225 - long __retval; 226 - freezer_do_not_count(); 227 - __retval = schedule_timeout_killable(timeout); 228 - freezer_count_unsafe(); 229 - return __retval; 230 - } 231 - 232 - /* 233 - * Like schedule_hrtimeout_range(), but should not block the freezer. Do not 234 - * call this with locks held. 235 - */ 236 - static inline int freezable_schedule_hrtimeout_range(ktime_t *expires, 237 - u64 delta, const enum hrtimer_mode mode) 238 - { 239 - int __retval; 240 - freezer_do_not_count(); 241 - __retval = schedule_hrtimeout_range(expires, delta, mode); 242 - freezer_count(); 243 - return __retval; 244 - } 245 - 246 - /* 247 - * Freezer-friendly wrappers around wait_event_interruptible(), 248 - * wait_event_killable() and wait_event_interruptible_timeout(), originally 249 - * defined in <linux/wait.h> 250 - */ 251 - 252 - /* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */ 253 - #define wait_event_freezekillable_unsafe(wq, condition) \ 254 - ({ \ 255 - int __retval; \ 256 - freezer_do_not_count(); \ 257 - __retval = wait_event_killable(wq, (condition)); \ 258 - freezer_count_unsafe(); \ 259 - __retval; \ 260 - }) 261 - 262 82 #else /* !CONFIG_FREEZER */ 263 83 static inline bool frozen(struct task_struct *p) { return false; } 264 84 static inline bool freezing(struct task_struct *p) { return false; } ··· 83 281 84 282 static inline bool try_to_freeze(void) { return false; } 85 283 86 - static inline void freezer_do_not_count(void) {} 87 - static inline void freezer_count(void) {} 88 - static inline int freezer_should_skip(struct task_struct *p) { return 0; } 89 284 static inline void set_freezable(void) {} 90 - 91 - #define freezable_schedule() schedule() 92 - 93 - #define freezable_schedule_unsafe() schedule() 94 - 95 - #define freezable_schedule_timeout(timeout) schedule_timeout(timeout) 96 - 97 - #define freezable_schedule_timeout_interruptible(timeout) \ 98 - schedule_timeout_interruptible(timeout) 99 - 100 - #define freezable_schedule_timeout_interruptible_unsafe(timeout) \ 101 - schedule_timeout_interruptible(timeout) 102 - 103 - #define freezable_schedule_timeout_killable(timeout) \ 104 - schedule_timeout_killable(timeout) 105 - 106 - #define freezable_schedule_timeout_killable_unsafe(timeout) \ 107 - schedule_timeout_killable(timeout) 108 - 109 - #define freezable_schedule_hrtimeout_range(expires, delta, mode) \ 110 - schedule_hrtimeout_range(expires, delta, mode) 111 - 112 - #define wait_event_freezekillable_unsafe(wq, condition) \ 113 - wait_event_killable(wq, condition) 114 285 115 286 #endif /* !CONFIG_FREEZER */ 116 287
+32 -18
include/linux/sched.h
··· 81 81 */ 82 82 83 83 /* Used in tsk->state: */ 84 - #define TASK_RUNNING 0x0000 85 - #define TASK_INTERRUPTIBLE 0x0001 86 - #define TASK_UNINTERRUPTIBLE 0x0002 87 - #define __TASK_STOPPED 0x0004 88 - #define __TASK_TRACED 0x0008 84 + #define TASK_RUNNING 0x00000000 85 + #define TASK_INTERRUPTIBLE 0x00000001 86 + #define TASK_UNINTERRUPTIBLE 0x00000002 87 + #define __TASK_STOPPED 0x00000004 88 + #define __TASK_TRACED 0x00000008 89 89 /* Used in tsk->exit_state: */ 90 - #define EXIT_DEAD 0x0010 91 - #define EXIT_ZOMBIE 0x0020 90 + #define EXIT_DEAD 0x00000010 91 + #define EXIT_ZOMBIE 0x00000020 92 92 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) 93 93 /* Used in tsk->state again: */ 94 - #define TASK_PARKED 0x0040 95 - #define TASK_DEAD 0x0080 96 - #define TASK_WAKEKILL 0x0100 97 - #define TASK_WAKING 0x0200 98 - #define TASK_NOLOAD 0x0400 99 - #define TASK_NEW 0x0800 100 - /* RT specific auxilliary flag to mark RT lock waiters */ 101 - #define TASK_RTLOCK_WAIT 0x1000 102 - #define TASK_STATE_MAX 0x2000 94 + #define TASK_PARKED 0x00000040 95 + #define TASK_DEAD 0x00000080 96 + #define TASK_WAKEKILL 0x00000100 97 + #define TASK_WAKING 0x00000200 98 + #define TASK_NOLOAD 0x00000400 99 + #define TASK_NEW 0x00000800 100 + #define TASK_RTLOCK_WAIT 0x00001000 101 + #define TASK_FREEZABLE 0x00002000 102 + #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) 103 + #define TASK_FROZEN 0x00008000 104 + #define TASK_STATE_MAX 0x00010000 105 + 106 + #define TASK_ANY (TASK_STATE_MAX-1) 107 + 108 + /* 109 + * DO NOT ADD ANY NEW USERS ! 110 + */ 111 + #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) 103 112 104 113 /* Convenience macros for the sake of set_current_state: */ 105 114 #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) ··· 1722 1713 #define PF_MEMALLOC 0x00000800 /* Allocating memory */ 1723 1714 #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ 1724 1715 #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ 1716 + #define PF__HOLE__00004000 0x00004000 1725 1717 #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ 1726 - #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ 1718 + #define PF__HOLE__00010000 0x00010000 1727 1719 #define PF_KSWAPD 0x00020000 /* I am kswapd */ 1728 1720 #define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ 1729 1721 #define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ ··· 1732 1722 * I am cleaning dirty pages from some other bdi. */ 1733 1723 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 1734 1724 #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ 1725 + #define PF__HOLE__00800000 0x00800000 1726 + #define PF__HOLE__01000000 0x01000000 1727 + #define PF__HOLE__02000000 0x02000000 1735 1728 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ 1736 1729 #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1737 1730 #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ 1738 - #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ 1731 + #define PF__HOLE__20000000 0x20000000 1732 + #define PF__HOLE__40000000 0x40000000 1739 1733 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ 1740 1734 1741 1735 /*
+1 -6
include/linux/sunrpc/sched.h
··· 252 252 void rpc_free(struct rpc_task *); 253 253 int rpciod_up(void); 254 254 void rpciod_down(void); 255 - int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *); 255 + int rpc_wait_for_completion_task(struct rpc_task *task); 256 256 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 257 257 struct net; 258 258 void rpc_show_tasks(struct net *); ··· 263 263 extern struct workqueue_struct *xprtiod_workqueue; 264 264 void rpc_prepare_task(struct rpc_task *task); 265 265 gfp_t rpc_task_gfp_mask(void); 266 - 267 - static inline int rpc_wait_for_completion_task(struct rpc_task *task) 268 - { 269 - return __rpc_wait_for_completion_task(task, NULL); 270 - } 271 266 272 267 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) || IS_ENABLED(CONFIG_TRACEPOINTS) 273 268 static inline const char * rpc_qname(const struct rpc_wait_queue *q)
+4 -4
include/linux/suspend.h
··· 511 511 extern void pm_wakep_autosleep_enabled(bool set); 512 512 extern void pm_print_active_wakeup_sources(void); 513 513 514 - extern void lock_system_sleep(void); 515 - extern void unlock_system_sleep(void); 514 + extern unsigned int lock_system_sleep(void); 515 + extern void unlock_system_sleep(unsigned int); 516 516 517 517 #else /* !CONFIG_PM_SLEEP */ 518 518 ··· 535 535 static inline void pm_wakeup_clear(bool reset) {} 536 536 static inline void pm_system_irq_wakeup(unsigned int irq_number) {} 537 537 538 - static inline void lock_system_sleep(void) {} 539 - static inline void unlock_system_sleep(void) {} 538 + static inline unsigned int lock_system_sleep(void) { return 0; } 539 + static inline void unlock_system_sleep(unsigned int flags) {} 540 540 541 541 #endif /* !CONFIG_PM_SLEEP */ 542 542
+5 -4
include/linux/umh.h
··· 11 11 struct cred; 12 12 struct file; 13 13 14 - #define UMH_NO_WAIT 0 /* don't wait at all */ 15 - #define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */ 16 - #define UMH_WAIT_PROC 2 /* wait for the process to complete */ 17 - #define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */ 14 + #define UMH_NO_WAIT 0x00 /* don't wait at all */ 15 + #define UMH_WAIT_EXEC 0x01 /* wait for the exec, but not the process */ 16 + #define UMH_WAIT_PROC 0x02 /* wait for the process to complete */ 17 + #define UMH_KILLABLE 0x04 /* wait for EXEC/PROC killable */ 18 + #define UMH_FREEZABLE 0x08 /* wait for EXEC/PROC freezable */ 18 19 19 20 struct subprocess_info { 20 21 struct work_struct work;
+35 -7
include/linux/wait.h
··· 281 281 282 282 #define ___wait_is_interruptible(state) \ 283 283 (!__builtin_constant_p(state) || \ 284 - state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ 284 + (state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) 285 285 286 286 extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags); 287 287 ··· 361 361 } while (0) 362 362 363 363 #define __wait_event_freezable(wq_head, condition) \ 364 - ___wait_event(wq_head, condition, TASK_INTERRUPTIBLE, 0, 0, \ 365 - freezable_schedule()) 364 + ___wait_event(wq_head, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), \ 365 + 0, 0, schedule()) 366 366 367 367 /** 368 368 * wait_event_freezable - sleep (or freeze) until a condition gets true ··· 420 420 421 421 #define __wait_event_freezable_timeout(wq_head, condition, timeout) \ 422 422 ___wait_event(wq_head, ___wait_cond_timeout(condition), \ 423 - TASK_INTERRUPTIBLE, 0, timeout, \ 424 - __ret = freezable_schedule_timeout(__ret)) 423 + (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 0, timeout, \ 424 + __ret = schedule_timeout(__ret)) 425 425 426 426 /* 427 427 * like wait_event_timeout() -- except it uses TASK_INTERRUPTIBLE to avoid ··· 642 642 643 643 644 644 #define __wait_event_freezable_exclusive(wq, condition) \ 645 - ___wait_event(wq, condition, TASK_INTERRUPTIBLE, 1, 0, \ 646 - freezable_schedule()) 645 + ___wait_event(wq, condition, (TASK_INTERRUPTIBLE|TASK_FREEZABLE), 1, 0,\ 646 + schedule()) 647 647 648 648 #define wait_event_freezable_exclusive(wq, condition) \ 649 649 ({ \ ··· 929 929 might_sleep(); \ 930 930 if (!(condition)) \ 931 931 __ret = __wait_event_killable(wq_head, condition); \ 932 + __ret; \ 933 + }) 934 + 935 + #define __wait_event_state(wq, condition, state) \ 936 + ___wait_event(wq, condition, state, 0, 0, schedule()) 937 + 938 + /** 939 + * wait_event_state - sleep until a condition gets true 940 + * @wq_head: the waitqueue to wait on 941 + * @condition: a C expression for the event to wait for 942 + * @state: state to sleep in 943 + * 944 + * The process is put to sleep (@state) until the @condition evaluates to true 945 + * or a signal is received (when allowed by @state). The @condition is checked 946 + * each time the waitqueue @wq_head is woken up. 947 + * 948 + * wake_up() has to be called after changing any variable that could 949 + * change the result of the wait condition. 950 + * 951 + * The function will return -ERESTARTSYS if it was interrupted by a signal 952 + * (when allowed by @state) and 0 if @condition evaluated to true. 953 + */ 954 + #define wait_event_state(wq_head, condition, state) \ 955 + ({ \ 956 + int __ret = 0; \ 957 + might_sleep(); \ 958 + if (!(condition)) \ 959 + __ret = __wait_event_state(wq_head, condition, state); \ 932 960 __ret; \ 933 961 }) 934 962
+1 -9
init/do_mounts_initrd.c
··· 99 99 init_mkdir("/old", 0700); 100 100 init_chdir("/old"); 101 101 102 - /* 103 - * In case that a resume from disk is carried out by linuxrc or one of 104 - * its children, we need to tell the freezer not to wait for us. 105 - */ 106 - current->flags |= PF_FREEZER_SKIP; 107 - 108 102 info = call_usermodehelper_setup("/linuxrc", argv, envp_init, 109 103 GFP_KERNEL, init_linuxrc, NULL, NULL); 110 104 if (!info) 111 105 return; 112 - call_usermodehelper_exec(info, UMH_WAIT_PROC); 113 - 114 - current->flags &= ~PF_FREEZER_SKIP; 106 + call_usermodehelper_exec(info, UMH_WAIT_PROC|UMH_FREEZABLE); 115 107 116 108 /* move initrd to rootfs' /old */ 117 109 init_mount("..", ".", NULL, MS_MOVE, NULL);
+8 -15
kernel/cgroup/legacy_freezer.c
··· 113 113 114 114 if (parent && (parent->state & CGROUP_FREEZING)) { 115 115 freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; 116 - atomic_inc(&system_freezing_cnt); 116 + static_branch_inc(&freezer_active); 117 117 } 118 118 119 119 mutex_unlock(&freezer_mutex); ··· 134 134 mutex_lock(&freezer_mutex); 135 135 136 136 if (freezer->state & CGROUP_FREEZING) 137 - atomic_dec(&system_freezing_cnt); 137 + static_branch_dec(&freezer_active); 138 138 139 139 freezer->state = 0; 140 140 ··· 179 179 __thaw_task(task); 180 180 } else { 181 181 freeze_task(task); 182 + 182 183 /* clear FROZEN and propagate upwards */ 183 184 while (freezer && (freezer->state & CGROUP_FROZEN)) { 184 185 freezer->state &= ~CGROUP_FROZEN; ··· 272 271 css_task_iter_start(css, 0, &it); 273 272 274 273 while ((task = css_task_iter_next(&it))) { 275 - if (freezing(task)) { 276 - /* 277 - * freezer_should_skip() indicates that the task 278 - * should be skipped when determining freezing 279 - * completion. Consider it frozen in addition to 280 - * the usual frozen condition. 281 - */ 282 - if (!frozen(task) && !freezer_should_skip(task)) 283 - goto out_iter_end; 284 - } 274 + if (freezing(task) && !frozen(task)) 275 + goto out_iter_end; 285 276 } 286 277 287 278 freezer->state |= CGROUP_FROZEN; ··· 350 357 351 358 if (freeze) { 352 359 if (!(freezer->state & CGROUP_FREEZING)) 353 - atomic_inc(&system_freezing_cnt); 360 + static_branch_inc(&freezer_active); 354 361 freezer->state |= state; 355 362 freeze_cgroup(freezer); 356 363 } else { ··· 359 366 freezer->state &= ~state; 360 367 361 368 if (!(freezer->state & CGROUP_FREEZING)) { 362 - if (was_freezing) 363 - atomic_dec(&system_freezing_cnt); 364 369 freezer->state &= ~CGROUP_FROZEN; 370 + if (was_freezing) 371 + static_branch_dec(&freezer_active); 365 372 unfreeze_cgroup(freezer); 366 373 } 367 374 }
+2 -2
kernel/exit.c
··· 374 374 complete(&core_state->startup); 375 375 376 376 for (;;) { 377 - set_current_state(TASK_UNINTERRUPTIBLE); 377 + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); 378 378 if (!self.task) /* see coredump_finish() */ 379 379 break; 380 - freezable_schedule(); 380 + schedule(); 381 381 } 382 382 __set_current_state(TASK_RUNNING); 383 383 }
+2 -3
kernel/fork.c
··· 1421 1421 static int wait_for_vfork_done(struct task_struct *child, 1422 1422 struct completion *vfork) 1423 1423 { 1424 + unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; 1424 1425 int killed; 1425 1426 1426 - freezer_do_not_count(); 1427 1427 cgroup_enter_frozen(); 1428 - killed = wait_for_completion_killable(vfork); 1428 + killed = wait_for_completion_state(vfork, state); 1429 1429 cgroup_leave_frozen(false); 1430 - freezer_count(); 1431 1430 1432 1431 if (killed) { 1433 1432 task_lock(child);
+96 -37
kernel/freezer.c
··· 13 13 #include <linux/kthread.h> 14 14 15 15 /* total number of freezing conditions in effect */ 16 - atomic_t system_freezing_cnt = ATOMIC_INIT(0); 17 - EXPORT_SYMBOL(system_freezing_cnt); 16 + DEFINE_STATIC_KEY_FALSE(freezer_active); 17 + EXPORT_SYMBOL(freezer_active); 18 18 19 - /* indicate whether PM freezing is in effect, protected by 19 + /* 20 + * indicate whether PM freezing is in effect, protected by 20 21 * system_transition_mutex 21 22 */ 22 23 bool pm_freezing; ··· 30 29 * freezing_slow_path - slow path for testing whether a task needs to be frozen 31 30 * @p: task to be tested 32 31 * 33 - * This function is called by freezing() if system_freezing_cnt isn't zero 32 + * This function is called by freezing() if freezer_active isn't zero 34 33 * and tests whether @p needs to enter and stay in frozen state. Can be 35 34 * called under any context. The freezers are responsible for ensuring the 36 35 * target tasks see the updated state. ··· 53 52 } 54 53 EXPORT_SYMBOL(freezing_slow_path); 55 54 55 + bool frozen(struct task_struct *p) 56 + { 57 + return READ_ONCE(p->__state) & TASK_FROZEN; 58 + } 59 + 56 60 /* Refrigerator is place where frozen processes are stored :-). */ 57 61 bool __refrigerator(bool check_kthr_stop) 58 62 { 59 - /* Hmm, should we be allowed to suspend when there are realtime 60 - processes around? */ 63 + unsigned int state = get_current_state(); 61 64 bool was_frozen = false; 62 - unsigned int save = get_current_state(); 63 65 64 66 pr_debug("%s entered refrigerator\n", current->comm); 65 67 68 + WARN_ON_ONCE(state && !(state & TASK_NORMAL)); 69 + 66 70 for (;;) { 67 - set_current_state(TASK_UNINTERRUPTIBLE); 71 + bool freeze; 72 + 73 + set_current_state(TASK_FROZEN); 68 74 69 75 spin_lock_irq(&freezer_lock); 70 - current->flags |= PF_FROZEN; 71 - if (!freezing(current) || 72 - (check_kthr_stop && kthread_should_stop())) 73 - current->flags &= ~PF_FROZEN; 76 + freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); 74 77 spin_unlock_irq(&freezer_lock); 75 78 76 - if (!(current->flags & PF_FROZEN)) 79 + if (!freeze) 77 80 break; 81 + 78 82 was_frozen = true; 79 83 schedule(); 80 84 } 85 + __set_current_state(TASK_RUNNING); 81 86 82 87 pr_debug("%s left refrigerator\n", current->comm); 83 - 84 - /* 85 - * Restore saved task state before returning. The mb'd version 86 - * needs to be used; otherwise, it might silently break 87 - * synchronization which depends on ordered task state change. 88 - */ 89 - set_current_state(save); 90 88 91 89 return was_frozen; 92 90 } ··· 99 99 signal_wake_up(p, 0); 100 100 unlock_task_sighand(p, &flags); 101 101 } 102 + } 103 + 104 + static int __set_task_frozen(struct task_struct *p, void *arg) 105 + { 106 + unsigned int state = READ_ONCE(p->__state); 107 + 108 + if (p->on_rq) 109 + return 0; 110 + 111 + if (p != current && task_curr(p)) 112 + return 0; 113 + 114 + if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) 115 + return 0; 116 + 117 + /* 118 + * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they 119 + * can suffer spurious wakeups. 120 + */ 121 + if (state & TASK_FREEZABLE) 122 + WARN_ON_ONCE(!(state & TASK_NORMAL)); 123 + 124 + #ifdef CONFIG_LOCKDEP 125 + /* 126 + * It's dangerous to freeze with locks held; there be dragons there. 127 + */ 128 + if (!(state & __TASK_FREEZABLE_UNSAFE)) 129 + WARN_ON_ONCE(debug_locks && p->lockdep_depth); 130 + #endif 131 + 132 + WRITE_ONCE(p->__state, TASK_FROZEN); 133 + return TASK_FROZEN; 134 + } 135 + 136 + static bool __freeze_task(struct task_struct *p) 137 + { 138 + /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ 139 + return task_call_func(p, __set_task_frozen, NULL); 102 140 } 103 141 104 142 /** ··· 154 116 { 155 117 unsigned long flags; 156 118 157 - /* 158 - * This check can race with freezer_do_not_count, but worst case that 159 - * will result in an extra wakeup being sent to the task. It does not 160 - * race with freezer_count(), the barriers in freezer_count() and 161 - * freezer_should_skip() ensure that either freezer_count() sees 162 - * freezing == true in try_to_freeze() and freezes, or 163 - * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task 164 - * normally. 165 - */ 166 - if (freezer_should_skip(p)) 167 - return false; 168 - 169 119 spin_lock_irqsave(&freezer_lock, flags); 170 - if (!freezing(p) || frozen(p)) { 120 + if (!freezing(p) || frozen(p) || __freeze_task(p)) { 171 121 spin_unlock_irqrestore(&freezer_lock, flags); 172 122 return false; 173 123 } ··· 163 137 if (!(p->flags & PF_KTHREAD)) 164 138 fake_signal_wake_up(p); 165 139 else 166 - wake_up_state(p, TASK_INTERRUPTIBLE); 140 + wake_up_state(p, TASK_NORMAL); 167 141 168 142 spin_unlock_irqrestore(&freezer_lock, flags); 169 143 return true; 170 144 } 171 145 146 + /* 147 + * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical 148 + * state in p->jobctl. If either of them got a wakeup that was missed because 149 + * TASK_FROZEN, then their canonical state reflects that and the below will 150 + * refuse to restore the special state and instead issue the wakeup. 151 + */ 152 + static int __set_task_special(struct task_struct *p, void *arg) 153 + { 154 + unsigned int state = 0; 155 + 156 + if (p->jobctl & JOBCTL_TRACED) 157 + state = TASK_TRACED; 158 + 159 + else if (p->jobctl & JOBCTL_STOPPED) 160 + state = TASK_STOPPED; 161 + 162 + if (state) 163 + WRITE_ONCE(p->__state, state); 164 + 165 + return state; 166 + } 167 + 172 168 void __thaw_task(struct task_struct *p) 173 169 { 174 - unsigned long flags; 170 + unsigned long flags, flags2; 175 171 176 172 spin_lock_irqsave(&freezer_lock, flags); 177 - if (frozen(p)) 178 - wake_up_process(p); 173 + if (WARN_ON_ONCE(freezing(p))) 174 + goto unlock; 175 + 176 + if (lock_task_sighand(p, &flags2)) { 177 + /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ 178 + bool ret = task_call_func(p, __set_task_special, NULL); 179 + unlock_task_sighand(p, &flags2); 180 + if (ret) 181 + goto unlock; 182 + } 183 + 184 + wake_up_state(p, TASK_FROZEN); 185 + unlock: 179 186 spin_unlock_irqrestore(&freezer_lock, flags); 180 187 } 181 188
+4 -4
kernel/futex/waitwake.c
··· 334 334 * futex_queue() calls spin_unlock() upon completion, both serializing 335 335 * access to the hash list and forcing another memory barrier. 336 336 */ 337 - set_current_state(TASK_INTERRUPTIBLE); 337 + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 338 338 futex_queue(q, hb); 339 339 340 340 /* Arm the timer */ ··· 352 352 * is no timeout, or if it has yet to expire. 353 353 */ 354 354 if (!timeout || timeout->task) 355 - freezable_schedule(); 355 + schedule(); 356 356 } 357 357 __set_current_state(TASK_RUNNING); 358 358 } ··· 430 430 return ret; 431 431 } 432 432 433 - set_current_state(TASK_INTERRUPTIBLE); 433 + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 434 434 435 435 for (i = 0; i < count; i++) { 436 436 u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; ··· 504 504 return; 505 505 } 506 506 507 - freezable_schedule(); 507 + schedule(); 508 508 } 509 509 510 510 /**
+12 -4
kernel/hung_task.c
··· 95 95 * Ensure the task is not frozen. 96 96 * Also, skip vfork and any other user process that freezer should skip. 97 97 */ 98 - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) 99 - return; 98 + if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) 99 + return; 100 100 101 101 /* 102 102 * When a freshly created task is scheduled once, changes its state to ··· 191 191 hung_task_show_lock = false; 192 192 rcu_read_lock(); 193 193 for_each_process_thread(g, t) { 194 + unsigned int state; 195 + 194 196 if (!max_count--) 195 197 goto unlock; 196 198 if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { ··· 200 198 goto unlock; 201 199 last_break = jiffies; 202 200 } 203 - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ 204 - if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) 201 + /* 202 + * skip the TASK_KILLABLE tasks -- these can be killed 203 + * skip the TASK_IDLE tasks -- those are genuinely idle 204 + */ 205 + state = READ_ONCE(t->__state); 206 + if ((state & TASK_UNINTERRUPTIBLE) && 207 + !(state & TASK_WAKEKILL) && 208 + !(state & TASK_NOLOAD)) 205 209 check_hung_task(t, timeout); 206 210 } 207 211 unlock:
+22 -13
kernel/power/hibernate.c
··· 92 92 */ 93 93 void hibernation_set_ops(const struct platform_hibernation_ops *ops) 94 94 { 95 + unsigned int sleep_flags; 96 + 95 97 if (ops && !(ops->begin && ops->end && ops->pre_snapshot 96 98 && ops->prepare && ops->finish && ops->enter && ops->pre_restore 97 99 && ops->restore_cleanup && ops->leave)) { 98 100 WARN_ON(1); 99 101 return; 100 102 } 101 - lock_system_sleep(); 103 + 104 + sleep_flags = lock_system_sleep(); 105 + 102 106 hibernation_ops = ops; 103 107 if (ops) 104 108 hibernation_mode = HIBERNATION_PLATFORM; 105 109 else if (hibernation_mode == HIBERNATION_PLATFORM) 106 110 hibernation_mode = HIBERNATION_SHUTDOWN; 107 111 108 - unlock_system_sleep(); 112 + unlock_system_sleep(sleep_flags); 109 113 } 110 114 EXPORT_SYMBOL_GPL(hibernation_set_ops); 111 115 ··· 717 713 int hibernate(void) 718 714 { 719 715 bool snapshot_test = false; 716 + unsigned int sleep_flags; 720 717 int error; 721 718 722 719 if (!hibernation_available()) { ··· 725 720 return -EPERM; 726 721 } 727 722 728 - lock_system_sleep(); 723 + sleep_flags = lock_system_sleep(); 729 724 /* The snapshot device should not be opened while we're running */ 730 725 if (!hibernate_acquire()) { 731 726 error = -EBUSY; ··· 799 794 pm_restore_console(); 800 795 hibernate_release(); 801 796 Unlock: 802 - unlock_system_sleep(); 797 + unlock_system_sleep(sleep_flags); 803 798 pr_info("hibernation exit\n"); 804 799 805 800 return error; ··· 814 809 */ 815 810 int hibernate_quiet_exec(int (*func)(void *data), void *data) 816 811 { 812 + unsigned int sleep_flags; 817 813 int error; 818 814 819 - lock_system_sleep(); 815 + sleep_flags = lock_system_sleep(); 820 816 821 817 if (!hibernate_acquire()) { 822 818 error = -EBUSY; ··· 897 891 hibernate_release(); 898 892 899 893 unlock: 900 - unlock_system_sleep(); 894 + unlock_system_sleep(sleep_flags); 901 895 902 896 return error; 903 897 } ··· 1106 1100 static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, 1107 1101 const char *buf, size_t n) 1108 1102 { 1103 + int mode = HIBERNATION_INVALID; 1104 + unsigned int sleep_flags; 1109 1105 int error = 0; 1110 - int i; 1111 1106 int len; 1112 1107 char *p; 1113 - int mode = HIBERNATION_INVALID; 1108 + int i; 1114 1109 1115 1110 if (!hibernation_available()) 1116 1111 return -EPERM; ··· 1119 1112 p = memchr(buf, '\n', n); 1120 1113 len = p ? p - buf : n; 1121 1114 1122 - lock_system_sleep(); 1115 + sleep_flags = lock_system_sleep(); 1123 1116 for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { 1124 1117 if (len == strlen(hibernation_modes[i]) 1125 1118 && !strncmp(buf, hibernation_modes[i], len)) { ··· 1149 1142 if (!error) 1150 1143 pm_pr_dbg("Hibernation mode set to '%s'\n", 1151 1144 hibernation_modes[mode]); 1152 - unlock_system_sleep(); 1145 + unlock_system_sleep(sleep_flags); 1153 1146 return error ? error : n; 1154 1147 } 1155 1148 ··· 1165 1158 static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, 1166 1159 const char *buf, size_t n) 1167 1160 { 1168 - dev_t res; 1161 + unsigned int sleep_flags; 1169 1162 int len = n; 1170 1163 char *name; 1164 + dev_t res; 1171 1165 1172 1166 if (len && buf[len-1] == '\n') 1173 1167 len--; ··· 1181 1173 if (!res) 1182 1174 return -EINVAL; 1183 1175 1184 - lock_system_sleep(); 1176 + sleep_flags = lock_system_sleep(); 1185 1177 swsusp_resume_device = res; 1186 - unlock_system_sleep(); 1178 + unlock_system_sleep(sleep_flags); 1179 + 1187 1180 pm_pr_dbg("Configured hibernation resume from disk to %u\n", 1188 1181 swsusp_resume_device); 1189 1182 noresume = 0;
+11 -7
kernel/power/main.c
··· 21 21 22 22 #ifdef CONFIG_PM_SLEEP 23 23 24 - void lock_system_sleep(void) 24 + unsigned int lock_system_sleep(void) 25 25 { 26 - current->flags |= PF_FREEZER_SKIP; 26 + unsigned int flags = current->flags; 27 + current->flags |= PF_NOFREEZE; 27 28 mutex_lock(&system_transition_mutex); 29 + return flags; 28 30 } 29 31 EXPORT_SYMBOL_GPL(lock_system_sleep); 30 32 31 - void unlock_system_sleep(void) 33 + void unlock_system_sleep(unsigned int flags) 32 34 { 33 35 /* 34 36 * Don't use freezer_count() because we don't want the call to ··· 48 46 * Which means, if we use try_to_freeze() here, it would make them 49 47 * enter the refrigerator, thus causing hibernation to lockup. 50 48 */ 51 - current->flags &= ~PF_FREEZER_SKIP; 49 + if (!(flags & PF_NOFREEZE)) 50 + current->flags &= ~PF_NOFREEZE; 52 51 mutex_unlock(&system_transition_mutex); 53 52 } 54 53 EXPORT_SYMBOL_GPL(unlock_system_sleep); ··· 266 263 static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, 267 264 const char *buf, size_t n) 268 265 { 266 + unsigned int sleep_flags; 269 267 const char * const *s; 268 + int error = -EINVAL; 270 269 int level; 271 270 char *p; 272 271 int len; 273 - int error = -EINVAL; 274 272 275 273 p = memchr(buf, '\n', n); 276 274 len = p ? p - buf : n; 277 275 278 - lock_system_sleep(); 276 + sleep_flags = lock_system_sleep(); 279 277 280 278 level = TEST_FIRST; 281 279 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) ··· 286 282 break; 287 283 } 288 284 289 - unlock_system_sleep(); 285 + unlock_system_sleep(sleep_flags); 290 286 291 287 return error ? error : n; 292 288 }
+4 -6
kernel/power/process.c
··· 50 50 if (p == current || !freeze_task(p)) 51 51 continue; 52 52 53 - if (!freezer_should_skip(p)) 54 - todo++; 53 + todo++; 55 54 } 56 55 read_unlock(&tasklist_lock); 57 56 ··· 95 96 if (!wakeup || pm_debug_messages_on) { 96 97 read_lock(&tasklist_lock); 97 98 for_each_process_thread(g, p) { 98 - if (p != current && !freezer_should_skip(p) 99 - && freezing(p) && !frozen(p)) 99 + if (p != current && freezing(p) && !frozen(p)) 100 100 sched_show_task(p); 101 101 } 102 102 read_unlock(&tasklist_lock); ··· 127 129 current->flags |= PF_SUSPEND_TASK; 128 130 129 131 if (!pm_freezing) 130 - atomic_inc(&system_freezing_cnt); 132 + static_branch_inc(&freezer_active); 131 133 132 134 pm_wakeup_clear(0); 133 135 pr_info("Freezing user space processes ... "); ··· 188 190 189 191 trace_suspend_resume(TPS("thaw_processes"), 0, true); 190 192 if (pm_freezing) 191 - atomic_dec(&system_freezing_cnt); 193 + static_branch_dec(&freezer_active); 192 194 pm_freezing = false; 193 195 pm_nosig_freezing = false; 194 196
+8 -4
kernel/power/suspend.c
··· 75 75 76 76 void s2idle_set_ops(const struct platform_s2idle_ops *ops) 77 77 { 78 - lock_system_sleep(); 78 + unsigned int sleep_flags; 79 + 80 + sleep_flags = lock_system_sleep(); 79 81 s2idle_ops = ops; 80 - unlock_system_sleep(); 82 + unlock_system_sleep(sleep_flags); 81 83 } 82 84 83 85 static void s2idle_begin(void) ··· 205 203 */ 206 204 void suspend_set_ops(const struct platform_suspend_ops *ops) 207 205 { 208 - lock_system_sleep(); 206 + unsigned int sleep_flags; 207 + 208 + sleep_flags = lock_system_sleep(); 209 209 210 210 suspend_ops = ops; 211 211 ··· 223 219 mem_sleep_current = PM_SUSPEND_MEM; 224 220 } 225 221 226 - unlock_system_sleep(); 222 + unlock_system_sleep(sleep_flags); 227 223 } 228 224 EXPORT_SYMBOL_GPL(suspend_set_ops); 229 225
+16 -12
kernel/power/user.c
··· 47 47 static int snapshot_open(struct inode *inode, struct file *filp) 48 48 { 49 49 struct snapshot_data *data; 50 + unsigned int sleep_flags; 50 51 int error; 51 52 52 53 if (!hibernation_available()) 53 54 return -EPERM; 54 55 55 - lock_system_sleep(); 56 + sleep_flags = lock_system_sleep(); 56 57 57 58 if (!hibernate_acquire()) { 58 59 error = -EBUSY; ··· 99 98 data->dev = 0; 100 99 101 100 Unlock: 102 - unlock_system_sleep(); 101 + unlock_system_sleep(sleep_flags); 103 102 104 103 return error; 105 104 } ··· 107 106 static int snapshot_release(struct inode *inode, struct file *filp) 108 107 { 109 108 struct snapshot_data *data; 109 + unsigned int sleep_flags; 110 110 111 - lock_system_sleep(); 111 + sleep_flags = lock_system_sleep(); 112 112 113 113 swsusp_free(); 114 114 data = filp->private_data; ··· 126 124 PM_POST_HIBERNATION : PM_POST_RESTORE); 127 125 hibernate_release(); 128 126 129 - unlock_system_sleep(); 127 + unlock_system_sleep(sleep_flags); 130 128 131 129 return 0; 132 130 } ··· 134 132 static ssize_t snapshot_read(struct file *filp, char __user *buf, 135 133 size_t count, loff_t *offp) 136 134 { 137 - struct snapshot_data *data; 138 - ssize_t res; 139 135 loff_t pg_offp = *offp & ~PAGE_MASK; 136 + struct snapshot_data *data; 137 + unsigned int sleep_flags; 138 + ssize_t res; 140 139 141 - lock_system_sleep(); 140 + sleep_flags = lock_system_sleep(); 142 141 143 142 data = filp->private_data; 144 143 if (!data->ready) { ··· 160 157 *offp += res; 161 158 162 159 Unlock: 163 - unlock_system_sleep(); 160 + unlock_system_sleep(sleep_flags); 164 161 165 162 return res; 166 163 } ··· 168 165 static ssize_t snapshot_write(struct file *filp, const char __user *buf, 169 166 size_t count, loff_t *offp) 170 167 { 171 - struct snapshot_data *data; 172 - ssize_t res; 173 168 loff_t pg_offp = *offp & ~PAGE_MASK; 169 + struct snapshot_data *data; 170 + unsigned long sleep_flags; 171 + ssize_t res; 174 172 175 173 if (need_wait) { 176 174 wait_for_device_probe(); 177 175 need_wait = false; 178 176 } 179 177 180 - lock_system_sleep(); 178 + sleep_flags = lock_system_sleep(); 181 179 182 180 data = filp->private_data; 183 181 ··· 200 196 if (res > 0) 201 197 *offp += res; 202 198 unlock: 203 - unlock_system_sleep(); 199 + unlock_system_sleep(sleep_flags); 204 200 205 201 return res; 206 202 }
+1 -1
kernel/ptrace.c
··· 269 269 read_unlock(&tasklist_lock); 270 270 271 271 if (!ret && !ignore_state && 272 - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) 272 + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN))) 273 273 ret = -ESRCH; 274 274 275 275 return ret;
+2 -1
kernel/sched/autogroup.c
··· 161 161 struct task_struct *t; 162 162 unsigned long flags; 163 163 164 - BUG_ON(!lock_task_sighand(p, &flags)); 164 + if (WARN_ON_ONCE(!lock_task_sighand(p, &flags))) 165 + return; 165 166 166 167 prev = p->signal->autogroup; 167 168 if (prev == ag) {
+12
kernel/sched/completion.c
··· 204 204 int __sched wait_for_completion_interruptible(struct completion *x) 205 205 { 206 206 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 207 + 207 208 if (t == -ERESTARTSYS) 208 209 return t; 209 210 return 0; ··· 242 241 int __sched wait_for_completion_killable(struct completion *x) 243 242 { 244 243 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 244 + 245 245 if (t == -ERESTARTSYS) 246 246 return t; 247 247 return 0; 248 248 } 249 249 EXPORT_SYMBOL(wait_for_completion_killable); 250 + 251 + int __sched wait_for_completion_state(struct completion *x, unsigned int state) 252 + { 253 + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state); 254 + 255 + if (t == -ERESTARTSYS) 256 + return t; 257 + return 0; 258 + } 259 + EXPORT_SYMBOL(wait_for_completion_state); 250 260 251 261 /** 252 262 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
+27 -76
kernel/sched/core.c
··· 143 143 * Number of tasks to iterate in a single balance run. 144 144 * Limited because this is done with IRQs disabled. 145 145 */ 146 - #ifdef CONFIG_PREEMPT_RT 147 - const_debug unsigned int sysctl_sched_nr_migrate = 8; 148 - #else 149 - const_debug unsigned int sysctl_sched_nr_migrate = 32; 150 - #endif 146 + const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; 151 147 152 148 __read_mostly int scheduler_running; 153 149 ··· 478 482 * p->se.load, p->rt_priority, 479 483 * p->dl.dl_{runtime, deadline, period, flags, bw, density} 480 484 * - sched_setnuma(): p->numa_preferred_nid 481 - * - sched_move_task()/ 482 - * cpu_cgroup_fork(): p->sched_task_group 485 + * - sched_move_task(): p->sched_task_group 483 486 * - uclamp_update_active() p->uclamp* 484 487 * 485 488 * p->state <- TASK_*: ··· 2324 2329 rq = cpu_rq(new_cpu); 2325 2330 2326 2331 rq_lock(rq, rf); 2327 - BUG_ON(task_cpu(p) != new_cpu); 2332 + WARN_ON_ONCE(task_cpu(p) != new_cpu); 2328 2333 activate_task(rq, p, 0); 2329 2334 check_preempt_curr(rq, p, 0); 2330 2335 ··· 2774 2779 return -EINVAL; 2775 2780 } 2776 2781 2777 - if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { 2782 + if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { 2778 2783 /* 2779 2784 * MIGRATE_ENABLE gets here because 'p == current', but for 2780 2785 * anything else we cannot do is_migration_disabled(), punt ··· 3250 3255 /* 3251 3256 * wait_task_inactive - wait for a thread to unschedule. 3252 3257 * 3253 - * If @match_state is nonzero, it's the @p->state value just checked and 3254 - * not expected to change. If it changes, i.e. @p might have woken up, 3255 - * then return zero. When we succeed in waiting for @p to be off its CPU, 3256 - * we return a positive number (its total switch count). If a second call 3257 - * a short while later returns the same number, the caller can be sure that 3258 - * @p has remained unscheduled the whole time. 3258 + * Wait for the thread to block in any of the states set in @match_state. 3259 + * If it changes, i.e. @p might have woken up, then return zero. When we 3260 + * succeed in waiting for @p to be off its CPU, we return a positive number 3261 + * (its total switch count). If a second call a short while later returns the 3262 + * same number, the caller can be sure that @p has remained unscheduled the 3263 + * whole time. 3259 3264 * 3260 3265 * The caller must ensure that the task *will* unschedule sometime soon, 3261 3266 * else this function might spin for a *long* time. This function can't ··· 3286 3291 * 3287 3292 * NOTE! Since we don't hold any locks, it's not 3288 3293 * even sure that "rq" stays as the right runqueue! 3289 - * But we don't care, since "task_running()" will 3294 + * But we don't care, since "task_on_cpu()" will 3290 3295 * return false if the runqueue has changed and p 3291 3296 * is actually now running somewhere else! 3292 3297 */ 3293 - while (task_running(rq, p)) { 3294 - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) 3298 + while (task_on_cpu(rq, p)) { 3299 + if (!(READ_ONCE(p->__state) & match_state)) 3295 3300 return 0; 3296 3301 cpu_relax(); 3297 3302 } ··· 3303 3308 */ 3304 3309 rq = task_rq_lock(p, &rf); 3305 3310 trace_sched_wait_task(p); 3306 - running = task_running(rq, p); 3311 + running = task_on_cpu(rq, p); 3307 3312 queued = task_on_rq_queued(p); 3308 3313 ncsw = 0; 3309 - if (!match_state || READ_ONCE(p->__state) == match_state) 3314 + if (READ_ONCE(p->__state) & match_state) 3310 3315 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 3311 3316 task_rq_unlock(rq, p, &rf); 3312 3317 ··· 6425 6430 prev->sched_contributes_to_load = 6426 6431 (prev_state & TASK_UNINTERRUPTIBLE) && 6427 6432 !(prev_state & TASK_NOLOAD) && 6428 - !(prev->flags & PF_FROZEN); 6433 + !(prev_state & TASK_FROZEN); 6429 6434 6430 6435 if (prev->sched_contributes_to_load) 6431 6436 rq->nr_uninterruptible++; ··· 8645 8650 if (curr->sched_class != p->sched_class) 8646 8651 goto out_unlock; 8647 8652 8648 - if (task_running(p_rq, p) || !task_is_running(p)) 8653 + if (task_on_cpu(p_rq, p) || !task_is_running(p)) 8649 8654 goto out_unlock; 8650 8655 8651 8656 yielded = curr->sched_class->yield_to_task(rq, p); ··· 8857 8862 if (pid_alive(p)) 8858 8863 ppid = task_pid_nr(rcu_dereference(p->real_parent)); 8859 8864 rcu_read_unlock(); 8860 - pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", 8865 + pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", 8861 8866 free, task_pid_nr(p), ppid, 8862 8867 read_task_thread_flags(p)); 8863 8868 ··· 8885 8890 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows 8886 8891 * TASK_KILLABLE). 8887 8892 */ 8888 - if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) 8893 + if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) 8889 8894 return false; 8890 8895 8891 8896 return true; ··· 9597 9602 static struct kmem_cache *task_group_cache __read_mostly; 9598 9603 #endif 9599 9604 9600 - DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); 9601 - DECLARE_PER_CPU(cpumask_var_t, select_rq_mask); 9602 - 9603 9605 void __init sched_init(void) 9604 9606 { 9605 9607 unsigned long ptr = 0; ··· 9640 9648 9641 9649 #endif /* CONFIG_RT_GROUP_SCHED */ 9642 9650 } 9643 - #ifdef CONFIG_CPUMASK_OFFSTACK 9644 - for_each_possible_cpu(i) { 9645 - per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( 9646 - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 9647 - per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( 9648 - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); 9649 - } 9650 - #endif /* CONFIG_CPUMASK_OFFSTACK */ 9651 9651 9652 9652 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); 9653 9653 ··· 10148 10164 spin_unlock_irqrestore(&task_group_lock, flags); 10149 10165 } 10150 10166 10151 - static void sched_change_group(struct task_struct *tsk, int type) 10167 + static void sched_change_group(struct task_struct *tsk) 10152 10168 { 10153 10169 struct task_group *tg; 10154 10170 ··· 10164 10180 10165 10181 #ifdef CONFIG_FAIR_GROUP_SCHED 10166 10182 if (tsk->sched_class->task_change_group) 10167 - tsk->sched_class->task_change_group(tsk, type); 10183 + tsk->sched_class->task_change_group(tsk); 10168 10184 else 10169 10185 #endif 10170 10186 set_task_rq(tsk, task_cpu(tsk)); ··· 10195 10211 if (running) 10196 10212 put_prev_task(rq, tsk); 10197 10213 10198 - sched_change_group(tsk, TASK_MOVE_GROUP); 10214 + sched_change_group(tsk); 10199 10215 10200 10216 if (queued) 10201 10217 enqueue_task(rq, tsk, queue_flags); ··· 10273 10289 sched_unregister_group(tg); 10274 10290 } 10275 10291 10276 - /* 10277 - * This is called before wake_up_new_task(), therefore we really only 10278 - * have to set its group bits, all the other stuff does not apply. 10279 - */ 10280 - static void cpu_cgroup_fork(struct task_struct *task) 10281 - { 10282 - struct rq_flags rf; 10283 - struct rq *rq; 10284 - 10285 - rq = task_rq_lock(task, &rf); 10286 - 10287 - update_rq_clock(rq); 10288 - sched_change_group(task, TASK_SET_GROUP); 10289 - 10290 - task_rq_unlock(rq, task, &rf); 10291 - } 10292 - 10292 + #ifdef CONFIG_RT_GROUP_SCHED 10293 10293 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) 10294 10294 { 10295 10295 struct task_struct *task; 10296 10296 struct cgroup_subsys_state *css; 10297 - int ret = 0; 10298 10297 10299 10298 cgroup_taskset_for_each(task, css, tset) { 10300 - #ifdef CONFIG_RT_GROUP_SCHED 10301 10299 if (!sched_rt_can_attach(css_tg(css), task)) 10302 10300 return -EINVAL; 10303 - #endif 10304 - /* 10305 - * Serialize against wake_up_new_task() such that if it's 10306 - * running, we're sure to observe its full state. 10307 - */ 10308 - raw_spin_lock_irq(&task->pi_lock); 10309 - /* 10310 - * Avoid calling sched_move_task() before wake_up_new_task() 10311 - * has happened. This would lead to problems with PELT, due to 10312 - * move wanting to detach+attach while we're not attached yet. 10313 - */ 10314 - if (READ_ONCE(task->__state) == TASK_NEW) 10315 - ret = -EINVAL; 10316 - raw_spin_unlock_irq(&task->pi_lock); 10317 - 10318 - if (ret) 10319 - break; 10320 10301 } 10321 - return ret; 10302 + return 0; 10322 10303 } 10304 + #endif 10323 10305 10324 10306 static void cpu_cgroup_attach(struct cgroup_taskset *tset) 10325 10307 { ··· 11121 11171 .css_released = cpu_cgroup_css_released, 11122 11172 .css_free = cpu_cgroup_css_free, 11123 11173 .css_extra_stat_show = cpu_extra_stat_show, 11124 - .fork = cpu_cgroup_fork, 11174 + #ifdef CONFIG_RT_GROUP_SCHED 11125 11175 .can_attach = cpu_cgroup_can_attach, 11176 + #endif 11126 11177 .attach = cpu_cgroup_attach, 11127 11178 .legacy_cftypes = cpu_legacy_files, 11128 11179 .dfl_cftypes = cpu_files,
+2 -2
kernel/sched/core_sched.c
··· 88 88 * core has now entered/left forced idle state. Defer accounting to the 89 89 * next scheduling edge, rather than always forcing a reschedule here. 90 90 */ 91 - if (task_running(rq, p)) 91 + if (task_on_cpu(rq, p)) 92 92 resched_curr(rq); 93 93 94 94 task_rq_unlock(rq, p, &rf); ··· 205 205 default: 206 206 err = -EINVAL; 207 207 goto out; 208 - }; 208 + } 209 209 210 210 if (type == PIDTYPE_PID) { 211 211 __sched_core_set(task, cookie);
+1 -1
kernel/sched/cpudeadline.c
··· 123 123 unsigned long cap, max_cap = 0; 124 124 int cpu, max_cpu = -1; 125 125 126 - if (!static_branch_unlikely(&sched_asym_cpucapacity)) 126 + if (!sched_asym_cpucap_active()) 127 127 return 1; 128 128 129 129 /* Ensure the capacity of the CPUs fits the task. */
+1 -1
kernel/sched/cpupri.c
··· 147 147 int task_pri = convert_prio(p->prio); 148 148 int idx, cpu; 149 149 150 - BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); 150 + WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES); 151 151 152 152 for (idx = 0; idx < task_pri; idx++) { 153 153
+56 -59
kernel/sched/deadline.c
··· 124 124 return cpus; 125 125 } 126 126 127 - static inline unsigned long __dl_bw_capacity(int i) 127 + static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) 128 128 { 129 - struct root_domain *rd = cpu_rq(i)->rd; 130 129 unsigned long cap = 0; 130 + int i; 131 131 132 - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 133 - "sched RCU must be held"); 134 - 135 - for_each_cpu_and(i, rd->span, cpu_active_mask) 132 + for_each_cpu_and(i, mask, cpu_active_mask) 136 133 cap += capacity_orig_of(i); 137 134 138 135 return cap; ··· 141 144 */ 142 145 static inline unsigned long dl_bw_capacity(int i) 143 146 { 144 - if (!static_branch_unlikely(&sched_asym_cpucapacity) && 147 + if (!sched_asym_cpucap_active() && 145 148 capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { 146 149 return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; 147 150 } else { 148 - return __dl_bw_capacity(i); 151 + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), 152 + "sched RCU must be held"); 153 + 154 + return __dl_bw_capacity(cpu_rq(i)->rd->span); 149 155 } 150 156 } 151 157 ··· 310 310 { 311 311 struct rq *rq; 312 312 313 - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); 313 + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); 314 314 315 315 if (task_on_rq_queued(p)) 316 316 return; ··· 431 431 sub_rq_bw(&p->dl, &rq->dl); 432 432 raw_spin_lock(&dl_b->lock); 433 433 __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 434 - __dl_clear_params(p); 435 434 raw_spin_unlock(&dl_b->lock); 435 + __dl_clear_params(p); 436 436 } 437 437 438 438 return; ··· 607 607 { 608 608 struct rb_node *leftmost; 609 609 610 - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); 610 + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); 611 611 612 612 leftmost = rb_add_cached(&p->pushable_dl_tasks, 613 613 &rq->dl.pushable_dl_tasks_root, ··· 684 684 * Failed to find any suitable CPU. 685 685 * The task will never come back! 686 686 */ 687 - BUG_ON(dl_bandwidth_enabled()); 687 + WARN_ON_ONCE(dl_bandwidth_enabled()); 688 688 689 689 /* 690 690 * If admission control is disabled we ··· 770 770 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); 771 771 static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); 772 772 773 + static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, 774 + struct rq *rq) 775 + { 776 + /* for non-boosted task, pi_of(dl_se) == dl_se */ 777 + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 778 + dl_se->runtime = pi_of(dl_se)->dl_runtime; 779 + } 780 + 773 781 /* 774 782 * We are being explicitly informed that a new instance is starting, 775 783 * and this means that: ··· 811 803 * future; in fact, we must consider execution overheads (time 812 804 * spent on hardirq context, etc.). 813 805 */ 814 - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; 815 - dl_se->runtime = dl_se->dl_runtime; 806 + replenish_dl_new_period(dl_se, rq); 816 807 } 817 808 818 809 /* ··· 837 830 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 838 831 struct rq *rq = rq_of_dl_rq(dl_rq); 839 832 840 - BUG_ON(pi_of(dl_se)->dl_runtime <= 0); 833 + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); 841 834 842 835 /* 843 836 * This could be the case for a !-dl task that is boosted. 844 837 * Just go with full inherited parameters. 845 838 */ 846 - if (dl_se->dl_deadline == 0) { 847 - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 848 - dl_se->runtime = pi_of(dl_se)->dl_runtime; 849 - } 839 + if (dl_se->dl_deadline == 0) 840 + replenish_dl_new_period(dl_se, rq); 850 841 851 842 if (dl_se->dl_yielded && dl_se->runtime > 0) 852 843 dl_se->runtime = 0; ··· 871 866 */ 872 867 if (dl_time_before(dl_se->deadline, rq_clock(rq))) { 873 868 printk_deferred_once("sched: DL replenish lagged too much\n"); 874 - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 875 - dl_se->runtime = pi_of(dl_se)->dl_runtime; 869 + replenish_dl_new_period(dl_se, rq); 876 870 } 877 871 878 872 if (dl_se->dl_yielded) ··· 1028 1024 return; 1029 1025 } 1030 1026 1031 - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 1032 - dl_se->runtime = pi_of(dl_se)->dl_runtime; 1027 + replenish_dl_new_period(dl_se, rq); 1033 1028 } 1034 1029 } 1035 1030 ··· 1336 1333 1337 1334 trace_sched_stat_runtime(curr, delta_exec, 0); 1338 1335 1339 - curr->se.sum_exec_runtime += delta_exec; 1340 - account_group_exec_runtime(curr, delta_exec); 1341 - 1342 - curr->se.exec_start = now; 1343 - cgroup_account_cputime(curr, delta_exec); 1336 + update_current_exec_runtime(curr, now, delta_exec); 1344 1337 1345 1338 if (dl_entity_is_special(dl_se)) 1346 1339 return; ··· 1615 1616 { 1616 1617 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 1617 1618 1618 - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); 1619 + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); 1619 1620 1620 1621 rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); 1621 1622 ··· 1639 1640 static void 1640 1641 enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) 1641 1642 { 1642 - BUG_ON(on_dl_rq(dl_se)); 1643 + WARN_ON_ONCE(on_dl_rq(dl_se)); 1643 1644 1644 1645 update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); 1645 1646 ··· 1813 1814 1814 1815 #ifdef CONFIG_SMP 1815 1816 1817 + static inline bool dl_task_is_earliest_deadline(struct task_struct *p, 1818 + struct rq *rq) 1819 + { 1820 + return (!rq->dl.dl_nr_running || 1821 + dl_time_before(p->dl.deadline, 1822 + rq->dl.earliest_dl.curr)); 1823 + } 1824 + 1816 1825 static int find_later_rq(struct task_struct *task); 1817 1826 1818 1827 static int ··· 1856 1849 * Take the capacity of the CPU into account to 1857 1850 * ensure it fits the requirement of the task. 1858 1851 */ 1859 - if (static_branch_unlikely(&sched_asym_cpucapacity)) 1852 + if (sched_asym_cpucap_active()) 1860 1853 select_rq |= !dl_task_fits_capacity(p, cpu); 1861 1854 1862 1855 if (select_rq) { 1863 1856 int target = find_later_rq(p); 1864 1857 1865 1858 if (target != -1 && 1866 - (dl_time_before(p->dl.deadline, 1867 - cpu_rq(target)->dl.earliest_dl.curr) || 1868 - (cpu_rq(target)->dl.dl_nr_running == 0))) 1859 + dl_task_is_earliest_deadline(p, cpu_rq(target))) 1869 1860 cpu = target; 1870 1861 } 1871 1862 rcu_read_unlock(); ··· 2022 2017 return NULL; 2023 2018 2024 2019 dl_se = pick_next_dl_entity(dl_rq); 2025 - BUG_ON(!dl_se); 2020 + WARN_ON_ONCE(!dl_se); 2026 2021 p = dl_task_of(dl_se); 2027 2022 2028 2023 return p; ··· 2092 2087 2093 2088 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 2094 2089 { 2095 - if (!task_running(rq, p) && 2090 + if (!task_on_cpu(rq, p) && 2096 2091 cpumask_test_cpu(cpu, &p->cpus_mask)) 2097 2092 return 1; 2098 2093 return 0; ··· 2230 2225 2231 2226 later_rq = cpu_rq(cpu); 2232 2227 2233 - if (later_rq->dl.dl_nr_running && 2234 - !dl_time_before(task->dl.deadline, 2235 - later_rq->dl.earliest_dl.curr)) { 2228 + if (!dl_task_is_earliest_deadline(task, later_rq)) { 2236 2229 /* 2237 2230 * Target rq has tasks of equal or earlier deadline, 2238 2231 * retrying does not release any lock and is unlikely ··· 2244 2241 if (double_lock_balance(rq, later_rq)) { 2245 2242 if (unlikely(task_rq(task) != rq || 2246 2243 !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || 2247 - task_running(rq, task) || 2244 + task_on_cpu(rq, task) || 2248 2245 !dl_task(task) || 2249 2246 !task_on_rq_queued(task))) { 2250 2247 double_unlock_balance(rq, later_rq); ··· 2258 2255 * its earliest one has a later deadline than our 2259 2256 * task, the rq is a good one. 2260 2257 */ 2261 - if (!later_rq->dl.dl_nr_running || 2262 - dl_time_before(task->dl.deadline, 2263 - later_rq->dl.earliest_dl.curr)) 2258 + if (dl_task_is_earliest_deadline(task, later_rq)) 2264 2259 break; 2265 2260 2266 2261 /* Otherwise we try again. */ ··· 2278 2277 2279 2278 p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); 2280 2279 2281 - BUG_ON(rq->cpu != task_cpu(p)); 2282 - BUG_ON(task_current(rq, p)); 2283 - BUG_ON(p->nr_cpus_allowed <= 1); 2280 + WARN_ON_ONCE(rq->cpu != task_cpu(p)); 2281 + WARN_ON_ONCE(task_current(rq, p)); 2282 + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); 2284 2283 2285 - BUG_ON(!task_on_rq_queued(p)); 2286 - BUG_ON(!dl_task(p)); 2284 + WARN_ON_ONCE(!task_on_rq_queued(p)); 2285 + WARN_ON_ONCE(!dl_task(p)); 2287 2286 2288 2287 return p; 2289 2288 } ··· 2429 2428 * - it will preempt the last one we pulled (if any). 2430 2429 */ 2431 2430 if (p && dl_time_before(p->dl.deadline, dmin) && 2432 - (!this_rq->dl.dl_nr_running || 2433 - dl_time_before(p->dl.deadline, 2434 - this_rq->dl.earliest_dl.curr))) { 2431 + dl_task_is_earliest_deadline(p, this_rq)) { 2435 2432 WARN_ON(p == src_rq->curr); 2436 2433 WARN_ON(!task_on_rq_queued(p)); 2437 2434 ··· 2474 2475 */ 2475 2476 static void task_woken_dl(struct rq *rq, struct task_struct *p) 2476 2477 { 2477 - if (!task_running(rq, p) && 2478 + if (!task_on_cpu(rq, p) && 2478 2479 !test_tsk_need_resched(rq->curr) && 2479 2480 p->nr_cpus_allowed > 1 && 2480 2481 dl_task(rq->curr) && ··· 2491 2492 struct root_domain *src_rd; 2492 2493 struct rq *rq; 2493 2494 2494 - BUG_ON(!dl_task(p)); 2495 + WARN_ON_ONCE(!dl_task(p)); 2495 2496 2496 2497 rq = task_rq(p); 2497 2498 src_rd = rq->rd; ··· 3006 3007 int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, 3007 3008 const struct cpumask *trial) 3008 3009 { 3009 - int ret = 1, trial_cpus; 3010 + unsigned long flags, cap; 3010 3011 struct dl_bw *cur_dl_b; 3011 - unsigned long flags; 3012 + int ret = 1; 3012 3013 3013 3014 rcu_read_lock_sched(); 3014 3015 cur_dl_b = dl_bw_of(cpumask_any(cur)); 3015 - trial_cpus = cpumask_weight(trial); 3016 - 3016 + cap = __dl_bw_capacity(trial); 3017 3017 raw_spin_lock_irqsave(&cur_dl_b->lock, flags); 3018 - if (cur_dl_b->bw != -1 && 3019 - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) 3018 + if (__dl_overflow(cur_dl_b, cap, 0, 0)) 3020 3019 ret = 0; 3021 3020 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 3022 3021 rcu_read_unlock_sched();
+111 -116
kernel/sched/fair.c
··· 799 799 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ 800 800 } 801 801 802 - static void attach_entity_cfs_rq(struct sched_entity *se); 803 - 804 802 /* 805 803 * With new tasks being created, their initial util_avgs are extrapolated 806 804 * based on the cfs_rq's current util_avg: ··· 833 835 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); 834 836 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; 835 837 836 - if (cap > 0) { 837 - if (cfs_rq->avg.util_avg != 0) { 838 - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; 839 - sa->util_avg /= (cfs_rq->avg.load_avg + 1); 840 - 841 - if (sa->util_avg > cap) 842 - sa->util_avg = cap; 843 - } else { 844 - sa->util_avg = cap; 845 - } 846 - } 847 - 848 - sa->runnable_avg = sa->util_avg; 849 - 850 838 if (p->sched_class != &fair_sched_class) { 851 839 /* 852 840 * For !fair tasks do: ··· 848 864 return; 849 865 } 850 866 851 - attach_entity_cfs_rq(se); 867 + if (cap > 0) { 868 + if (cfs_rq->avg.util_avg != 0) { 869 + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; 870 + sa->util_avg /= (cfs_rq->avg.load_avg + 1); 871 + 872 + if (sa->util_avg > cap) 873 + sa->util_avg = cap; 874 + } else { 875 + sa->util_avg = cap; 876 + } 877 + } 878 + 879 + sa->runnable_avg = sa->util_avg; 852 880 } 853 881 854 882 #else /* !CONFIG_SMP */ ··· 1588 1592 1589 1593 #ifdef CONFIG_SCHED_SMT 1590 1594 /* Forward declarations of select_idle_sibling helpers */ 1591 - static inline bool test_idle_cores(int cpu, bool def); 1595 + static inline bool test_idle_cores(int cpu); 1592 1596 static inline int numa_idle_core(int idle_core, int cpu) 1593 1597 { 1594 1598 if (!static_branch_likely(&sched_smt_present) || 1595 - idle_core >= 0 || !test_idle_cores(cpu, false)) 1599 + idle_core >= 0 || !test_idle_cores(cpu)) 1596 1600 return idle_core; 1597 1601 1598 1602 /* ··· 2596 2600 if (!join) 2597 2601 return; 2598 2602 2599 - BUG_ON(irqs_disabled()); 2603 + WARN_ON_ONCE(irqs_disabled()); 2600 2604 double_lock_irq(&my_grp->lock, &grp->lock); 2601 2605 2602 2606 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { ··· 3834 3838 * @cfs_rq: cfs_rq to update 3835 3839 * 3836 3840 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) 3837 - * avg. The immediate corollary is that all (fair) tasks must be attached, see 3838 - * post_init_entity_util_avg(). 3841 + * avg. The immediate corollary is that all (fair) tasks must be attached. 3839 3842 * 3840 3843 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. 3841 3844 * ··· 3998 4003 #define UPDATE_TG 0x1 3999 4004 #define SKIP_AGE_LOAD 0x2 4000 4005 #define DO_ATTACH 0x4 4006 + #define DO_DETACH 0x8 4001 4007 4002 4008 /* Update task and its cfs_rq load average */ 4003 4009 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ··· 4028 4032 attach_entity_load_avg(cfs_rq, se); 4029 4033 update_tg_load_avg(cfs_rq); 4030 4034 4035 + } else if (flags & DO_DETACH) { 4036 + /* 4037 + * DO_DETACH means we're here from dequeue_entity() 4038 + * and we are migrating task out of the CPU. 4039 + */ 4040 + detach_entity_load_avg(cfs_rq, se); 4041 + update_tg_load_avg(cfs_rq); 4031 4042 } else if (decayed) { 4032 4043 cfs_rq_util_change(cfs_rq, 0); 4033 4044 ··· 4067 4064 4068 4065 /* 4069 4066 * tasks cannot exit without having gone through wake_up_new_task() -> 4070 - * post_init_entity_util_avg() which will have added things to the 4071 - * cfs_rq, so we can remove unconditionally. 4067 + * enqueue_task_fair() which will have added things to the cfs_rq, 4068 + * so we can remove unconditionally. 4072 4069 */ 4073 4070 4074 4071 sync_entity_load_avg(se); ··· 4265 4262 4266 4263 static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 4267 4264 { 4268 - if (!static_branch_unlikely(&sched_asym_cpucapacity)) 4265 + if (!sched_asym_cpucap_active()) 4269 4266 return; 4270 4267 4271 4268 if (!p || p->nr_cpus_allowed == 1) { ··· 4295 4292 #define UPDATE_TG 0x0 4296 4293 #define SKIP_AGE_LOAD 0x0 4297 4294 #define DO_ATTACH 0x0 4295 + #define DO_DETACH 0x0 4298 4296 4299 4297 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) 4300 4298 { ··· 4438 4434 /* 4439 4435 * When enqueuing a sched_entity, we must: 4440 4436 * - Update loads to have both entity and cfs_rq synced with now. 4441 - * - Add its load to cfs_rq->runnable_avg 4437 + * - For group_entity, update its runnable_weight to reflect the new 4438 + * h_nr_running of its group cfs_rq. 4442 4439 * - For group_entity, update its weight to reflect the new share of 4443 4440 * its group cfs_rq 4444 4441 * - Add its new weight to cfs_rq->load.weight ··· 4516 4511 static void 4517 4512 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 4518 4513 { 4514 + int action = UPDATE_TG; 4515 + 4516 + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) 4517 + action |= DO_DETACH; 4518 + 4519 4519 /* 4520 4520 * Update run-time statistics of the 'current'. 4521 4521 */ ··· 4529 4519 /* 4530 4520 * When dequeuing a sched_entity, we must: 4531 4521 * - Update loads to have both entity and cfs_rq synced with now. 4532 - * - Subtract its load from the cfs_rq->runnable_avg. 4522 + * - For group_entity, update its runnable_weight to reflect the new 4523 + * h_nr_running of its group cfs_rq. 4533 4524 * - Subtract its previous weight from cfs_rq->load.weight. 4534 4525 * - For group entity, update its weight to reflect the new share 4535 4526 * of its group cfs_rq. 4536 4527 */ 4537 - update_load_avg(cfs_rq, se, UPDATE_TG); 4528 + update_load_avg(cfs_rq, se, action); 4538 4529 se_update_runnable(se); 4539 4530 4540 4531 update_stats_dequeue_fair(cfs_rq, se, flags); ··· 5904 5893 #ifdef CONFIG_SMP 5905 5894 5906 5895 /* Working cpumask for: load_balance, load_balance_newidle. */ 5907 - DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); 5908 - DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); 5896 + static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); 5897 + static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); 5909 5898 5910 5899 #ifdef CONFIG_NO_HZ_COMMON 5911 5900 ··· 6271 6260 WRITE_ONCE(sds->has_idle_cores, val); 6272 6261 } 6273 6262 6274 - static inline bool test_idle_cores(int cpu, bool def) 6263 + static inline bool test_idle_cores(int cpu) 6275 6264 { 6276 6265 struct sched_domain_shared *sds; 6277 6266 ··· 6279 6268 if (sds) 6280 6269 return READ_ONCE(sds->has_idle_cores); 6281 6270 6282 - return def; 6271 + return false; 6283 6272 } 6284 6273 6285 6274 /* ··· 6295 6284 int cpu; 6296 6285 6297 6286 rcu_read_lock(); 6298 - if (test_idle_cores(core, true)) 6287 + if (test_idle_cores(core)) 6299 6288 goto unlock; 6300 6289 6301 6290 for_each_cpu(cpu, cpu_smt_mask(core)) { ··· 6320 6309 { 6321 6310 bool idle = true; 6322 6311 int cpu; 6323 - 6324 - if (!static_branch_likely(&sched_smt_present)) 6325 - return __select_idle_cpu(core, p); 6326 6312 6327 6313 for_each_cpu(cpu, cpu_smt_mask(core)) { 6328 6314 if (!available_idle_cpu(cpu)) { ··· 6347 6339 /* 6348 6340 * Scan the local SMT mask for idle CPUs. 6349 6341 */ 6350 - static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 6342 + static int select_idle_smt(struct task_struct *p, int target) 6351 6343 { 6352 6344 int cpu; 6353 6345 6354 - for_each_cpu(cpu, cpu_smt_mask(target)) { 6355 - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || 6356 - !cpumask_test_cpu(cpu, sched_domain_span(sd))) 6346 + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { 6347 + if (cpu == target) 6357 6348 continue; 6358 6349 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) 6359 6350 return cpu; ··· 6367 6360 { 6368 6361 } 6369 6362 6370 - static inline bool test_idle_cores(int cpu, bool def) 6363 + static inline bool test_idle_cores(int cpu) 6371 6364 { 6372 - return def; 6365 + return false; 6373 6366 } 6374 6367 6375 6368 static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) ··· 6377 6370 return __select_idle_cpu(core, p); 6378 6371 } 6379 6372 6380 - static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) 6373 + static inline int select_idle_smt(struct task_struct *p, int target) 6381 6374 { 6382 6375 return -1; 6383 6376 } ··· 6396 6389 struct sched_domain_shared *sd_share; 6397 6390 struct rq *this_rq = this_rq(); 6398 6391 int this = smp_processor_id(); 6399 - struct sched_domain *this_sd; 6392 + struct sched_domain *this_sd = NULL; 6400 6393 u64 time = 0; 6401 - 6402 - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); 6403 - if (!this_sd) 6404 - return -1; 6405 6394 6406 6395 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); 6407 6396 6408 6397 if (sched_feat(SIS_PROP) && !has_idle_core) { 6409 6398 u64 avg_cost, avg_idle, span_avg; 6410 6399 unsigned long now = jiffies; 6400 + 6401 + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); 6402 + if (!this_sd) 6403 + return -1; 6411 6404 6412 6405 /* 6413 6406 * If we're busy, the assumption that the last idle period ··· 6462 6455 if (has_idle_core) 6463 6456 set_idle_cores(target, false); 6464 6457 6465 - if (sched_feat(SIS_PROP) && !has_idle_core) { 6458 + if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { 6466 6459 time = cpu_clock(this) - time; 6467 6460 6468 6461 /* ··· 6513 6506 6514 6507 static inline bool asym_fits_capacity(unsigned long task_util, int cpu) 6515 6508 { 6516 - if (static_branch_unlikely(&sched_asym_cpucapacity)) 6509 + if (sched_asym_cpucap_active()) 6517 6510 return fits_capacity(task_util, capacity_of(cpu)); 6518 6511 6519 6512 return true; ··· 6533 6526 * On asymmetric system, update task utilization because we will check 6534 6527 * that the task fits with cpu's capacity. 6535 6528 */ 6536 - if (static_branch_unlikely(&sched_asym_cpucapacity)) { 6529 + if (sched_asym_cpucap_active()) { 6537 6530 sync_entity_load_avg(&p->se); 6538 6531 task_util = uclamp_task_util(p); 6539 6532 } ··· 6587 6580 * For asymmetric CPU capacity systems, our domain of interest is 6588 6581 * sd_asym_cpucapacity rather than sd_llc. 6589 6582 */ 6590 - if (static_branch_unlikely(&sched_asym_cpucapacity)) { 6583 + if (sched_asym_cpucap_active()) { 6591 6584 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); 6592 6585 /* 6593 6586 * On an asymmetric CPU capacity system where an exclusive ··· 6608 6601 return target; 6609 6602 6610 6603 if (sched_smt_active()) { 6611 - has_idle_core = test_idle_cores(target, false); 6604 + has_idle_core = test_idle_cores(target); 6612 6605 6613 6606 if (!has_idle_core && cpus_share_cache(prev, target)) { 6614 - i = select_idle_smt(p, sd, prev); 6607 + i = select_idle_smt(p, prev); 6615 6608 if ((unsigned int)i < nr_cpumask_bits) 6616 6609 return i; 6617 6610 } ··· 7083 7076 return new_cpu; 7084 7077 } 7085 7078 7086 - static void detach_entity_cfs_rq(struct sched_entity *se); 7087 - 7088 7079 /* 7089 7080 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and 7090 7081 * cfs_rq_of(p) references at time of call are still valid and identify the ··· 7104 7099 se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); 7105 7100 } 7106 7101 7107 - if (p->on_rq == TASK_ON_RQ_MIGRATING) { 7108 - /* 7109 - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' 7110 - * rq->lock and can modify state directly. 7111 - */ 7112 - lockdep_assert_rq_held(task_rq(p)); 7113 - detach_entity_cfs_rq(se); 7114 - 7115 - } else { 7102 + if (!task_on_rq_migrating(p)) { 7116 7103 remove_entity_load_avg(se); 7117 7104 7118 7105 /* ··· 7276 7279 return; 7277 7280 7278 7281 find_matching_se(&se, &pse); 7279 - BUG_ON(!pse); 7282 + WARN_ON_ONCE(!pse); 7280 7283 7281 7284 cse_is_idle = se_is_idle(se); 7282 7285 pse_is_idle = se_is_idle(pse); ··· 7935 7938 /* Record that we found at least one task that could run on dst_cpu */ 7936 7939 env->flags &= ~LBF_ALL_PINNED; 7937 7940 7938 - if (task_running(env->src_rq, p)) { 7941 + if (task_on_cpu(env->src_rq, p)) { 7939 7942 schedstat_inc(p->stats.nr_failed_migrations_running); 7940 7943 return 0; 7941 7944 } ··· 8009 8012 return NULL; 8010 8013 } 8011 8014 8012 - static const unsigned int sched_nr_migrate_break = 32; 8013 - 8014 8015 /* 8015 8016 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from 8016 8017 * busiest_rq, as part of a balancing operation within domain "sd". ··· 8044 8049 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) 8045 8050 break; 8046 8051 8047 - p = list_last_entry(tasks, struct task_struct, se.group_node); 8048 - 8049 8052 env->loop++; 8050 - /* We've more or less seen every task there is, call it quits */ 8051 - if (env->loop > env->loop_max) 8053 + /* 8054 + * We've more or less seen every task there is, call it quits 8055 + * unless we haven't found any movable task yet. 8056 + */ 8057 + if (env->loop > env->loop_max && 8058 + !(env->flags & LBF_ALL_PINNED)) 8052 8059 break; 8053 8060 8054 8061 /* take a breather every nr_migrate tasks */ 8055 8062 if (env->loop > env->loop_break) { 8056 - env->loop_break += sched_nr_migrate_break; 8063 + env->loop_break += SCHED_NR_MIGRATE_BREAK; 8057 8064 env->flags |= LBF_NEED_BREAK; 8058 8065 break; 8059 8066 } 8067 + 8068 + p = list_last_entry(tasks, struct task_struct, se.group_node); 8060 8069 8061 8070 if (!can_migrate_task(p, env)) 8062 8071 goto next; ··· 8158 8159 { 8159 8160 lockdep_assert_rq_held(rq); 8160 8161 8161 - BUG_ON(task_rq(p) != rq); 8162 + WARN_ON_ONCE(task_rq(p) != rq); 8162 8163 activate_task(rq, p, ENQUEUE_NOCLOCK); 8163 8164 check_preempt_curr(rq, p, 0); 8164 8165 } ··· 10098 10099 struct rq *busiest; 10099 10100 struct rq_flags rf; 10100 10101 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); 10101 - 10102 10102 struct lb_env env = { 10103 10103 .sd = sd, 10104 10104 .dst_cpu = this_cpu, 10105 10105 .dst_rq = this_rq, 10106 10106 .dst_grpmask = sched_group_span(sd->groups), 10107 10107 .idle = idle, 10108 - .loop_break = sched_nr_migrate_break, 10108 + .loop_break = SCHED_NR_MIGRATE_BREAK, 10109 10109 .cpus = cpus, 10110 10110 .fbq_type = all, 10111 10111 .tasks = LIST_HEAD_INIT(env.tasks), ··· 10132 10134 goto out_balanced; 10133 10135 } 10134 10136 10135 - BUG_ON(busiest == env.dst_rq); 10137 + WARN_ON_ONCE(busiest == env.dst_rq); 10136 10138 10137 10139 schedstat_add(sd->lb_imbalance[idle], env.imbalance); 10138 10140 ··· 10180 10182 10181 10183 if (env.flags & LBF_NEED_BREAK) { 10182 10184 env.flags &= ~LBF_NEED_BREAK; 10183 - goto more_balance; 10185 + /* Stop if we tried all running tasks */ 10186 + if (env.loop < busiest->nr_running) 10187 + goto more_balance; 10184 10188 } 10185 10189 10186 10190 /* ··· 10213 10213 env.dst_cpu = env.new_dst_cpu; 10214 10214 env.flags &= ~LBF_DST_PINNED; 10215 10215 env.loop = 0; 10216 - env.loop_break = sched_nr_migrate_break; 10216 + env.loop_break = SCHED_NR_MIGRATE_BREAK; 10217 10217 10218 10218 /* 10219 10219 * Go back to "more_balance" rather than "redo" since we ··· 10245 10245 */ 10246 10246 if (!cpumask_subset(cpus, env.dst_grpmask)) { 10247 10247 env.loop = 0; 10248 - env.loop_break = sched_nr_migrate_break; 10248 + env.loop_break = SCHED_NR_MIGRATE_BREAK; 10249 10249 goto redo; 10250 10250 } 10251 10251 goto out_all_pinned; ··· 10430 10430 * we need to fix it. Originally reported by 10431 10431 * Bjorn Helgaas on a 128-CPU setup. 10432 10432 */ 10433 - BUG_ON(busiest_rq == target_rq); 10433 + WARN_ON_ONCE(busiest_rq == target_rq); 10434 10434 10435 10435 /* Search for an sd spanning us and the target CPU. */ 10436 10436 rcu_read_lock(); ··· 10916 10916 * can be a simple update of blocked load or a complete load balance with 10917 10917 * tasks movement depending of flags. 10918 10918 */ 10919 - static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, 10920 - enum cpu_idle_type idle) 10919 + static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) 10921 10920 { 10922 10921 /* Earliest time when we have to do rebalance again */ 10923 10922 unsigned long now = jiffies; ··· 11031 11032 if (idle != CPU_IDLE) 11032 11033 return false; 11033 11034 11034 - _nohz_idle_balance(this_rq, flags, idle); 11035 + _nohz_idle_balance(this_rq, flags); 11035 11036 11036 11037 return true; 11037 11038 } ··· 11051 11052 * (ie NOHZ_STATS_KICK set) and will do the same. 11052 11053 */ 11053 11054 if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) 11054 - _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE); 11055 + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); 11055 11056 } 11056 11057 11057 11058 static void nohz_newidle_balance(struct rq *this_rq) ··· 11551 11552 { 11552 11553 struct cfs_rq *cfs_rq = cfs_rq_of(se); 11553 11554 11555 + #ifdef CONFIG_SMP 11556 + /* 11557 + * In case the task sched_avg hasn't been attached: 11558 + * - A forked task which hasn't been woken up by wake_up_new_task(). 11559 + * - A task which has been woken up by try_to_wake_up() but is 11560 + * waiting for actually being woken up by sched_ttwu_pending(). 11561 + */ 11562 + if (!se->avg.last_update_time) 11563 + return; 11564 + #endif 11565 + 11554 11566 /* Catch up with the cfs_rq and remove our load when we leave */ 11555 11567 update_load_avg(cfs_rq, se, 0); 11556 11568 detach_entity_load_avg(cfs_rq, se); ··· 11572 11562 static void attach_entity_cfs_rq(struct sched_entity *se) 11573 11563 { 11574 11564 struct cfs_rq *cfs_rq = cfs_rq_of(se); 11575 - 11576 - #ifdef CONFIG_FAIR_GROUP_SCHED 11577 - /* 11578 - * Since the real-depth could have been changed (only FAIR 11579 - * class maintain depth value), reset depth properly. 11580 - */ 11581 - se->depth = se->parent ? se->parent->depth + 1 : 0; 11582 - #endif 11583 11565 11584 11566 /* Synchronize entity with its cfs_rq */ 11585 11567 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); ··· 11668 11666 } 11669 11667 11670 11668 #ifdef CONFIG_FAIR_GROUP_SCHED 11671 - static void task_set_group_fair(struct task_struct *p) 11669 + static void task_change_group_fair(struct task_struct *p) 11672 11670 { 11673 - struct sched_entity *se = &p->se; 11671 + /* 11672 + * We couldn't detach or attach a forked task which 11673 + * hasn't been woken up by wake_up_new_task(). 11674 + */ 11675 + if (READ_ONCE(p->__state) == TASK_NEW) 11676 + return; 11674 11677 11675 - set_task_rq(p, task_cpu(p)); 11676 - se->depth = se->parent ? se->parent->depth + 1 : 0; 11677 - } 11678 - 11679 - static void task_move_group_fair(struct task_struct *p) 11680 - { 11681 11678 detach_task_cfs_rq(p); 11682 - set_task_rq(p, task_cpu(p)); 11683 11679 11684 11680 #ifdef CONFIG_SMP 11685 11681 /* Tell se's cfs_rq has been changed -- migrated */ 11686 11682 p->se.avg.last_update_time = 0; 11687 11683 #endif 11684 + set_task_rq(p, task_cpu(p)); 11688 11685 attach_task_cfs_rq(p); 11689 - } 11690 - 11691 - static void task_change_group_fair(struct task_struct *p, int type) 11692 - { 11693 - switch (type) { 11694 - case TASK_SET_GROUP: 11695 - task_set_group_fair(p); 11696 - break; 11697 - 11698 - case TASK_MOVE_GROUP: 11699 - task_move_group_fair(p); 11700 - break; 11701 - } 11702 11686 } 11703 11687 11704 11688 void free_fair_sched_group(struct task_group *tg) ··· 12063 12075 __init void init_sched_fair_class(void) 12064 12076 { 12065 12077 #ifdef CONFIG_SMP 12078 + int i; 12079 + 12080 + for_each_possible_cpu(i) { 12081 + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); 12082 + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); 12083 + } 12084 + 12066 12085 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 12067 12086 12068 12087 #ifdef CONFIG_NO_HZ_COMMON
+7 -11
kernel/sched/rt.c
··· 509 509 unsigned int cpu_cap; 510 510 511 511 /* Only heterogeneous systems can benefit from this check */ 512 - if (!static_branch_unlikely(&sched_asym_cpucapacity)) 512 + if (!sched_asym_cpucap_active()) 513 513 return true; 514 514 515 515 min_cap = uclamp_eff_value(p, UCLAMP_MIN); ··· 843 843 * We cannot be left wanting - that would mean some runtime 844 844 * leaked out of the system. 845 845 */ 846 - BUG_ON(want); 846 + WARN_ON_ONCE(want); 847 847 balanced: 848 848 /* 849 849 * Disable all the borrow logic by pretending we have inf ··· 1062 1062 1063 1063 trace_sched_stat_runtime(curr, delta_exec, 0); 1064 1064 1065 - curr->se.sum_exec_runtime += delta_exec; 1066 - account_group_exec_runtime(curr, delta_exec); 1067 - 1068 - curr->se.exec_start = now; 1069 - cgroup_account_cputime(curr, delta_exec); 1065 + update_current_exec_runtime(curr, now, delta_exec); 1070 1066 1071 1067 if (!rt_bandwidth_enabled()) 1072 1068 return; ··· 1845 1849 1846 1850 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1847 1851 { 1848 - if (!task_running(rq, p) && 1852 + if (!task_on_cpu(rq, p) && 1849 1853 cpumask_test_cpu(cpu, &p->cpus_mask)) 1850 1854 return 1; 1851 1855 ··· 1893 1897 * If we're on asym system ensure we consider the different capacities 1894 1898 * of the CPUs when searching for the lowest_mask. 1895 1899 */ 1896 - if (static_branch_unlikely(&sched_asym_cpucapacity)) { 1900 + if (sched_asym_cpucap_active()) { 1897 1901 1898 1902 ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, 1899 1903 task, lowest_mask, ··· 2000 2004 */ 2001 2005 if (unlikely(task_rq(task) != rq || 2002 2006 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || 2003 - task_running(rq, task) || 2007 + task_on_cpu(rq, task) || 2004 2008 !rt_task(task) || 2005 2009 !task_on_rq_queued(task))) { 2006 2010 ··· 2458 2462 */ 2459 2463 static void task_woken_rt(struct rq *rq, struct task_struct *p) 2460 2464 { 2461 - bool need_to_push = !task_running(rq, p) && 2465 + bool need_to_push = !task_on_cpu(rq, p) && 2462 2466 !test_tsk_need_resched(rq->curr) && 2463 2467 p->nr_cpus_allowed > 1 && 2464 2468 (dl_task(rq->curr) || rt_task(rq->curr)) &&
+42 -23
kernel/sched/sched.h
··· 321 321 u64 total_bw; 322 322 }; 323 323 324 - /* 325 - * Verify the fitness of task @p to run on @cpu taking into account the 326 - * CPU original capacity and the runtime/deadline ratio of the task. 327 - * 328 - * The function will return true if the CPU original capacity of the 329 - * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the 330 - * task and false otherwise. 331 - */ 332 - static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) 333 - { 334 - unsigned long cap = arch_scale_cpu_capacity(cpu); 335 - 336 - return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; 337 - } 338 - 339 324 extern void init_dl_bw(struct dl_bw *dl_b); 340 325 extern int sched_dl_global_validate(void); 341 326 extern void sched_dl_do_global(void); ··· 1800 1815 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); 1801 1816 extern struct static_key_false sched_asym_cpucapacity; 1802 1817 1818 + static __always_inline bool sched_asym_cpucap_active(void) 1819 + { 1820 + return static_branch_unlikely(&sched_asym_cpucapacity); 1821 + } 1822 + 1803 1823 struct sched_group_capacity { 1804 1824 atomic_t ref; 1805 1825 /* ··· 1932 1942 set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); 1933 1943 p->se.cfs_rq = tg->cfs_rq[cpu]; 1934 1944 p->se.parent = tg->se[cpu]; 1945 + p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; 1935 1946 #endif 1936 1947 1937 1948 #ifdef CONFIG_RT_GROUP_SCHED ··· 2051 2060 return rq->curr == p; 2052 2061 } 2053 2062 2054 - static inline int task_running(struct rq *rq, struct task_struct *p) 2063 + static inline int task_on_cpu(struct rq *rq, struct task_struct *p) 2055 2064 { 2056 2065 #ifdef CONFIG_SMP 2057 2066 return p->on_cpu; ··· 2195 2204 2196 2205 void (*update_curr)(struct rq *rq); 2197 2206 2198 - #define TASK_SET_GROUP 0 2199 - #define TASK_MOVE_GROUP 1 2200 - 2201 2207 #ifdef CONFIG_FAIR_GROUP_SCHED 2202 - void (*task_change_group)(struct task_struct *p, int type); 2208 + void (*task_change_group)(struct task_struct *p); 2203 2209 #endif 2204 2210 }; 2205 2211 ··· 2422 2434 extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); 2423 2435 2424 2436 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); 2437 + 2438 + #ifdef CONFIG_PREEMPT_RT 2439 + #define SCHED_NR_MIGRATE_BREAK 8 2440 + #else 2441 + #define SCHED_NR_MIGRATE_BREAK 32 2442 + #endif 2425 2443 2426 2444 extern const_debug unsigned int sysctl_sched_nr_migrate; 2427 2445 extern const_debug unsigned int sysctl_sched_migration_cost; ··· 2703 2709 __acquires(rq1->lock) 2704 2710 __acquires(rq2->lock) 2705 2711 { 2706 - BUG_ON(!irqs_disabled()); 2707 - BUG_ON(rq1 != rq2); 2712 + WARN_ON_ONCE(!irqs_disabled()); 2713 + WARN_ON_ONCE(rq1 != rq2); 2708 2714 raw_spin_rq_lock(rq1); 2709 2715 __acquire(rq2->lock); /* Fake it out ;) */ 2710 2716 double_rq_clock_clear_update(rq1, rq2); ··· 2720 2726 __releases(rq1->lock) 2721 2727 __releases(rq2->lock) 2722 2728 { 2723 - BUG_ON(rq1 != rq2); 2729 + WARN_ON_ONCE(rq1 != rq2); 2724 2730 raw_spin_rq_unlock(rq1); 2725 2731 __release(rq2->lock); 2726 2732 } ··· 2889 2895 unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, 2890 2896 enum cpu_util_type type, 2891 2897 struct task_struct *p); 2898 + 2899 + /* 2900 + * Verify the fitness of task @p to run on @cpu taking into account the 2901 + * CPU original capacity and the runtime/deadline ratio of the task. 2902 + * 2903 + * The function will return true if the original capacity of @cpu is 2904 + * greater than or equal to task's deadline density right shifted by 2905 + * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise. 2906 + */ 2907 + static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) 2908 + { 2909 + unsigned long cap = arch_scale_cpu_capacity(cpu); 2910 + 2911 + return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT); 2912 + } 2892 2913 2893 2914 static inline unsigned long cpu_bw_dl(struct rq *rq) 2894 2915 { ··· 3165 3156 extern int sched_dynamic_mode(const char *str); 3166 3157 extern void sched_dynamic_update(int mode); 3167 3158 #endif 3159 + 3160 + static inline void update_current_exec_runtime(struct task_struct *curr, 3161 + u64 now, u64 delta_exec) 3162 + { 3163 + curr->se.sum_exec_runtime += delta_exec; 3164 + account_group_exec_runtime(curr, delta_exec); 3165 + 3166 + curr->se.exec_start = now; 3167 + cgroup_account_cputime(curr, delta_exec); 3168 + } 3168 3169 3169 3170 #endif /* _KERNEL_SCHED_SCHED_H */
+4 -7
kernel/sched/stop_task.c
··· 71 71 static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) 72 72 { 73 73 struct task_struct *curr = rq->curr; 74 - u64 delta_exec; 74 + u64 now, delta_exec; 75 75 76 - delta_exec = rq_clock_task(rq) - curr->se.exec_start; 76 + now = rq_clock_task(rq); 77 + delta_exec = now - curr->se.exec_start; 77 78 if (unlikely((s64)delta_exec < 0)) 78 79 delta_exec = 0; 79 80 80 81 schedstat_set(curr->stats.exec_max, 81 82 max(curr->stats.exec_max, delta_exec)); 82 83 83 - curr->se.sum_exec_runtime += delta_exec; 84 - account_group_exec_runtime(curr, delta_exec); 85 - 86 - curr->se.exec_start = rq_clock_task(rq); 87 - cgroup_account_cputime(curr, delta_exec); 84 + update_current_exec_runtime(curr, now, delta_exec); 88 85 } 89 86 90 87 /*
+7 -7
kernel/signal.c
··· 2305 2305 read_unlock(&tasklist_lock); 2306 2306 cgroup_enter_frozen(); 2307 2307 preempt_enable_no_resched(); 2308 - freezable_schedule(); 2308 + schedule(); 2309 2309 cgroup_leave_frozen(true); 2310 2310 2311 2311 /* ··· 2474 2474 2475 2475 /* Now we don't run again until woken by SIGCONT or SIGKILL */ 2476 2476 cgroup_enter_frozen(); 2477 - freezable_schedule(); 2477 + schedule(); 2478 2478 return true; 2479 2479 } else { 2480 2480 /* ··· 2549 2549 * immediately (if there is a non-fatal signal pending), and 2550 2550 * put the task into sleep. 2551 2551 */ 2552 - __set_current_state(TASK_INTERRUPTIBLE); 2552 + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 2553 2553 clear_thread_flag(TIF_SIGPENDING); 2554 2554 spin_unlock_irq(&current->sighand->siglock); 2555 2555 cgroup_enter_frozen(); 2556 - freezable_schedule(); 2556 + schedule(); 2557 2557 } 2558 2558 2559 2559 static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) ··· 3601 3601 recalc_sigpending(); 3602 3602 spin_unlock_irq(&tsk->sighand->siglock); 3603 3603 3604 - __set_current_state(TASK_INTERRUPTIBLE); 3605 - ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, 3606 - HRTIMER_MODE_REL); 3604 + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 3605 + ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, 3606 + HRTIMER_MODE_REL); 3607 3607 spin_lock_irq(&tsk->sighand->siglock); 3608 3608 __set_task_blocked(tsk, &tsk->real_blocked); 3609 3609 sigemptyset(&tsk->real_blocked);
+2 -2
kernel/time/hrtimer.c
··· 2037 2037 struct restart_block *restart; 2038 2038 2039 2039 do { 2040 - set_current_state(TASK_INTERRUPTIBLE); 2040 + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 2041 2041 hrtimer_sleeper_start_expires(t, mode); 2042 2042 2043 2043 if (likely(t->task)) 2044 - freezable_schedule(); 2044 + schedule(); 2045 2045 2046 2046 hrtimer_cancel(&t->timer); 2047 2047 mode = HRTIMER_MODE_ABS;
+12 -6
kernel/umh.c
··· 28 28 #include <linux/async.h> 29 29 #include <linux/uaccess.h> 30 30 #include <linux/initrd.h> 31 + #include <linux/freezer.h> 31 32 32 33 #include <trace/events/module.h> 33 34 ··· 404 403 */ 405 404 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 406 405 { 406 + unsigned int state = TASK_UNINTERRUPTIBLE; 407 407 DECLARE_COMPLETION_ONSTACK(done); 408 408 int retval = 0; 409 409 ··· 438 436 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 439 437 goto unlock; 440 438 441 - if (wait & UMH_KILLABLE) { 442 - retval = wait_for_completion_killable(&done); 443 - if (!retval) 444 - goto wait_done; 439 + if (wait & UMH_KILLABLE) 440 + state |= TASK_KILLABLE; 445 441 442 + if (wait & UMH_FREEZABLE) 443 + state |= TASK_FREEZABLE; 444 + 445 + retval = wait_for_completion_state(&done, state); 446 + if (!retval) 447 + goto wait_done; 448 + 449 + if (wait & UMH_KILLABLE) { 446 450 /* umh_complete() will see NULL and free sub_info */ 447 451 if (xchg(&sub_info->complete, NULL)) 448 452 goto unlock; 449 - /* fallthrough, umh_complete() was already called */ 450 453 } 451 454 452 - wait_for_completion(&done); 453 455 wait_done: 454 456 retval = sub_info->retval; 455 457 out:
+2 -2
mm/khugepaged.c
··· 730 730 DEFINE_WAIT(wait); 731 731 732 732 add_wait_queue(&khugepaged_wait, &wait); 733 - freezable_schedule_timeout_interruptible( 734 - msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 733 + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); 734 + schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); 735 735 remove_wait_queue(&khugepaged_wait, &wait); 736 736 } 737 737
+5 -7
net/sunrpc/sched.c
··· 269 269 270 270 static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) 271 271 { 272 - freezable_schedule_unsafe(); 272 + schedule(); 273 273 if (signal_pending_state(mode, current)) 274 274 return -ERESTARTSYS; 275 275 return 0; ··· 333 333 * to enforce taking of the wq->lock and hence avoid races with 334 334 * rpc_complete_task(). 335 335 */ 336 - int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action) 336 + int rpc_wait_for_completion_task(struct rpc_task *task) 337 337 { 338 - if (action == NULL) 339 - action = rpc_wait_bit_killable; 340 338 return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, 341 - action, TASK_KILLABLE); 339 + rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); 342 340 } 343 - EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); 341 + EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task); 344 342 345 343 /* 346 344 * Make an RPC task runnable. ··· 962 964 trace_rpc_task_sync_sleep(task, task->tk_action); 963 965 status = out_of_line_wait_on_bit(&task->tk_runstate, 964 966 RPC_TASK_QUEUED, rpc_wait_bit_killable, 965 - TASK_KILLABLE); 967 + TASK_KILLABLE|TASK_FREEZABLE); 966 968 if (status < 0) { 967 969 /* 968 970 * When a sync task receives a signal, it exits with
+3 -5
net/unix/af_unix.c
··· 2560 2560 struct sk_buff *last, unsigned int last_len, 2561 2561 bool freezable) 2562 2562 { 2563 + unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; 2563 2564 struct sk_buff *tail; 2564 2565 DEFINE_WAIT(wait); 2565 2566 2566 2567 unix_state_lock(sk); 2567 2568 2568 2569 for (;;) { 2569 - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2570 + prepare_to_wait(sk_sleep(sk), &wait, state); 2570 2571 2571 2572 tail = skb_peek_tail(&sk->sk_receive_queue); 2572 2573 if (tail != last || ··· 2580 2579 2581 2580 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2582 2581 unix_state_unlock(sk); 2583 - if (freezable) 2584 - timeo = freezable_schedule_timeout(timeo); 2585 - else 2586 - timeo = schedule_timeout(timeo); 2582 + timeo = schedule_timeout(timeo); 2587 2583 unix_state_lock(sk); 2588 2584 2589 2585 if (sock_flag(sk, SOCK_DEAD))