Merge tag 'sched-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+1

arch/riscv/Kconfig

··· 39 39 select ARCH_HAS_MMIOWB 40 40 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE 41 41 select ARCH_HAS_PMEM_API 42 + select ARCH_HAS_PREEMPT_LAZY 42 43 select ARCH_HAS_PREPARE_SYNC_CORE_CMD 43 44 select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU 44 45 select ARCH_HAS_PTE_SPECIAL

+6 -4

arch/riscv/include/asm/thread_info.h

··· 107 107 * - pending work-to-be-done flags are in lowest half-word 108 108 * - other flags in upper half-word(s) 109 109 */ 110 - #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ 111 - #define TIF_SIGPENDING 2 /* signal pending */ 112 - #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ 110 + #define TIF_NEED_RESCHED 0 /* rescheduling necessary */ 111 + #define TIF_NEED_RESCHED_LAZY 1 /* Lazy rescheduling needed */ 112 + #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ 113 + #define TIF_SIGPENDING 3 /* signal pending */ 113 114 #define TIF_RESTORE_SIGMASK 4 /* restore signal mask in do_signal() */ 114 115 #define TIF_MEMDIE 5 /* is terminating due to OOM killer */ 115 116 #define TIF_NOTIFY_SIGNAL 9 /* signal notifications exist */ ··· 118 117 #define TIF_32BIT 11 /* compat-mode 32bit process */ 119 118 #define TIF_RISCV_V_DEFER_RESTORE 12 /* restore Vector before returing to user */ 120 119 120 + #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 121 + #define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) 121 122 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 122 123 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) 123 - #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 124 124 #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) 125 125 #define _TIF_UPROBE (1 << TIF_UPROBE) 126 126 #define _TIF_RISCV_V_DEFER_RESTORE (1 << TIF_RISCV_V_DEFER_RESTORE)

+1

arch/x86/Kconfig

··· 93 93 select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS 94 94 select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE 95 95 select ARCH_HAS_PMEM_API if X86_64 96 + select ARCH_HAS_PREEMPT_LAZY 96 97 select ARCH_HAS_PTE_DEVMAP if X86_64 97 98 select ARCH_HAS_PTE_SPECIAL 98 99 select ARCH_HAS_HW_PTE_YOUNG

+4 -2

arch/x86/include/asm/thread_info.h

··· 87 87 #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ 88 88 #define TIF_SIGPENDING 2 /* signal pending */ 89 89 #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ 90 - #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ 91 - #define TIF_SSBD 5 /* Speculative store bypass disable */ 90 + #define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ 91 + #define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ 92 + #define TIF_SSBD 6 /* Speculative store bypass disable */ 92 93 #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ 93 94 #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ 94 95 #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ ··· 111 110 #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) 112 111 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) 113 112 #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) 113 + #define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) 114 114 #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) 115 115 #define _TIF_SSBD (1 << TIF_SSBD) 116 116 #define _TIF_SPEC_IB (1 << TIF_SPEC_IB)

+1 -1

fs/exec.c

··· 990 990 active_mm = tsk->active_mm; 991 991 tsk->active_mm = mm; 992 992 tsk->mm = mm; 993 - mm_init_cid(mm); 993 + mm_init_cid(mm, tsk); 994 994 /* 995 995 * This prevents preemption while active_mm is being loaded and 996 996 * it and mm are being updated, which could cause problems for

+2 -1

include/linux/entry-common.h

··· 64 64 65 65 #define EXIT_TO_USER_MODE_WORK \ 66 66 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 67 - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 67 + _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 68 + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 68 69 ARCH_EXIT_TO_USER_MODE_WORK) 69 70 70 71 /**

+3 -2

include/linux/entry-kvm.h

+63 -9

include/linux/mm_types.h

··· 782 782 struct mm_cid { 783 783 u64 time; 784 784 int cid; 785 + int recent_cid; 785 786 }; 786 787 #endif 787 788 ··· 853 852 * When the next mm_cid scan is due (in jiffies). 854 853 */ 855 854 unsigned long mm_cid_next_scan; 855 + /** 856 + * @nr_cpus_allowed: Number of CPUs allowed for mm. 857 + * 858 + * Number of CPUs allowed in the union of all mm's 859 + * threads allowed CPUs. 860 + */ 861 + unsigned int nr_cpus_allowed; 862 + /** 863 + * @max_nr_cid: Maximum number of concurrency IDs allocated. 864 + * 865 + * Track the highest number of concurrency IDs allocated for the 866 + * mm. 867 + */ 868 + atomic_t max_nr_cid; 869 + /** 870 + * @cpus_allowed_lock: Lock protecting mm cpus_allowed. 871 + * 872 + * Provide mutual exclusion for mm cpus_allowed and 873 + * mm nr_cpus_allowed updates. 874 + */ 875 + raw_spinlock_t cpus_allowed_lock; 856 876 #endif 857 877 #ifdef CONFIG_MMU 858 878 atomic_long_t pgtables_bytes; /* size of all page tables */ ··· 1192 1170 return cid & ~MM_CID_LAZY_PUT; 1193 1171 } 1194 1172 1173 + /* 1174 + * mm_cpus_allowed: Union of all mm's threads allowed CPUs. 1175 + */ 1176 + static inline cpumask_t *mm_cpus_allowed(struct mm_struct *mm) 1177 + { 1178 + unsigned long bitmap = (unsigned long)mm; 1179 + 1180 + bitmap += offsetof(struct mm_struct, cpu_bitmap); 1181 + /* Skip cpu_bitmap */ 1182 + bitmap += cpumask_size(); 1183 + return (struct cpumask *)bitmap; 1184 + } 1185 + 1195 1186 /* Accessor for struct mm_struct's cidmask. */ 1196 1187 static inline cpumask_t *mm_cidmask(struct mm_struct *mm) 1197 1188 { 1198 - unsigned long cid_bitmap = (unsigned long)mm; 1189 + unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); 1199 1190 1200 - cid_bitmap += offsetof(struct mm_struct, cpu_bitmap); 1201 - /* Skip cpu_bitmap */ 1191 + /* Skip mm_cpus_allowed */ 1202 1192 cid_bitmap += cpumask_size(); 1203 1193 return (struct cpumask *)cid_bitmap; 1204 1194 } 1205 1195 1206 - static inline void mm_init_cid(struct mm_struct *mm) 1196 + static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 1207 1197 { 1208 1198 int i; 1209 1199 ··· 1223 1189 struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); 1224 1190 1225 1191 pcpu_cid->cid = MM_CID_UNSET; 1192 + pcpu_cid->recent_cid = MM_CID_UNSET; 1226 1193 pcpu_cid->time = 0; 1227 1194 } 1195 + mm->nr_cpus_allowed = p->nr_cpus_allowed; 1196 + atomic_set(&mm->max_nr_cid, 0); 1197 + raw_spin_lock_init(&mm->cpus_allowed_lock); 1198 + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 1228 1199 cpumask_clear(mm_cidmask(mm)); 1229 1200 } 1230 1201 1231 - static inline int mm_alloc_cid_noprof(struct mm_struct *mm) 1202 + static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) 1232 1203 { 1233 1204 mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); 1234 1205 if (!mm->pcpu_cid) 1235 1206 return -ENOMEM; 1236 - mm_init_cid(mm); 1207 + mm_init_cid(mm, p); 1237 1208 return 0; 1238 1209 } 1239 1210 #define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__)) ··· 1251 1212 1252 1213 static inline unsigned int mm_cid_size(void) 1253 1214 { 1254 - return cpumask_size(); 1215 + return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ 1216 + } 1217 + 1218 + static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) 1219 + { 1220 + struct cpumask *mm_allowed = mm_cpus_allowed(mm); 1221 + 1222 + if (!mm) 1223 + return; 1224 + /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ 1225 + raw_spin_lock(&mm->cpus_allowed_lock); 1226 + cpumask_or(mm_allowed, mm_allowed, cpumask); 1227 + WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); 1228 + raw_spin_unlock(&mm->cpus_allowed_lock); 1255 1229 } 1256 1230 #else /* CONFIG_SCHED_MM_CID */ 1257 - static inline void mm_init_cid(struct mm_struct *mm) { } 1258 - static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; } 1231 + static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } 1232 + static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } 1259 1233 static inline void mm_destroy_cid(struct mm_struct *mm) { } 1234 + 1260 1235 static inline unsigned int mm_cid_size(void) 1261 1236 { 1262 1237 return 0; 1263 1238 } 1239 + static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } 1264 1240 #endif /* CONFIG_SCHED_MM_CID */ 1265 1241 1266 1242 struct mmu_gather;

+7 -1

include/linux/preempt.h

··· 486 486 extern bool preempt_model_none(void); 487 487 extern bool preempt_model_voluntary(void); 488 488 extern bool preempt_model_full(void); 489 + extern bool preempt_model_lazy(void); 489 490 490 491 #else 491 492 ··· 501 500 static inline bool preempt_model_full(void) 502 501 { 503 502 return IS_ENABLED(CONFIG_PREEMPT); 503 + } 504 + 505 + static inline bool preempt_model_lazy(void) 506 + { 507 + return IS_ENABLED(CONFIG_PREEMPT_LAZY); 504 508 } 505 509 506 510 #endif ··· 525 519 */ 526 520 static inline bool preempt_model_preemptible(void) 527 521 { 528 - return preempt_model_full() || preempt_model_rt(); 522 + return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); 529 523 } 530 524 531 525 #endif /* __LINUX_PREEMPT_H */

+3 -2

include/linux/sched.h

··· 1898 1898 1899 1899 #ifdef CONFIG_THREAD_INFO_IN_TASK 1900 1900 # define task_thread_info(task) (&(task)->thread_info) 1901 - #elif !defined(__HAVE_THREAD_FUNCTIONS) 1901 + #else 1902 1902 # define task_thread_info(task) ((struct thread_info *)(task)->stack) 1903 1903 #endif 1904 1904 ··· 2002 2002 2003 2003 static inline void clear_tsk_need_resched(struct task_struct *tsk) 2004 2004 { 2005 - clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); 2005 + atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, 2006 + (atomic_long_t *)&task_thread_info(tsk)->flags); 2006 2007 } 2007 2008 2008 2009 static inline int test_tsk_need_resched(struct task_struct *tsk)

-1

include/linux/sched/ext.h

··· 199 199 #ifdef CONFIG_EXT_GROUP_SCHED 200 200 struct cgroup *cgrp_moving_from; 201 201 #endif 202 - /* must be the last field, see init_scx_entity() */ 203 202 struct list_head tasks_node; 204 203 }; 205 204

+1 -1

include/linux/sched/task_stack.h

··· 34 34 #endif 35 35 } 36 36 37 - #elif !defined(__HAVE_THREAD_FUNCTIONS) 37 + #else 38 38 39 39 #define task_stack_page(task) ((void *)(task)->stack) 40 40

+17 -4

include/linux/thread_info.h

··· 59 59 60 60 #include <asm/thread_info.h> 61 61 62 + #ifndef TIF_NEED_RESCHED_LAZY 63 + #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY 64 + #error Inconsistent PREEMPT_LAZY 65 + #endif 66 + #define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED 67 + #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED 68 + #endif 69 + 62 70 #ifdef __KERNEL__ 63 71 64 72 #ifndef arch_set_restart_data ··· 187 179 188 180 #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H 189 181 190 - static __always_inline bool tif_need_resched(void) 182 + static __always_inline bool tif_test_bit(int bit) 191 183 { 192 - return arch_test_bit(TIF_NEED_RESCHED, 184 + return arch_test_bit(bit, 193 185 (unsigned long *)(&current_thread_info()->flags)); 194 186 } 195 187 196 188 #else 197 189 198 - static __always_inline bool tif_need_resched(void) 190 + static __always_inline bool tif_test_bit(int bit) 199 191 { 200 - return test_bit(TIF_NEED_RESCHED, 192 + return test_bit(bit, 201 193 (unsigned long *)(&current_thread_info()->flags)); 202 194 } 203 195 204 196 #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ 197 + 198 + static __always_inline bool tif_need_resched(void) 199 + { 200 + return tif_test_bit(TIF_NEED_RESCHED); 201 + } 205 202 206 203 #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES 207 204 static inline int arch_within_stack_frames(const void * const stack,

+363 -83

include/linux/wait_bit.h

··· 8 8 #include <linux/wait.h> 9 9 10 10 struct wait_bit_key { 11 - void *flags; 11 + unsigned long *flags; 12 12 int bit_nr; 13 13 unsigned long timeout; 14 14 }; ··· 23 23 24 24 typedef int wait_bit_action_f(struct wait_bit_key *key, int mode); 25 25 26 - void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit); 26 + void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit); 27 27 int __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); 28 28 int __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry, wait_bit_action_f *action, unsigned int mode); 29 - void wake_up_bit(void *word, int bit); 30 - int out_of_line_wait_on_bit(void *word, int, wait_bit_action_f *action, unsigned int mode); 31 - int out_of_line_wait_on_bit_timeout(void *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); 32 - int out_of_line_wait_on_bit_lock(void *word, int, wait_bit_action_f *action, unsigned int mode); 33 - struct wait_queue_head *bit_waitqueue(void *word, int bit); 29 + void wake_up_bit(unsigned long *word, int bit); 30 + int out_of_line_wait_on_bit(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); 31 + int out_of_line_wait_on_bit_timeout(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode, unsigned long timeout); 32 + int out_of_line_wait_on_bit_lock(unsigned long *word, int, wait_bit_action_f *action, unsigned int mode); 33 + struct wait_queue_head *bit_waitqueue(unsigned long *word, int bit); 34 34 extern void __init wait_bit_init(void); 35 35 36 36 int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync, void *key); ··· 49 49 extern int bit_wait(struct wait_bit_key *key, int mode); 50 50 extern int bit_wait_io(struct wait_bit_key *key, int mode); 51 51 extern int bit_wait_timeout(struct wait_bit_key *key, int mode); 52 - extern int bit_wait_io_timeout(struct wait_bit_key *key, int mode); 53 52 54 53 /** 55 54 * wait_on_bit - wait for a bit to be cleared 56 - * @word: the word being waited on, a kernel virtual address 57 - * @bit: the bit of the word being waited on 55 + * @word: the address containing the bit being waited on 56 + * @bit: the bit at that address being waited on 58 57 * @mode: the task state to sleep in 59 58 * 60 - * There is a standard hashed waitqueue table for generic use. This 61 - * is the part of the hashtable's accessor API that waits on a bit. 62 - * For instance, if one were to have waiters on a bitflag, one would 63 - * call wait_on_bit() in threads waiting for the bit to clear. 64 - * One uses wait_on_bit() where one is waiting for the bit to clear, 65 - * but has no intention of setting it. 66 - * Returned value will be zero if the bit was cleared, or non-zero 67 - * if the process received a signal and the mode permitted wakeup 68 - * on that signal. 59 + * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) 60 + * to be cleared. The clearing of the bit must be signalled with 61 + * wake_up_bit(), often as clear_and_wake_up_bit(). 62 + * 63 + * The process will wait on a waitqueue selected by hash from a shared 64 + * pool. It will only be woken on a wake_up for the target bit, even 65 + * if other processes on the same queue are waiting for other bits. 66 + * 67 + * Returned value will be zero if the bit was cleared in which case the 68 + * call has ACQUIRE semantics, or %-EINTR if the process received a 69 + * signal and the mode permitted wake up on that signal. 69 70 */ 70 71 static inline int 71 72 wait_on_bit(unsigned long *word, int bit, unsigned mode) ··· 81 80 82 81 /** 83 82 * wait_on_bit_io - wait for a bit to be cleared 84 - * @word: the word being waited on, a kernel virtual address 85 - * @bit: the bit of the word being waited on 83 + * @word: the address containing the bit being waited on 84 + * @bit: the bit at that address being waited on 86 85 * @mode: the task state to sleep in 87 86 * 88 - * Use the standard hashed waitqueue table to wait for a bit 89 - * to be cleared. This is similar to wait_on_bit(), but calls 90 - * io_schedule() instead of schedule() for the actual waiting. 87 + * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) 88 + * to be cleared. The clearing of the bit must be signalled with 89 + * wake_up_bit(), often as clear_and_wake_up_bit(). 91 90 * 92 - * Returned value will be zero if the bit was cleared, or non-zero 93 - * if the process received a signal and the mode permitted wakeup 94 - * on that signal. 91 + * This is similar to wait_on_bit(), but calls io_schedule() instead of 92 + * schedule() for the actual waiting. 93 + * 94 + * Returned value will be zero if the bit was cleared in which case the 95 + * call has ACQUIRE semantics, or %-EINTR if the process received a 96 + * signal and the mode permitted wake up on that signal. 95 97 */ 96 98 static inline int 97 99 wait_on_bit_io(unsigned long *word, int bit, unsigned mode) ··· 108 104 } 109 105 110 106 /** 111 - * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses 112 - * @word: the word being waited on, a kernel virtual address 113 - * @bit: the bit of the word being waited on 107 + * wait_on_bit_timeout - wait for a bit to be cleared or a timeout to elapse 108 + * @word: the address containing the bit being waited on 109 + * @bit: the bit at that address being waited on 114 110 * @mode: the task state to sleep in 115 111 * @timeout: timeout, in jiffies 116 112 * 117 - * Use the standard hashed waitqueue table to wait for a bit 118 - * to be cleared. This is similar to wait_on_bit(), except also takes a 119 - * timeout parameter. 113 + * Wait for the given bit in an unsigned long or bitmap (see 114 + * DECLARE_BITMAP()) to be cleared, or for a timeout to expire. The 115 + * clearing of the bit must be signalled with wake_up_bit(), often as 116 + * clear_and_wake_up_bit(). 120 117 * 121 - * Returned value will be zero if the bit was cleared before the 122 - * @timeout elapsed, or non-zero if the @timeout elapsed or process 123 - * received a signal and the mode permitted wakeup on that signal. 118 + * This is similar to wait_on_bit(), except it also takes a timeout 119 + * parameter. 120 + * 121 + * Returned value will be zero if the bit was cleared in which case the 122 + * call has ACQUIRE semantics, or %-EINTR if the process received a 123 + * signal and the mode permitted wake up on that signal, or %-EAGAIN if the 124 + * timeout elapsed. 124 125 */ 125 126 static inline int 126 127 wait_on_bit_timeout(unsigned long *word, int bit, unsigned mode, ··· 141 132 142 133 /** 143 134 * wait_on_bit_action - wait for a bit to be cleared 144 - * @word: the word being waited on, a kernel virtual address 145 - * @bit: the bit of the word being waited on 135 + * @word: the address containing the bit waited on 136 + * @bit: the bit at that address being waited on 146 137 * @action: the function used to sleep, which may take special actions 147 138 * @mode: the task state to sleep in 148 139 * 149 - * Use the standard hashed waitqueue table to wait for a bit 150 - * to be cleared, and allow the waiting action to be specified. 151 - * This is like wait_on_bit() but allows fine control of how the waiting 152 - * is done. 140 + * Wait for the given bit in an unsigned long or bitmap (see DECLARE_BITMAP()) 141 + * to be cleared. The clearing of the bit must be signalled with 142 + * wake_up_bit(), often as clear_and_wake_up_bit(). 153 143 * 154 - * Returned value will be zero if the bit was cleared, or non-zero 155 - * if the process received a signal and the mode permitted wakeup 156 - * on that signal. 144 + * This is similar to wait_on_bit(), but calls @action() instead of 145 + * schedule() for the actual waiting. 146 + * 147 + * Returned value will be zero if the bit was cleared in which case the 148 + * call has ACQUIRE semantics, or the error code returned by @action if 149 + * that call returned non-zero. 157 150 */ 158 151 static inline int 159 152 wait_on_bit_action(unsigned long *word, int bit, wait_bit_action_f *action, ··· 168 157 } 169 158 170 159 /** 171 - * wait_on_bit_lock - wait for a bit to be cleared, when wanting to set it 172 - * @word: the word being waited on, a kernel virtual address 173 - * @bit: the bit of the word being waited on 160 + * wait_on_bit_lock - wait for a bit to be cleared, then set it 161 + * @word: the address containing the bit being waited on 162 + * @bit: the bit of the word being waited on and set 174 163 * @mode: the task state to sleep in 175 164 * 176 - * There is a standard hashed waitqueue table for generic use. This 177 - * is the part of the hashtable's accessor API that waits on a bit 178 - * when one intends to set it, for instance, trying to lock bitflags. 179 - * For instance, if one were to have waiters trying to set bitflag 180 - * and waiting for it to clear before setting it, one would call 181 - * wait_on_bit() in threads waiting to be able to set the bit. 182 - * One uses wait_on_bit_lock() where one is waiting for the bit to 183 - * clear with the intention of setting it, and when done, clearing it. 165 + * Wait for the given bit in an unsigned long or bitmap (see 166 + * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be 167 + * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As 168 + * soon as it is clear, atomically set it and return. 184 169 * 185 - * Returns zero if the bit was (eventually) found to be clear and was 186 - * set. Returns non-zero if a signal was delivered to the process and 187 - * the @mode allows that signal to wake the process. 170 + * This is similar to wait_on_bit(), but sets the bit before returning. 171 + * 172 + * Returned value will be zero if the bit was successfully set in which 173 + * case the call has the same memory sequencing semantics as 174 + * test_and_clear_bit(), or %-EINTR if the process received a signal and 175 + * the mode permitted wake up on that signal. 188 176 */ 189 177 static inline int 190 178 wait_on_bit_lock(unsigned long *word, int bit, unsigned mode) ··· 195 185 } 196 186 197 187 /** 198 - * wait_on_bit_lock_io - wait for a bit to be cleared, when wanting to set it 199 - * @word: the word being waited on, a kernel virtual address 200 - * @bit: the bit of the word being waited on 188 + * wait_on_bit_lock_io - wait for a bit to be cleared, then set it 189 + * @word: the address containing the bit being waited on 190 + * @bit: the bit of the word being waited on and set 201 191 * @mode: the task state to sleep in 202 192 * 203 - * Use the standard hashed waitqueue table to wait for a bit 204 - * to be cleared and then to atomically set it. This is similar 205 - * to wait_on_bit(), but calls io_schedule() instead of schedule() 206 - * for the actual waiting. 193 + * Wait for the given bit in an unsigned long or bitmap (see 194 + * DECLARE_BITMAP()) to be cleared. The clearing of the bit must be 195 + * signalled with wake_up_bit(), often as clear_and_wake_up_bit(). As 196 + * soon as it is clear, atomically set it and return. 197 + * 198 + * This is similar to wait_on_bit_lock(), but calls io_schedule() instead 199 + * of schedule(). 207 200 * 208 201 * Returns zero if the bit was (eventually) found to be clear and was 209 202 * set. Returns non-zero if a signal was delivered to the process and ··· 222 209 } 223 210 224 211 /** 225 - * wait_on_bit_lock_action - wait for a bit to be cleared, when wanting to set it 226 - * @word: the word being waited on, a kernel virtual address 227 - * @bit: the bit of the word being waited on 212 + * wait_on_bit_lock_action - wait for a bit to be cleared, then set it 213 + * @word: the address containing the bit being waited on 214 + * @bit: the bit of the word being waited on and set 228 215 * @action: the function used to sleep, which may take special actions 229 216 * @mode: the task state to sleep in 230 217 * 231 - * Use the standard hashed waitqueue table to wait for a bit 232 - * to be cleared and then to set it, and allow the waiting action 233 - * to be specified. 234 - * This is like wait_on_bit() but allows fine control of how the waiting 235 - * is done. 218 + * This is similar to wait_on_bit_lock(), but calls @action() instead of 219 + * schedule() for the actual waiting. 236 220 * 237 - * Returns zero if the bit was (eventually) found to be clear and was 238 - * set. Returns non-zero if a signal was delivered to the process and 239 - * the @mode allows that signal to wake the process. 221 + * Returned value will be zero if the bit was successfully set in which 222 + * case the call has the same memory sequencing semantics as 223 + * test_and_clear_bit(), or the error code returned by @action if that 224 + * call returned non-zero. 240 225 */ 241 226 static inline int 242 227 wait_on_bit_lock_action(unsigned long *word, int bit, wait_bit_action_f *action, ··· 280 269 #define __wait_var_event(var, condition) \ 281 270 ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ 282 271 schedule()) 272 + #define __wait_var_event_io(var, condition) \ 273 + ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ 274 + io_schedule()) 283 275 276 + /** 277 + * wait_var_event - wait for a variable to be updated and notified 278 + * @var: the address of variable being waited on 279 + * @condition: the condition to wait for 280 + * 281 + * Wait for a @condition to be true, only re-checking when a wake up is 282 + * received for the given @var (an arbitrary kernel address which need 283 + * not be directly related to the given condition, but usually is). 284 + * 285 + * The process will wait on a waitqueue selected by hash from a shared 286 + * pool. It will only be woken on a wake_up for the given address. 287 + * 288 + * The condition should normally use smp_load_acquire() or a similarly 289 + * ordered access to ensure that any changes to memory made before the 290 + * condition became true will be visible after the wait completes. 291 + */ 284 292 #define wait_var_event(var, condition) \ 285 293 do { \ 286 294 might_sleep(); \ ··· 308 278 __wait_var_event(var, condition); \ 309 279 } while (0) 310 280 281 + /** 282 + * wait_var_event_io - wait for a variable to be updated and notified 283 + * @var: the address of variable being waited on 284 + * @condition: the condition to wait for 285 + * 286 + * Wait for an IO related @condition to be true, only re-checking when a 287 + * wake up is received for the given @var (an arbitrary kernel address 288 + * which need not be directly related to the given condition, but 289 + * usually is). 290 + * 291 + * The process will wait on a waitqueue selected by hash from a shared 292 + * pool. It will only be woken on a wake_up for the given address. 293 + * 294 + * This is similar to wait_var_event(), but calls io_schedule() instead 295 + * of schedule(). 296 + * 297 + * The condition should normally use smp_load_acquire() or a similarly 298 + * ordered access to ensure that any changes to memory made before the 299 + * condition became true will be visible after the wait completes. 300 + */ 301 + #define wait_var_event_io(var, condition) \ 302 + do { \ 303 + might_sleep(); \ 304 + if (condition) \ 305 + break; \ 306 + __wait_var_event_io(var, condition); \ 307 + } while (0) 308 + 311 309 #define __wait_var_event_killable(var, condition) \ 312 310 ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ 313 311 schedule()) 314 312 313 + /** 314 + * wait_var_event_killable - wait for a variable to be updated and notified 315 + * @var: the address of variable being waited on 316 + * @condition: the condition to wait for 317 + * 318 + * Wait for a @condition to be true or a fatal signal to be received, 319 + * only re-checking the condition when a wake up is received for the given 320 + * @var (an arbitrary kernel address which need not be directly related 321 + * to the given condition, but usually is). 322 + * 323 + * This is similar to wait_var_event() but returns a value which is 324 + * 0 if the condition became true, or %-ERESTARTSYS if a fatal signal 325 + * was received. 326 + * 327 + * The condition should normally use smp_load_acquire() or a similarly 328 + * ordered access to ensure that any changes to memory made before the 329 + * condition became true will be visible after the wait completes. 330 + */ 315 331 #define wait_var_event_killable(var, condition) \ 316 332 ({ \ 317 333 int __ret = 0; \ ··· 372 296 TASK_UNINTERRUPTIBLE, 0, timeout, \ 373 297 __ret = schedule_timeout(__ret)) 374 298 299 + /** 300 + * wait_var_event_timeout - wait for a variable to be updated or a timeout to expire 301 + * @var: the address of variable being waited on 302 + * @condition: the condition to wait for 303 + * @timeout: maximum time to wait in jiffies 304 + * 305 + * Wait for a @condition to be true or a timeout to expire, only 306 + * re-checking the condition when a wake up is received for the given 307 + * @var (an arbitrary kernel address which need not be directly related 308 + * to the given condition, but usually is). 309 + * 310 + * This is similar to wait_var_event() but returns a value which is 0 if 311 + * the timeout expired and the condition was still false, or the 312 + * remaining time left in the timeout (but at least 1) if the condition 313 + * was found to be true. 314 + * 315 + * The condition should normally use smp_load_acquire() or a similarly 316 + * ordered access to ensure that any changes to memory made before the 317 + * condition became true will be visible after the wait completes. 318 + */ 375 319 #define wait_var_event_timeout(var, condition, timeout) \ 376 320 ({ \ 377 321 long __ret = timeout; \ ··· 405 309 ___wait_var_event(var, condition, TASK_INTERRUPTIBLE, 0, 0, \ 406 310 schedule()) 407 311 312 + /** 313 + * wait_var_event_killable - wait for a variable to be updated and notified 314 + * @var: the address of variable being waited on 315 + * @condition: the condition to wait for 316 + * 317 + * Wait for a @condition to be true or a signal to be received, only 318 + * re-checking the condition when a wake up is received for the given 319 + * @var (an arbitrary kernel address which need not be directly related 320 + * to the given condition, but usually is). 321 + * 322 + * This is similar to wait_var_event() but returns a value which is 0 if 323 + * the condition became true, or %-ERESTARTSYS if a signal was received. 324 + * 325 + * The condition should normally use smp_load_acquire() or a similarly 326 + * ordered access to ensure that any changes to memory made before the 327 + * condition became true will be visible after the wait completes. 328 + */ 408 329 #define wait_var_event_interruptible(var, condition) \ 409 330 ({ \ 410 331 int __ret = 0; \ ··· 432 319 }) 433 320 434 321 /** 435 - * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit 322 + * wait_var_event_any_lock - wait for a variable to be updated under a lock 323 + * @var: the address of the variable being waited on 324 + * @condition: condition to wait for 325 + * @lock: the object that is locked to protect updates to the variable 326 + * @type: prefix on lock and unlock operations 327 + * @state: waiting state, %TASK_UNINTERRUPTIBLE etc. 436 328 * 437 - * @bit: the bit of the word being waited on 438 - * @word: the word being waited on, a kernel virtual address 329 + * Wait for a condition which can only be reliably tested while holding 330 + * a lock. The variables assessed in the condition will normal be updated 331 + * under the same lock, and the wake up should be signalled with 332 + * wake_up_var_locked() under the same lock. 439 333 * 440 - * You can use this helper if bitflags are manipulated atomically rather than 441 - * non-atomically under a lock. 334 + * This is similar to wait_var_event(), but assumes a lock is held 335 + * while calling this function and while updating the variable. 336 + * 337 + * This must be called while the given lock is held and the lock will be 338 + * dropped when schedule() is called to wait for a wake up, and will be 339 + * reclaimed before testing the condition again. The functions used to 340 + * unlock and lock the object are constructed by appending _unlock and _lock 341 + * to @type. 342 + * 343 + * Return %-ERESTARTSYS if a signal arrives which is allowed to interrupt 344 + * the wait according to @state. 442 345 */ 443 - static inline void clear_and_wake_up_bit(int bit, void *word) 346 + #define wait_var_event_any_lock(var, condition, lock, type, state) \ 347 + ({ \ 348 + int __ret = 0; \ 349 + if (!(condition)) \ 350 + __ret = ___wait_var_event(var, condition, state, 0, 0, \ 351 + type ## _unlock(lock); \ 352 + schedule(); \ 353 + type ## _lock(lock)); \ 354 + __ret; \ 355 + }) 356 + 357 + /** 358 + * wait_var_event_spinlock - wait for a variable to be updated under a spinlock 359 + * @var: the address of the variable being waited on 360 + * @condition: condition to wait for 361 + * @lock: the spinlock which protects updates to the variable 362 + * 363 + * Wait for a condition which can only be reliably tested while holding 364 + * a spinlock. The variables assessed in the condition will normal be updated 365 + * under the same spinlock, and the wake up should be signalled with 366 + * wake_up_var_locked() under the same spinlock. 367 + * 368 + * This is similar to wait_var_event(), but assumes a spinlock is held 369 + * while calling this function and while updating the variable. 370 + * 371 + * This must be called while the given lock is held and the lock will be 372 + * dropped when schedule() is called to wait for a wake up, and will be 373 + * reclaimed before testing the condition again. 374 + */ 375 + #define wait_var_event_spinlock(var, condition, lock) \ 376 + wait_var_event_any_lock(var, condition, lock, spin, TASK_UNINTERRUPTIBLE) 377 + 378 + /** 379 + * wait_var_event_mutex - wait for a variable to be updated under a mutex 380 + * @var: the address of the variable being waited on 381 + * @condition: condition to wait for 382 + * @mutex: the mutex which protects updates to the variable 383 + * 384 + * Wait for a condition which can only be reliably tested while holding 385 + * a mutex. The variables assessed in the condition will normal be 386 + * updated under the same mutex, and the wake up should be signalled 387 + * with wake_up_var_locked() under the same mutex. 388 + * 389 + * This is similar to wait_var_event(), but assumes a mutex is held 390 + * while calling this function and while updating the variable. 391 + * 392 + * This must be called while the given mutex is held and the mutex will be 393 + * dropped when schedule() is called to wait for a wake up, and will be 394 + * reclaimed before testing the condition again. 395 + */ 396 + #define wait_var_event_mutex(var, condition, lock) \ 397 + wait_var_event_any_lock(var, condition, lock, mutex, TASK_UNINTERRUPTIBLE) 398 + 399 + /** 400 + * wake_up_var_protected - wake up waiters for a variable asserting that it is safe 401 + * @var: the address of the variable being waited on 402 + * @cond: the condition which afirms this is safe 403 + * 404 + * When waking waiters which use wait_var_event_any_lock() the waker must be 405 + * holding the reelvant lock to avoid races. This version of wake_up_var() 406 + * asserts that the relevant lock is held and so no barrier is needed. 407 + * The @cond is only tested when CONFIG_LOCKDEP is enabled. 408 + */ 409 + #define wake_up_var_protected(var, cond) \ 410 + do { \ 411 + lockdep_assert(cond); \ 412 + wake_up_var(var); \ 413 + } while (0) 414 + 415 + /** 416 + * wake_up_var_locked - wake up waiters for a variable while holding a spinlock or mutex 417 + * @var: the address of the variable being waited on 418 + * @lock: The spinlock or mutex what protects the variable 419 + * 420 + * Send a wake up for the given variable which should be waited for with 421 + * wait_var_event_spinlock() or wait_var_event_mutex(). Unlike wake_up_var(), 422 + * no extra barriers are needed as the locking provides sufficient sequencing. 423 + */ 424 + #define wake_up_var_locked(var, lock) \ 425 + wake_up_var_protected(var, lockdep_is_held(lock)) 426 + 427 + /** 428 + * clear_and_wake_up_bit - clear a bit and wake up anyone waiting on that bit 429 + * @bit: the bit of the word being waited on 430 + * @word: the address containing the bit being waited on 431 + * 432 + * The designated bit is cleared and any tasks waiting in wait_on_bit() 433 + * or similar will be woken. This call has RELEASE semantics so that 434 + * any changes to memory made before this call are guaranteed to be visible 435 + * after the corresponding wait_on_bit() completes. 436 + */ 437 + static inline void clear_and_wake_up_bit(int bit, unsigned long *word) 444 438 { 445 439 clear_bit_unlock(bit, word); 446 440 /* See wake_up_bit() for which memory barrier you need to use. */ 447 441 smp_mb__after_atomic(); 448 442 wake_up_bit(word, bit); 449 443 } 444 + 445 + /** 446 + * test_and_clear_wake_up_bit - clear a bit if it was set: wake up anyone waiting on that bit 447 + * @bit: the bit of the word being waited on 448 + * @word: the address of memory containing that bit 449 + * 450 + * If the bit is set and can be atomically cleared, any tasks waiting in 451 + * wait_on_bit() or similar will be woken. This call has the same 452 + * complete ordering semantics as test_and_clear_bit(). Any changes to 453 + * memory made before this call are guaranteed to be visible after the 454 + * corresponding wait_on_bit() completes. 455 + * 456 + * Returns %true if the bit was successfully set and the wake up was sent. 457 + */ 458 + static inline bool test_and_clear_wake_up_bit(int bit, unsigned long *word) 459 + { 460 + if (!test_and_clear_bit(bit, word)) 461 + return false; 462 + /* no extra barrier required */ 463 + wake_up_bit(word, bit); 464 + return true; 465 + } 466 + 467 + /** 468 + * atomic_dec_and_wake_up - decrement an atomic_t and if zero, wake up waiters 469 + * @var: the variable to dec and test 470 + * 471 + * Decrements the atomic variable and if it reaches zero, send a wake_up to any 472 + * processes waiting on the variable. 473 + * 474 + * This function has the same complete ordering semantics as atomic_dec_and_test. 475 + * 476 + * Returns %true is the variable reaches zero and the wake up was sent. 477 + */ 478 + 479 + static inline bool atomic_dec_and_wake_up(atomic_t *var) 480 + { 481 + if (!atomic_dec_and_test(var)) 482 + return false; 483 + /* No extra barrier required */ 484 + wake_up_var(var); 485 + return true; 486 + } 487 + 488 + /** 489 + * store_release_wake_up - update a variable and send a wake_up 490 + * @var: the address of the variable to be updated and woken 491 + * @val: the value to store in the variable. 492 + * 493 + * Store the given value in the variable send a wake up to any tasks 494 + * waiting on the variable. All necessary barriers are included to ensure 495 + * the task calling wait_var_event() sees the new value and all values 496 + * written to memory before this call. 497 + */ 498 + #define store_release_wake_up(var, val) \ 499 + do { \ 500 + smp_store_release(var, val); \ 501 + smp_mb(); \ 502 + wake_up_var(var); \ 503 + } while (0) 450 504 451 505 #endif /* _LINUX_WAIT_BIT_H */

+22 -5

kernel/Kconfig.preempt

··· 11 11 select PREEMPTION 12 12 select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK 13 13 14 + config ARCH_HAS_PREEMPT_LAZY 15 + bool 16 + 14 17 choice 15 18 prompt "Preemption Model" 16 19 default PREEMPT_NONE 17 20 18 21 config PREEMPT_NONE 19 22 bool "No Forced Preemption (Server)" 23 + depends on !PREEMPT_RT 20 24 select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC 21 25 help 22 26 This is the traditional Linux preemption model, geared towards ··· 36 32 config PREEMPT_VOLUNTARY 37 33 bool "Voluntary Kernel Preemption (Desktop)" 38 34 depends on !ARCH_NO_PREEMPT 35 + depends on !PREEMPT_RT 39 36 select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC 40 37 help 41 38 This option reduces the latency of the kernel by adding more ··· 56 51 config PREEMPT 57 52 bool "Preemptible Kernel (Low-Latency Desktop)" 58 53 depends on !ARCH_NO_PREEMPT 59 - select PREEMPT_BUILD 54 + select PREEMPT_BUILD if !PREEMPT_DYNAMIC 60 55 help 61 56 This option reduces the latency of the kernel by making 62 57 all kernel code (that is not executing in a critical section) ··· 72 67 embedded system with latency requirements in the milliseconds 73 68 range. 74 69 70 + config PREEMPT_LAZY 71 + bool "Scheduler controlled preemption model" 72 + depends on !ARCH_NO_PREEMPT 73 + depends on ARCH_HAS_PREEMPT_LAZY 74 + select PREEMPT_BUILD if !PREEMPT_DYNAMIC 75 + help 76 + This option provides a scheduler driven preemption model that 77 + is fundamentally similar to full preemption, but is less 78 + eager to preempt SCHED_NORMAL tasks in an attempt to 79 + reduce lock holder preemption and recover some of the performance 80 + gains seen from using Voluntary preemption. 81 + 82 + endchoice 83 + 75 84 config PREEMPT_RT 76 85 bool "Fully Preemptible Kernel (Real-Time)" 77 - depends on EXPERT && ARCH_SUPPORTS_RT 86 + depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST 78 87 select PREEMPTION 79 88 help 80 89 This option turns the kernel into a real-time kernel by replacing ··· 103 84 Select this if you are building a kernel for systems which 104 85 require real-time guarantees. 105 86 106 - endchoice 107 - 108 87 config PREEMPT_COUNT 109 88 bool 110 89 ··· 112 95 113 96 config PREEMPT_DYNAMIC 114 97 bool "Preemption behaviour defined on boot" 115 - depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT 98 + depends on HAVE_PREEMPT_DYNAMIC 116 99 select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY 117 100 select PREEMPT_BUILD 118 101 default y if HAVE_PREEMPT_DYNAMIC_CALL

+1 -1

kernel/entry/common.c

··· 98 98 99 99 local_irq_enable_exit_to_user(ti_work); 100 100 101 - if (ti_work & _TIF_NEED_RESCHED) 101 + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) 102 102 schedule(); 103 103 104 104 if (ti_work & _TIF_UPROBE)

+2 -2

kernel/entry/kvm.c

··· 13 13 return -EINTR; 14 14 } 15 15 16 - if (ti_work & _TIF_NEED_RESCHED) 16 + if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) 17 17 schedule(); 18 18 19 19 if (ti_work & _TIF_NOTIFY_RESUME) ··· 24 24 return ret; 25 25 26 26 ti_work = read_thread_flags(); 27 - } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); 27 + } while (ti_work & XFER_TO_GUEST_MODE_WORK); 28 28 return 0; 29 29 } 30 30

+1 -1

kernel/fork.c

··· 1299 1299 if (init_new_context(p, mm)) 1300 1300 goto fail_nocontext; 1301 1301 1302 - if (mm_alloc_cid(mm)) 1302 + if (mm_alloc_cid(mm, p)) 1303 1303 goto fail_cid; 1304 1304 1305 1305 if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,

+5 -1

kernel/futex/pi.c

··· 922 922 struct rt_mutex_waiter rt_waiter; 923 923 struct futex_hash_bucket *hb; 924 924 struct futex_q q = futex_q_init; 925 + DEFINE_WAKE_Q(wake_q); 925 926 int res, ret; 926 927 927 928 if (!IS_ENABLED(CONFIG_FUTEX_PI)) ··· 1019 1018 * such that futex_unlock_pi() is guaranteed to observe the waiter when 1020 1019 * it sees the futex_q::pi_state. 1021 1020 */ 1022 - ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); 1021 + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q); 1022 + preempt_disable(); 1023 1023 raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); 1024 + wake_up_q(&wake_q); 1025 + preempt_enable(); 1024 1026 1025 1027 if (ret) { 1026 1028 if (ret == 1)

+22 -37

kernel/locking/mutex.c

··· 56 56 } 57 57 EXPORT_SYMBOL(__mutex_init); 58 58 59 - /* 60 - * @owner: contains: 'struct task_struct *' to the current lock owner, 61 - * NULL means not owned. Since task_struct pointers are aligned at 62 - * at least L1_CACHE_BYTES, we have low bits to store extra state. 63 - * 64 - * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. 65 - * Bit1 indicates unlock needs to hand the lock to the top-waiter 66 - * Bit2 indicates handoff has been done and we're waiting for pickup. 67 - */ 68 - #define MUTEX_FLAG_WAITERS 0x01 69 - #define MUTEX_FLAG_HANDOFF 0x02 70 - #define MUTEX_FLAG_PICKUP 0x04 71 - 72 - #define MUTEX_FLAGS 0x07 73 - 74 - /* 75 - * Internal helper function; C doesn't allow us to hide it :/ 76 - * 77 - * DO NOT USE (outside of mutex code). 78 - */ 79 - static inline struct task_struct *__mutex_owner(struct mutex *lock) 80 - { 81 - return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); 82 - } 83 - 84 59 static inline struct task_struct *__owner_task(unsigned long owner) 85 60 { 86 61 return (struct task_struct *)(owner & ~MUTEX_FLAGS); ··· 550 575 struct lockdep_map *nest_lock, unsigned long ip, 551 576 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) 552 577 { 578 + DEFINE_WAKE_Q(wake_q); 553 579 struct mutex_waiter waiter; 554 580 struct ww_mutex *ww; 581 + unsigned long flags; 555 582 int ret; 556 583 557 584 if (!use_ww_ctx) ··· 596 619 return 0; 597 620 } 598 621 599 - raw_spin_lock(&lock->wait_lock); 622 + raw_spin_lock_irqsave(&lock->wait_lock, flags); 600 623 /* 601 624 * After waiting to acquire the wait_lock, try again. 602 625 */ 603 626 if (__mutex_trylock(lock)) { 604 627 if (ww_ctx) 605 - __ww_mutex_check_waiters(lock, ww_ctx); 628 + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); 606 629 607 630 goto skip_wait; 608 631 } ··· 622 645 * Add in stamp order, waking up waiters that must kill 623 646 * themselves. 624 647 */ 625 - ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx); 648 + ret = __ww_mutex_add_waiter(&waiter, lock, ww_ctx, &wake_q); 626 649 if (ret) 627 650 goto err_early_kill; 628 651 } ··· 657 680 goto err; 658 681 } 659 682 660 - raw_spin_unlock(&lock->wait_lock); 683 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 684 + /* Make sure we do wakeups before calling schedule */ 685 + wake_up_q(&wake_q); 686 + wake_q_init(&wake_q); 687 + 661 688 schedule_preempt_disabled(); 662 689 663 690 first = __mutex_waiter_is_first(lock, &waiter); ··· 682 701 trace_contention_begin(lock, LCB_F_MUTEX); 683 702 } 684 703 685 - raw_spin_lock(&lock->wait_lock); 704 + raw_spin_lock_irqsave(&lock->wait_lock, flags); 686 705 } 687 - raw_spin_lock(&lock->wait_lock); 706 + raw_spin_lock_irqsave(&lock->wait_lock, flags); 688 707 acquired: 689 708 __set_current_state(TASK_RUNNING); 690 709 ··· 695 714 */ 696 715 if (!ww_ctx->is_wait_die && 697 716 !__mutex_waiter_is_first(lock, &waiter)) 698 - __ww_mutex_check_waiters(lock, ww_ctx); 717 + __ww_mutex_check_waiters(lock, ww_ctx, &wake_q); 699 718 } 700 719 701 720 __mutex_remove_waiter(lock, &waiter); ··· 710 729 if (ww_ctx) 711 730 ww_mutex_lock_acquired(ww, ww_ctx); 712 731 713 - raw_spin_unlock(&lock->wait_lock); 732 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 733 + wake_up_q(&wake_q); 714 734 preempt_enable(); 715 735 return 0; 716 736 ··· 720 738 __mutex_remove_waiter(lock, &waiter); 721 739 err_early_kill: 722 740 trace_contention_end(lock, ret); 723 - raw_spin_unlock(&lock->wait_lock); 741 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 724 742 debug_mutex_free_waiter(&waiter); 725 743 mutex_release(&lock->dep_map, ip); 744 + wake_up_q(&wake_q); 726 745 preempt_enable(); 727 746 return ret; 728 747 } ··· 891 908 struct task_struct *next = NULL; 892 909 DEFINE_WAKE_Q(wake_q); 893 910 unsigned long owner; 911 + unsigned long flags; 894 912 895 913 mutex_release(&lock->dep_map, ip); 896 914 ··· 918 934 } 919 935 } 920 936 921 - raw_spin_lock(&lock->wait_lock); 937 + raw_spin_lock_irqsave(&lock->wait_lock, flags); 922 938 debug_mutex_unlock(lock); 923 939 if (!list_empty(&lock->wait_list)) { 924 940 /* get the first entry from the wait-list: */ ··· 935 951 if (owner & MUTEX_FLAG_HANDOFF) 936 952 __mutex_handoff(lock, next); 937 953 938 - raw_spin_unlock(&lock->wait_lock); 939 - 954 + preempt_disable(); 955 + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 940 956 wake_up_q(&wake_q); 957 + preempt_enable(); 941 958 } 942 959 943 960 #ifndef CONFIG_DEBUG_LOCK_ALLOC

+27

kernel/locking/mutex.h

··· 20 20 #endif 21 21 }; 22 22 23 + /* 24 + * @owner: contains: 'struct task_struct *' to the current lock owner, 25 + * NULL means not owned. Since task_struct pointers are aligned at 26 + * at least L1_CACHE_BYTES, we have low bits to store extra state. 27 + * 28 + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. 29 + * Bit1 indicates unlock needs to hand the lock to the top-waiter 30 + * Bit2 indicates handoff has been done and we're waiting for pickup. 31 + */ 32 + #define MUTEX_FLAG_WAITERS 0x01 33 + #define MUTEX_FLAG_HANDOFF 0x02 34 + #define MUTEX_FLAG_PICKUP 0x04 35 + 36 + #define MUTEX_FLAGS 0x07 37 + 38 + /* 39 + * Internal helper function; C doesn't allow us to hide it :/ 40 + * 41 + * DO NOT USE (outside of mutex & scheduler code). 42 + */ 43 + static inline struct task_struct *__mutex_owner(struct mutex *lock) 44 + { 45 + if (!lock) 46 + return NULL; 47 + return (struct task_struct *)(atomic_long_read(&lock->owner) & ~MUTEX_FLAGS); 48 + } 49 + 23 50 #ifdef CONFIG_DEBUG_MUTEXES 24 51 extern void debug_mutex_lock_common(struct mutex *lock, 25 52 struct mutex_waiter *waiter);

+37 -14

kernel/locking/rtmutex.c

··· 34 34 35 35 static inline int __ww_mutex_add_waiter(struct rt_mutex_waiter *waiter, 36 36 struct rt_mutex *lock, 37 - struct ww_acquire_ctx *ww_ctx) 37 + struct ww_acquire_ctx *ww_ctx, 38 + struct wake_q_head *wake_q) 38 39 { 39 40 return 0; 40 41 } 41 42 42 43 static inline void __ww_mutex_check_waiters(struct rt_mutex *lock, 43 - struct ww_acquire_ctx *ww_ctx) 44 + struct ww_acquire_ctx *ww_ctx, 45 + struct wake_q_head *wake_q) 44 46 { 45 47 } 46 48 ··· 1203 1201 struct rt_mutex_waiter *waiter, 1204 1202 struct task_struct *task, 1205 1203 struct ww_acquire_ctx *ww_ctx, 1206 - enum rtmutex_chainwalk chwalk) 1204 + enum rtmutex_chainwalk chwalk, 1205 + struct wake_q_head *wake_q) 1207 1206 { 1208 1207 struct task_struct *owner = rt_mutex_owner(lock); 1209 1208 struct rt_mutex_waiter *top_waiter = waiter; ··· 1248 1245 1249 1246 /* Check whether the waiter should back out immediately */ 1250 1247 rtm = container_of(lock, struct rt_mutex, rtmutex); 1251 - res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); 1248 + preempt_disable(); 1249 + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); 1250 + wake_up_q(wake_q); 1251 + preempt_enable(); 1252 1252 if (res) { 1253 1253 raw_spin_lock(&task->pi_lock); 1254 1254 rt_mutex_dequeue(lock, waiter); ··· 1681 1675 * @state: The task state for sleeping 1682 1676 * @chwalk: Indicator whether full or partial chainwalk is requested 1683 1677 * @waiter: Initializer waiter for blocking 1678 + * @wake_q: The wake_q to wake tasks after we release the wait_lock 1684 1679 */ 1685 1680 static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, 1686 1681 struct ww_acquire_ctx *ww_ctx, 1687 1682 unsigned int state, 1688 1683 enum rtmutex_chainwalk chwalk, 1689 - struct rt_mutex_waiter *waiter) 1684 + struct rt_mutex_waiter *waiter, 1685 + struct wake_q_head *wake_q) 1690 1686 { 1691 1687 struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); 1692 1688 struct ww_mutex *ww = ww_container_of(rtm); ··· 1699 1691 /* Try to acquire the lock again: */ 1700 1692 if (try_to_take_rt_mutex(lock, current, NULL)) { 1701 1693 if (build_ww_mutex() && ww_ctx) { 1702 - __ww_mutex_check_waiters(rtm, ww_ctx); 1694 + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); 1703 1695 ww_mutex_lock_acquired(ww, ww_ctx); 1704 1696 } 1705 1697 return 0; ··· 1709 1701 1710 1702 trace_contention_begin(lock, LCB_F_RT); 1711 1703 1712 - ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk); 1704 + ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); 1713 1705 if (likely(!ret)) 1714 1706 ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); 1715 1707 ··· 1717 1709 /* acquired the lock */ 1718 1710 if (build_ww_mutex() && ww_ctx) { 1719 1711 if (!ww_ctx->is_wait_die) 1720 - __ww_mutex_check_waiters(rtm, ww_ctx); 1712 + __ww_mutex_check_waiters(rtm, ww_ctx, wake_q); 1721 1713 ww_mutex_lock_acquired(ww, ww_ctx); 1722 1714 } 1723 1715 } else { ··· 1739 1731 1740 1732 static inline int __rt_mutex_slowlock_locked(struct rt_mutex_base *lock, 1741 1733 struct ww_acquire_ctx *ww_ctx, 1742 - unsigned int state) 1734 + unsigned int state, 1735 + struct wake_q_head *wake_q) 1743 1736 { 1744 1737 struct rt_mutex_waiter waiter; 1745 1738 int ret; ··· 1749 1740 waiter.ww_ctx = ww_ctx; 1750 1741 1751 1742 ret = __rt_mutex_slowlock(lock, ww_ctx, state, RT_MUTEX_MIN_CHAINWALK, 1752 - &waiter); 1743 + &waiter, wake_q); 1753 1744 1754 1745 debug_rt_mutex_free_waiter(&waiter); 1755 1746 return ret; ··· 1765 1756 struct ww_acquire_ctx *ww_ctx, 1766 1757 unsigned int state) 1767 1758 { 1759 + DEFINE_WAKE_Q(wake_q); 1768 1760 unsigned long flags; 1769 1761 int ret; 1770 1762 ··· 1787 1777 * irqsave/restore variants. 1788 1778 */ 1789 1779 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1790 - ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); 1780 + ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q); 1781 + preempt_disable(); 1791 1782 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1783 + wake_up_q(&wake_q); 1784 + preempt_enable(); 1792 1785 rt_mutex_post_schedule(); 1793 1786 1794 1787 return ret; ··· 1817 1804 /** 1818 1805 * rtlock_slowlock_locked - Slow path lock acquisition for RT locks 1819 1806 * @lock: The underlying RT mutex 1807 + * @wake_q: The wake_q to wake tasks after we release the wait_lock 1820 1808 */ 1821 - static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock) 1809 + static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock, 1810 + struct wake_q_head *wake_q) 1822 1811 __releases(&lock->wait_lock) __acquires(&lock->wait_lock) 1823 1812 { 1824 1813 struct rt_mutex_waiter waiter; ··· 1838 1823 1839 1824 trace_contention_begin(lock, LCB_F_RT); 1840 1825 1841 - task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK); 1826 + task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK, wake_q); 1842 1827 1843 1828 for (;;) { 1844 1829 /* Try to acquire the lock again */ ··· 1849 1834 owner = rt_mutex_owner(lock); 1850 1835 else 1851 1836 owner = NULL; 1837 + preempt_disable(); 1852 1838 raw_spin_unlock_irq(&lock->wait_lock); 1839 + wake_up_q(wake_q); 1840 + wake_q_init(wake_q); 1841 + preempt_enable(); 1853 1842 1854 1843 if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner)) 1855 1844 schedule_rtlock(); ··· 1878 1859 static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock) 1879 1860 { 1880 1861 unsigned long flags; 1862 + DEFINE_WAKE_Q(wake_q); 1881 1863 1882 1864 raw_spin_lock_irqsave(&lock->wait_lock, flags); 1883 - rtlock_slowlock_locked(lock); 1865 + rtlock_slowlock_locked(lock, &wake_q); 1866 + preempt_disable(); 1884 1867 raw_spin_unlock_irqrestore(&lock->wait_lock, flags); 1868 + wake_up_q(&wake_q); 1869 + preempt_enable(); 1885 1870 } 1886 1871 1887 1872 #endif /* RT_MUTEX_BUILD_SPINLOCKS */

+9 -3

kernel/locking/rtmutex_api.c

··· 275 275 * @lock: the rt_mutex to take 276 276 * @waiter: the pre-initialized rt_mutex_waiter 277 277 * @task: the task to prepare 278 + * @wake_q: the wake_q to wake tasks after we release the wait_lock 278 279 * 279 280 * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock 280 281 * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. ··· 292 291 */ 293 292 int __sched __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, 294 293 struct rt_mutex_waiter *waiter, 295 - struct task_struct *task) 294 + struct task_struct *task, 295 + struct wake_q_head *wake_q) 296 296 { 297 297 int ret; 298 298 ··· 304 302 305 303 /* We enforce deadlock detection for futexes */ 306 304 ret = task_blocks_on_rt_mutex(lock, waiter, task, NULL, 307 - RT_MUTEX_FULL_CHAINWALK); 305 + RT_MUTEX_FULL_CHAINWALK, wake_q); 308 306 309 307 if (ret && !rt_mutex_owner(lock)) { 310 308 /* ··· 343 341 struct task_struct *task) 344 342 { 345 343 int ret; 344 + DEFINE_WAKE_Q(wake_q); 346 345 347 346 raw_spin_lock_irq(&lock->wait_lock); 348 - ret = __rt_mutex_start_proxy_lock(lock, waiter, task); 347 + ret = __rt_mutex_start_proxy_lock(lock, waiter, task, &wake_q); 349 348 if (unlikely(ret)) 350 349 remove_waiter(lock, waiter); 350 + preempt_disable(); 351 351 raw_spin_unlock_irq(&lock->wait_lock); 352 + wake_up_q(&wake_q); 353 + preempt_enable(); 352 354 353 355 return ret; 354 356 }

+2 -1

kernel/locking/rtmutex_common.h

··· 83 83 extern void rt_mutex_proxy_unlock(struct rt_mutex_base *lock); 84 84 extern int __rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, 85 85 struct rt_mutex_waiter *waiter, 86 - struct task_struct *task); 86 + struct task_struct *task, 87 + struct wake_q_head *); 87 88 extern int rt_mutex_start_proxy_lock(struct rt_mutex_base *lock, 88 89 struct rt_mutex_waiter *waiter, 89 90 struct task_struct *task);

+7 -1

kernel/locking/rwbase_rt.c

··· 69 69 unsigned int state) 70 70 { 71 71 struct rt_mutex_base *rtm = &rwb->rtmutex; 72 + DEFINE_WAKE_Q(wake_q); 72 73 int ret; 73 74 74 75 rwbase_pre_schedule(); ··· 111 110 * For rwlocks this returns 0 unconditionally, so the below 112 111 * !ret conditionals are optimized out. 113 112 */ 114 - ret = rwbase_rtmutex_slowlock_locked(rtm, state); 113 + ret = rwbase_rtmutex_slowlock_locked(rtm, state, &wake_q); 115 114 116 115 /* 117 116 * On success the rtmutex is held, so there can't be a writer ··· 122 121 */ 123 122 if (!ret) 124 123 atomic_inc(&rwb->readers); 124 + 125 + preempt_disable(); 125 126 raw_spin_unlock_irq(&rtm->wait_lock); 127 + wake_up_q(&wake_q); 128 + preempt_enable(); 129 + 126 130 if (!ret) 127 131 rwbase_rtmutex_unlock(rtm); 128 132

+2 -2

kernel/locking/rwsem.c

··· 1413 1413 #define rwbase_rtmutex_lock_state(rtm, state) \ 1414 1414 __rt_mutex_lock(rtm, state) 1415 1415 1416 - #define rwbase_rtmutex_slowlock_locked(rtm, state) \ 1417 - __rt_mutex_slowlock_locked(rtm, NULL, state) 1416 + #define rwbase_rtmutex_slowlock_locked(rtm, state, wq) \ 1417 + __rt_mutex_slowlock_locked(rtm, NULL, state, wq) 1418 1418 1419 1419 #define rwbase_rtmutex_unlock(rtm) \ 1420 1420 __rt_mutex_unlock(rtm)

+3 -2

kernel/locking/spinlock_rt.c

··· 162 162 } 163 163 164 164 static __always_inline int 165 - rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) 165 + rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state, 166 + struct wake_q_head *wake_q) 166 167 { 167 - rtlock_slowlock_locked(rtm); 168 + rtlock_slowlock_locked(rtm, wake_q); 168 169 return 0; 169 170 } 170 171

+30 -21

kernel/locking/ww_mutex.h

··· 70 70 return atomic_long_read(&lock->owner) & MUTEX_FLAG_WAITERS; 71 71 } 72 72 73 - static inline void lock_wait_lock(struct mutex *lock) 73 + static inline void lock_wait_lock(struct mutex *lock, unsigned long *flags) 74 74 { 75 - raw_spin_lock(&lock->wait_lock); 75 + raw_spin_lock_irqsave(&lock->wait_lock, *flags); 76 76 } 77 77 78 - static inline void unlock_wait_lock(struct mutex *lock) 78 + static inline void unlock_wait_lock(struct mutex *lock, unsigned long *flags) 79 79 { 80 - raw_spin_unlock(&lock->wait_lock); 80 + raw_spin_unlock_irqrestore(&lock->wait_lock, *flags); 81 81 } 82 82 83 83 static inline void lockdep_assert_wait_lock_held(struct mutex *lock) ··· 144 144 return rt_mutex_has_waiters(&lock->rtmutex); 145 145 } 146 146 147 - static inline void lock_wait_lock(struct rt_mutex *lock) 147 + static inline void lock_wait_lock(struct rt_mutex *lock, unsigned long *flags) 148 148 { 149 - raw_spin_lock(&lock->rtmutex.wait_lock); 149 + raw_spin_lock_irqsave(&lock->rtmutex.wait_lock, *flags); 150 150 } 151 151 152 - static inline void unlock_wait_lock(struct rt_mutex *lock) 152 + static inline void unlock_wait_lock(struct rt_mutex *lock, unsigned long *flags) 153 153 { 154 - raw_spin_unlock(&lock->rtmutex.wait_lock); 154 + raw_spin_unlock_irqrestore(&lock->rtmutex.wait_lock, *flags); 155 155 } 156 156 157 157 static inline void lockdep_assert_wait_lock_held(struct rt_mutex *lock) ··· 275 275 */ 276 276 static bool 277 277 __ww_mutex_die(struct MUTEX *lock, struct MUTEX_WAITER *waiter, 278 - struct ww_acquire_ctx *ww_ctx) 278 + struct ww_acquire_ctx *ww_ctx, struct wake_q_head *wake_q) 279 279 { 280 280 if (!ww_ctx->is_wait_die) 281 281 return false; ··· 284 284 #ifndef WW_RT 285 285 debug_mutex_wake_waiter(lock, waiter); 286 286 #endif 287 - wake_up_process(waiter->task); 287 + wake_q_add(wake_q, waiter->task); 288 288 } 289 289 290 290 return true; ··· 299 299 */ 300 300 static bool __ww_mutex_wound(struct MUTEX *lock, 301 301 struct ww_acquire_ctx *ww_ctx, 302 - struct ww_acquire_ctx *hold_ctx) 302 + struct ww_acquire_ctx *hold_ctx, 303 + struct wake_q_head *wake_q) 303 304 { 304 305 struct task_struct *owner = __ww_mutex_owner(lock); 305 306 ··· 332 331 * wakeup pending to re-read the wounded state. 333 332 */ 334 333 if (owner != current) 335 - wake_up_process(owner); 334 + wake_q_add(wake_q, owner); 336 335 337 336 return true; 338 337 } ··· 353 352 * The current task must not be on the wait list. 354 353 */ 355 354 static void 356 - __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx) 355 + __ww_mutex_check_waiters(struct MUTEX *lock, struct ww_acquire_ctx *ww_ctx, 356 + struct wake_q_head *wake_q) 357 357 { 358 358 struct MUTEX_WAITER *cur; 359 359 ··· 366 364 if (!cur->ww_ctx) 367 365 continue; 368 366 369 - if (__ww_mutex_die(lock, cur, ww_ctx) || 370 - __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx)) 367 + if (__ww_mutex_die(lock, cur, ww_ctx, wake_q) || 368 + __ww_mutex_wound(lock, cur->ww_ctx, ww_ctx, wake_q)) 371 369 break; 372 370 } 373 371 } ··· 379 377 static __always_inline void 380 378 ww_mutex_set_context_fastpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) 381 379 { 380 + DEFINE_WAKE_Q(wake_q); 381 + unsigned long flags; 382 + 382 383 ww_mutex_lock_acquired(lock, ctx); 383 384 384 385 /* ··· 409 404 * Uh oh, we raced in fastpath, check if any of the waiters need to 410 405 * die or wound us. 411 406 */ 412 - lock_wait_lock(&lock->base); 413 - __ww_mutex_check_waiters(&lock->base, ctx); 414 - unlock_wait_lock(&lock->base); 407 + lock_wait_lock(&lock->base, &flags); 408 + __ww_mutex_check_waiters(&lock->base, ctx, &wake_q); 409 + preempt_disable(); 410 + unlock_wait_lock(&lock->base, &flags); 411 + wake_up_q(&wake_q); 412 + preempt_enable(); 415 413 } 416 414 417 415 static __always_inline int ··· 496 488 static inline int 497 489 __ww_mutex_add_waiter(struct MUTEX_WAITER *waiter, 498 490 struct MUTEX *lock, 499 - struct ww_acquire_ctx *ww_ctx) 491 + struct ww_acquire_ctx *ww_ctx, 492 + struct wake_q_head *wake_q) 500 493 { 501 494 struct MUTEX_WAITER *cur, *pos = NULL; 502 495 bool is_wait_die; ··· 541 532 pos = cur; 542 533 543 534 /* Wait-Die: ensure younger waiters die. */ 544 - __ww_mutex_die(lock, cur, ww_ctx); 535 + __ww_mutex_die(lock, cur, ww_ctx, wake_q); 545 536 } 546 537 547 538 __ww_waiter_add(lock, waiter, pos); ··· 559 550 * such that either we or the fastpath will wound @ww->ctx. 560 551 */ 561 552 smp_mb(); 562 - __ww_mutex_wound(lock, ww_ctx, ww->ctx); 553 + __ww_mutex_wound(lock, ww_ctx, ww->ctx, wake_q); 563 554 } 564 555 565 556 return 0;

+197 -92

kernel/sched/core.c

··· 832 832 833 833 rq_lock(rq, &rf); 834 834 update_rq_clock(rq); 835 - rq->curr->sched_class->task_tick(rq, rq->curr, 1); 835 + rq->donor->sched_class->task_tick(rq, rq->curr, 1); 836 836 rq_unlock(rq, &rf); 837 837 838 838 return HRTIMER_NORESTART; ··· 941 941 * this avoids any races wrt polling state changes and thereby avoids 942 942 * spurious IPIs. 943 943 */ 944 - static inline bool set_nr_and_not_polling(struct task_struct *p) 944 + static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) 945 945 { 946 - struct thread_info *ti = task_thread_info(p); 947 - return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); 946 + return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); 948 947 } 949 948 950 949 /* ··· 968 969 } 969 970 970 971 #else 971 - static inline bool set_nr_and_not_polling(struct task_struct *p) 972 + static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) 972 973 { 973 - set_tsk_need_resched(p); 974 + set_ti_thread_flag(ti, tif); 974 975 return true; 975 976 } 976 977 ··· 1075 1076 * might also involve a cross-CPU call to trigger the scheduler on 1076 1077 * the target CPU. 1077 1078 */ 1078 - void resched_curr(struct rq *rq) 1079 + static void __resched_curr(struct rq *rq, int tif) 1079 1080 { 1080 1081 struct task_struct *curr = rq->curr; 1082 + struct thread_info *cti = task_thread_info(curr); 1081 1083 int cpu; 1082 1084 1083 1085 lockdep_assert_rq_held(rq); 1084 1086 1085 - if (test_tsk_need_resched(curr)) 1087 + /* 1088 + * Always immediately preempt the idle task; no point in delaying doing 1089 + * actual work. 1090 + */ 1091 + if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) 1092 + tif = TIF_NEED_RESCHED; 1093 + 1094 + if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) 1086 1095 return; 1087 1096 1088 1097 cpu = cpu_of(rq); 1089 1098 1090 1099 if (cpu == smp_processor_id()) { 1091 - set_tsk_need_resched(curr); 1092 - set_preempt_need_resched(); 1100 + set_ti_thread_flag(cti, tif); 1101 + if (tif == TIF_NEED_RESCHED) 1102 + set_preempt_need_resched(); 1093 1103 return; 1094 1104 } 1095 1105 1096 - if (set_nr_and_not_polling(curr)) 1097 - smp_send_reschedule(cpu); 1098 - else 1106 + if (set_nr_and_not_polling(cti, tif)) { 1107 + if (tif == TIF_NEED_RESCHED) 1108 + smp_send_reschedule(cpu); 1109 + } else { 1099 1110 trace_sched_wake_idle_without_ipi(cpu); 1111 + } 1112 + } 1113 + 1114 + void resched_curr(struct rq *rq) 1115 + { 1116 + __resched_curr(rq, TIF_NEED_RESCHED); 1117 + } 1118 + 1119 + #ifdef CONFIG_PREEMPT_DYNAMIC 1120 + static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); 1121 + static __always_inline bool dynamic_preempt_lazy(void) 1122 + { 1123 + return static_branch_unlikely(&sk_dynamic_preempt_lazy); 1124 + } 1125 + #else 1126 + static __always_inline bool dynamic_preempt_lazy(void) 1127 + { 1128 + return IS_ENABLED(CONFIG_PREEMPT_LAZY); 1129 + } 1130 + #endif 1131 + 1132 + static __always_inline int get_lazy_tif_bit(void) 1133 + { 1134 + if (dynamic_preempt_lazy()) 1135 + return TIF_NEED_RESCHED_LAZY; 1136 + 1137 + return TIF_NEED_RESCHED; 1138 + } 1139 + 1140 + void resched_curr_lazy(struct rq *rq) 1141 + { 1142 + __resched_curr(rq, get_lazy_tif_bit()); 1100 1143 } 1101 1144 1102 1145 void resched_cpu(int cpu) ··· 1233 1192 * and testing of the above solutions didn't appear to report 1234 1193 * much benefits. 1235 1194 */ 1236 - if (set_nr_and_not_polling(rq->idle)) 1195 + if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) 1237 1196 smp_send_reschedule(cpu); 1238 1197 else 1239 1198 trace_sched_wake_idle_without_ipi(cpu); ··· 1440 1399 * requests are serialized using a mutex to reduce the risk of conflicting 1441 1400 * updates or API abuses. 1442 1401 */ 1443 - static DEFINE_MUTEX(uclamp_mutex); 1402 + static __maybe_unused DEFINE_MUTEX(uclamp_mutex); 1444 1403 1445 1404 /* Max allowed minimum utilization */ 1446 1405 static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; ··· 2065 2024 */ 2066 2025 uclamp_rq_inc(rq, p); 2067 2026 2068 - if (!(flags & ENQUEUE_RESTORE)) { 2027 + psi_enqueue(p, flags); 2028 + 2029 + if (!(flags & ENQUEUE_RESTORE)) 2069 2030 sched_info_enqueue(rq, p); 2070 - psi_enqueue(p, flags & ENQUEUE_MIGRATED); 2071 - } 2072 2031 2073 2032 if (sched_core_enabled(rq)) 2074 2033 sched_core_enqueue(rq, p); ··· 2085 2044 if (!(flags & DEQUEUE_NOCLOCK)) 2086 2045 update_rq_clock(rq); 2087 2046 2088 - if (!(flags & DEQUEUE_SAVE)) { 2047 + if (!(flags & DEQUEUE_SAVE)) 2089 2048 sched_info_dequeue(rq, p); 2090 - psi_dequeue(p, !(flags & DEQUEUE_SLEEP)); 2091 - } 2049 + 2050 + psi_dequeue(p, flags); 2092 2051 2093 2052 /* 2094 2053 * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' ··· 2176 2135 2177 2136 void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) 2178 2137 { 2179 - if (p->sched_class == rq->curr->sched_class) 2180 - rq->curr->sched_class->wakeup_preempt(rq, p, flags); 2181 - else if (sched_class_above(p->sched_class, rq->curr->sched_class)) 2138 + struct task_struct *donor = rq->donor; 2139 + 2140 + if (p->sched_class == donor->sched_class) 2141 + donor->sched_class->wakeup_preempt(rq, p, flags); 2142 + else if (sched_class_above(p->sched_class, donor->sched_class)) 2182 2143 resched_curr(rq); 2183 2144 2184 2145 /* 2185 2146 * A queue event has occurred, and we're going to schedule. In 2186 2147 * this case, we can save a useless back to back clock update. 2187 2148 */ 2188 - if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 2149 + if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr)) 2189 2150 rq_clock_skip_update(rq); 2190 2151 } 2191 2152 ··· 2663 2620 2664 2621 // XXX validate p is still the highest prio task 2665 2622 if (task_rq(p) == rq) { 2666 - deactivate_task(rq, p, 0); 2667 - set_task_cpu(p, lowest_rq->cpu); 2668 - activate_task(lowest_rq, p, 0); 2623 + move_queued_task_locked(rq, lowest_rq, p); 2669 2624 resched_curr(lowest_rq); 2670 2625 } 2671 2626 ··· 2723 2682 lockdep_assert_held(&p->pi_lock); 2724 2683 2725 2684 queued = task_on_rq_queued(p); 2726 - running = task_current(rq, p); 2685 + running = task_current_donor(rq, p); 2727 2686 2728 2687 if (queued) { 2729 2688 /* ··· 2737 2696 put_prev_task(rq, p); 2738 2697 2739 2698 p->sched_class->set_cpus_allowed(p, ctx); 2699 + mm_set_cpus_allowed(p->mm, ctx->new_mask); 2740 2700 2741 2701 if (queued) 2742 2702 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); ··· 3350 3308 rq_pin_lock(src_rq, &srf); 3351 3309 rq_pin_lock(dst_rq, &drf); 3352 3310 3353 - deactivate_task(src_rq, p, 0); 3354 - set_task_cpu(p, cpu); 3355 - activate_task(dst_rq, p, 0); 3311 + move_queued_task_locked(src_rq, dst_rq, p); 3356 3312 wakeup_preempt(dst_rq, p, 0); 3357 3313 3358 3314 rq_unpin_lock(dst_rq, &drf); ··· 4464 4424 * Perform scheduler related setup for a newly forked process p. 4465 4425 * p is forked by current. 4466 4426 * 4467 - * __sched_fork() is basic setup used by init_idle() too: 4427 + * __sched_fork() is basic setup which is also used by sched_init() to 4428 + * initialize the boot CPU's idle task. 4468 4429 */ 4469 4430 static void __sched_fork(unsigned long clone_flags, struct task_struct *p) 4470 4431 { ··· 5558 5517 * project cycles that may never be accounted to this 5559 5518 * thread, breaking clock_gettime(). 5560 5519 */ 5561 - if (task_current(rq, p) && task_on_rq_queued(p)) { 5520 + if (task_current_donor(rq, p) && task_on_rq_queued(p)) { 5562 5521 prefetch_curr_exec_start(p); 5563 5522 update_rq_clock(rq); 5564 5523 p->sched_class->update_curr(rq); ··· 5626 5585 { 5627 5586 int cpu = smp_processor_id(); 5628 5587 struct rq *rq = cpu_rq(cpu); 5629 - struct task_struct *curr; 5588 + /* accounting goes to the donor task */ 5589 + struct task_struct *donor; 5630 5590 struct rq_flags rf; 5631 5591 unsigned long hw_pressure; 5632 5592 u64 resched_latency; ··· 5638 5596 sched_clock_tick(); 5639 5597 5640 5598 rq_lock(rq, &rf); 5599 + donor = rq->donor; 5641 5600 5642 - curr = rq->curr; 5643 - psi_account_irqtime(rq, curr, NULL); 5601 + psi_account_irqtime(rq, donor, NULL); 5644 5602 5645 5603 update_rq_clock(rq); 5646 5604 hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); 5647 5605 update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); 5648 - curr->sched_class->task_tick(rq, curr, 0); 5606 + 5607 + if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) 5608 + resched_curr(rq); 5609 + 5610 + donor->sched_class->task_tick(rq, donor, 0); 5649 5611 if (sched_feat(LATENCY_WARN)) 5650 5612 resched_latency = cpu_resched_latency(rq); 5651 5613 calc_global_load_tick(rq); 5652 5614 sched_core_tick(rq); 5653 - task_tick_mm_cid(rq, curr); 5615 + task_tick_mm_cid(rq, donor); 5654 5616 scx_tick(rq); 5655 5617 5656 5618 rq_unlock(rq, &rf); ··· 5664 5618 5665 5619 perf_event_task_tick(); 5666 5620 5667 - if (curr->flags & PF_WQ_WORKER) 5668 - wq_worker_tick(curr); 5621 + if (donor->flags & PF_WQ_WORKER) 5622 + wq_worker_tick(donor); 5669 5623 5670 5624 #ifdef CONFIG_SMP 5671 5625 if (!scx_switched_all()) { ··· 5732 5686 struct task_struct *curr = rq->curr; 5733 5687 5734 5688 if (cpu_online(cpu)) { 5689 + /* 5690 + * Since this is a remote tick for full dynticks mode, 5691 + * we are always sure that there is no proxy (only a 5692 + * single task is running). 5693 + */ 5694 + SCHED_WARN_ON(rq->curr != rq->donor); 5735 5695 update_rq_clock(rq); 5736 5696 5737 5697 if (!is_idle_task(curr)) { ··· 6361 6309 if (sched_task_is_throttled(p, this)) 6362 6310 goto next; 6363 6311 6364 - deactivate_task(src, p, 0); 6365 - set_task_cpu(p, this); 6366 - activate_task(dst, p, 0); 6367 - 6312 + move_queued_task_locked(src, dst, p); 6368 6313 resched_curr(dst); 6369 6314 6370 6315 success = true; ··· 6556 6507 #define SM_RTLOCK_WAIT 2 6557 6508 6558 6509 /* 6510 + * Helper function for __schedule() 6511 + * 6512 + * If a task does not have signals pending, deactivate it 6513 + * Otherwise marks the task's __state as RUNNING 6514 + */ 6515 + static bool try_to_block_task(struct rq *rq, struct task_struct *p, 6516 + unsigned long task_state) 6517 + { 6518 + int flags = DEQUEUE_NOCLOCK; 6519 + 6520 + if (signal_pending_state(task_state, p)) { 6521 + WRITE_ONCE(p->__state, TASK_RUNNING); 6522 + return false; 6523 + } 6524 + 6525 + p->sched_contributes_to_load = 6526 + (task_state & TASK_UNINTERRUPTIBLE) && 6527 + !(task_state & TASK_NOLOAD) && 6528 + !(task_state & TASK_FROZEN); 6529 + 6530 + if (unlikely(is_special_task_state(task_state))) 6531 + flags |= DEQUEUE_SPECIAL; 6532 + 6533 + /* 6534 + * __schedule() ttwu() 6535 + * prev_state = prev->state; if (p->on_rq && ...) 6536 + * if (prev_state) goto out; 6537 + * p->on_rq = 0; smp_acquire__after_ctrl_dep(); 6538 + * p->state = TASK_WAKING 6539 + * 6540 + * Where __schedule() and ttwu() have matching control dependencies. 6541 + * 6542 + * After this, schedule() must not care about p->state any more. 6543 + */ 6544 + block_task(rq, p, flags); 6545 + return true; 6546 + } 6547 + 6548 + /* 6559 6549 * __schedule() is the main scheduler function. 6560 6550 * 6561 6551 * The main means of driving the scheduler and thus entering this function are: ··· 6702 6614 goto picked; 6703 6615 } 6704 6616 } else if (!preempt && prev_state) { 6705 - if (signal_pending_state(prev_state, prev)) { 6706 - WRITE_ONCE(prev->__state, TASK_RUNNING); 6707 - } else { 6708 - int flags = DEQUEUE_NOCLOCK; 6709 - 6710 - prev->sched_contributes_to_load = 6711 - (prev_state & TASK_UNINTERRUPTIBLE) && 6712 - !(prev_state & TASK_NOLOAD) && 6713 - !(prev_state & TASK_FROZEN); 6714 - 6715 - if (unlikely(is_special_task_state(prev_state))) 6716 - flags |= DEQUEUE_SPECIAL; 6717 - 6718 - /* 6719 - * __schedule() ttwu() 6720 - * prev_state = prev->state; if (p->on_rq && ...) 6721 - * if (prev_state) goto out; 6722 - * p->on_rq = 0; smp_acquire__after_ctrl_dep(); 6723 - * p->state = TASK_WAKING 6724 - * 6725 - * Where __schedule() and ttwu() have matching control dependencies. 6726 - * 6727 - * After this, schedule() must not care about p->state any more. 6728 - */ 6729 - block_task(rq, prev, flags); 6730 - block = true; 6731 - } 6617 + block = try_to_block_task(rq, prev, prev_state); 6732 6618 switch_count = &prev->nvcsw; 6733 6619 } 6734 6620 6735 6621 next = pick_next_task(rq, prev, &rf); 6622 + rq_set_donor(rq, next); 6736 6623 picked: 6737 6624 clear_tsk_need_resched(prev); 6738 6625 clear_preempt_need_resched(); ··· 7214 7151 dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 7215 7152 7216 7153 queued = task_on_rq_queued(p); 7217 - running = task_current(rq, p); 7154 + running = task_current_donor(rq, p); 7218 7155 if (queued) 7219 7156 dequeue_task(rq, p, queue_flag); 7220 7157 if (running) ··· 7414 7351 * preempt_schedule <- NOP 7415 7352 * preempt_schedule_notrace <- NOP 7416 7353 * irqentry_exit_cond_resched <- NOP 7354 + * dynamic_preempt_lazy <- false 7417 7355 * 7418 7356 * VOLUNTARY: 7419 7357 * cond_resched <- __cond_resched ··· 7422 7358 * preempt_schedule <- NOP 7423 7359 * preempt_schedule_notrace <- NOP 7424 7360 * irqentry_exit_cond_resched <- NOP 7361 + * dynamic_preempt_lazy <- false 7425 7362 * 7426 7363 * FULL: 7427 7364 * cond_resched <- RET0 ··· 7430 7365 * preempt_schedule <- preempt_schedule 7431 7366 * preempt_schedule_notrace <- preempt_schedule_notrace 7432 7367 * irqentry_exit_cond_resched <- irqentry_exit_cond_resched 7368 + * dynamic_preempt_lazy <- false 7369 + * 7370 + * LAZY: 7371 + * cond_resched <- RET0 7372 + * might_resched <- RET0 7373 + * preempt_schedule <- preempt_schedule 7374 + * preempt_schedule_notrace <- preempt_schedule_notrace 7375 + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched 7376 + * dynamic_preempt_lazy <- true 7433 7377 */ 7434 7378 7435 7379 enum { ··· 7446 7372 preempt_dynamic_none, 7447 7373 preempt_dynamic_voluntary, 7448 7374 preempt_dynamic_full, 7375 + preempt_dynamic_lazy, 7449 7376 }; 7450 7377 7451 7378 int preempt_dynamic_mode = preempt_dynamic_undefined; 7452 7379 7453 7380 int sched_dynamic_mode(const char *str) 7454 7381 { 7382 + #ifndef CONFIG_PREEMPT_RT 7455 7383 if (!strcmp(str, "none")) 7456 7384 return preempt_dynamic_none; 7457 7385 7458 7386 if (!strcmp(str, "voluntary")) 7459 7387 return preempt_dynamic_voluntary; 7388 + #endif 7460 7389 7461 7390 if (!strcmp(str, "full")) 7462 7391 return preempt_dynamic_full; 7463 7392 7393 + #ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY 7394 + if (!strcmp(str, "lazy")) 7395 + return preempt_dynamic_lazy; 7396 + #endif 7397 + 7464 7398 return -EINVAL; 7465 7399 } 7400 + 7401 + #define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) 7402 + #define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) 7466 7403 7467 7404 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 7468 7405 #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) 7469 7406 #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) 7470 7407 #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) 7471 - #define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) 7472 - #define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) 7408 + #define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) 7409 + #define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) 7473 7410 #else 7474 7411 #error "Unsupported PREEMPT_DYNAMIC mechanism" 7475 7412 #endif ··· 7500 7415 preempt_dynamic_enable(preempt_schedule); 7501 7416 preempt_dynamic_enable(preempt_schedule_notrace); 7502 7417 preempt_dynamic_enable(irqentry_exit_cond_resched); 7418 + preempt_dynamic_key_disable(preempt_lazy); 7503 7419 7504 7420 switch (mode) { 7505 7421 case preempt_dynamic_none: ··· 7510 7424 preempt_dynamic_disable(preempt_schedule); 7511 7425 preempt_dynamic_disable(preempt_schedule_notrace); 7512 7426 preempt_dynamic_disable(irqentry_exit_cond_resched); 7427 + preempt_dynamic_key_disable(preempt_lazy); 7513 7428 if (mode != preempt_dynamic_mode) 7514 7429 pr_info("Dynamic Preempt: none\n"); 7515 7430 break; ··· 7522 7435 preempt_dynamic_disable(preempt_schedule); 7523 7436 preempt_dynamic_disable(preempt_schedule_notrace); 7524 7437 preempt_dynamic_disable(irqentry_exit_cond_resched); 7438 + preempt_dynamic_key_disable(preempt_lazy); 7525 7439 if (mode != preempt_dynamic_mode) 7526 7440 pr_info("Dynamic Preempt: voluntary\n"); 7527 7441 break; ··· 7534 7446 preempt_dynamic_enable(preempt_schedule); 7535 7447 preempt_dynamic_enable(preempt_schedule_notrace); 7536 7448 preempt_dynamic_enable(irqentry_exit_cond_resched); 7449 + preempt_dynamic_key_disable(preempt_lazy); 7537 7450 if (mode != preempt_dynamic_mode) 7538 7451 pr_info("Dynamic Preempt: full\n"); 7452 + break; 7453 + 7454 + case preempt_dynamic_lazy: 7455 + if (!klp_override) 7456 + preempt_dynamic_disable(cond_resched); 7457 + preempt_dynamic_disable(might_resched); 7458 + preempt_dynamic_enable(preempt_schedule); 7459 + preempt_dynamic_enable(preempt_schedule_notrace); 7460 + preempt_dynamic_enable(irqentry_exit_cond_resched); 7461 + preempt_dynamic_key_enable(preempt_lazy); 7462 + if (mode != preempt_dynamic_mode) 7463 + pr_info("Dynamic Preempt: lazy\n"); 7539 7464 break; 7540 7465 } 7541 7466 ··· 7612 7511 sched_dynamic_update(preempt_dynamic_none); 7613 7512 } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { 7614 7513 sched_dynamic_update(preempt_dynamic_voluntary); 7514 + } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { 7515 + sched_dynamic_update(preempt_dynamic_lazy); 7615 7516 } else { 7616 7517 /* Default static call setting, nothing to do */ 7617 7518 WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); ··· 7634 7531 PREEMPT_MODEL_ACCESSOR(none); 7635 7532 PREEMPT_MODEL_ACCESSOR(voluntary); 7636 7533 PREEMPT_MODEL_ACCESSOR(full); 7534 + PREEMPT_MODEL_ACCESSOR(lazy); 7637 7535 7638 7536 #else /* !CONFIG_PREEMPT_DYNAMIC: */ 7639 7537 ··· 7787 7683 struct rq *rq = cpu_rq(cpu); 7788 7684 unsigned long flags; 7789 7685 7790 - __sched_fork(0, idle); 7791 - 7792 7686 raw_spin_lock_irqsave(&idle->pi_lock, flags); 7793 7687 raw_spin_rq_lock(rq); 7794 7688 ··· 7801 7699 7802 7700 #ifdef CONFIG_SMP 7803 7701 /* 7804 - * It's possible that init_idle() gets called multiple times on a task, 7805 - * in that case do_set_cpus_allowed() will not do the right thing. 7806 - * 7807 - * And since this is boot we can forgo the serialization. 7702 + * No validation and serialization required at boot time and for 7703 + * setting up the idle tasks of not yet online CPUs. 7808 7704 */ 7809 7705 set_cpus_allowed_common(idle, &ac); 7810 7706 #endif ··· 7821 7721 rcu_read_unlock(); 7822 7722 7823 7723 rq->idle = idle; 7724 + rq_set_donor(rq, idle); 7824 7725 rcu_assign_pointer(rq->curr, idle); 7825 7726 idle->on_rq = TASK_ON_RQ_QUEUED; 7826 7727 #ifdef CONFIG_SMP ··· 7911 7810 7912 7811 rq = task_rq_lock(p, &rf); 7913 7812 queued = task_on_rq_queued(p); 7914 - running = task_current(rq, p); 7813 + running = task_current_donor(rq, p); 7915 7814 7916 7815 if (queued) 7917 7816 dequeue_task(rq, p, DEQUEUE_SAVE); ··· 8647 8546 * but because we are the idle thread, we just pick up running again 8648 8547 * when this runqueue becomes "idle". 8649 8548 */ 8549 + __sched_fork(0, current); 8650 8550 init_idle(current, smp_processor_id()); 8651 8551 8652 8552 calc_load_update = jiffies + LOAD_FREQ; ··· 9062 8960 9063 8961 update_rq_clock(rq); 9064 8962 9065 - running = task_current(rq, tsk); 8963 + running = task_current_donor(rq, tsk); 9066 8964 queued = task_on_rq_queued(tsk); 9067 8965 9068 8966 if (queued) ··· 10355 10253 */ 10356 10254 if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10357 10255 return -1; 10256 + WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); 10358 10257 return src_cid; 10359 10258 } 10360 10259 ··· 10368 10265 { 10369 10266 struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; 10370 10267 struct mm_struct *mm = t->mm; 10371 - int src_cid, dst_cid, src_cpu; 10268 + int src_cid, src_cpu; 10269 + bool dst_cid_is_set; 10372 10270 struct rq *src_rq; 10373 10271 10374 10272 lockdep_assert_rq_held(dst_rq); ··· 10386 10282 * allocation closest to 0 in cases where few threads migrate around 10387 10283 * many CPUs. 10388 10284 * 10389 - * If destination cid is already set, we may have to just clear 10390 - * the src cid to ensure compactness in frequent migrations 10391 - * scenarios. 10285 + * If destination cid or recent cid is already set, we may have 10286 + * to just clear the src cid to ensure compactness in frequent 10287 + * migrations scenarios. 10392 10288 * 10393 10289 * It is not useful to clear the src cid when the number of threads is 10394 10290 * greater or equal to the number of allowed CPUs, because user-space ··· 10396 10292 * allowed CPUs. 10397 10293 */ 10398 10294 dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); 10399 - dst_cid = READ_ONCE(dst_pcpu_cid->cid); 10400 - if (!mm_cid_is_unset(dst_cid) && 10401 - atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) 10295 + dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || 10296 + !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); 10297 + if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) 10402 10298 return; 10403 10299 src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); 10404 10300 src_rq = cpu_rq(src_cpu); ··· 10409 10305 src_cid); 10410 10306 if (src_cid == -1) 10411 10307 return; 10412 - if (!mm_cid_is_unset(dst_cid)) { 10308 + if (dst_cid_is_set) { 10413 10309 __mm_cid_put(mm, src_cid); 10414 10310 return; 10415 10311 } 10416 10312 /* Move src_cid to dst cpu. */ 10417 10313 mm_cid_snapshot_time(dst_rq, mm); 10418 10314 WRITE_ONCE(dst_pcpu_cid->cid, src_cid); 10315 + WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); 10419 10316 } 10420 10317 10421 10318 static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, ··· 10655 10550 * Matches barrier in sched_mm_cid_remote_clear_old(). 10656 10551 */ 10657 10552 smp_mb(); 10658 - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); 10553 + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); 10659 10554 } 10660 10555 rseq_set_notify_resume(t); 10661 10556 }

+23 -34

kernel/sched/deadline.c

··· 1339 1339 #endif 1340 1340 1341 1341 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 1342 - if (dl_task(rq->curr)) 1342 + if (dl_task(rq->donor)) 1343 1343 wakeup_preempt_dl(rq, p, 0); 1344 1344 else 1345 1345 resched_curr(rq); ··· 1736 1736 */ 1737 1737 static void update_curr_dl(struct rq *rq) 1738 1738 { 1739 - struct task_struct *curr = rq->curr; 1740 - struct sched_dl_entity *dl_se = &curr->dl; 1739 + struct task_struct *donor = rq->donor; 1740 + struct sched_dl_entity *dl_se = &donor->dl; 1741 1741 s64 delta_exec; 1742 1742 1743 - if (!dl_task(curr) || !on_dl_rq(dl_se)) 1743 + if (!dl_task(donor) || !on_dl_rq(dl_se)) 1744 1744 return; 1745 1745 1746 1746 /* ··· 2213 2213 static int 2214 2214 select_task_rq_dl(struct task_struct *p, int cpu, int flags) 2215 2215 { 2216 - struct task_struct *curr; 2216 + struct task_struct *curr, *donor; 2217 2217 bool select_rq; 2218 2218 struct rq *rq; 2219 2219 ··· 2224 2224 2225 2225 rcu_read_lock(); 2226 2226 curr = READ_ONCE(rq->curr); /* unlocked access */ 2227 + donor = READ_ONCE(rq->donor); 2227 2228 2228 2229 /* 2229 2230 * If we are dealing with a -deadline task, we must ··· 2235 2234 * other hand, if it has a shorter deadline, we 2236 2235 * try to make it stay here, it might be important. 2237 2236 */ 2238 - select_rq = unlikely(dl_task(curr)) && 2237 + select_rq = unlikely(dl_task(donor)) && 2239 2238 (curr->nr_cpus_allowed < 2 || 2240 - !dl_entity_preempt(&p->dl, &curr->dl)) && 2239 + !dl_entity_preempt(&p->dl, &donor->dl)) && 2241 2240 p->nr_cpus_allowed > 1; 2242 2241 2243 2242 /* ··· 2300 2299 * let's hope p can move out. 2301 2300 */ 2302 2301 if (rq->curr->nr_cpus_allowed == 1 || 2303 - !cpudl_find(&rq->rd->cpudl, rq->curr, NULL)) 2302 + !cpudl_find(&rq->rd->cpudl, rq->donor, NULL)) 2304 2303 return; 2305 2304 2306 2305 /* ··· 2339 2338 static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, 2340 2339 int flags) 2341 2340 { 2342 - if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { 2341 + if (dl_entity_preempt(&p->dl, &rq->donor->dl)) { 2343 2342 resched_curr(rq); 2344 2343 return; 2345 2344 } ··· 2349 2348 * In the unlikely case current and p have the same deadline 2350 2349 * let us try to decide what's the best thing to do... 2351 2350 */ 2352 - if ((p->dl.deadline == rq->curr->dl.deadline) && 2351 + if ((p->dl.deadline == rq->donor->dl.deadline) && 2353 2352 !test_tsk_need_resched(rq->curr)) 2354 2353 check_preempt_equal_dl(rq, p); 2355 2354 #endif /* CONFIG_SMP */ ··· 2381 2380 if (!first) 2382 2381 return; 2383 2382 2384 - if (rq->curr->sched_class != &dl_sched_class) 2383 + if (rq->donor->sched_class != &dl_sched_class) 2385 2384 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); 2386 2385 2387 2386 deadline_queue_push_tasks(rq); ··· 2488 2487 /* Only try algorithms three times */ 2489 2488 #define DL_MAX_TRIES 3 2490 2489 2491 - static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 2492 - { 2493 - if (!task_on_cpu(rq, p) && 2494 - cpumask_test_cpu(cpu, &p->cpus_mask)) 2495 - return 1; 2496 - return 0; 2497 - } 2498 - 2499 2490 /* 2500 2491 * Return the earliest pushable rq's task, which is suitable to be executed 2501 2492 * on the CPU, NULL otherwise: ··· 2506 2513 if (next_node) { 2507 2514 p = __node_2_pdl(next_node); 2508 2515 2509 - if (pick_dl_task(rq, p, cpu)) 2516 + if (task_is_pushable(rq, p, cpu)) 2510 2517 return p; 2511 2518 2512 2519 next_node = rb_next(next_node); ··· 2700 2707 * can move away, it makes sense to just reschedule 2701 2708 * without going further in pushing next_task. 2702 2709 */ 2703 - if (dl_task(rq->curr) && 2704 - dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) && 2710 + if (dl_task(rq->donor) && 2711 + dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) && 2705 2712 rq->curr->nr_cpus_allowed > 1) { 2706 2713 resched_curr(rq); 2707 2714 return 0; ··· 2744 2751 goto retry; 2745 2752 } 2746 2753 2747 - deactivate_task(rq, next_task, 0); 2748 - set_task_cpu(next_task, later_rq->cpu); 2749 - activate_task(later_rq, next_task, 0); 2754 + move_queued_task_locked(rq, later_rq, next_task); 2750 2755 ret = 1; 2751 2756 2752 2757 resched_curr(later_rq); ··· 2824 2833 * deadline than the current task of its runqueue. 2825 2834 */ 2826 2835 if (dl_time_before(p->dl.deadline, 2827 - src_rq->curr->dl.deadline)) 2836 + src_rq->donor->dl.deadline)) 2828 2837 goto skip; 2829 2838 2830 2839 if (is_migration_disabled(p)) { 2831 2840 push_task = get_push_task(src_rq); 2832 2841 } else { 2833 - deactivate_task(src_rq, p, 0); 2834 - set_task_cpu(p, this_cpu); 2835 - activate_task(this_rq, p, 0); 2842 + move_queued_task_locked(src_rq, this_rq, p); 2836 2843 dmin = p->dl.deadline; 2837 2844 resched = true; 2838 2845 } ··· 2863 2874 if (!task_on_cpu(rq, p) && 2864 2875 !test_tsk_need_resched(rq->curr) && 2865 2876 p->nr_cpus_allowed > 1 && 2866 - dl_task(rq->curr) && 2877 + dl_task(rq->donor) && 2867 2878 (rq->curr->nr_cpus_allowed < 2 || 2868 - !dl_entity_preempt(&p->dl, &rq->curr->dl))) { 2879 + !dl_entity_preempt(&p->dl, &rq->donor->dl))) { 2869 2880 push_dl_tasks(rq); 2870 2881 } 2871 2882 } ··· 3040 3051 return; 3041 3052 } 3042 3053 3043 - if (rq->curr != p) { 3054 + if (rq->donor != p) { 3044 3055 #ifdef CONFIG_SMP 3045 3056 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) 3046 3057 deadline_queue_push_tasks(rq); 3047 3058 #endif 3048 - if (dl_task(rq->curr)) 3059 + if (dl_task(rq->donor)) 3049 3060 wakeup_preempt_dl(rq, p, 0); 3050 3061 else 3051 3062 resched_curr(rq); ··· 3074 3085 if (!rq->dl.overloaded) 3075 3086 deadline_queue_pull_task(rq); 3076 3087 3077 - if (task_current(rq, p)) { 3088 + if (task_current_donor(rq, p)) { 3078 3089 /* 3079 3090 * If we now have a earlier deadline task than p, 3080 3091 * then reschedule, provided p is still on this

+4 -3

kernel/sched/debug.c

··· 245 245 static int sched_dynamic_show(struct seq_file *m, void *v) 246 246 { 247 247 static const char * preempt_modes[] = { 248 - "none", "voluntary", "full" 248 + "none", "voluntary", "full", "lazy", 249 249 }; 250 - int i; 250 + int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); 251 + int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; 251 252 252 - for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { 253 + for (; i < j; i++) { 253 254 if (preempt_dynamic_mode == i) 254 255 seq_puts(m, "("); 255 256 seq_puts(m, preempt_modes[i]);

+1 -6

kernel/sched/ext.c

··· 3567 3567 3568 3568 void init_scx_entity(struct sched_ext_entity *scx) 3569 3569 { 3570 - /* 3571 - * init_idle() calls this function again after fork sequence is 3572 - * complete. Don't touch ->tasks_node as it's already linked. 3573 - */ 3574 - memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); 3575 - 3570 + memset(scx, 0, sizeof(*scx)); 3576 3571 INIT_LIST_HEAD(&scx->dsq_list.node); 3577 3572 RB_CLEAR_NODE(&scx->dsq_priq); 3578 3573 scx->sticky_cpu = -1;

+18 -24

kernel/sched/fair.c

··· 1200 1200 */ 1201 1201 s64 update_curr_common(struct rq *rq) 1202 1202 { 1203 - struct task_struct *curr = rq->curr; 1203 + struct task_struct *donor = rq->donor; 1204 1204 s64 delta_exec; 1205 1205 1206 - delta_exec = update_curr_se(rq, &curr->se); 1206 + delta_exec = update_curr_se(rq, &donor->se); 1207 1207 if (likely(delta_exec > 0)) 1208 - update_curr_task(curr, delta_exec); 1208 + update_curr_task(donor, delta_exec); 1209 1209 1210 1210 return delta_exec; 1211 1211 } ··· 1251 1251 return; 1252 1252 1253 1253 if (resched || did_preempt_short(cfs_rq, curr)) { 1254 - resched_curr(rq); 1254 + resched_curr_lazy(rq); 1255 1255 clear_buddies(cfs_rq, curr); 1256 1256 } 1257 1257 } 1258 1258 1259 1259 static void update_curr_fair(struct rq *rq) 1260 1260 { 1261 - update_curr(cfs_rq_of(&rq->curr->se)); 1261 + update_curr(cfs_rq_of(&rq->donor->se)); 1262 1262 } 1263 1263 1264 1264 static inline void ··· 5280 5280 * 5281 5281 * EEVDF: placement strategy #1 / #2 5282 5282 */ 5283 - if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { 5283 + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) { 5284 5284 struct sched_entity *curr = cfs_rq->curr; 5285 5285 unsigned long load; 5286 5286 ··· 5678 5678 * validating it and just reschedule. 5679 5679 */ 5680 5680 if (queued) { 5681 - resched_curr(rq_of(cfs_rq)); 5681 + resched_curr_lazy(rq_of(cfs_rq)); 5682 5682 return; 5683 5683 } 5684 - /* 5685 - * don't let the period tick interfere with the hrtick preemption 5686 - */ 5687 - if (!sched_feat(DOUBLE_TICK) && 5688 - hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) 5689 - return; 5690 5684 #endif 5691 5685 } 5692 5686 ··· 6816 6822 s64 delta = slice - ran; 6817 6823 6818 6824 if (delta < 0) { 6819 - if (task_current(rq, p)) 6825 + if (task_current_donor(rq, p)) 6820 6826 resched_curr(rq); 6821 6827 return; 6822 6828 } ··· 6831 6837 */ 6832 6838 static void hrtick_update(struct rq *rq) 6833 6839 { 6834 - struct task_struct *curr = rq->curr; 6840 + struct task_struct *donor = rq->donor; 6835 6841 6836 - if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) 6842 + if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class) 6837 6843 return; 6838 6844 6839 - hrtick_start_fair(rq, curr); 6845 + hrtick_start_fair(rq, donor); 6840 6846 } 6841 6847 #else /* !CONFIG_SCHED_HRTICK */ 6842 6848 static inline void ··· 8757 8763 */ 8758 8764 static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) 8759 8765 { 8760 - struct task_struct *curr = rq->curr; 8761 - struct sched_entity *se = &curr->se, *pse = &p->se; 8762 - struct cfs_rq *cfs_rq = task_cfs_rq(curr); 8766 + struct task_struct *donor = rq->donor; 8767 + struct sched_entity *se = &donor->se, *pse = &p->se; 8768 + struct cfs_rq *cfs_rq = task_cfs_rq(donor); 8763 8769 int cse_is_idle, pse_is_idle; 8764 8770 8765 8771 if (unlikely(se == pse)) ··· 8788 8794 * prevents us from potentially nominating it as a false LAST_BUDDY 8789 8795 * below. 8790 8796 */ 8791 - if (test_tsk_need_resched(curr)) 8797 + if (test_tsk_need_resched(rq->curr)) 8792 8798 return; 8793 8799 8794 8800 if (!sched_feat(WAKEUP_PREEMPTION)) ··· 8836 8842 return; 8837 8843 8838 8844 preempt: 8839 - resched_curr(rq); 8845 + resched_curr_lazy(rq); 8840 8846 } 8841 8847 8842 8848 static struct task_struct *pick_task_fair(struct rq *rq) ··· 13087 13093 * our priority decreased, or if we are not currently running on 13088 13094 * this runqueue and our priority is higher than the current's 13089 13095 */ 13090 - if (task_current(rq, p)) { 13096 + if (task_current_donor(rq, p)) { 13091 13097 if (p->prio > oldprio) 13092 13098 resched_curr(rq); 13093 13099 } else ··· 13194 13200 * kick off the schedule if running, otherwise just see 13195 13201 * if we can still preempt the current task. 13196 13202 */ 13197 - if (task_current(rq, p)) 13203 + if (task_current_donor(rq, p)) 13198 13204 resched_curr(rq); 13199 13205 else 13200 13206 wakeup_preempt(rq, p, 0);

+1 -2

kernel/sched/features.h

··· 19 19 */ 20 20 SCHED_FEAT(RUN_TO_PARITY, true) 21 21 /* 22 - * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for 22 + * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for 23 23 * current. 24 24 */ 25 25 SCHED_FEAT(PREEMPT_SHORT, true) ··· 56 56 57 57 SCHED_FEAT(HRTICK, false) 58 58 SCHED_FEAT(HRTICK_DL, false) 59 - SCHED_FEAT(DOUBLE_TICK, false) 60 59 61 60 /* 62 61 * Decrement CPU capacity based on time not spent running tasks

-1

kernel/sched/idle.c

··· 271 271 tick_nohz_idle_enter(); 272 272 273 273 while (!need_resched()) { 274 - rmb(); 275 274 276 275 /* 277 276 * Interrupts shouldn't be re-enabled from that point on until

+1 -1

kernel/sched/pelt.c

··· 476 476 bool update_other_load_avgs(struct rq *rq) 477 477 { 478 478 u64 now = rq_clock_pelt(rq); 479 - const struct sched_class *curr_class = rq->curr->sched_class; 479 + const struct sched_class *curr_class = rq->donor->sched_class; 480 480 unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); 481 481 482 482 lockdep_assert_rq_held(rq);

+27 -40

kernel/sched/rt.c

··· 528 528 529 529 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 530 530 { 531 - struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 531 + struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor; 532 532 struct rq *rq = rq_of_rt_rq(rt_rq); 533 533 struct sched_rt_entity *rt_se; 534 534 ··· 542 542 else if (!on_rt_rq(rt_se)) 543 543 enqueue_rt_entity(rt_se, 0); 544 544 545 - if (rt_rq->highest_prio.curr < curr->prio) 545 + if (rt_rq->highest_prio.curr < donor->prio) 546 546 resched_curr(rq); 547 547 } 548 548 } ··· 988 988 */ 989 989 static void update_curr_rt(struct rq *rq) 990 990 { 991 - struct task_struct *curr = rq->curr; 991 + struct task_struct *donor = rq->donor; 992 992 s64 delta_exec; 993 993 994 - if (curr->sched_class != &rt_sched_class) 994 + if (donor->sched_class != &rt_sched_class) 995 995 return; 996 996 997 997 delta_exec = update_curr_common(rq); ··· 999 999 return; 1000 1000 1001 1001 #ifdef CONFIG_RT_GROUP_SCHED 1002 - struct sched_rt_entity *rt_se = &curr->rt; 1002 + struct sched_rt_entity *rt_se = &donor->rt; 1003 1003 1004 1004 if (!rt_bandwidth_enabled()) 1005 1005 return; ··· 1535 1535 static int 1536 1536 select_task_rq_rt(struct task_struct *p, int cpu, int flags) 1537 1537 { 1538 - struct task_struct *curr; 1538 + struct task_struct *curr, *donor; 1539 1539 struct rq *rq; 1540 1540 bool test; 1541 1541 ··· 1547 1547 1548 1548 rcu_read_lock(); 1549 1549 curr = READ_ONCE(rq->curr); /* unlocked access */ 1550 + donor = READ_ONCE(rq->donor); 1550 1551 1551 1552 /* 1552 1553 * If the current task on @p's runqueue is an RT task, then ··· 1576 1575 * systems like big.LITTLE. 1577 1576 */ 1578 1577 test = curr && 1579 - unlikely(rt_task(curr)) && 1580 - (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio); 1578 + unlikely(rt_task(donor)) && 1579 + (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio); 1581 1580 1582 1581 if (test || !rt_task_fits_capacity(p, cpu)) { 1583 1582 int target = find_lowest_rq(p); ··· 1607 1606 1608 1607 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1609 1608 { 1610 - /* 1611 - * Current can't be migrated, useless to reschedule, 1612 - * let's hope p can move out. 1613 - */ 1614 1609 if (rq->curr->nr_cpus_allowed == 1 || 1615 - !cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) 1610 + !cpupri_find(&rq->rd->cpupri, rq->donor, NULL)) 1616 1611 return; 1617 1612 1618 1613 /* ··· 1651 1654 */ 1652 1655 static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) 1653 1656 { 1654 - if (p->prio < rq->curr->prio) { 1657 + struct task_struct *donor = rq->donor; 1658 + 1659 + if (p->prio < donor->prio) { 1655 1660 resched_curr(rq); 1656 1661 return; 1657 1662 } ··· 1671 1672 * to move current somewhere else, making room for our non-migratable 1672 1673 * task. 1673 1674 */ 1674 - if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) 1675 + if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr)) 1675 1676 check_preempt_equal_prio(rq, p); 1676 1677 #endif 1677 1678 } ··· 1696 1697 * utilization. We only care of the case where we start to schedule a 1697 1698 * rt task 1698 1699 */ 1699 - if (rq->curr->sched_class != &rt_sched_class) 1700 + if (rq->donor->sched_class != &rt_sched_class) 1700 1701 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); 1701 1702 1702 1703 rt_queue_push_tasks(rq); ··· 1772 1773 /* Only try algorithms three times */ 1773 1774 #define RT_MAX_TRIES 3 1774 1775 1775 - static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1776 - { 1777 - if (!task_on_cpu(rq, p) && 1778 - cpumask_test_cpu(cpu, &p->cpus_mask)) 1779 - return 1; 1780 - 1781 - return 0; 1782 - } 1783 - 1784 1776 /* 1785 1777 * Return the highest pushable rq's task, which is suitable to be executed 1786 1778 * on the CPU, NULL otherwise ··· 1785 1795 return NULL; 1786 1796 1787 1797 plist_for_each_entry(p, head, pushable_tasks) { 1788 - if (pick_rt_task(rq, p, cpu)) 1798 + if (task_is_pushable(rq, p, cpu)) 1789 1799 return p; 1790 1800 } 1791 1801 ··· 1958 1968 1959 1969 BUG_ON(rq->cpu != task_cpu(p)); 1960 1970 BUG_ON(task_current(rq, p)); 1971 + BUG_ON(task_current_donor(rq, p)); 1961 1972 BUG_ON(p->nr_cpus_allowed <= 1); 1962 1973 1963 1974 BUG_ON(!task_on_rq_queued(p)); ··· 1991 2000 * higher priority than current. If that's the case 1992 2001 * just reschedule current. 1993 2002 */ 1994 - if (unlikely(next_task->prio < rq->curr->prio)) { 2003 + if (unlikely(next_task->prio < rq->donor->prio)) { 1995 2004 resched_curr(rq); 1996 2005 return 0; 1997 2006 } ··· 2012 2021 * Note that the stoppers are masqueraded as SCHED_FIFO 2013 2022 * (cf. sched_set_stop_task()), so we can't rely on rt_task(). 2014 2023 */ 2015 - if (rq->curr->sched_class != &rt_sched_class) 2024 + if (rq->donor->sched_class != &rt_sched_class) 2016 2025 return 0; 2017 2026 2018 2027 cpu = find_lowest_rq(rq->curr); ··· 2079 2088 goto retry; 2080 2089 } 2081 2090 2082 - deactivate_task(rq, next_task, 0); 2083 - set_task_cpu(next_task, lowest_rq->cpu); 2084 - activate_task(lowest_rq, next_task, 0); 2091 + move_queued_task_locked(rq, lowest_rq, next_task); 2085 2092 resched_curr(lowest_rq); 2086 2093 ret = 1; 2087 2094 ··· 2344 2355 * p if it is lower in priority than the 2345 2356 * current task on the run queue 2346 2357 */ 2347 - if (p->prio < src_rq->curr->prio) 2358 + if (p->prio < src_rq->donor->prio) 2348 2359 goto skip; 2349 2360 2350 2361 if (is_migration_disabled(p)) { 2351 2362 push_task = get_push_task(src_rq); 2352 2363 } else { 2353 - deactivate_task(src_rq, p, 0); 2354 - set_task_cpu(p, this_cpu); 2355 - activate_task(this_rq, p, 0); 2364 + move_queued_task_locked(src_rq, this_rq, p); 2356 2365 resched = true; 2357 2366 } 2358 2367 /* ··· 2386 2399 bool need_to_push = !task_on_cpu(rq, p) && 2387 2400 !test_tsk_need_resched(rq->curr) && 2388 2401 p->nr_cpus_allowed > 1 && 2389 - (dl_task(rq->curr) || rt_task(rq->curr)) && 2402 + (dl_task(rq->donor) || rt_task(rq->donor)) && 2390 2403 (rq->curr->nr_cpus_allowed < 2 || 2391 - rq->curr->prio <= p->prio); 2404 + rq->donor->prio <= p->prio); 2392 2405 2393 2406 if (need_to_push) 2394 2407 push_rt_tasks(rq); ··· 2472 2485 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2473 2486 rt_queue_push_tasks(rq); 2474 2487 #endif /* CONFIG_SMP */ 2475 - if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2488 + if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq))) 2476 2489 resched_curr(rq); 2477 2490 } 2478 2491 } ··· 2487 2500 if (!task_on_rq_queued(p)) 2488 2501 return; 2489 2502 2490 - if (task_current(rq, p)) { 2503 + if (task_current_donor(rq, p)) { 2491 2504 #ifdef CONFIG_SMP 2492 2505 /* 2493 2506 * If our priority decreases while running, we ··· 2513 2526 * greater than the current running task 2514 2527 * then reschedule. 2515 2528 */ 2516 - if (p->prio < rq->curr->prio) 2529 + if (p->prio < rq->donor->prio) 2517 2530 resched_curr(rq); 2518 2531 } 2519 2532 }

+110 -45

kernel/sched/sched.h

··· 1148 1148 */ 1149 1149 unsigned int nr_uninterruptible; 1150 1150 1151 - struct task_struct __rcu *curr; 1151 + union { 1152 + struct task_struct __rcu *donor; /* Scheduler context */ 1153 + struct task_struct __rcu *curr; /* Execution context */ 1154 + }; 1152 1155 struct sched_dl_entity *dl_server; 1153 1156 struct task_struct *idle; 1154 1157 struct task_struct *stop; ··· 1347 1344 #define task_rq(p) cpu_rq(task_cpu(p)) 1348 1345 #define cpu_curr(cpu) (cpu_rq(cpu)->curr) 1349 1346 #define raw_rq() raw_cpu_ptr(&runqueues) 1347 + 1348 + static inline void rq_set_donor(struct rq *rq, struct task_struct *t) 1349 + { 1350 + /* Do nothing */ 1351 + } 1350 1352 1351 1353 #ifdef CONFIG_SCHED_CORE 1352 1354 static inline struct cpumask *sched_group_span(struct sched_group *sg); ··· 2094 2086 2095 2087 #endif /* CONFIG_SMP */ 2096 2088 2097 - #include "stats.h" 2098 - 2099 - #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) 2100 - 2101 - extern void __sched_core_account_forceidle(struct rq *rq); 2102 - 2103 - static inline void sched_core_account_forceidle(struct rq *rq) 2104 - { 2105 - if (schedstat_enabled()) 2106 - __sched_core_account_forceidle(rq); 2107 - } 2108 - 2109 - extern void __sched_core_tick(struct rq *rq); 2110 - 2111 - static inline void sched_core_tick(struct rq *rq) 2112 - { 2113 - if (sched_core_enabled(rq) && schedstat_enabled()) 2114 - __sched_core_tick(rq); 2115 - } 2116 - 2117 - #else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ 2118 - 2119 - static inline void sched_core_account_forceidle(struct rq *rq) { } 2120 - 2121 - static inline void sched_core_tick(struct rq *rq) { } 2122 - 2123 - #endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ 2124 - 2125 2089 #ifdef CONFIG_CGROUP_SCHED 2126 2090 2127 2091 /* ··· 2241 2261 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 2242 2262 } 2243 2263 2264 + /* 2265 + * Is p the current execution context? 2266 + */ 2244 2267 static inline int task_current(struct rq *rq, struct task_struct *p) 2245 2268 { 2246 2269 return rq->curr == p; 2270 + } 2271 + 2272 + /* 2273 + * Is p the current scheduling context? 2274 + * 2275 + * Note that it might be the current execution context at the same time if 2276 + * rq->curr == rq->donor == p. 2277 + */ 2278 + static inline int task_current_donor(struct rq *rq, struct task_struct *p) 2279 + { 2280 + return rq->donor == p; 2247 2281 } 2248 2282 2249 2283 static inline int task_on_cpu(struct rq *rq, struct task_struct *p) ··· 2446 2452 2447 2453 static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 2448 2454 { 2449 - WARN_ON_ONCE(rq->curr != prev); 2455 + WARN_ON_ONCE(rq->donor != prev); 2450 2456 prev->sched_class->put_prev_task(rq, prev, NULL); 2451 2457 } 2452 2458 ··· 2610 2616 2611 2617 static inline struct task_struct *get_push_task(struct rq *rq) 2612 2618 { 2613 - struct task_struct *p = rq->curr; 2619 + struct task_struct *p = rq->donor; 2614 2620 2615 2621 lockdep_assert_rq_held(rq); 2616 2622 ··· 2690 2696 extern void init_sched_fair_class(void); 2691 2697 2692 2698 extern void resched_curr(struct rq *rq); 2699 + extern void resched_curr_lazy(struct rq *rq); 2693 2700 extern void resched_cpu(int cpu); 2694 2701 2695 2702 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); ··· 3195 3200 static inline void nohz_run_idle_balance(int cpu) { } 3196 3201 #endif 3197 3202 3203 + #include "stats.h" 3204 + 3205 + #if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS) 3206 + 3207 + extern void __sched_core_account_forceidle(struct rq *rq); 3208 + 3209 + static inline void sched_core_account_forceidle(struct rq *rq) 3210 + { 3211 + if (schedstat_enabled()) 3212 + __sched_core_account_forceidle(rq); 3213 + } 3214 + 3215 + extern void __sched_core_tick(struct rq *rq); 3216 + 3217 + static inline void sched_core_tick(struct rq *rq) 3218 + { 3219 + if (sched_core_enabled(rq) && schedstat_enabled()) 3220 + __sched_core_tick(rq); 3221 + } 3222 + 3223 + #else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */ 3224 + 3225 + static inline void sched_core_account_forceidle(struct rq *rq) { } 3226 + 3227 + static inline void sched_core_tick(struct rq *rq) { } 3228 + 3229 + #endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */ 3230 + 3198 3231 #ifdef CONFIG_IRQ_TIME_ACCOUNTING 3199 3232 3200 3233 struct irqtime { ··· 3653 3630 __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3654 3631 } 3655 3632 3656 - static inline int __mm_cid_try_get(struct mm_struct *mm) 3633 + static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) 3657 3634 { 3658 - struct cpumask *cpumask; 3659 - int cid; 3635 + struct cpumask *cidmask = mm_cidmask(mm); 3636 + struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3637 + int cid = __this_cpu_read(pcpu_cid->recent_cid); 3660 3638 3661 - cpumask = mm_cidmask(mm); 3639 + /* Try to re-use recent cid. This improves cache locality. */ 3640 + if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) 3641 + return cid; 3662 3642 /* 3643 + * Expand cid allocation if the maximum number of concurrency 3644 + * IDs allocated (max_nr_cid) is below the number cpus allowed 3645 + * and number of threads. Expanding cid allocation as much as 3646 + * possible improves cache locality. 3647 + */ 3648 + cid = atomic_read(&mm->max_nr_cid); 3649 + while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { 3650 + if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) 3651 + continue; 3652 + if (!cpumask_test_and_set_cpu(cid, cidmask)) 3653 + return cid; 3654 + } 3655 + /* 3656 + * Find the first available concurrency id. 3663 3657 * Retry finding first zero bit if the mask is temporarily 3664 3658 * filled. This only happens during concurrent remote-clear 3665 3659 * which owns a cid without holding a rq lock. 3666 3660 */ 3667 3661 for (;;) { 3668 - cid = cpumask_first_zero(cpumask); 3669 - if (cid < nr_cpu_ids) 3662 + cid = cpumask_first_zero(cidmask); 3663 + if (cid < READ_ONCE(mm->nr_cpus_allowed)) 3670 3664 break; 3671 3665 cpu_relax(); 3672 3666 } 3673 - if (cpumask_test_and_set_cpu(cid, cpumask)) 3667 + if (cpumask_test_and_set_cpu(cid, cidmask)) 3674 3668 return -1; 3675 3669 3676 3670 return cid; ··· 3705 3665 WRITE_ONCE(pcpu_cid->time, rq->clock); 3706 3666 } 3707 3667 3708 - static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm) 3668 + static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, 3669 + struct mm_struct *mm) 3709 3670 { 3710 3671 int cid; 3711 3672 ··· 3716 3675 * guarantee forward progress. 3717 3676 */ 3718 3677 if (!READ_ONCE(use_cid_lock)) { 3719 - cid = __mm_cid_try_get(mm); 3678 + cid = __mm_cid_try_get(t, mm); 3720 3679 if (cid >= 0) 3721 3680 goto end; 3722 3681 raw_spin_lock(&cid_lock); 3723 3682 } else { 3724 3683 raw_spin_lock(&cid_lock); 3725 - cid = __mm_cid_try_get(mm); 3684 + cid = __mm_cid_try_get(t, mm); 3726 3685 if (cid >= 0) 3727 3686 goto unlock; 3728 3687 } ··· 3742 3701 * all newcoming allocations observe the use_cid_lock flag set. 3743 3702 */ 3744 3703 do { 3745 - cid = __mm_cid_try_get(mm); 3704 + cid = __mm_cid_try_get(t, mm); 3746 3705 cpu_relax(); 3747 3706 } while (cid < 0); 3748 3707 /* ··· 3759 3718 return cid; 3760 3719 } 3761 3720 3762 - static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm) 3721 + static inline int mm_cid_get(struct rq *rq, struct task_struct *t, 3722 + struct mm_struct *mm) 3763 3723 { 3764 3724 struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3765 3725 struct cpumask *cpumask; ··· 3777 3735 if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3778 3736 __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3779 3737 } 3780 - cid = __mm_cid_get(rq, mm); 3738 + cid = __mm_cid_get(rq, t, mm); 3781 3739 __this_cpu_write(pcpu_cid->cid, cid); 3740 + __this_cpu_write(pcpu_cid->recent_cid, cid); 3782 3741 3783 3742 return cid; 3784 3743 } ··· 3832 3789 prev->mm_cid = -1; 3833 3790 } 3834 3791 if (next->mm_cid_active) 3835 - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm); 3792 + next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); 3836 3793 } 3837 3794 3838 3795 #else /* !CONFIG_SCHED_MM_CID: */ ··· 3845 3802 3846 3803 extern u64 avg_vruntime(struct cfs_rq *cfs_rq); 3847 3804 extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); 3805 + #ifdef CONFIG_SMP 3806 + static inline 3807 + void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task) 3808 + { 3809 + lockdep_assert_rq_held(src_rq); 3810 + lockdep_assert_rq_held(dst_rq); 3811 + 3812 + deactivate_task(src_rq, task, 0); 3813 + set_task_cpu(task, dst_rq->cpu); 3814 + activate_task(dst_rq, task, 0); 3815 + } 3816 + 3817 + static inline 3818 + bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu) 3819 + { 3820 + if (!task_on_cpu(rq, p) && 3821 + cpumask_test_cpu(cpu, &p->cpus_mask)) 3822 + return true; 3823 + 3824 + return false; 3825 + } 3826 + #endif 3848 3827 3849 3828 #ifdef CONFIG_RT_MUTEXES 3850 3829

+19 -10

kernel/sched/stats.h

··· 127 127 * go through migration requeues. In this case, *sleeping* states need 128 128 * to be transferred. 129 129 */ 130 - static inline void psi_enqueue(struct task_struct *p, bool migrate) 130 + static inline void psi_enqueue(struct task_struct *p, int flags) 131 131 { 132 132 int clear = 0, set = 0; 133 133 134 134 if (static_branch_likely(&psi_disabled)) 135 135 return; 136 136 137 + /* Same runqueue, nothing changed for psi */ 138 + if (flags & ENQUEUE_RESTORE) 139 + return; 140 + 137 141 if (p->se.sched_delayed) { 138 142 /* CPU migration of "sleeping" task */ 139 - SCHED_WARN_ON(!migrate); 143 + SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED)); 140 144 if (p->in_memstall) 141 145 set |= TSK_MEMSTALL; 142 146 if (p->in_iowait) 143 147 set |= TSK_IOWAIT; 144 - } else if (migrate) { 148 + } else if (flags & ENQUEUE_MIGRATED) { 145 149 /* CPU migration of runnable task */ 146 150 set = TSK_RUNNING; 147 151 if (p->in_memstall) ··· 162 158 psi_task_change(p, clear, set); 163 159 } 164 160 165 - static inline void psi_dequeue(struct task_struct *p, bool migrate) 161 + static inline void psi_dequeue(struct task_struct *p, int flags) 166 162 { 167 163 if (static_branch_likely(&psi_disabled)) 168 164 return; 169 165 170 - /* 171 - * When migrating a task to another CPU, clear all psi 172 - * state. The enqueue callback above will work it out. 173 - */ 174 - if (migrate) 175 - psi_task_change(p, p->psi_flags, 0); 166 + /* Same runqueue, nothing changed for psi */ 167 + if (flags & DEQUEUE_SAVE) 168 + return; 176 169 177 170 /* 178 171 * A voluntary sleep is a dequeue followed by a task switch. To ··· 177 176 * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. 178 177 * Do nothing here. 179 178 */ 179 + if (flags & DEQUEUE_SLEEP) 180 + return; 181 + 182 + /* 183 + * When migrating a task to another CPU, clear all psi 184 + * state. The enqueue callback above will work it out. 185 + */ 186 + psi_task_change(p, p->psi_flags, 0); 180 187 } 181 188 182 189 static inline void psi_ttwu_dequeue(struct task_struct *p)

+2 -2

kernel/sched/syscalls.c

··· 91 91 } 92 92 93 93 queued = task_on_rq_queued(p); 94 - running = task_current(rq, p); 94 + running = task_current_donor(rq, p); 95 95 if (queued) 96 96 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); 97 97 if (running) ··· 713 713 dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); 714 714 715 715 queued = task_on_rq_queued(p); 716 - running = task_current(rq, p); 716 + running = task_current_donor(rq, p); 717 717 if (queued) 718 718 dequeue_task(rq, p, queue_flags); 719 719 if (running)

+57 -33

kernel/sched/wait_bit.c

··· 9 9 10 10 static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned; 11 11 12 - wait_queue_head_t *bit_waitqueue(void *word, int bit) 12 + wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit) 13 13 { 14 14 const int shift = BITS_PER_LONG == 32 ? 5 : 6; 15 15 unsigned long val = (unsigned long)word << shift | bit; ··· 55 55 } 56 56 EXPORT_SYMBOL(__wait_on_bit); 57 57 58 - int __sched out_of_line_wait_on_bit(void *word, int bit, 58 + int __sched out_of_line_wait_on_bit(unsigned long *word, int bit, 59 59 wait_bit_action_f *action, unsigned mode) 60 60 { 61 61 struct wait_queue_head *wq_head = bit_waitqueue(word, bit); ··· 66 66 EXPORT_SYMBOL(out_of_line_wait_on_bit); 67 67 68 68 int __sched out_of_line_wait_on_bit_timeout( 69 - void *word, int bit, wait_bit_action_f *action, 69 + unsigned long *word, int bit, wait_bit_action_f *action, 70 70 unsigned mode, unsigned long timeout) 71 71 { 72 72 struct wait_queue_head *wq_head = bit_waitqueue(word, bit); ··· 108 108 } 109 109 EXPORT_SYMBOL(__wait_on_bit_lock); 110 110 111 - int __sched out_of_line_wait_on_bit_lock(void *word, int bit, 111 + int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit, 112 112 wait_bit_action_f *action, unsigned mode) 113 113 { 114 114 struct wait_queue_head *wq_head = bit_waitqueue(word, bit); ··· 118 118 } 119 119 EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); 120 120 121 - void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) 121 + void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit) 122 122 { 123 123 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 124 124 ··· 128 128 EXPORT_SYMBOL(__wake_up_bit); 129 129 130 130 /** 131 - * wake_up_bit - wake up a waiter on a bit 132 - * @word: the word being waited on, a kernel virtual address 133 - * @bit: the bit of the word being waited on 131 + * wake_up_bit - wake up waiters on a bit 132 + * @word: the address containing the bit being waited on 133 + * @bit: the bit at that address being waited on 134 134 * 135 - * There is a standard hashed waitqueue table for generic use. This 136 - * is the part of the hash-table's accessor API that wakes up waiters 137 - * on a bit. For instance, if one were to have waiters on a bitflag, 138 - * one would call wake_up_bit() after clearing the bit. 135 + * Wake up any process waiting in wait_on_bit() or similar for the 136 + * given bit to be cleared. 139 137 * 140 - * In order for this to function properly, as it uses waitqueue_active() 141 - * internally, some kind of memory barrier must be done prior to calling 142 - * this. Typically, this will be smp_mb__after_atomic(), but in some 143 - * cases where bitflags are manipulated non-atomically under a lock, one 144 - * may need to use a less regular barrier, such fs/inode.c's smp_mb(), 145 - * because spin_unlock() does not guarantee a memory barrier. 138 + * The wake-up is sent to tasks in a waitqueue selected by hash from a 139 + * shared pool. Only those tasks on that queue which have requested 140 + * wake_up on this specific address and bit will be woken, and only if the 141 + * bit is clear. 142 + * 143 + * In order for this to function properly there must be a full memory 144 + * barrier after the bit is cleared and before this function is called. 145 + * If the bit was cleared atomically, such as a by clear_bit() then 146 + * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed. 147 + * If the bit was cleared with a fully-ordered operation, no further 148 + * barrier is required. 149 + * 150 + * Normally the bit should be cleared by an operation with RELEASE 151 + * semantics so that any changes to memory made before the bit is 152 + * cleared are guaranteed to be visible after the matching wait_on_bit() 153 + * completes. 146 154 */ 147 - void wake_up_bit(void *word, int bit) 155 + void wake_up_bit(unsigned long *word, int bit) 148 156 { 149 157 __wake_up_bit(bit_waitqueue(word, bit), word, bit); 150 158 } ··· 196 188 } 197 189 EXPORT_SYMBOL(init_wait_var_entry); 198 190 191 + /** 192 + * wake_up_var - wake up waiters on a variable (kernel address) 193 + * @var: the address of the variable being waited on 194 + * 195 + * Wake up any process waiting in wait_var_event() or similar for the 196 + * given variable to change. wait_var_event() can be waiting for an 197 + * arbitrary condition to be true and associates that condition with an 198 + * address. Calling wake_up_var() suggests that the condition has been 199 + * made true, but does not strictly require the condtion to use the 200 + * address given. 201 + * 202 + * The wake-up is sent to tasks in a waitqueue selected by hash from a 203 + * shared pool. Only those tasks on that queue which have requested 204 + * wake_up on this specific address will be woken. 205 + * 206 + * In order for this to function properly there must be a full memory 207 + * barrier after the variable is updated (or more accurately, after the 208 + * condition waited on has been made to be true) and before this function 209 + * is called. If the variable was updated atomically, such as a by 210 + * atomic_dec() then smb_mb__after_atomic() can be used. If the 211 + * variable was updated by a fully ordered operation such as 212 + * atomic_dec_and_test() then no extra barrier is required. Otherwise 213 + * smb_mb() is needed. 214 + * 215 + * Normally the variable should be updated (the condition should be made 216 + * to be true) by an operation with RELEASE semantics such as 217 + * smp_store_release() so that any changes to memory made before the 218 + * variable was updated are guaranteed to be visible after the matching 219 + * wait_var_event() completes. 220 + */ 199 221 void wake_up_var(void *var) 200 222 { 201 223 __wake_up_bit(__var_waitqueue(var), var, -1); ··· 265 227 return 0; 266 228 } 267 229 EXPORT_SYMBOL_GPL(bit_wait_timeout); 268 - 269 - __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) 270 - { 271 - unsigned long now = READ_ONCE(jiffies); 272 - 273 - if (time_after_eq(now, word->timeout)) 274 - return -EAGAIN; 275 - io_schedule_timeout(word->timeout - now); 276 - if (signal_pending_state(mode, current)) 277 - return -EINTR; 278 - 279 - return 0; 280 - } 281 - EXPORT_SYMBOL_GPL(bit_wait_io_timeout); 282 230 283 231 void __init wait_bit_init(void) 284 232 {

+4 -10

kernel/softirq.c

··· 748 748 749 749 static bool tasklet_clear_sched(struct tasklet_struct *t) 750 750 { 751 - if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { 752 - wake_up_var(&t->state); 751 + if (test_and_clear_wake_up_bit(TASKLET_STATE_SCHED, &t->state)) 753 752 return true; 754 - } 755 753 756 754 WARN_ONCE(1, "tasklet SCHED state not set: %s %pS\n", 757 755 t->use_callback ? "callback" : "func", ··· 869 871 if (in_interrupt()) 870 872 pr_notice("Attempt to kill tasklet from interrupt\n"); 871 873 872 - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) 873 - wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); 874 + wait_on_bit_lock(&t->state, TASKLET_STATE_SCHED, TASK_UNINTERRUPTIBLE); 874 875 875 876 tasklet_unlock_wait(t); 876 877 tasklet_clear_sched(t); ··· 879 882 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) 880 883 void tasklet_unlock(struct tasklet_struct *t) 881 884 { 882 - smp_mb__before_atomic(); 883 - clear_bit(TASKLET_STATE_RUN, &t->state); 884 - smp_mb__after_atomic(); 885 - wake_up_var(&t->state); 885 + clear_and_wake_up_bit(TASKLET_STATE_RUN, &t->state); 886 886 } 887 887 EXPORT_SYMBOL_GPL(tasklet_unlock); 888 888 889 889 void tasklet_unlock_wait(struct tasklet_struct *t) 890 890 { 891 - wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); 891 + wait_on_bit(&t->state, TASKLET_STATE_RUN, TASK_UNINTERRUPTIBLE); 892 892 } 893 893 EXPORT_SYMBOL_GPL(tasklet_unlock_wait); 894 894 #endif

Configure Feed

Configure Feed