Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+4

Documentation/admin-guide/kernel-parameters.txt

··· 6500 6500 Memory area to be used by remote processor image, 6501 6501 managed by CMA. 6502 6502 6503 + rseq_debug= [KNL] Enable or disable restartable sequence 6504 + debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE. 6505 + Format: <bool> 6506 + 6503 6507 rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling 6504 6508 when CONFIG_RT_GROUP_SCHED=y. Defaults to 6505 6509 !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED.

+1 -1

arch/arm64/kernel/entry-common.c

··· 100 100 static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) 101 101 { 102 102 local_irq_disable(); 103 - exit_to_user_mode_prepare(regs); 103 + exit_to_user_mode_prepare_legacy(regs); 104 104 local_daif_mask(); 105 105 mte_check_tfsr_exit(); 106 106 exit_to_user_mode();

+2 -1

arch/x86/entry/syscall_32.c

··· 274 274 * fetch EBP before invoking any of the syscall entry work 275 275 * functions. 276 276 */ 277 - syscall_enter_from_user_mode_prepare(regs); 277 + enter_from_user_mode(regs); 278 278 279 279 instrumentation_begin(); 280 + local_irq_enable(); 280 281 /* Fetch EBP from where the vDSO stashed it. */ 281 282 if (IS_ENABLED(CONFIG_X86_64)) { 282 283 /*

+10 -10

arch/x86/include/asm/ptrace.h

··· 187 187 extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); 188 188 189 189 190 - static inline unsigned long regs_return_value(struct pt_regs *regs) 190 + static __always_inline unsigned long regs_return_value(struct pt_regs *regs) 191 191 { 192 192 return regs->ax; 193 193 } 194 194 195 - static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) 195 + static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) 196 196 { 197 197 regs->ax = rc; 198 198 } ··· 277 277 } 278 278 #endif 279 279 280 - static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) 280 + static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs) 281 281 { 282 282 return regs->sp; 283 283 } 284 284 285 - static inline unsigned long instruction_pointer(struct pt_regs *regs) 285 + static __always_inline unsigned long instruction_pointer(struct pt_regs *regs) 286 286 { 287 287 return regs->ip; 288 288 } 289 289 290 - static inline void instruction_pointer_set(struct pt_regs *regs, 291 - unsigned long val) 290 + static __always_inline 291 + void instruction_pointer_set(struct pt_regs *regs, unsigned long val) 292 292 { 293 293 regs->ip = val; 294 294 } 295 295 296 - static inline unsigned long frame_pointer(struct pt_regs *regs) 296 + static __always_inline unsigned long frame_pointer(struct pt_regs *regs) 297 297 { 298 298 return regs->bp; 299 299 } 300 300 301 - static inline unsigned long user_stack_pointer(struct pt_regs *regs) 301 + static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs) 302 302 { 303 303 return regs->sp; 304 304 } 305 305 306 - static inline void user_stack_pointer_set(struct pt_regs *regs, 307 - unsigned long val) 306 + static __always_inline 307 + void user_stack_pointer_set(struct pt_regs *regs, unsigned long val) 308 308 { 309 309 regs->sp = val; 310 310 }

+3

drivers/hv/mshv_root_main.c

··· 29 29 #include <linux/crash_dump.h> 30 30 #include <linux/panic_notifier.h> 31 31 #include <linux/vmalloc.h> 32 + #include <linux/rseq.h> 32 33 33 34 #include "mshv_eventfd.h" 34 35 #include "mshv.h" ··· 560 559 vp->run.flags.intercept_suspend = 1; 561 560 } 562 561 } while (!vp->run.flags.intercept_suspend); 562 + 563 + rseq_virt_userspace_exit(); 563 564 564 565 return ret; 565 566 }

+1 -1

fs/binfmt_elf.c

··· 46 46 #include <linux/cred.h> 47 47 #include <linux/dax.h> 48 48 #include <linux/uaccess.h> 49 - #include <linux/rseq.h> 49 + #include <uapi/linux/rseq.h> 50 50 #include <asm/param.h> 51 51 #include <asm/page.h> 52 52

+1 -1

fs/exec.c

··· 1774 1774 force_fatal_sig(SIGSEGV); 1775 1775 1776 1776 sched_mm_cid_after_execve(current); 1777 - rseq_set_notify_resume(current); 1777 + rseq_force_update(); 1778 1778 current->in_execve = 0; 1779 1779 1780 1780 return retval;

+3

include/asm-generic/thread_info_tif.h

··· 45 45 # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) 46 46 #endif 47 47 48 + #define TIF_RSEQ 11 // Run RSEQ fast path 49 + #define _TIF_RSEQ BIT(TIF_RSEQ) 50 + 48 51 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */

+15

include/linux/bitmap.h

··· 45 45 * bitmap_copy(dst, src, nbits) *dst = *src 46 46 * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 47 47 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 48 + * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst 48 49 * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 49 50 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) 50 51 * bitmap_complement(dst, src, nbits) *dst = ~(*src) ··· 166 165 const unsigned long *bitmap2, unsigned int nbits); 167 166 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, 168 167 const unsigned long *bitmap2, unsigned int nbits); 168 + unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, 169 + const unsigned long *bitmap2, unsigned int nbits); 169 170 void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, 170 171 const unsigned long *bitmap2, unsigned int nbits); 171 172 bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, ··· 338 335 *dst = *src1 | *src2; 339 336 else 340 337 __bitmap_or(dst, src1, src2, nbits); 338 + } 339 + 340 + static __always_inline 341 + unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, 342 + const unsigned long *src2, unsigned int nbits) 343 + { 344 + if (small_const_nbits(nbits)) { 345 + *dst = *src1 | *src2; 346 + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); 347 + } else { 348 + return __bitmap_weighted_or(dst, src1, src2, nbits); 349 + } 341 350 } 342 351 343 352 static __always_inline

+15 -15

include/linux/cleanup.h

··· 208 208 */ 209 209 210 210 #define DEFINE_FREE(_name, _type, _free) \ 211 - static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } 211 + static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } 212 212 213 213 #define __free(_name) __cleanup(__free_##_name) 214 214 ··· 220 220 __val; \ 221 221 }) 222 222 223 - static inline __must_check 223 + static __always_inline __must_check 224 224 const volatile void * __must_check_fn(const volatile void *val) 225 225 { return val; } 226 226 ··· 278 278 279 279 #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \ 280 280 typedef _type class_##_name##_t; \ 281 - static inline void class_##_name##_destructor(_type *p) \ 281 + static __always_inline void class_##_name##_destructor(_type *p) \ 282 282 { _type _T = *p; _exit; } \ 283 - static inline _type class_##_name##_constructor(_init_args) \ 283 + static __always_inline _type class_##_name##_constructor(_init_args) \ 284 284 { _type t = _init; return t; } 285 285 286 286 #define EXTEND_CLASS(_name, ext, _init, _init_args...) \ 287 287 typedef class_##_name##_t class_##_name##ext##_t; \ 288 - static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\ 288 + static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ 289 289 { class_##_name##_destructor(p); } \ 290 - static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ 290 + static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ 291 291 { class_##_name##_t t = _init; return t; } 292 292 293 293 #define CLASS(_name, var) \ ··· 360 360 }) 361 361 362 362 #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ 363 - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ 363 + static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ 364 364 { \ 365 365 void *_ptr = (void *)(__force unsigned long)*(_exp); \ 366 366 if (IS_ERR(_ptr)) { \ ··· 368 368 } \ 369 369 return _ptr; \ 370 370 } \ 371 - static inline int class_##_name##_lock_err(class_##_name##_t *_T) \ 371 + static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \ 372 372 { \ 373 373 long _rc = (__force unsigned long)*(_exp); \ 374 374 if (!_rc) { \ ··· 397 397 EXTEND_CLASS(_name, _ext, \ 398 398 ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ 399 399 class_##_name##_t _T) \ 400 - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 400 + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 401 401 { return class_##_name##_lock_ptr(_T); } \ 402 - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 402 + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 403 403 { return class_##_name##_lock_err(_T); } 404 404 405 405 /* ··· 479 479 __VA_ARGS__; \ 480 480 } class_##_name##_t; \ 481 481 \ 482 - static inline void class_##_name##_destructor(class_##_name##_t *_T) \ 482 + static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ 483 483 { \ 484 484 if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ 485 485 } \ ··· 487 487 __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) 488 488 489 489 #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ 490 - static inline class_##_name##_t class_##_name##_constructor(_type *l) \ 490 + static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ 491 491 { \ 492 492 class_##_name##_t _t = { .lock = l }, *_T = &_t; \ 493 493 _lock; \ ··· 495 495 } 496 496 497 497 #define __DEFINE_LOCK_GUARD_0(_name, _lock) \ 498 - static inline class_##_name##_t class_##_name##_constructor(void) \ 498 + static __always_inline class_##_name##_t class_##_name##_constructor(void) \ 499 499 { \ 500 500 class_##_name##_t _t = { .lock = (void*)1 }, \ 501 501 *_T __maybe_unused = &_t; \ ··· 521 521 if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ 522 522 _t; }), \ 523 523 typeof_member(class_##_name##_t, lock) l) \ 524 - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 524 + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 525 525 { return class_##_name##_lock_ptr(_T); } \ 526 - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 526 + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 527 527 { return class_##_name##_lock_err(_T); } 528 528 529 529 #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \

+24 -2

include/linux/cpumask.h

··· 126 126 #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) 127 127 128 128 extern atomic_t __num_online_cpus; 129 + extern unsigned int __num_possible_cpus; 129 130 130 131 extern cpumask_t cpus_booted_once_mask; 131 132 ··· 730 729 } 731 730 732 731 /** 732 + * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result 733 + * @dstp: the cpumask result 734 + * @src1p: the first input 735 + * @src2p: the second input 736 + * 737 + * Return: The number of bits set in the resulting cpumask @dstp 738 + */ 739 + static __always_inline 740 + unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p, 741 + const struct cpumask *src2p) 742 + { 743 + return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p), 744 + cpumask_bits(src2p), small_cpumask_bits); 745 + } 746 + 747 + /** 733 748 * cpumask_xor - *dstp = *src1p ^ *src2p 734 749 * @dstp: the cpumask result 735 750 * @src1p: the first input ··· 1153 1136 #define __assign_cpu(cpu, mask, val) \ 1154 1137 __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) 1155 1138 1156 - #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) 1157 1139 #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) 1158 1140 #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) 1159 1141 #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) 1160 1142 #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) 1161 1143 1162 1144 void set_cpu_online(unsigned int cpu, bool online); 1145 + void set_cpu_possible(unsigned int cpu, bool possible); 1163 1146 1164 1147 /** 1165 1148 * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * ··· 1212 1195 { 1213 1196 return raw_atomic_read(&__num_online_cpus); 1214 1197 } 1215 - #define num_possible_cpus() cpumask_weight(cpu_possible_mask) 1198 + 1199 + static __always_inline unsigned int num_possible_cpus(void) 1200 + { 1201 + return __num_possible_cpus; 1202 + } 1203 + 1216 1204 #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) 1217 1205 #define num_present_cpus() cpumask_weight(cpu_present_mask) 1218 1206 #define num_active_cpus() cpumask_weight(cpu_active_mask)

+11 -27

include/linux/entry-common.h

··· 3 3 #define __LINUX_ENTRYCOMMON_H 4 4 5 5 #include <linux/irq-entry-common.h> 6 + #include <linux/livepatch.h> 6 7 #include <linux/ptrace.h> 8 + #include <linux/resume_user_mode.h> 7 9 #include <linux/seccomp.h> 8 10 #include <linux/sched.h> 9 - #include <linux/livepatch.h> 10 - #include <linux/resume_user_mode.h> 11 11 12 12 #include <asm/entry-common.h> 13 13 #include <asm/syscall.h> ··· 37 37 SYSCALL_WORK_SYSCALL_AUDIT | \ 38 38 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 39 39 ARCH_SYSCALL_WORK_ENTER) 40 + 40 41 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 41 42 SYSCALL_WORK_SYSCALL_TRACE | \ 42 43 SYSCALL_WORK_SYSCALL_AUDIT | \ ··· 45 44 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ 46 45 ARCH_SYSCALL_WORK_EXIT) 47 46 48 - /** 49 - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts 50 - * @regs: Pointer to currents pt_regs 51 - * 52 - * Invoked from architecture specific syscall entry code with interrupts 53 - * disabled. The calling code has to be non-instrumentable. When the 54 - * function returns all state is correct, interrupts are enabled and the 55 - * subsequent functions can be instrumented. 56 - * 57 - * This handles lockdep, RCU (context tracking) and tracing state, i.e. 58 - * the functionality provided by enter_from_user_mode(). 59 - * 60 - * This is invoked when there is extra architecture specific functionality 61 - * to be done between establishing state and handling user mode entry work. 62 - */ 63 - void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); 64 - 65 - long syscall_trace_enter(struct pt_regs *regs, long syscall, 66 - unsigned long work); 47 + long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); 67 48 68 49 /** 69 50 * syscall_enter_from_user_mode_work - Check and handle work before invoking ··· 54 71 * @syscall: The syscall number 55 72 * 56 73 * Invoked from architecture specific syscall entry code with interrupts 57 - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra 58 - * architecture specific work. 74 + * enabled after invoking enter_from_user_mode(), enabling interrupts and 75 + * extra architecture specific work. 59 76 * 60 77 * Returns: The original or a modified syscall number 61 78 * ··· 91 108 * function returns all state is correct, interrupts are enabled and the 92 109 * subsequent functions can be instrumented. 93 110 * 94 - * This is combination of syscall_enter_from_user_mode_prepare() and 95 - * syscall_enter_from_user_mode_work(). 111 + * This is the combination of enter_from_user_mode() and 112 + * syscall_enter_from_user_mode_work() to be used when there is no 113 + * architecture specific work to be done between the two. 96 114 * 97 115 * Returns: The original or a modified syscall number. See 98 116 * syscall_enter_from_user_mode_work() for further explanation. ··· 146 162 local_irq_enable(); 147 163 } 148 164 149 - rseq_syscall(regs); 165 + rseq_debug_syscall_return(regs); 150 166 151 167 /* 152 168 * Do one-time syscall specific work. If these work items are ··· 156 172 if (unlikely(work & SYSCALL_WORK_EXIT)) 157 173 syscall_exit_work(regs, work); 158 174 local_irq_disable_exit_to_user(); 159 - exit_to_user_mode_prepare(regs); 175 + syscall_exit_to_user_mode_prepare(regs); 160 176 } 161 177 162 178 /**

+63 -12

include/linux/irq-entry-common.h

··· 2 2 #ifndef __LINUX_IRQENTRYCOMMON_H 3 3 #define __LINUX_IRQENTRYCOMMON_H 4 4 5 + #include <linux/context_tracking.h> 6 + #include <linux/kmsan.h> 7 + #include <linux/rseq_entry.h> 5 8 #include <linux/static_call_types.h> 6 9 #include <linux/syscalls.h> 7 - #include <linux/context_tracking.h> 8 10 #include <linux/tick.h> 9 - #include <linux/kmsan.h> 10 11 #include <linux/unwind_deferred.h> 11 12 12 13 #include <asm/entry-common.h> ··· 30 29 #define EXIT_TO_USER_MODE_WORK \ 31 30 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 32 31 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 33 - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 32 + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ 34 33 ARCH_EXIT_TO_USER_MODE_WORK) 35 34 36 35 /** ··· 68 67 69 68 /** 70 69 * enter_from_user_mode - Establish state when coming from user mode 70 + * @regs: Pointer to currents pt_regs 71 71 * 72 72 * Syscall/interrupt entry disables interrupts, but user mode is traced as 73 73 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. ··· 197 195 */ 198 196 void arch_do_signal_or_restart(struct pt_regs *regs); 199 197 200 - /** 201 - * exit_to_user_mode_loop - do any pending work before leaving to user space 202 - */ 203 - unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 204 - unsigned long ti_work); 198 + /* Handle pending TIF work */ 199 + unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); 205 200 206 201 /** 207 - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 202 + * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 208 203 * @regs: Pointer to pt_regs on entry stack 209 204 * 210 205 * 1) check that interrupts are disabled ··· 209 210 * 3) call exit_to_user_mode_loop() if any flags from 210 211 * EXIT_TO_USER_MODE_WORK are set 211 212 * 4) check that interrupts are still disabled 213 + * 214 + * Don't invoke directly, use the syscall/irqentry_ prefixed variants below 212 215 */ 213 - static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) 216 + static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) 214 217 { 215 218 unsigned long ti_work; 216 219 ··· 226 225 ti_work = exit_to_user_mode_loop(regs, ti_work); 227 226 228 227 arch_exit_to_user_mode_prepare(regs, ti_work); 228 + } 229 229 230 + static __always_inline void __exit_to_user_mode_validate(void) 231 + { 230 232 /* Ensure that kernel state is sane for a return to userspace */ 231 233 kmap_assert_nomap(); 232 234 lockdep_assert_irqs_disabled(); 233 235 lockdep_sys_exit(); 236 + } 237 + 238 + /* Temporary workaround to keep ARM64 alive */ 239 + static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) 240 + { 241 + __exit_to_user_mode_prepare(regs); 242 + rseq_exit_to_user_mode_legacy(); 243 + __exit_to_user_mode_validate(); 244 + } 245 + 246 + /** 247 + * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 248 + * @regs: Pointer to pt_regs on entry stack 249 + * 250 + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for 251 + * syscalls and interrupts. 252 + */ 253 + static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 254 + { 255 + __exit_to_user_mode_prepare(regs); 256 + rseq_syscall_exit_to_user_mode(); 257 + __exit_to_user_mode_validate(); 258 + } 259 + 260 + /** 261 + * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 262 + * @regs: Pointer to pt_regs on entry stack 263 + * 264 + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for 265 + * syscalls and interrupts. 266 + */ 267 + static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) 268 + { 269 + __exit_to_user_mode_prepare(regs); 270 + rseq_irqentry_exit_to_user_mode(); 271 + __exit_to_user_mode_validate(); 234 272 } 235 273 236 274 /** ··· 314 274 * 315 275 * The function establishes state (lockdep, RCU (context tracking), tracing) 316 276 */ 317 - void irqentry_enter_from_user_mode(struct pt_regs *regs); 277 + static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) 278 + { 279 + enter_from_user_mode(regs); 280 + rseq_note_user_irq_entry(); 281 + } 318 282 319 283 /** 320 284 * irqentry_exit_to_user_mode - Interrupt exit work ··· 333 289 * Interrupt exit is not invoking #1 which is the syscall specific one time 334 290 * work. 335 291 */ 336 - void irqentry_exit_to_user_mode(struct pt_regs *regs); 292 + static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) 293 + { 294 + instrumentation_begin(); 295 + irqentry_exit_to_user_mode_prepare(regs); 296 + instrumentation_end(); 297 + exit_to_user_mode(); 298 + } 337 299 338 300 #ifndef irqentry_state 339 301 /** ··· 404 354 * Conditional reschedule with additional sanity checks. 405 355 */ 406 356 void raw_irqentry_exit_cond_resched(void); 357 + 407 358 #ifdef CONFIG_PREEMPT_DYNAMIC 408 359 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 409 360 #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched

+2 -7

include/linux/irq_work.h

··· 2 2 #ifndef _LINUX_IRQ_WORK_H 3 3 #define _LINUX_IRQ_WORK_H 4 4 5 - #include <linux/smp_types.h> 5 + #include <linux/irq_work_types.h> 6 6 #include <linux/rcuwait.h> 7 + #include <linux/smp_types.h> 7 8 8 9 /* 9 10 * An entry can be in one of four states: ··· 14 13 * pending next, 3 -> {busy} : queued, pending callback 15 14 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 16 15 */ 17 - 18 - struct irq_work { 19 - struct __call_single_node node; 20 - void (*func)(struct irq_work *); 21 - struct rcuwait irqwait; 22 - }; 23 16 24 17 #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ 25 18 .node = { .u_flags = (_flags), }, \

+14

include/linux/irq_work_types.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_IRQ_WORK_TYPES_H 3 + #define _LINUX_IRQ_WORK_TYPES_H 4 + 5 + #include <linux/smp_types.h> 6 + #include <linux/types.h> 7 + 8 + struct irq_work { 9 + struct __call_single_node node; 10 + void (*func)(struct irq_work *); 11 + struct rcuwait irqwait; 12 + }; 13 + 14 + #endif

-25

include/linux/mm.h

··· 2408 2408 /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ 2409 2409 #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) 2410 2410 2411 - #ifdef CONFIG_SCHED_MM_CID 2412 - void sched_mm_cid_before_execve(struct task_struct *t); 2413 - void sched_mm_cid_after_execve(struct task_struct *t); 2414 - void sched_mm_cid_fork(struct task_struct *t); 2415 - void sched_mm_cid_exit_signals(struct task_struct *t); 2416 - static inline int task_mm_cid(struct task_struct *t) 2417 - { 2418 - return t->mm_cid; 2419 - } 2420 - #else 2421 - static inline void sched_mm_cid_before_execve(struct task_struct *t) { } 2422 - static inline void sched_mm_cid_after_execve(struct task_struct *t) { } 2423 - static inline void sched_mm_cid_fork(struct task_struct *t) { } 2424 - static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } 2425 - static inline int task_mm_cid(struct task_struct *t) 2426 - { 2427 - /* 2428 - * Use the processor id as a fall-back when the mm cid feature is 2429 - * disabled. This provides functional per-cpu data structure accesses 2430 - * in user-space, althrough it won't provide the memory usage benefits. 2431 - */ 2432 - return raw_smp_processor_id(); 2433 - } 2434 - #endif 2435 - 2436 2411 #ifdef CONFIG_MMU 2437 2412 extern bool can_do_mlock(void); 2438 2413 #else

+13 -115

include/linux/mm_types.h

··· 20 20 #include <linux/seqlock.h> 21 21 #include <linux/percpu_counter.h> 22 22 #include <linux/types.h> 23 + #include <linux/rseq_types.h> 23 24 #include <linux/bitmap.h> 24 25 25 26 #include <asm/mmu.h> ··· 923 922 #define vma_policy(vma) NULL 924 923 #endif 925 924 926 - #ifdef CONFIG_SCHED_MM_CID 927 - struct mm_cid { 928 - u64 time; 929 - int cid; 930 - int recent_cid; 931 - }; 932 - #endif 933 - 934 925 /* 935 926 * Opaque type representing current mm_struct flag state. Must be accessed via 936 927 * mm_flags_xxx() helper functions. ··· 984 991 */ 985 992 atomic_t mm_users; 986 993 987 - #ifdef CONFIG_SCHED_MM_CID 988 - /** 989 - * @pcpu_cid: Per-cpu current cid. 990 - * 991 - * Keep track of the currently allocated mm_cid for each cpu. 992 - * The per-cpu mm_cid values are serialized by their respective 993 - * runqueue locks. 994 - */ 995 - struct mm_cid __percpu *pcpu_cid; 996 - /* 997 - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). 998 - * 999 - * When the next mm_cid scan is due (in jiffies). 1000 - */ 1001 - unsigned long mm_cid_next_scan; 1002 - /** 1003 - * @nr_cpus_allowed: Number of CPUs allowed for mm. 1004 - * 1005 - * Number of CPUs allowed in the union of all mm's 1006 - * threads allowed CPUs. 1007 - */ 1008 - unsigned int nr_cpus_allowed; 1009 - /** 1010 - * @max_nr_cid: Maximum number of allowed concurrency 1011 - * IDs allocated. 1012 - * 1013 - * Track the highest number of allowed concurrency IDs 1014 - * allocated for the mm. 1015 - */ 1016 - atomic_t max_nr_cid; 1017 - /** 1018 - * @cpus_allowed_lock: Lock protecting mm cpus_allowed. 1019 - * 1020 - * Provide mutual exclusion for mm cpus_allowed and 1021 - * mm nr_cpus_allowed updates. 1022 - */ 1023 - raw_spinlock_t cpus_allowed_lock; 1024 - #endif 994 + /* MM CID related storage */ 995 + struct mm_mm_cid mm_cid; 996 + 1025 997 #ifdef CONFIG_MMU 1026 998 atomic_long_t pgtables_bytes; /* size of all page tables */ 1027 999 #endif ··· 1328 1370 } 1329 1371 1330 1372 #ifdef CONFIG_SCHED_MM_CID 1331 - 1332 - enum mm_cid_state { 1333 - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ 1334 - MM_CID_LAZY_PUT = (1U << 31), 1335 - }; 1336 - 1337 - static inline bool mm_cid_is_unset(int cid) 1338 - { 1339 - return cid == MM_CID_UNSET; 1340 - } 1341 - 1342 - static inline bool mm_cid_is_lazy_put(int cid) 1343 - { 1344 - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); 1345 - } 1346 - 1347 - static inline bool mm_cid_is_valid(int cid) 1348 - { 1349 - return !(cid & MM_CID_LAZY_PUT); 1350 - } 1351 - 1352 - static inline int mm_cid_set_lazy_put(int cid) 1353 - { 1354 - return cid | MM_CID_LAZY_PUT; 1355 - } 1356 - 1357 - static inline int mm_cid_clear_lazy_put(int cid) 1358 - { 1359 - return cid & ~MM_CID_LAZY_PUT; 1360 - } 1361 - 1362 1373 /* 1363 1374 * mm_cpus_allowed: Union of all mm's threads allowed CPUs. 1364 1375 */ ··· 1342 1415 } 1343 1416 1344 1417 /* Accessor for struct mm_struct's cidmask. */ 1345 - static inline cpumask_t *mm_cidmask(struct mm_struct *mm) 1418 + static inline unsigned long *mm_cidmask(struct mm_struct *mm) 1346 1419 { 1347 1420 unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); 1348 1421 1349 1422 /* Skip mm_cpus_allowed */ 1350 1423 cid_bitmap += cpumask_size(); 1351 - return (struct cpumask *)cid_bitmap; 1424 + return (unsigned long *)cid_bitmap; 1352 1425 } 1353 1426 1354 - static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 1355 - { 1356 - int i; 1357 - 1358 - for_each_possible_cpu(i) { 1359 - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); 1360 - 1361 - pcpu_cid->cid = MM_CID_UNSET; 1362 - pcpu_cid->recent_cid = MM_CID_UNSET; 1363 - pcpu_cid->time = 0; 1364 - } 1365 - mm->nr_cpus_allowed = p->nr_cpus_allowed; 1366 - atomic_set(&mm->max_nr_cid, 0); 1367 - raw_spin_lock_init(&mm->cpus_allowed_lock); 1368 - cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 1369 - cpumask_clear(mm_cidmask(mm)); 1370 - } 1427 + void mm_init_cid(struct mm_struct *mm, struct task_struct *p); 1371 1428 1372 1429 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) 1373 1430 { 1374 - mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); 1375 - if (!mm->pcpu_cid) 1431 + mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu); 1432 + if (!mm->mm_cid.pcpu) 1376 1433 return -ENOMEM; 1377 1434 mm_init_cid(mm, p); 1378 1435 return 0; ··· 1365 1454 1366 1455 static inline void mm_destroy_cid(struct mm_struct *mm) 1367 1456 { 1368 - free_percpu(mm->pcpu_cid); 1369 - mm->pcpu_cid = NULL; 1457 + free_percpu(mm->mm_cid.pcpu); 1458 + mm->mm_cid.pcpu = NULL; 1370 1459 } 1371 1460 1372 1461 static inline unsigned int mm_cid_size(void) 1373 1462 { 1374 - return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ 1463 + /* mm_cpus_allowed(), mm_cidmask(). */ 1464 + return cpumask_size() + bitmap_size(num_possible_cpus()); 1375 1465 } 1376 1466 1377 - static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) 1378 - { 1379 - struct cpumask *mm_allowed = mm_cpus_allowed(mm); 1380 - 1381 - if (!mm) 1382 - return; 1383 - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ 1384 - raw_spin_lock(&mm->cpus_allowed_lock); 1385 - cpumask_or(mm_allowed, mm_allowed, cpumask); 1386 - WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); 1387 - raw_spin_unlock(&mm->cpus_allowed_lock); 1388 - } 1389 1467 #else /* CONFIG_SCHED_MM_CID */ 1390 1468 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } 1391 1469 static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } 1392 1470 static inline void mm_destroy_cid(struct mm_struct *mm) { } 1393 - 1394 1471 static inline unsigned int mm_cid_size(void) 1395 1472 { 1396 1473 return 0; 1397 1474 } 1398 - static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } 1399 1475 #endif /* CONFIG_SCHED_MM_CID */ 1400 1476 1401 1477 struct mmu_gather;

+1 -1

include/linux/resume_user_mode.h

··· 59 59 mem_cgroup_handle_over_high(GFP_KERNEL); 60 60 blkcg_maybe_throttle_current(); 61 61 62 - rseq_handle_notify_resume(NULL, regs); 62 + rseq_handle_slowpath(regs); 63 63 } 64 64 65 65 #endif /* LINUX_RESUME_USER_MODE_H */

+127 -97

include/linux/rseq.h

··· 3 3 #define _LINUX_RSEQ_H 4 4 5 5 #ifdef CONFIG_RSEQ 6 - 7 - #include <linux/preempt.h> 8 6 #include <linux/sched.h> 9 7 10 - #ifdef CONFIG_MEMBARRIER 11 - # define RSEQ_EVENT_GUARD irq 12 - #else 13 - # define RSEQ_EVENT_GUARD preempt 14 - #endif 8 + #include <uapi/linux/rseq.h> 9 + 10 + void __rseq_handle_slowpath(struct pt_regs *regs); 11 + 12 + /* Invoked from resume_user_mode_work() */ 13 + static inline void rseq_handle_slowpath(struct pt_regs *regs) 14 + { 15 + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { 16 + if (current->rseq.event.slowpath) 17 + __rseq_handle_slowpath(regs); 18 + } else { 19 + /* '&' is intentional to spare one conditional branch */ 20 + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) 21 + __rseq_handle_slowpath(regs); 22 + } 23 + } 24 + 25 + void __rseq_signal_deliver(int sig, struct pt_regs *regs); 15 26 16 27 /* 17 - * Map the event mask on the user-space ABI enum rseq_cs_flags 18 - * for direct mask checks. 28 + * Invoked from signal delivery to fixup based on the register context before 29 + * switching to the signal delivery context. 19 30 */ 20 - enum rseq_event_mask_bits { 21 - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, 22 - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, 23 - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, 24 - }; 25 - 26 - enum rseq_event_mask { 27 - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), 28 - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), 29 - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), 30 - }; 31 - 32 - static inline void rseq_set_notify_resume(struct task_struct *t) 31 + static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) 33 32 { 34 - if (t->rseq) 35 - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 33 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 34 + /* '&' is intentional to spare one conditional branch */ 35 + if (current->rseq.event.has_rseq & current->rseq.event.user_irq) 36 + __rseq_signal_deliver(ksig->sig, regs); 37 + } else { 38 + if (current->rseq.event.has_rseq) 39 + __rseq_signal_deliver(ksig->sig, regs); 40 + } 36 41 } 37 42 38 - void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); 39 - 40 - static inline void rseq_handle_notify_resume(struct ksignal *ksig, 41 - struct pt_regs *regs) 43 + static inline void rseq_raise_notify_resume(struct task_struct *t) 42 44 { 43 - if (current->rseq) 44 - __rseq_handle_notify_resume(ksig, regs); 45 + set_tsk_thread_flag(t, TIF_RSEQ); 45 46 } 46 47 47 - static inline void rseq_signal_deliver(struct ksignal *ksig, 48 - struct pt_regs *regs) 48 + /* Invoked from context switch to force evaluation on exit to user */ 49 + static __always_inline void rseq_sched_switch_event(struct task_struct *t) 49 50 { 50 - scoped_guard(RSEQ_EVENT_GUARD) 51 - __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); 52 - rseq_handle_notify_resume(ksig, regs); 51 + struct rseq_event *ev = &t->rseq.event; 52 + 53 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 54 + /* 55 + * Avoid a boat load of conditionals by using simple logic 56 + * to determine whether NOTIFY_RESUME needs to be raised. 57 + * 58 + * It's required when the CPU or MM CID has changed or 59 + * the entry was from user space. 60 + */ 61 + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; 62 + 63 + if (raise) { 64 + ev->sched_switch = true; 65 + rseq_raise_notify_resume(t); 66 + } 67 + } else { 68 + if (ev->has_rseq) { 69 + t->rseq.event.sched_switch = true; 70 + rseq_raise_notify_resume(t); 71 + } 72 + } 53 73 } 54 74 55 - /* rseq_preempt() requires preemption to be disabled. */ 56 - static inline void rseq_preempt(struct task_struct *t) 75 + /* 76 + * Invoked from __set_task_cpu() when a task migrates or from 77 + * mm_cid_schedin() when the CID changes to enforce an IDs update. 78 + * 79 + * This does not raise TIF_NOTIFY_RESUME as that happens in 80 + * rseq_sched_switch_event(). 81 + */ 82 + static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) 57 83 { 58 - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); 59 - rseq_set_notify_resume(t); 84 + t->rseq.event.ids_changed = true; 60 85 } 61 86 62 - /* rseq_migrate() requires preemption to be disabled. */ 63 - static inline void rseq_migrate(struct task_struct *t) 87 + /* Enforce a full update after RSEQ registration and when execve() failed */ 88 + static inline void rseq_force_update(void) 64 89 { 65 - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); 66 - rseq_set_notify_resume(t); 90 + if (current->rseq.event.has_rseq) { 91 + current->rseq.event.ids_changed = true; 92 + current->rseq.event.sched_switch = true; 93 + rseq_raise_notify_resume(current); 94 + } 95 + } 96 + 97 + /* 98 + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, 99 + * which clears TIF_NOTIFY_RESUME on architectures that don't use the 100 + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. 101 + * 102 + * To avoid updating user space RSEQ in that case just to do it eventually 103 + * again before returning to user space, because __rseq_handle_slowpath() 104 + * does nothing when invoked with NULL register state. 105 + * 106 + * After returning from guest mode, before exiting to userspace, hypervisors 107 + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. 108 + */ 109 + static inline void rseq_virt_userspace_exit(void) 110 + { 111 + /* 112 + * The generic optimization for deferring RSEQ updates until the next 113 + * exit relies on having a dedicated TIF_RSEQ. 114 + */ 115 + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && 116 + current->rseq.event.sched_switch) 117 + rseq_raise_notify_resume(current); 118 + } 119 + 120 + static inline void rseq_reset(struct task_struct *t) 121 + { 122 + memset(&t->rseq, 0, sizeof(t->rseq)); 123 + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; 124 + } 125 + 126 + static inline void rseq_execve(struct task_struct *t) 127 + { 128 + rseq_reset(t); 67 129 } 68 130 69 131 /* 70 132 * If parent process has a registered restartable sequences area, the 71 133 * child inherits. Unregister rseq for a clone with CLONE_VM set. 134 + * 135 + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault 136 + * on the COW page on exit to user space, when the child stays on the same 137 + * CPU as the parent. That's obviously not guaranteed, but in overcommit 138 + * scenarios it is more likely and optimizes for the fork/exec case without 139 + * taking the fault. 72 140 */ 73 141 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) 74 142 { 75 - if (clone_flags & CLONE_VM) { 76 - t->rseq = NULL; 77 - t->rseq_len = 0; 78 - t->rseq_sig = 0; 79 - t->rseq_event_mask = 0; 80 - } else { 143 + if (clone_flags & CLONE_VM) 144 + rseq_reset(t); 145 + else 81 146 t->rseq = current->rseq; 82 - t->rseq_len = current->rseq_len; 83 - t->rseq_sig = current->rseq_sig; 84 - t->rseq_event_mask = current->rseq_event_mask; 85 - } 86 147 } 87 148 88 - static inline void rseq_execve(struct task_struct *t) 89 - { 90 - t->rseq = NULL; 91 - t->rseq_len = 0; 92 - t->rseq_sig = 0; 93 - t->rseq_event_mask = 0; 94 - } 95 - 96 - #else 97 - 98 - static inline void rseq_set_notify_resume(struct task_struct *t) 99 - { 100 - } 101 - static inline void rseq_handle_notify_resume(struct ksignal *ksig, 102 - struct pt_regs *regs) 103 - { 104 - } 105 - static inline void rseq_signal_deliver(struct ksignal *ksig, 106 - struct pt_regs *regs) 107 - { 108 - } 109 - static inline void rseq_preempt(struct task_struct *t) 110 - { 111 - } 112 - static inline void rseq_migrate(struct task_struct *t) 113 - { 114 - } 115 - static inline void rseq_fork(struct task_struct *t, u64 clone_flags) 116 - { 117 - } 118 - static inline void rseq_execve(struct task_struct *t) 119 - { 120 - } 121 - 122 - #endif 149 + #else /* CONFIG_RSEQ */ 150 + static inline void rseq_handle_slowpath(struct pt_regs *regs) { } 151 + static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } 152 + static inline void rseq_sched_switch_event(struct task_struct *t) { } 153 + static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } 154 + static inline void rseq_force_update(void) { } 155 + static inline void rseq_virt_userspace_exit(void) { } 156 + static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } 157 + static inline void rseq_execve(struct task_struct *t) { } 158 + #endif /* !CONFIG_RSEQ */ 123 159 124 160 #ifdef CONFIG_DEBUG_RSEQ 125 - 126 161 void rseq_syscall(struct pt_regs *regs); 127 - 128 - #else 129 - 130 - static inline void rseq_syscall(struct pt_regs *regs) 131 - { 132 - } 133 - 134 - #endif 162 + #else /* CONFIG_DEBUG_RSEQ */ 163 + static inline void rseq_syscall(struct pt_regs *regs) { } 164 + #endif /* !CONFIG_DEBUG_RSEQ */ 135 165 136 166 #endif /* _LINUX_RSEQ_H */

+616

include/linux/rseq_entry.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_RSEQ_ENTRY_H 3 + #define _LINUX_RSEQ_ENTRY_H 4 + 5 + /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ 6 + #ifdef CONFIG_RSEQ_STATS 7 + #include <linux/percpu.h> 8 + 9 + struct rseq_stats { 10 + unsigned long exit; 11 + unsigned long signal; 12 + unsigned long slowpath; 13 + unsigned long fastpath; 14 + unsigned long ids; 15 + unsigned long cs; 16 + unsigned long clear; 17 + unsigned long fixup; 18 + }; 19 + 20 + DECLARE_PER_CPU(struct rseq_stats, rseq_stats); 21 + 22 + /* 23 + * Slow path has interrupts and preemption enabled, but the fast path 24 + * runs with interrupts disabled so there is no point in having the 25 + * preemption checks implied in __this_cpu_inc() for every operation. 26 + */ 27 + #ifdef RSEQ_BUILD_SLOW_PATH 28 + #define rseq_stat_inc(which) this_cpu_inc((which)) 29 + #else 30 + #define rseq_stat_inc(which) raw_cpu_inc((which)) 31 + #endif 32 + 33 + #else /* CONFIG_RSEQ_STATS */ 34 + #define rseq_stat_inc(x) do { } while (0) 35 + #endif /* !CONFIG_RSEQ_STATS */ 36 + 37 + #ifdef CONFIG_RSEQ 38 + #include <linux/jump_label.h> 39 + #include <linux/rseq.h> 40 + #include <linux/uaccess.h> 41 + 42 + #include <linux/tracepoint-defs.h> 43 + 44 + #ifdef CONFIG_TRACEPOINTS 45 + DECLARE_TRACEPOINT(rseq_update); 46 + DECLARE_TRACEPOINT(rseq_ip_fixup); 47 + void __rseq_trace_update(struct task_struct *t); 48 + void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 49 + unsigned long offset, unsigned long abort_ip); 50 + 51 + static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) 52 + { 53 + if (tracepoint_enabled(rseq_update) && ids) 54 + __rseq_trace_update(t); 55 + } 56 + 57 + static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 58 + unsigned long offset, unsigned long abort_ip) 59 + { 60 + if (tracepoint_enabled(rseq_ip_fixup)) 61 + __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 62 + } 63 + 64 + #else /* CONFIG_TRACEPOINT */ 65 + static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } 66 + static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 67 + unsigned long offset, unsigned long abort_ip) { } 68 + #endif /* !CONFIG_TRACEPOINT */ 69 + 70 + DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 71 + 72 + #ifdef RSEQ_BUILD_SLOW_PATH 73 + #define rseq_inline 74 + #else 75 + #define rseq_inline __always_inline 76 + #endif 77 + 78 + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); 79 + bool rseq_debug_validate_ids(struct task_struct *t); 80 + 81 + static __always_inline void rseq_note_user_irq_entry(void) 82 + { 83 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) 84 + current->rseq.event.user_irq = true; 85 + } 86 + 87 + /* 88 + * Check whether there is a valid critical section and whether the 89 + * instruction pointer in @regs is inside the critical section. 90 + * 91 + * - If the critical section is invalid, terminate the task. 92 + * 93 + * - If valid and the instruction pointer is inside, set it to the abort IP. 94 + * 95 + * - If valid and the instruction pointer is outside, clear the critical 96 + * section address. 97 + * 98 + * Returns true, if the section was valid and either fixup or clear was 99 + * done, false otherwise. 100 + * 101 + * In the failure case task::rseq_event::fatal is set when a invalid 102 + * section was found. It's clear when the failure was an unresolved page 103 + * fault. 104 + * 105 + * If inlined into the exit to user path with interrupts disabled, the 106 + * caller has to protect against page faults with pagefault_disable(). 107 + * 108 + * In preemptible task context this would be counterproductive as the page 109 + * faults could not be fully resolved. As a consequence unresolved page 110 + * faults in task context are fatal too. 111 + */ 112 + 113 + #ifdef RSEQ_BUILD_SLOW_PATH 114 + /* 115 + * The debug version is put out of line, but kept here so the code stays 116 + * together. 117 + * 118 + * @csaddr has already been checked by the caller to be in user space 119 + */ 120 + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, 121 + unsigned long csaddr) 122 + { 123 + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 124 + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; 125 + unsigned long ip = instruction_pointer(regs); 126 + u64 __user *uc_head = (u64 __user *) ucs; 127 + u32 usig, __user *uc_sig; 128 + 129 + scoped_user_rw_access(ucs, efault) { 130 + /* 131 + * Evaluate the user pile and exit if one of the conditions 132 + * is not fulfilled. 133 + */ 134 + unsafe_get_user(start_ip, &ucs->start_ip, efault); 135 + if (unlikely(start_ip >= tasksize)) 136 + goto die; 137 + /* If outside, just clear the critical section. */ 138 + if (ip < start_ip) 139 + goto clear; 140 + 141 + unsafe_get_user(offset, &ucs->post_commit_offset, efault); 142 + cs_end = start_ip + offset; 143 + /* Check for overflow and wraparound */ 144 + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) 145 + goto die; 146 + 147 + /* If not inside, clear it. */ 148 + if (ip >= cs_end) 149 + goto clear; 150 + 151 + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 152 + /* Ensure it's "valid" */ 153 + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 154 + goto die; 155 + /* Validate that the abort IP is not in the critical section */ 156 + if (unlikely(abort_ip - start_ip < offset)) 157 + goto die; 158 + 159 + /* 160 + * Check version and flags for 0. No point in emitting 161 + * deprecated warnings before dying. That could be done in 162 + * the slow path eventually, but *shrug*. 163 + */ 164 + unsafe_get_user(head, uc_head, efault); 165 + if (unlikely(head)) 166 + goto die; 167 + 168 + /* abort_ip - 4 is >= 0. See abort_ip check above */ 169 + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 170 + unsafe_get_user(usig, uc_sig, efault); 171 + if (unlikely(usig != t->rseq.sig)) 172 + goto die; 173 + 174 + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ 175 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 176 + /* If not in interrupt from user context, let it die */ 177 + if (unlikely(!t->rseq.event.user_irq)) 178 + goto die; 179 + } 180 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 181 + instruction_pointer_set(regs, (unsigned long)abort_ip); 182 + rseq_stat_inc(rseq_stats.fixup); 183 + break; 184 + clear: 185 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 186 + rseq_stat_inc(rseq_stats.clear); 187 + abort_ip = 0ULL; 188 + } 189 + 190 + if (unlikely(abort_ip)) 191 + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 192 + return true; 193 + die: 194 + t->rseq.event.fatal = true; 195 + efault: 196 + return false; 197 + } 198 + 199 + /* 200 + * On debug kernels validate that user space did not mess with it if the 201 + * debug branch is enabled. 202 + */ 203 + bool rseq_debug_validate_ids(struct task_struct *t) 204 + { 205 + struct rseq __user *rseq = t->rseq.usrptr; 206 + u32 cpu_id, uval, node_id; 207 + 208 + /* 209 + * On the first exit after registering the rseq region CPU ID is 210 + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! 211 + */ 212 + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? 213 + cpu_to_node(t->rseq.ids.cpu_id) : 0; 214 + 215 + scoped_user_read_access(rseq, efault) { 216 + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); 217 + if (cpu_id != t->rseq.ids.cpu_id) 218 + goto die; 219 + unsafe_get_user(uval, &rseq->cpu_id, efault); 220 + if (uval != cpu_id) 221 + goto die; 222 + unsafe_get_user(uval, &rseq->node_id, efault); 223 + if (uval != node_id) 224 + goto die; 225 + unsafe_get_user(uval, &rseq->mm_cid, efault); 226 + if (uval != t->rseq.ids.mm_cid) 227 + goto die; 228 + } 229 + return true; 230 + die: 231 + t->rseq.event.fatal = true; 232 + efault: 233 + return false; 234 + } 235 + 236 + #endif /* RSEQ_BUILD_SLOW_PATH */ 237 + 238 + /* 239 + * This only ensures that abort_ip is in the user address space and 240 + * validates that it is preceded by the signature. 241 + * 242 + * No other sanity checks are done here, that's what the debug code is for. 243 + */ 244 + static rseq_inline bool 245 + rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) 246 + { 247 + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 248 + unsigned long ip = instruction_pointer(regs); 249 + unsigned long tasksize = TASK_SIZE; 250 + u64 start_ip, abort_ip, offset; 251 + u32 usig, __user *uc_sig; 252 + 253 + rseq_stat_inc(rseq_stats.cs); 254 + 255 + if (unlikely(csaddr >= tasksize)) { 256 + t->rseq.event.fatal = true; 257 + return false; 258 + } 259 + 260 + if (static_branch_unlikely(&rseq_debug_enabled)) 261 + return rseq_debug_update_user_cs(t, regs, csaddr); 262 + 263 + scoped_user_rw_access(ucs, efault) { 264 + unsafe_get_user(start_ip, &ucs->start_ip, efault); 265 + unsafe_get_user(offset, &ucs->post_commit_offset, efault); 266 + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 267 + 268 + /* 269 + * No sanity checks. If user space screwed it up, it can 270 + * keep the pieces. That's what debug code is for. 271 + * 272 + * If outside, just clear the critical section. 273 + */ 274 + if (ip - start_ip >= offset) 275 + goto clear; 276 + 277 + /* 278 + * Two requirements for @abort_ip: 279 + * - Must be in user space as x86 IRET would happily return to 280 + * the kernel. 281 + * - The four bytes preceding the instruction at @abort_ip must 282 + * contain the signature. 283 + * 284 + * The latter protects against the following attack vector: 285 + * 286 + * An attacker with limited abilities to write, creates a critical 287 + * section descriptor, sets the abort IP to a library function or 288 + * some other ROP gadget and stores the address of the descriptor 289 + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP 290 + * protection. 291 + */ 292 + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 293 + goto die; 294 + 295 + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ 296 + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 297 + unsafe_get_user(usig, uc_sig, efault); 298 + if (unlikely(usig != t->rseq.sig)) 299 + goto die; 300 + 301 + /* Invalidate the critical section */ 302 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 303 + /* Update the instruction pointer */ 304 + instruction_pointer_set(regs, (unsigned long)abort_ip); 305 + rseq_stat_inc(rseq_stats.fixup); 306 + break; 307 + clear: 308 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 309 + rseq_stat_inc(rseq_stats.clear); 310 + abort_ip = 0ULL; 311 + } 312 + 313 + if (unlikely(abort_ip)) 314 + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 315 + return true; 316 + die: 317 + t->rseq.event.fatal = true; 318 + efault: 319 + return false; 320 + } 321 + 322 + /* 323 + * Updates CPU ID, Node ID and MM CID and reads the critical section 324 + * address, when @csaddr != NULL. This allows to put the ID update and the 325 + * read under the same uaccess region to spare a separate begin/end. 326 + * 327 + * As this is either invoked from a C wrapper with @csaddr = NULL or from 328 + * the fast path code with a valid pointer, a clever compiler should be 329 + * able to optimize the read out. Spares a duplicate implementation. 330 + * 331 + * Returns true, if the operation was successful, false otherwise. 332 + * 333 + * In the failure case task::rseq_event::fatal is set when invalid data 334 + * was found on debug kernels. It's clear when the failure was an unresolved page 335 + * fault. 336 + * 337 + * If inlined into the exit to user path with interrupts disabled, the 338 + * caller has to protect against page faults with pagefault_disable(). 339 + * 340 + * In preemptible task context this would be counterproductive as the page 341 + * faults could not be fully resolved. As a consequence unresolved page 342 + * faults in task context are fatal too. 343 + */ 344 + static rseq_inline 345 + bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, 346 + u32 node_id, u64 *csaddr) 347 + { 348 + struct rseq __user *rseq = t->rseq.usrptr; 349 + 350 + if (static_branch_unlikely(&rseq_debug_enabled)) { 351 + if (!rseq_debug_validate_ids(t)) 352 + return false; 353 + } 354 + 355 + scoped_user_rw_access(rseq, efault) { 356 + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); 357 + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); 358 + unsafe_put_user(node_id, &rseq->node_id, efault); 359 + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); 360 + if (csaddr) 361 + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); 362 + } 363 + 364 + /* Cache the new values */ 365 + t->rseq.ids.cpu_cid = ids->cpu_cid; 366 + rseq_stat_inc(rseq_stats.ids); 367 + rseq_trace_update(t, ids); 368 + return true; 369 + efault: 370 + return false; 371 + } 372 + 373 + /* 374 + * Update user space with new IDs and conditionally check whether the task 375 + * is in a critical section. 376 + */ 377 + static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, 378 + struct rseq_ids *ids, u32 node_id) 379 + { 380 + u64 csaddr; 381 + 382 + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) 383 + return false; 384 + 385 + /* 386 + * On architectures which utilize the generic entry code this 387 + * allows to skip the critical section when the entry was not from 388 + * a user space interrupt, unless debug mode is enabled. 389 + */ 390 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 391 + if (!static_branch_unlikely(&rseq_debug_enabled)) { 392 + if (likely(!t->rseq.event.user_irq)) 393 + return true; 394 + } 395 + } 396 + if (likely(!csaddr)) 397 + return true; 398 + /* Sigh, this really needs to do work */ 399 + return rseq_update_user_cs(t, regs, csaddr); 400 + } 401 + 402 + /* 403 + * If you want to use this then convert your architecture to the generic 404 + * entry code. I'm tired of building workarounds for people who can't be 405 + * bothered to make the maintenance of generic infrastructure less 406 + * burdensome. Just sucking everything into the architecture code and 407 + * thereby making others chase the horrible hacks and keep them working is 408 + * neither acceptable nor sustainable. 409 + */ 410 + #ifdef CONFIG_GENERIC_ENTRY 411 + 412 + /* 413 + * This is inlined into the exit path because: 414 + * 415 + * 1) It's a one time comparison in the fast path when there is no event to 416 + * handle 417 + * 418 + * 2) The access to the user space rseq memory (TLS) is unlikely to fault 419 + * so the straight inline operation is: 420 + * 421 + * - Four 32-bit stores only if CPU ID/ MM CID need to be updated 422 + * - One 64-bit load to retrieve the critical section address 423 + * 424 + * 3) In the unlikely case that the critical section address is != NULL: 425 + * 426 + * - One 64-bit load to retrieve the start IP 427 + * - One 64-bit load to retrieve the offset for calculating the end 428 + * - One 64-bit load to retrieve the abort IP 429 + * - One 64-bit load to retrieve the signature 430 + * - One store to clear the critical section address 431 + * 432 + * The non-debug case implements only the minimal required checking. It 433 + * provides protection against a rogue abort IP in kernel space, which 434 + * would be exploitable at least on x86, and also against a rogue CS 435 + * descriptor by checking the signature at the abort IP. Any fallout from 436 + * invalid critical section descriptors is a user space problem. The debug 437 + * case provides the full set of checks and terminates the task if a 438 + * condition is not met. 439 + * 440 + * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and 441 + * tells the caller to loop back into exit_to_user_mode_loop(). The rseq 442 + * slow path there will handle the failure. 443 + */ 444 + static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) 445 + { 446 + /* 447 + * Page faults need to be disabled as this is called with 448 + * interrupts disabled 449 + */ 450 + guard(pagefault)(); 451 + if (likely(!t->rseq.event.ids_changed)) { 452 + struct rseq __user *rseq = t->rseq.usrptr; 453 + /* 454 + * If IDs have not changed rseq_event::user_irq must be true 455 + * See rseq_sched_switch_event(). 456 + */ 457 + u64 csaddr; 458 + 459 + if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) 460 + return false; 461 + 462 + if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { 463 + if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) 464 + return false; 465 + } 466 + return true; 467 + } 468 + 469 + struct rseq_ids ids = { 470 + .cpu_id = task_cpu(t), 471 + .mm_cid = task_mm_cid(t), 472 + }; 473 + u32 node_id = cpu_to_node(ids.cpu_id); 474 + 475 + return rseq_update_usr(t, regs, &ids, node_id); 476 + } 477 + 478 + static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) 479 + { 480 + struct task_struct *t = current; 481 + 482 + /* 483 + * If the task did not go through schedule or got the flag enforced 484 + * by the rseq syscall or execve, then nothing to do here. 485 + * 486 + * CPU ID and MM CID can only change when going through a context 487 + * switch. 488 + * 489 + * rseq_sched_switch_event() sets the rseq_event::sched_switch bit 490 + * only when rseq_event::has_rseq is true. That conditional is 491 + * required to avoid setting the TIF bit if RSEQ is not registered 492 + * for a task. rseq_event::sched_switch is cleared when RSEQ is 493 + * unregistered by a task so it's sufficient to check for the 494 + * sched_switch bit alone. 495 + * 496 + * A sane compiler requires three instructions for the nothing to do 497 + * case including clearing the events, but your mileage might vary. 498 + */ 499 + if (unlikely((t->rseq.event.sched_switch))) { 500 + rseq_stat_inc(rseq_stats.fastpath); 501 + 502 + if (unlikely(!rseq_exit_user_update(regs, t))) 503 + return true; 504 + } 505 + /* Clear state so next entry starts from a clean slate */ 506 + t->rseq.event.events = 0; 507 + return false; 508 + } 509 + 510 + /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ 511 + #ifdef CONFIG_HAVE_GENERIC_TIF_BITS 512 + static __always_inline bool test_tif_rseq(unsigned long ti_work) 513 + { 514 + return ti_work & _TIF_RSEQ; 515 + } 516 + 517 + static __always_inline void clear_tif_rseq(void) 518 + { 519 + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); 520 + clear_thread_flag(TIF_RSEQ); 521 + } 522 + #else 523 + static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } 524 + static __always_inline void clear_tif_rseq(void) { } 525 + #endif 526 + 527 + static __always_inline bool 528 + rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 529 + { 530 + if (likely(!test_tif_rseq(ti_work))) 531 + return false; 532 + 533 + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 534 + current->rseq.event.slowpath = true; 535 + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 536 + return true; 537 + } 538 + 539 + clear_tif_rseq(); 540 + return false; 541 + } 542 + 543 + #else /* CONFIG_GENERIC_ENTRY */ 544 + static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 545 + { 546 + return false; 547 + } 548 + #endif /* !CONFIG_GENERIC_ENTRY */ 549 + 550 + static __always_inline void rseq_syscall_exit_to_user_mode(void) 551 + { 552 + struct rseq_event *ev = &current->rseq.event; 553 + 554 + rseq_stat_inc(rseq_stats.exit); 555 + 556 + /* Needed to remove the store for the !lockdep case */ 557 + if (IS_ENABLED(CONFIG_LOCKDEP)) { 558 + WARN_ON_ONCE(ev->sched_switch); 559 + ev->events = 0; 560 + } 561 + } 562 + 563 + static __always_inline void rseq_irqentry_exit_to_user_mode(void) 564 + { 565 + struct rseq_event *ev = &current->rseq.event; 566 + 567 + rseq_stat_inc(rseq_stats.exit); 568 + 569 + lockdep_assert_once(!ev->sched_switch); 570 + 571 + /* 572 + * Ensure that event (especially user_irq) is cleared when the 573 + * interrupt did not result in a schedule and therefore the 574 + * rseq processing could not clear it. 575 + */ 576 + ev->events = 0; 577 + } 578 + 579 + /* Required to keep ARM64 working */ 580 + static __always_inline void rseq_exit_to_user_mode_legacy(void) 581 + { 582 + struct rseq_event *ev = &current->rseq.event; 583 + 584 + rseq_stat_inc(rseq_stats.exit); 585 + 586 + if (static_branch_unlikely(&rseq_debug_enabled)) 587 + WARN_ON_ONCE(ev->sched_switch); 588 + 589 + /* 590 + * Ensure that event (especially user_irq) is cleared when the 591 + * interrupt did not result in a schedule and therefore the 592 + * rseq processing did not clear it. 593 + */ 594 + ev->events = 0; 595 + } 596 + 597 + void __rseq_debug_syscall_return(struct pt_regs *regs); 598 + 599 + static inline void rseq_debug_syscall_return(struct pt_regs *regs) 600 + { 601 + if (static_branch_unlikely(&rseq_debug_enabled)) 602 + __rseq_debug_syscall_return(regs); 603 + } 604 + #else /* CONFIG_RSEQ */ 605 + static inline void rseq_note_user_irq_entry(void) { } 606 + static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 607 + { 608 + return false; 609 + } 610 + static inline void rseq_syscall_exit_to_user_mode(void) { } 611 + static inline void rseq_irqentry_exit_to_user_mode(void) { } 612 + static inline void rseq_exit_to_user_mode_legacy(void) { } 613 + static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } 614 + #endif /* !CONFIG_RSEQ */ 615 + 616 + #endif /* _LINUX_RSEQ_ENTRY_H */

+164

include/linux/rseq_types.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_RSEQ_TYPES_H 3 + #define _LINUX_RSEQ_TYPES_H 4 + 5 + #include <linux/irq_work_types.h> 6 + #include <linux/types.h> 7 + #include <linux/workqueue_types.h> 8 + 9 + #ifdef CONFIG_RSEQ 10 + struct rseq; 11 + 12 + /** 13 + * struct rseq_event - Storage for rseq related event management 14 + * @all: Compound to initialize and clear the data efficiently 15 + * @events: Compound to access events with a single load/store 16 + * @sched_switch: True if the task was scheduled and needs update on 17 + * exit to user 18 + * @ids_changed: Indicator that IDs need to be updated 19 + * @user_irq: True on interrupt entry from user mode 20 + * @has_rseq: True if the task has a rseq pointer installed 21 + * @error: Compound error code for the slow path to analyze 22 + * @fatal: User space data corrupted or invalid 23 + * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME 24 + * is required 25 + * 26 + * @sched_switch and @ids_changed must be adjacent and the combo must be 27 + * 16bit aligned to allow a single store, when both are set at the same 28 + * time in the scheduler. 29 + */ 30 + struct rseq_event { 31 + union { 32 + u64 all; 33 + struct { 34 + union { 35 + u32 events; 36 + struct { 37 + u8 sched_switch; 38 + u8 ids_changed; 39 + u8 user_irq; 40 + }; 41 + }; 42 + 43 + u8 has_rseq; 44 + u8 __pad; 45 + union { 46 + u16 error; 47 + struct { 48 + u8 fatal; 49 + u8 slowpath; 50 + }; 51 + }; 52 + }; 53 + }; 54 + }; 55 + 56 + /** 57 + * struct rseq_ids - Cache for ids, which need to be updated 58 + * @cpu_cid: Compound of @cpu_id and @mm_cid to make the 59 + * compiler emit a single compare on 64-bit 60 + * @cpu_id: The CPU ID which was written last to user space 61 + * @mm_cid: The MM CID which was written last to user space 62 + * 63 + * @cpu_id and @mm_cid are updated when the data is written to user space. 64 + */ 65 + struct rseq_ids { 66 + union { 67 + u64 cpu_cid; 68 + struct { 69 + u32 cpu_id; 70 + u32 mm_cid; 71 + }; 72 + }; 73 + }; 74 + 75 + /** 76 + * struct rseq_data - Storage for all rseq related data 77 + * @usrptr: Pointer to the registered user space RSEQ memory 78 + * @len: Length of the RSEQ region 79 + * @sig: Signature of critial section abort IPs 80 + * @event: Storage for event management 81 + * @ids: Storage for cached CPU ID and MM CID 82 + */ 83 + struct rseq_data { 84 + struct rseq __user *usrptr; 85 + u32 len; 86 + u32 sig; 87 + struct rseq_event event; 88 + struct rseq_ids ids; 89 + }; 90 + 91 + #else /* CONFIG_RSEQ */ 92 + struct rseq_data { }; 93 + #endif /* !CONFIG_RSEQ */ 94 + 95 + #ifdef CONFIG_SCHED_MM_CID 96 + 97 + #define MM_CID_UNSET BIT(31) 98 + #define MM_CID_ONCPU BIT(30) 99 + #define MM_CID_TRANSIT BIT(29) 100 + 101 + /** 102 + * struct sched_mm_cid - Storage for per task MM CID data 103 + * @active: MM CID is active for the task 104 + * @cid: The CID associated to the task either permanently or 105 + * borrowed from the CPU 106 + */ 107 + struct sched_mm_cid { 108 + unsigned int active; 109 + unsigned int cid; 110 + }; 111 + 112 + /** 113 + * struct mm_cid_pcpu - Storage for per CPU MM_CID data 114 + * @cid: The CID associated to the CPU either permanently or 115 + * while a task with a CID is running 116 + */ 117 + struct mm_cid_pcpu { 118 + unsigned int cid; 119 + }____cacheline_aligned_in_smp; 120 + 121 + /** 122 + * struct mm_mm_cid - Storage for per MM CID data 123 + * @pcpu: Per CPU storage for CIDs associated to a CPU 124 + * @percpu: Set, when CIDs are in per CPU mode 125 + * @transit: Set to MM_CID_TRANSIT during a mode change transition phase 126 + * @max_cids: The exclusive maximum CID value for allocation and convergence 127 + * @irq_work: irq_work to handle the affinity mode change case 128 + * @work: Regular work to handle the affinity mode change case 129 + * @lock: Spinlock to protect against affinity setting which can't take @mutex 130 + * @mutex: Mutex to serialize forks and exits related to this mm 131 + * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map 132 + * is growth only. 133 + * @users: The number of tasks sharing this MM. Separate from mm::mm_users 134 + * as that is modified by mmget()/mm_put() by other entities which 135 + * do not actually share the MM. 136 + * @pcpu_thrs: Threshold for switching back from per CPU mode 137 + * @update_deferred: A deferred switch back to per task mode is pending. 138 + */ 139 + struct mm_mm_cid { 140 + /* Hotpath read mostly members */ 141 + struct mm_cid_pcpu __percpu *pcpu; 142 + unsigned int percpu; 143 + unsigned int transit; 144 + unsigned int max_cids; 145 + 146 + /* Rarely used. Moves @lock and @mutex into the second cacheline */ 147 + struct irq_work irq_work; 148 + struct work_struct work; 149 + 150 + raw_spinlock_t lock; 151 + struct mutex mutex; 152 + 153 + /* Low frequency modified */ 154 + unsigned int nr_cpus_allowed; 155 + unsigned int users; 156 + unsigned int pcpu_thrs; 157 + unsigned int update_deferred; 158 + }____cacheline_aligned_in_smp; 159 + #else /* CONFIG_SCHED_MM_CID */ 160 + struct mm_mm_cid { }; 161 + struct sched_mm_cid { }; 162 + #endif /* !CONFIG_SCHED_MM_CID */ 163 + 164 + #endif

+29 -28

include/linux/sched.h

··· 41 41 #include <linux/task_io_accounting.h> 42 42 #include <linux/posix-timers_types.h> 43 43 #include <linux/restart_block.h> 44 - #include <uapi/linux/rseq.h> 44 + #include <linux/rseq_types.h> 45 45 #include <linux/seqlock_types.h> 46 46 #include <linux/kcsan.h> 47 47 #include <linux/rv.h> ··· 1406 1406 unsigned long numa_pages_migrated; 1407 1407 #endif /* CONFIG_NUMA_BALANCING */ 1408 1408 1409 - #ifdef CONFIG_RSEQ 1410 - struct rseq __user *rseq; 1411 - u32 rseq_len; 1412 - u32 rseq_sig; 1413 - /* 1414 - * RmW on rseq_event_mask must be performed atomically 1415 - * with respect to preemption. 1416 - */ 1417 - unsigned long rseq_event_mask; 1418 - # ifdef CONFIG_DEBUG_RSEQ 1419 - /* 1420 - * This is a place holder to save a copy of the rseq fields for 1421 - * validation of read-only fields. The struct rseq has a 1422 - * variable-length array at the end, so it cannot be used 1423 - * directly. Reserve a size large enough for the known fields. 1424 - */ 1425 - char rseq_fields[sizeof(struct rseq)]; 1426 - # endif 1427 - #endif 1428 - 1429 - #ifdef CONFIG_SCHED_MM_CID 1430 - int mm_cid; /* Current cid in mm */ 1431 - int last_mm_cid; /* Most recent cid in mm */ 1432 - int migrate_from_cpu; 1433 - int mm_cid_active; /* Whether cid bitmap is active */ 1434 - struct callback_head cid_work; 1435 - #endif 1409 + struct rseq_data rseq; 1410 + struct sched_mm_cid mm_cid; 1436 1411 1437 1412 struct tlbflush_unmap_batch tlb_ubc; 1438 1413 ··· 2298 2323 #else 2299 2324 #define alloc_tag_save(_tag) NULL 2300 2325 #define alloc_tag_restore(_tag, _old) do {} while (0) 2326 + #endif 2327 + 2328 + /* Avoids recursive inclusion hell */ 2329 + #ifdef CONFIG_SCHED_MM_CID 2330 + void sched_mm_cid_before_execve(struct task_struct *t); 2331 + void sched_mm_cid_after_execve(struct task_struct *t); 2332 + void sched_mm_cid_fork(struct task_struct *t); 2333 + void sched_mm_cid_exit(struct task_struct *t); 2334 + static __always_inline int task_mm_cid(struct task_struct *t) 2335 + { 2336 + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); 2337 + } 2338 + #else 2339 + static inline void sched_mm_cid_before_execve(struct task_struct *t) { } 2340 + static inline void sched_mm_cid_after_execve(struct task_struct *t) { } 2341 + static inline void sched_mm_cid_fork(struct task_struct *t) { } 2342 + static inline void sched_mm_cid_exit(struct task_struct *t) { } 2343 + static __always_inline int task_mm_cid(struct task_struct *t) 2344 + { 2345 + /* 2346 + * Use the processor id as a fall-back when the mm cid feature is 2347 + * disabled. This provides functional per-cpu data structure accesses 2348 + * in user-space, althrough it won't provide the memory usage benefits. 2349 + */ 2350 + return task_cpu(t); 2351 + } 2301 2352 #endif 2302 2353 2303 2354 #ifndef MODULE

+5

include/linux/thread_info.h

··· 67 67 #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED 68 68 #endif 69 69 70 + #ifndef TIF_RSEQ 71 + # define TIF_RSEQ TIF_NOTIFY_RESUME 72 + # define _TIF_RSEQ _TIF_NOTIFY_RESUME 73 + #endif 74 + 70 75 #ifdef __KERNEL__ 71 76 72 77 #ifndef arch_set_restart_data

+2 -2

include/trace/events/rseq.h

··· 21 21 ), 22 22 23 23 TP_fast_assign( 24 - __entry->cpu_id = raw_smp_processor_id(); 24 + __entry->cpu_id = t->rseq.ids.cpu_id; 25 25 __entry->node_id = cpu_to_node(__entry->cpu_id); 26 - __entry->mm_cid = task_mm_cid(t); 26 + __entry->mm_cid = t->rseq.ids.mm_cid; 27 27 ), 28 28 29 29 TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,

+7 -14

include/uapi/linux/rseq.h

··· 114 114 /* 115 115 * Restartable sequences flags field. 116 116 * 117 - * This field should only be updated by the thread which 118 - * registered this data structure. Read by the kernel. 119 - * Mainly used for single-stepping through rseq critical sections 120 - * with debuggers. 121 - * 122 - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT 123 - * Inhibit instruction sequence block restart on preemption 124 - * for this thread. 125 - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL 126 - * Inhibit instruction sequence block restart on signal 127 - * delivery for this thread. 128 - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE 129 - * Inhibit instruction sequence block restart on migration for 130 - * this thread. 117 + * This field was initially intended to allow event masking for 118 + * single-stepping through rseq critical sections with debuggers. 119 + * The kernel does not support this anymore and the relevant bits 120 + * are checked for being always false: 121 + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT 122 + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL 123 + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE 131 124 */ 132 125 __u32 flags; 133 126

+27 -1

init/Kconfig

··· 1913 1913 1914 1914 If unsure, say Y. 1915 1915 1916 + config RSEQ_STATS 1917 + default n 1918 + bool "Enable lightweight statistics of restartable sequences" if EXPERT 1919 + depends on RSEQ && DEBUG_FS 1920 + help 1921 + Enable lightweight counters which expose information about the 1922 + frequency of RSEQ operations via debugfs. Mostly interesting for 1923 + kernel debugging or performance analysis. While lightweight it's 1924 + still adding code into the user/kernel mode transitions. 1925 + 1926 + If unsure, say N. 1927 + 1928 + config RSEQ_DEBUG_DEFAULT_ENABLE 1929 + default n 1930 + bool "Enable restartable sequences debug mode by default" if EXPERT 1931 + depends on RSEQ 1932 + help 1933 + This enables the static branch for debug mode of restartable 1934 + sequences. 1935 + 1936 + This also can be controlled on the kernel command line via the 1937 + command line parameter "rseq_debug=0/1" and through debugfs. 1938 + 1939 + If unsure, say N. 1940 + 1916 1941 config DEBUG_RSEQ 1917 1942 default n 1918 1943 bool "Enable debugging of rseq() system call" if EXPERT 1919 - depends on RSEQ && DEBUG_KERNEL 1944 + depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY 1945 + select RSEQ_DEBUG_DEFAULT_ENABLE 1920 1946 help 1921 1947 Enable extra debugging checks for the rseq system call. 1922 1948

+3

init/init_task.c

··· 250 250 #ifdef CONFIG_SECCOMP_FILTER 251 251 .seccomp = { .filter_count = ATOMIC_INIT(0) }, 252 252 #endif 253 + #ifdef CONFIG_SCHED_MM_CID 254 + .mm_cid = { .cid = MM_CID_UNSET, }, 255 + #endif 253 256 }; 254 257 EXPORT_SYMBOL(init_task); 255 258

+19

kernel/cpu.c

··· 3085 3085 #ifdef CONFIG_INIT_ALL_POSSIBLE 3086 3086 struct cpumask __cpu_possible_mask __ro_after_init 3087 3087 = {CPU_BITS_ALL}; 3088 + unsigned int __num_possible_cpus __ro_after_init = NR_CPUS; 3088 3089 #else 3089 3090 struct cpumask __cpu_possible_mask __ro_after_init; 3091 + unsigned int __num_possible_cpus __ro_after_init; 3090 3092 #endif 3091 3093 EXPORT_SYMBOL(__cpu_possible_mask); 3094 + EXPORT_SYMBOL(__num_possible_cpus); 3092 3095 3093 3096 struct cpumask __cpu_online_mask __read_mostly; 3094 3097 EXPORT_SYMBOL(__cpu_online_mask); ··· 3119 3116 void init_cpu_possible(const struct cpumask *src) 3120 3117 { 3121 3118 cpumask_copy(&__cpu_possible_mask, src); 3119 + __num_possible_cpus = cpumask_weight(&__cpu_possible_mask); 3122 3120 } 3123 3121 3124 3122 void set_cpu_online(unsigned int cpu, bool online) ··· 3140 3136 } else { 3141 3137 if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) 3142 3138 atomic_dec(&__num_online_cpus); 3139 + } 3140 + } 3141 + 3142 + /* 3143 + * This should be marked __init, but there is a boatload of call sites 3144 + * which need to be fixed up to do so. Sigh... 3145 + */ 3146 + void set_cpu_possible(unsigned int cpu, bool possible) 3147 + { 3148 + if (possible) { 3149 + if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask)) 3150 + __num_possible_cpus++; 3151 + } else { 3152 + if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask)) 3153 + __num_possible_cpus--; 3143 3154 } 3144 3155 } 3145 3156

+22 -17

kernel/entry/common.c

··· 11 11 /* Workaround to allow gradual conversion of architecture code */ 12 12 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 13 13 14 - /** 15 - * exit_to_user_mode_loop - do any pending work before leaving to user space 16 - * @regs: Pointer to pt_regs on entry stack 17 - * @ti_work: TIF work flags as read by the caller 18 - */ 19 - __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 20 - unsigned long ti_work) 14 + #ifdef CONFIG_HAVE_GENERIC_TIF_BITS 15 + #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) 16 + #else 17 + #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) 18 + #endif 19 + 20 + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, 21 + unsigned long ti_work) 21 22 { 22 23 /* 23 24 * Before returning to user space ensure that all pending work 24 25 * items have been completed. 25 26 */ 26 - while (ti_work & EXIT_TO_USER_MODE_WORK) { 27 + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { 27 28 28 29 local_irq_enable_exit_to_user(ti_work); 29 30 ··· 63 62 return ti_work; 64 63 } 65 64 66 - noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 65 + /** 66 + * exit_to_user_mode_loop - do any pending work before leaving to user space 67 + * @regs: Pointer to pt_regs on entry stack 68 + * @ti_work: TIF work flags as read by the caller 69 + */ 70 + __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 71 + unsigned long ti_work) 67 72 { 68 - enter_from_user_mode(regs); 69 - } 73 + for (;;) { 74 + ti_work = __exit_to_user_mode_loop(regs, ti_work); 70 75 71 - noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 72 - { 73 - instrumentation_begin(); 74 - exit_to_user_mode_prepare(regs); 75 - instrumentation_end(); 76 - exit_to_user_mode(); 76 + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) 77 + return ti_work; 78 + ti_work = read_thread_flags(); 79 + } 77 80 } 78 81 79 82 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)

-8

kernel/entry/syscall-common.c

··· 63 63 return ret ? : syscall; 64 64 } 65 65 66 - noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 67 - { 68 - enter_from_user_mode(regs); 69 - instrumentation_begin(); 70 - local_irq_enable(); 71 - instrumentation_end(); 72 - } 73 - 74 66 /* 75 67 * If SYSCALL_EMU is set, then the only reason to report is when 76 68 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall

+1

kernel/exit.c

··· 911 911 user_events_exit(tsk); 912 912 913 913 io_uring_files_cancel(); 914 + sched_mm_cid_exit(tsk); 914 915 exit_signals(tsk); /* sets PF_EXITING */ 915 916 916 917 seccomp_filter_release(tsk);

+3 -4

kernel/fork.c

··· 955 955 #endif 956 956 957 957 #ifdef CONFIG_SCHED_MM_CID 958 - tsk->mm_cid = -1; 959 - tsk->last_mm_cid = -1; 960 - tsk->mm_cid_active = 0; 961 - tsk->migrate_from_cpu = -1; 958 + tsk->mm_cid.cid = MM_CID_UNSET; 959 + tsk->mm_cid.active = 0; 962 960 #endif 963 961 return tsk; 964 962 ··· 2454 2456 exit_nsproxy_namespaces(p); 2455 2457 bad_fork_cleanup_mm: 2456 2458 if (p->mm) { 2459 + sched_mm_cid_exit(p); 2457 2460 mm_clear_owner(p->mm, p); 2458 2461 mmput(p->mm); 2459 2462 }

+3 -3

kernel/ptrace.c

··· 793 793 unsigned long size, void __user *data) 794 794 { 795 795 struct ptrace_rseq_configuration conf = { 796 - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, 797 - .rseq_abi_size = task->rseq_len, 798 - .signature = task->rseq_sig, 796 + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, 797 + .rseq_abi_size = task->rseq.len, 798 + .signature = task->rseq.sig, 799 799 .flags = 0, 800 800 }; 801 801

+289 -384

kernel/rseq.c

··· 8 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 9 */ 10 10 11 - #include <linux/sched.h> 12 - #include <linux/uaccess.h> 13 - #include <linux/syscalls.h> 14 - #include <linux/rseq.h> 15 - #include <linux/types.h> 16 - #include <linux/ratelimit.h> 17 - #include <asm/ptrace.h> 18 - 19 - #define CREATE_TRACE_POINTS 20 - #include <trace/events/rseq.h> 21 - 22 - /* The original rseq structure size (including padding) is 32 bytes. */ 23 - #define ORIG_RSEQ_SIZE 32 24 - 25 - #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ 26 - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ 27 - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) 28 - 29 - #ifdef CONFIG_DEBUG_RSEQ 30 - static struct rseq *rseq_kernel_fields(struct task_struct *t) 31 - { 32 - return (struct rseq *) t->rseq_fields; 33 - } 34 - 35 - static int rseq_validate_ro_fields(struct task_struct *t) 36 - { 37 - static DEFINE_RATELIMIT_STATE(_rs, 38 - DEFAULT_RATELIMIT_INTERVAL, 39 - DEFAULT_RATELIMIT_BURST); 40 - u32 cpu_id_start, cpu_id, node_id, mm_cid; 41 - struct rseq __user *rseq = t->rseq; 42 - 43 - /* 44 - * Validate fields which are required to be read-only by 45 - * user-space. 46 - */ 47 - if (!user_read_access_begin(rseq, t->rseq_len)) 48 - goto efault; 49 - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); 50 - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); 51 - unsafe_get_user(node_id, &rseq->node_id, efault_end); 52 - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); 53 - user_read_access_end(); 54 - 55 - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || 56 - cpu_id != rseq_kernel_fields(t)->cpu_id || 57 - node_id != rseq_kernel_fields(t)->node_id || 58 - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { 59 - 60 - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" 61 - "\tcpu_id_start: %u ?= %u\n" 62 - "\tcpu_id: %u ?= %u\n" 63 - "\tnode_id: %u ?= %u\n" 64 - "\tmm_cid: %u ?= %u\n", 65 - t->pid, t->comm, 66 - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, 67 - cpu_id, rseq_kernel_fields(t)->cpu_id, 68 - node_id, rseq_kernel_fields(t)->node_id, 69 - mm_cid, rseq_kernel_fields(t)->mm_cid); 70 - } 71 - 72 - /* For now, only print a console warning on mismatch. */ 73 - return 0; 74 - 75 - efault_end: 76 - user_read_access_end(); 77 - efault: 78 - return -EFAULT; 79 - } 80 - 81 11 /* 82 - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent 83 - * state. 84 - */ 85 - #define rseq_unsafe_put_user(t, value, field, error_label) \ 86 - do { \ 87 - unsafe_put_user(value, &t->rseq->field, error_label); \ 88 - rseq_kernel_fields(t)->field = value; \ 89 - } while (0) 90 - 91 - #else 92 - static int rseq_validate_ro_fields(struct task_struct *t) 93 - { 94 - return 0; 95 - } 96 - 97 - #define rseq_unsafe_put_user(t, value, field, error_label) \ 98 - unsafe_put_user(value, &t->rseq->field, error_label) 99 - #endif 100 - 101 - /* 102 - * 103 12 * Restartable sequences are a lightweight interface that allows 104 13 * user-level code to be executed atomically relative to scheduler 105 14 * preemption and signal delivery. Typically used for implementing ··· 67 158 * F1. <failure> 68 159 */ 69 160 70 - static int rseq_update_cpu_node_id(struct task_struct *t) 161 + /* Required to select the proper per_cpu ops for rseq_stats_inc() */ 162 + #define RSEQ_BUILD_SLOW_PATH 163 + 164 + #include <linux/debugfs.h> 165 + #include <linux/ratelimit.h> 166 + #include <linux/rseq_entry.h> 167 + #include <linux/sched.h> 168 + #include <linux/syscalls.h> 169 + #include <linux/uaccess.h> 170 + #include <linux/types.h> 171 + #include <asm/ptrace.h> 172 + 173 + #define CREATE_TRACE_POINTS 174 + #include <trace/events/rseq.h> 175 + 176 + DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 177 + 178 + static inline void rseq_control_debug(bool on) 71 179 { 72 - struct rseq __user *rseq = t->rseq; 73 - u32 cpu_id = raw_smp_processor_id(); 74 - u32 node_id = cpu_to_node(cpu_id); 75 - u32 mm_cid = task_mm_cid(t); 180 + if (on) 181 + static_branch_enable(&rseq_debug_enabled); 182 + else 183 + static_branch_disable(&rseq_debug_enabled); 184 + } 76 185 77 - /* 78 - * Validate read-only rseq fields. 79 - */ 80 - if (rseq_validate_ro_fields(t)) 81 - goto efault; 82 - WARN_ON_ONCE((int) mm_cid < 0); 83 - if (!user_write_access_begin(rseq, t->rseq_len)) 84 - goto efault; 186 + static int __init rseq_setup_debug(char *str) 187 + { 188 + bool on; 85 189 86 - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); 87 - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 88 - rseq_unsafe_put_user(t, node_id, node_id, efault_end); 89 - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 190 + if (kstrtobool(str, &on)) 191 + return -EINVAL; 192 + rseq_control_debug(on); 193 + return 1; 194 + } 195 + __setup("rseq_debug=", rseq_setup_debug); 90 196 91 - /* 92 - * Additional feature fields added after ORIG_RSEQ_SIZE 93 - * need to be conditionally updated only if 94 - * t->rseq_len != ORIG_RSEQ_SIZE. 95 - */ 96 - user_write_access_end(); 197 + #ifdef CONFIG_TRACEPOINTS 198 + /* 199 + * Out of line, so the actual update functions can be in a header to be 200 + * inlined into the exit to user code. 201 + */ 202 + void __rseq_trace_update(struct task_struct *t) 203 + { 97 204 trace_rseq_update(t); 98 - return 0; 99 - 100 - efault_end: 101 - user_write_access_end(); 102 - efault: 103 - return -EFAULT; 104 205 } 105 206 106 - static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) 207 + void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 208 + unsigned long offset, unsigned long abort_ip) 107 209 { 108 - struct rseq __user *rseq = t->rseq; 109 - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, 110 - mm_cid = 0; 210 + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); 211 + } 212 + #endif /* CONFIG_TRACEPOINTS */ 111 213 112 - /* 113 - * Validate read-only rseq fields. 114 - */ 115 - if (rseq_validate_ro_fields(t)) 116 - goto efault; 214 + #ifdef CONFIG_DEBUG_FS 215 + #ifdef CONFIG_RSEQ_STATS 216 + DEFINE_PER_CPU(struct rseq_stats, rseq_stats); 117 217 118 - if (!user_write_access_begin(rseq, t->rseq_len)) 119 - goto efault; 218 + static int rseq_stats_show(struct seq_file *m, void *p) 219 + { 220 + struct rseq_stats stats = { }; 221 + unsigned int cpu; 120 222 121 - /* 122 - * Reset all fields to their initial state. 123 - * 124 - * All fields have an initial state of 0 except cpu_id which is set to 125 - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after 126 - * unregistration can figure out that rseq needs to be registered 127 - * again. 128 - */ 129 - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); 130 - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 131 - rseq_unsafe_put_user(t, node_id, node_id, efault_end); 132 - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 223 + for_each_possible_cpu(cpu) { 224 + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); 225 + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); 226 + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); 227 + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); 228 + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); 229 + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); 230 + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); 231 + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); 232 + } 133 233 134 - /* 135 - * Additional feature fields added after ORIG_RSEQ_SIZE 136 - * need to be conditionally reset only if 137 - * t->rseq_len != ORIG_RSEQ_SIZE. 138 - */ 139 - user_write_access_end(); 234 + seq_printf(m, "exit: %16lu\n", stats.exit); 235 + seq_printf(m, "signal: %16lu\n", stats.signal); 236 + seq_printf(m, "slowp: %16lu\n", stats.slowpath); 237 + seq_printf(m, "fastp: %16lu\n", stats.fastpath); 238 + seq_printf(m, "ids: %16lu\n", stats.ids); 239 + seq_printf(m, "cs: %16lu\n", stats.cs); 240 + seq_printf(m, "clear: %16lu\n", stats.clear); 241 + seq_printf(m, "fixup: %16lu\n", stats.fixup); 140 242 return 0; 141 - 142 - efault_end: 143 - user_write_access_end(); 144 - efault: 145 - return -EFAULT; 146 243 } 147 244 148 - /* 149 - * Get the user-space pointer value stored in the 'rseq_cs' field. 150 - */ 151 - static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) 245 + static int rseq_stats_open(struct inode *inode, struct file *file) 152 246 { 153 - if (!rseq_cs) 154 - return -EFAULT; 247 + return single_open(file, rseq_stats_show, inode->i_private); 248 + } 155 249 156 - #ifdef CONFIG_64BIT 157 - if (get_user(*rseq_cs, &rseq->rseq_cs)) 158 - return -EFAULT; 250 + static const struct file_operations stat_ops = { 251 + .open = rseq_stats_open, 252 + .read = seq_read, 253 + .llseek = seq_lseek, 254 + .release = single_release, 255 + }; 256 + 257 + static int __init rseq_stats_init(struct dentry *root_dir) 258 + { 259 + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); 260 + return 0; 261 + } 159 262 #else 160 - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) 161 - return -EFAULT; 162 - #endif 263 + static inline void rseq_stats_init(struct dentry *root_dir) { } 264 + #endif /* CONFIG_RSEQ_STATS */ 163 265 266 + static int rseq_debug_show(struct seq_file *m, void *p) 267 + { 268 + bool on = static_branch_unlikely(&rseq_debug_enabled); 269 + 270 + seq_printf(m, "%d\n", on); 164 271 return 0; 165 272 } 166 273 167 - /* 168 - * If the rseq_cs field of 'struct rseq' contains a valid pointer to 169 - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. 170 - */ 171 - static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 274 + static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, 275 + size_t count, loff_t *ppos) 172 276 { 173 - struct rseq_cs __user *urseq_cs; 174 - u64 ptr; 175 - u32 __user *usig; 176 - u32 sig; 177 - int ret; 277 + bool on; 178 278 179 - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); 180 - if (ret) 181 - return ret; 182 - 183 - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ 184 - if (!ptr) { 185 - memset(rseq_cs, 0, sizeof(*rseq_cs)); 186 - return 0; 187 - } 188 - /* Check that the pointer value fits in the user-space process space. */ 189 - if (ptr >= TASK_SIZE) 190 - return -EINVAL; 191 - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; 192 - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) 193 - return -EFAULT; 194 - 195 - if (rseq_cs->start_ip >= TASK_SIZE || 196 - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || 197 - rseq_cs->abort_ip >= TASK_SIZE || 198 - rseq_cs->version > 0) 199 - return -EINVAL; 200 - /* Check for overflow. */ 201 - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) 202 - return -EINVAL; 203 - /* Ensure that abort_ip is not in the critical section. */ 204 - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) 279 + if (kstrtobool_from_user(ubuf, count, &on)) 205 280 return -EINVAL; 206 281 207 - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); 208 - ret = get_user(sig, usig); 209 - if (ret) 210 - return ret; 282 + rseq_control_debug(on); 283 + return count; 284 + } 211 285 212 - if (current->rseq_sig != sig) { 213 - printk_ratelimited(KERN_WARNING 214 - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", 215 - sig, current->rseq_sig, current->pid, usig); 216 - return -EINVAL; 217 - } 286 + static int rseq_debug_open(struct inode *inode, struct file *file) 287 + { 288 + return single_open(file, rseq_debug_show, inode->i_private); 289 + } 290 + 291 + static const struct file_operations debug_ops = { 292 + .open = rseq_debug_open, 293 + .read = seq_read, 294 + .write = rseq_debug_write, 295 + .llseek = seq_lseek, 296 + .release = single_release, 297 + }; 298 + 299 + static int __init rseq_debugfs_init(void) 300 + { 301 + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); 302 + 303 + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); 304 + rseq_stats_init(root_dir); 218 305 return 0; 219 306 } 307 + __initcall(rseq_debugfs_init); 308 + #endif /* CONFIG_DEBUG_FS */ 220 309 221 - static bool rseq_warn_flags(const char *str, u32 flags) 310 + static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) 222 311 { 223 - u32 test_flags; 224 - 225 - if (!flags) 226 - return false; 227 - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; 228 - if (test_flags) 229 - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); 230 - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; 231 - if (test_flags) 232 - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); 233 - return true; 312 + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); 234 313 } 235 314 236 - static int rseq_need_restart(struct task_struct *t, u32 cs_flags) 315 + static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) 237 316 { 238 - u32 flags, event_mask; 239 - int ret; 317 + struct rseq __user *urseq = t->rseq.usrptr; 318 + u64 csaddr; 240 319 241 - if (rseq_warn_flags("rseq_cs", cs_flags)) 242 - return -EINVAL; 243 - 244 - /* Get thread flags. */ 245 - ret = get_user(flags, &t->rseq->flags); 246 - if (ret) 247 - return ret; 248 - 249 - if (rseq_warn_flags("rseq", flags)) 250 - return -EINVAL; 251 - 252 - /* 253 - * Load and clear event mask atomically with respect to 254 - * scheduler preemption and membarrier IPIs. 255 - */ 256 - scoped_guard(RSEQ_EVENT_GUARD) { 257 - event_mask = t->rseq_event_mask; 258 - t->rseq_event_mask = 0; 259 - } 260 - 261 - return !!event_mask; 320 + scoped_user_read_access(urseq, efault) 321 + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); 322 + if (likely(!csaddr)) 323 + return true; 324 + return rseq_update_user_cs(t, regs, csaddr); 325 + efault: 326 + return false; 262 327 } 263 328 264 - static int clear_rseq_cs(struct rseq __user *rseq) 329 + static void rseq_slowpath_update_usr(struct pt_regs *regs) 265 330 { 266 331 /* 267 - * The rseq_cs field is set to NULL on preemption or signal 268 - * delivery on top of rseq assembly block, as well as on top 269 - * of code outside of the rseq assembly block. This performs 270 - * a lazy clear of the rseq_cs field. 271 - * 272 - * Set rseq_cs to NULL. 332 + * Preserve rseq state and user_irq state. The generic entry code 333 + * clears user_irq on the way out, the non-generic entry 334 + * architectures are not having user_irq. 273 335 */ 274 - #ifdef CONFIG_64BIT 275 - return put_user(0UL, &rseq->rseq_cs); 276 - #else 277 - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) 278 - return -EFAULT; 279 - return 0; 280 - #endif 281 - } 282 - 283 - /* 284 - * Unsigned comparison will be true when ip >= start_ip, and when 285 - * ip < start_ip + post_commit_offset. 286 - */ 287 - static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) 288 - { 289 - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; 290 - } 291 - 292 - static int rseq_ip_fixup(struct pt_regs *regs) 293 - { 294 - unsigned long ip = instruction_pointer(regs); 336 + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; 295 337 struct task_struct *t = current; 296 - struct rseq_cs rseq_cs; 297 - int ret; 298 - 299 - ret = rseq_get_rseq_cs(t, &rseq_cs); 300 - if (ret) 301 - return ret; 302 - 303 - /* 304 - * Handle potentially not being within a critical section. 305 - * If not nested over a rseq critical section, restart is useless. 306 - * Clear the rseq_cs pointer and return. 307 - */ 308 - if (!in_rseq_cs(ip, &rseq_cs)) 309 - return clear_rseq_cs(t->rseq); 310 - ret = rseq_need_restart(t, rseq_cs.flags); 311 - if (ret <= 0) 312 - return ret; 313 - ret = clear_rseq_cs(t->rseq); 314 - if (ret) 315 - return ret; 316 - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, 317 - rseq_cs.abort_ip); 318 - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); 319 - return 0; 320 - } 321 - 322 - /* 323 - * This resume handler must always be executed between any of: 324 - * - preemption, 325 - * - signal delivery, 326 - * and return to user-space. 327 - * 328 - * This is how we can ensure that the entire rseq critical section 329 - * will issue the commit instruction only if executed atomically with 330 - * respect to other threads scheduled on the same CPU, and with respect 331 - * to signal handlers. 332 - */ 333 - void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) 334 - { 335 - struct task_struct *t = current; 336 - int ret, sig; 338 + struct rseq_ids ids; 339 + u32 node_id; 340 + bool event; 337 341 338 342 if (unlikely(t->flags & PF_EXITING)) 339 343 return; 340 344 341 - /* 342 - * regs is NULL if and only if the caller is in a syscall path. Skip 343 - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and 344 - * kill a misbehaving userspace on debug kernels. 345 - */ 346 - if (regs) { 347 - ret = rseq_ip_fixup(regs); 348 - if (unlikely(ret < 0)) 349 - goto error; 350 - } 351 - if (unlikely(rseq_update_cpu_node_id(t))) 352 - goto error; 353 - return; 345 + rseq_stat_inc(rseq_stats.slowpath); 354 346 355 - error: 356 - sig = ksig ? ksig->sig : 0; 357 - force_sigsegv(sig); 347 + /* 348 + * Read and clear the event pending bit first. If the task 349 + * was not preempted or migrated or a signal is on the way, 350 + * there is no point in doing any of the heavy lifting here 351 + * on production kernels. In that case TIF_NOTIFY_RESUME 352 + * was raised by some other functionality. 353 + * 354 + * This is correct because the read/clear operation is 355 + * guarded against scheduler preemption, which makes it CPU 356 + * local atomic. If the task is preempted right after 357 + * re-enabling preemption then TIF_NOTIFY_RESUME is set 358 + * again and this function is invoked another time _before_ 359 + * the task is able to return to user mode. 360 + * 361 + * On a debug kernel, invoke the fixup code unconditionally 362 + * with the result handed in to allow the detection of 363 + * inconsistencies. 364 + */ 365 + scoped_guard(irq) { 366 + event = t->rseq.event.sched_switch; 367 + t->rseq.event.all &= evt_mask.all; 368 + ids.cpu_id = task_cpu(t); 369 + ids.mm_cid = task_mm_cid(t); 370 + } 371 + 372 + if (!event) 373 + return; 374 + 375 + node_id = cpu_to_node(ids.cpu_id); 376 + 377 + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { 378 + /* 379 + * Clear the errors just in case this might survive magically, but 380 + * leave the rest intact. 381 + */ 382 + t->rseq.event.error = 0; 383 + force_sig(SIGSEGV); 384 + } 358 385 } 359 386 360 - #ifdef CONFIG_DEBUG_RSEQ 387 + void __rseq_handle_slowpath(struct pt_regs *regs) 388 + { 389 + /* 390 + * If invoked from hypervisors before entering the guest via 391 + * resume_user_mode_work(), then @regs is a NULL pointer. 392 + * 393 + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 394 + * it before returning from the ioctl() to user space when 395 + * rseq_event.sched_switch is set. 396 + * 397 + * So it's safe to ignore here instead of pointlessly updating it 398 + * in the vcpu_run() loop. 399 + */ 400 + if (!regs) 401 + return; 402 + 403 + rseq_slowpath_update_usr(regs); 404 + } 405 + 406 + void __rseq_signal_deliver(int sig, struct pt_regs *regs) 407 + { 408 + rseq_stat_inc(rseq_stats.signal); 409 + /* 410 + * Don't update IDs, they are handled on exit to user if 411 + * necessary. The important thing is to abort a critical section of 412 + * the interrupted context as after this point the instruction 413 + * pointer in @regs points to the signal handler. 414 + */ 415 + if (unlikely(!rseq_handle_cs(current, regs))) { 416 + /* 417 + * Clear the errors just in case this might survive 418 + * magically, but leave the rest intact. 419 + */ 420 + current->rseq.event.error = 0; 421 + force_sigsegv(sig); 422 + } 423 + } 361 424 362 425 /* 363 426 * Terminate the process if a syscall is issued within a restartable 364 427 * sequence. 365 428 */ 366 - void rseq_syscall(struct pt_regs *regs) 429 + void __rseq_debug_syscall_return(struct pt_regs *regs) 367 430 { 368 - unsigned long ip = instruction_pointer(regs); 369 431 struct task_struct *t = current; 370 - struct rseq_cs rseq_cs; 432 + u64 csaddr; 371 433 372 - if (!t->rseq) 434 + if (!t->rseq.event.has_rseq) 373 435 return; 374 - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) 375 - force_sig(SIGSEGV); 436 + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) 437 + goto fail; 438 + if (likely(!csaddr)) 439 + return; 440 + if (unlikely(csaddr >= TASK_SIZE)) 441 + goto fail; 442 + if (rseq_debug_update_user_cs(t, regs, csaddr)) 443 + return; 444 + fail: 445 + force_sig(SIGSEGV); 376 446 } 377 447 448 + #ifdef CONFIG_DEBUG_RSEQ 449 + /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ 450 + void rseq_syscall(struct pt_regs *regs) 451 + { 452 + __rseq_debug_syscall_return(regs); 453 + } 378 454 #endif 455 + 456 + static bool rseq_reset_ids(void) 457 + { 458 + struct rseq_ids ids = { 459 + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 460 + .mm_cid = 0, 461 + }; 462 + 463 + /* 464 + * If this fails, terminate it because this leaves the kernel in 465 + * stupid state as exit to user space will try to fixup the ids 466 + * again. 467 + */ 468 + if (rseq_set_ids(current, &ids, 0)) 469 + return true; 470 + 471 + force_sig(SIGSEGV); 472 + return false; 473 + } 474 + 475 + /* The original rseq structure size (including padding) is 32 bytes. */ 476 + #define ORIG_RSEQ_SIZE 32 379 477 380 478 /* 381 479 * sys_rseq - setup restartable sequences for caller thread. 382 480 */ 383 - SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, 384 - int, flags, u32, sig) 481 + SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) 385 482 { 386 - int ret; 387 - u64 rseq_cs; 388 - 389 483 if (flags & RSEQ_FLAG_UNREGISTER) { 390 484 if (flags & ~RSEQ_FLAG_UNREGISTER) 391 485 return -EINVAL; 392 486 /* Unregister rseq for current thread. */ 393 - if (current->rseq != rseq || !current->rseq) 487 + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) 394 488 return -EINVAL; 395 - if (rseq_len != current->rseq_len) 489 + if (rseq_len != current->rseq.len) 396 490 return -EINVAL; 397 - if (current->rseq_sig != sig) 491 + if (current->rseq.sig != sig) 398 492 return -EPERM; 399 - ret = rseq_reset_rseq_cpu_node_id(current); 400 - if (ret) 401 - return ret; 402 - current->rseq = NULL; 403 - current->rseq_sig = 0; 404 - current->rseq_len = 0; 493 + if (!rseq_reset_ids()) 494 + return -EFAULT; 495 + rseq_reset(current); 405 496 return 0; 406 497 } 407 498 408 499 if (unlikely(flags)) 409 500 return -EINVAL; 410 501 411 - if (current->rseq) { 502 + if (current->rseq.usrptr) { 412 503 /* 413 504 * If rseq is already registered, check whether 414 505 * the provided address differs from the prior 415 506 * one. 416 507 */ 417 - if (current->rseq != rseq || rseq_len != current->rseq_len) 508 + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) 418 509 return -EINVAL; 419 - if (current->rseq_sig != sig) 510 + if (current->rseq.sig != sig) 420 511 return -EPERM; 421 512 /* Already registered. */ 422 513 return -EBUSY; ··· 440 531 if (!access_ok(rseq, rseq_len)) 441 532 return -EFAULT; 442 533 443 - /* 444 - * If the rseq_cs pointer is non-NULL on registration, clear it to 445 - * avoid a potential segfault on return to user-space. The proper thing 446 - * to do would have been to fail the registration but this would break 447 - * older libcs that reuse the rseq area for new threads without 448 - * clearing the fields. 449 - */ 450 - if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) 451 - return -EFAULT; 452 - if (rseq_cs && clear_rseq_cs(rseq)) 453 - return -EFAULT; 534 + scoped_user_write_access(rseq, efault) { 535 + /* 536 + * If the rseq_cs pointer is non-NULL on registration, clear it to 537 + * avoid a potential segfault on return to user-space. The proper thing 538 + * to do would have been to fail the registration but this would break 539 + * older libcs that reuse the rseq area for new threads without 540 + * clearing the fields. Don't bother reading it, just reset it. 541 + */ 542 + unsafe_put_user(0UL, &rseq->rseq_cs, efault); 543 + /* Initialize IDs in user space */ 544 + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); 545 + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 546 + unsafe_put_user(0U, &rseq->node_id, efault); 547 + unsafe_put_user(0U, &rseq->mm_cid, efault); 548 + } 454 549 455 - #ifdef CONFIG_DEBUG_RSEQ 456 - /* 457 - * Initialize the in-kernel rseq fields copy for validation of 458 - * read-only fields. 459 - */ 460 - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || 461 - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || 462 - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || 463 - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) 464 - return -EFAULT; 465 - #endif 466 550 /* 467 551 * Activate the registration by setting the rseq area address, length 468 552 * and signature in the task struct. 469 553 */ 470 - current->rseq = rseq; 471 - current->rseq_len = rseq_len; 472 - current->rseq_sig = sig; 554 + current->rseq.usrptr = rseq; 555 + current->rseq.len = rseq_len; 556 + current->rseq.sig = sig; 473 557 474 558 /* 475 559 * If rseq was previously inactive, and has just been 476 560 * registered, ensure the cpu_id_start and cpu_id fields 477 561 * are updated before returning to user-space. 478 562 */ 479 - rseq_set_notify_resume(current); 480 - 563 + current->rseq.event.has_rseq = true; 564 + rseq_force_update(); 481 565 return 0; 566 + 567 + efault: 568 + return -EFAULT; 482 569 }

+442 -472

kernel/sched/core.c

··· 2131 2131 { 2132 2132 if (task_on_rq_migrating(p)) 2133 2133 flags |= ENQUEUE_MIGRATED; 2134 - if (flags & ENQUEUE_MIGRATED) 2135 - sched_mm_cid_migrate_to(rq, p); 2136 2134 2137 2135 enqueue_task(rq, p, flags); 2138 2136 ··· 2641 2643 return 0; 2642 2644 } 2643 2645 2646 + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); 2647 + 2644 2648 /* 2645 2649 * sched_class::set_cpus_allowed must do the below, but is not required to 2646 2650 * actually call this function. ··· 2656 2656 2657 2657 cpumask_copy(&p->cpus_mask, ctx->new_mask); 2658 2658 p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); 2659 + mm_update_cpus_allowed(p->mm, ctx->new_mask); 2659 2660 2660 2661 /* 2661 2662 * Swap in a new user_cpus_ptr if SCA_USER flag set ··· 2668 2667 static void 2669 2668 do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) 2670 2669 { 2671 - scoped_guard (sched_change, p, DEQUEUE_SAVE) { 2670 + scoped_guard (sched_change, p, DEQUEUE_SAVE) 2672 2671 p->sched_class->set_cpus_allowed(p, ctx); 2673 - mm_set_cpus_allowed(p->mm, ctx->new_mask); 2674 - } 2675 2672 } 2676 2673 2677 2674 /* ··· 3262 3263 if (p->sched_class->migrate_task_rq) 3263 3264 p->sched_class->migrate_task_rq(p, new_cpu); 3264 3265 p->se.nr_migrations++; 3265 - rseq_migrate(p); 3266 - sched_mm_cid_migrate_from(p); 3267 3266 perf_event_task_migrate(p); 3268 3267 } 3269 3268 ··· 4412 4415 init_numa_balancing(clone_flags, p); 4413 4416 p->wake_entry.u_flags = CSD_TYPE_TTWU; 4414 4417 p->migration_pending = NULL; 4415 - init_sched_mm_cid(p); 4416 4418 } 4417 4419 4418 4420 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); ··· 4687 4691 p->sched_task_group = tg; 4688 4692 } 4689 4693 #endif 4690 - rseq_migrate(p); 4691 4694 /* 4692 4695 * We're setting the CPU for the first time, we don't migrate, 4693 4696 * so use __set_task_cpu(). ··· 4750 4755 * as we're not fully set-up yet. 4751 4756 */ 4752 4757 p->recent_used_cpu = task_cpu(p); 4753 - rseq_migrate(p); 4754 4758 __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); 4755 4759 rq = __task_rq_lock(p, &rf); 4756 4760 update_rq_clock(rq); ··· 5043 5049 kcov_prepare_switch(prev); 5044 5050 sched_info_switch(rq, prev, next); 5045 5051 perf_event_task_sched_out(prev, next); 5046 - rseq_preempt(prev); 5047 5052 fire_sched_out_preempt_notifiers(prev, next); 5048 5053 kmap_local_sched_out(); 5049 5054 prepare_task(next); ··· 5205 5212 * 5206 5213 * kernel -> user switch + mmdrop_lazy_tlb() active 5207 5214 * user -> user switch 5208 - * 5209 - * switch_mm_cid() needs to be updated if the barriers provided 5210 - * by context_switch() are modified. 5211 5215 */ 5212 - if (!next->mm) { // to kernel 5216 + if (!next->mm) { // to kernel 5213 5217 enter_lazy_tlb(prev->active_mm, next); 5214 5218 5215 5219 next->active_mm = prev->active_mm; 5216 - if (prev->mm) // from user 5220 + if (prev->mm) // from user 5217 5221 mmgrab_lazy_tlb(prev->active_mm); 5218 5222 else 5219 5223 prev->active_mm = NULL; 5220 - } else { // to user 5224 + } else { // to user 5221 5225 membarrier_switch_mm(rq, prev->active_mm, next->mm); 5222 5226 /* 5223 5227 * sys_membarrier() requires an smp_mb() between setting ··· 5227 5237 switch_mm_irqs_off(prev->active_mm, next->mm, next); 5228 5238 lru_gen_use_mm(next->mm); 5229 5239 5230 - if (!prev->mm) { // from kernel 5240 + if (!prev->mm) { // from kernel 5231 5241 /* will mmdrop_lazy_tlb() in finish_task_switch(). */ 5232 5242 rq->prev_mm = prev->active_mm; 5233 5243 prev->active_mm = NULL; 5234 5244 } 5235 5245 } 5236 5246 5237 - /* switch_mm_cid() requires the memory barriers above. */ 5238 - switch_mm_cid(rq, prev, next); 5247 + mm_cid_switch_to(prev, next); 5248 + 5249 + /* 5250 + * Tell rseq that the task was scheduled in. Must be after 5251 + * switch_mm_cid() to get the TIF flag set. 5252 + */ 5253 + rseq_sched_switch_event(next); 5239 5254 5240 5255 prepare_lock_switch(rq, next, rf); 5241 5256 ··· 5525 5530 resched_latency = cpu_resched_latency(rq); 5526 5531 calc_global_load_tick(rq); 5527 5532 sched_core_tick(rq); 5528 - task_tick_mm_cid(rq, donor); 5529 5533 scx_tick(rq); 5530 5534 5531 5535 rq_unlock(rq, &rf); ··· 10254 10260 } 10255 10261 10256 10262 #ifdef CONFIG_SCHED_MM_CID 10257 - 10258 10263 /* 10259 - * @cid_lock: Guarantee forward-progress of cid allocation. 10264 + * Concurrency IDentifier management 10260 10265 * 10261 - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock 10262 - * is only used when contention is detected by the lock-free allocation so 10263 - * forward progress can be guaranteed. 10264 - */ 10265 - DEFINE_RAW_SPINLOCK(cid_lock); 10266 - 10267 - /* 10268 - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. 10266 + * Serialization rules: 10269 10267 * 10270 - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is 10271 - * detected, it is set to 1 to ensure that all newly coming allocations are 10272 - * serialized by @cid_lock until the allocation which detected contention 10273 - * completes and sets @use_cid_lock back to 0. This guarantees forward progress 10274 - * of a cid allocation. 10275 - */ 10276 - int use_cid_lock; 10277 - 10278 - /* 10279 - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid 10280 - * concurrently with respect to the execution of the source runqueue context 10281 - * switch. 10268 + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore 10269 + * protects mm::mm_cid::users. 10282 10270 * 10283 - * There is one basic properties we want to guarantee here: 10271 + * mm::mm_cid::lock: Serializes mm_update_max_cids() and 10272 + * mm_update_cpus_allowed(). Nests in mm_cid::mutex 10273 + * and runqueue lock. 10284 10274 * 10285 - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively 10286 - * used by a task. That would lead to concurrent allocation of the cid and 10287 - * userspace corruption. 10275 + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks 10276 + * and can only be modified with atomic operations. 10288 10277 * 10289 - * Provide this guarantee by introducing a Dekker memory ordering to guarantee 10290 - * that a pair of loads observe at least one of a pair of stores, which can be 10291 - * shown as: 10278 + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue 10279 + * lock. 10292 10280 * 10293 - * X = Y = 0 10281 + * CID ownership: 10294 10282 * 10295 - * w[X]=1 w[Y]=1 10296 - * MB MB 10297 - * r[Y]=y r[X]=x 10283 + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or 10284 + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the 10285 + * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, 10286 + * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the 10287 + * task needs to drop the CID into the pool when scheduling out. Both bits 10288 + * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is 10289 + * actually handed over to user space in the RSEQ memory. 10298 10290 * 10299 - * Which guarantees that x==0 && y==0 is impossible. But rather than using 10300 - * values 0 and 1, this algorithm cares about specific state transitions of the 10301 - * runqueue current task (as updated by the scheduler context switch), and the 10302 - * per-mm/cpu cid value. 10291 + * Mode switching: 10303 10292 * 10304 - * Let's introduce task (Y) which has task->mm == mm and task (N) which has 10305 - * task->mm != mm for the rest of the discussion. There are two scheduler state 10306 - * transitions on context switch we care about: 10293 + * Switching to per CPU mode happens when the user count becomes greater 10294 + * than the maximum number of CIDs, which is calculated by: 10307 10295 * 10308 - * (TSA) Store to rq->curr with transition from (N) to (Y) 10296 + * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); 10297 + * max_cids = min(1.25 * opt_cids, num_possible_cpus()); 10309 10298 * 10310 - * (TSB) Store to rq->curr with transition from (Y) to (N) 10299 + * The +25% allowance is useful for tight CPU masks in scenarios where only 10300 + * a few threads are created and destroyed to avoid frequent mode 10301 + * switches. Though this allowance shrinks, the closer opt_cids becomes to 10302 + * num_possible_cpus(), which is the (unfortunate) hard ABI limit. 10311 10303 * 10312 - * On the remote-clear side, there is one transition we care about: 10304 + * At the point of switching to per CPU mode the new user is not yet 10305 + * visible in the system, so the task which initiated the fork() runs the 10306 + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and 10307 + * either transfers each tasks owned CID to the CPU the task runs on or 10308 + * drops it into the CID pool if a task is not on a CPU at that point in 10309 + * time. Tasks which schedule in before the task walk reaches them do the 10310 + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes 10311 + * it's guaranteed that no task related to that MM owns a CID anymore. 10313 10312 * 10314 - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag 10313 + * Switching back to task mode happens when the user count goes below the 10314 + * threshold which was recorded on the per CPU mode switch: 10315 10315 * 10316 - * There is also a transition to UNSET state which can be performed from all 10317 - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which 10318 - * guarantees that only a single thread will succeed: 10316 + * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); 10319 10317 * 10320 - * (TMB) cmpxchg to *pcpu_cid to mark UNSET 10318 + * This threshold is updated when a affinity change increases the number of 10319 + * allowed CPUs for the MM, which might cause a switch back to per task 10320 + * mode. 10321 10321 * 10322 - * Just to be clear, what we do _not_ want to happen is a transition to UNSET 10323 - * when a thread is actively using the cid (property (1)). 10322 + * If the switch back was initiated by a exiting task, then that task runs 10323 + * the fixup function. If it was initiated by a affinity change, then it's 10324 + * run either in the deferred update function in context of a workqueue or 10325 + * by a task which forks a new one or by a task which exits. Whatever 10326 + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible 10327 + * CPUs and either transfers the CPU owned CIDs to a related task which 10328 + * runs on the CPU or drops it into the pool. Tasks which schedule in on a 10329 + * CPU which the walk did not cover yet do the handover themself. 10324 10330 * 10325 - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. 10331 + * This transition from CPU to per task ownership happens in two phases: 10326 10332 * 10327 - * Scenario A) (TSA)+(TMA) (from next task perspective) 10333 + * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task 10334 + * CID and denotes that the CID is only temporarily owned by the 10335 + * task. When it schedules out the task drops the CID back into the 10336 + * pool if this bit is set. 10328 10337 * 10329 - * CPU0 CPU1 10338 + * 2) The initiating context walks the per CPU space and after completion 10339 + * clears mm:mm_cid.transit. So after that point the CIDs are strictly 10340 + * task owned again. 10330 10341 * 10331 - * Context switch CS-1 Remote-clear 10332 - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) 10333 - * (implied barrier after cmpxchg) 10334 - * - switch_mm_cid() 10335 - * - memory barrier (see switch_mm_cid() 10336 - * comment explaining how this barrier 10337 - * is combined with other scheduler 10338 - * barriers) 10339 - * - mm_cid_get (next) 10340 - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) 10342 + * This two phase transition is required to prevent CID space exhaustion 10343 + * during the transition as a direct transfer of ownership would fail if 10344 + * two tasks are scheduled in on the same CPU before the fixup freed per 10345 + * CPU CIDs. 10341 10346 * 10342 - * This Dekker ensures that either task (Y) is observed by the 10343 - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are 10344 - * observed. 10345 - * 10346 - * If task (Y) store is observed by rcu_dereference(), it means that there is 10347 - * still an active task on the cpu. Remote-clear will therefore not transition 10348 - * to UNSET, which fulfills property (1). 10349 - * 10350 - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), 10351 - * it will move its state to UNSET, which clears the percpu cid perhaps 10352 - * uselessly (which is not an issue for correctness). Because task (Y) is not 10353 - * observed, CPU1 can move ahead to set the state to UNSET. Because moving 10354 - * state to UNSET is done with a cmpxchg expecting that the old state has the 10355 - * LAZY flag set, only one thread will successfully UNSET. 10356 - * 10357 - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 10358 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and 10359 - * CPU1 will observe task (Y) and do nothing more, which is fine. 10360 - * 10361 - * What we are effectively preventing with this Dekker is a scenario where 10362 - * neither LAZY flag nor store (Y) are observed, which would fail property (1) 10363 - * because this would UNSET a cid which is actively used. 10347 + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID 10348 + * related to that MM is owned by a CPU anymore. 10364 10349 */ 10365 10350 10366 - void sched_mm_cid_migrate_from(struct task_struct *t) 10351 + /* 10352 + * Update the CID range properties when the constraints change. Invoked via 10353 + * fork(), exit() and affinity changes 10354 + */ 10355 + static void __mm_update_max_cids(struct mm_mm_cid *mc) 10367 10356 { 10368 - t->migrate_from_cpu = task_cpu(t); 10357 + unsigned int opt_cids, max_cids; 10358 + 10359 + /* Calculate the new optimal constraint */ 10360 + opt_cids = min(mc->nr_cpus_allowed, mc->users); 10361 + 10362 + /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ 10363 + max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); 10364 + WRITE_ONCE(mc->max_cids, max_cids); 10369 10365 } 10370 10366 10371 - static 10372 - int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, 10373 - struct task_struct *t, 10374 - struct mm_cid *src_pcpu_cid) 10367 + static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) 10375 10368 { 10376 - struct mm_struct *mm = t->mm; 10377 - struct task_struct *src_task; 10378 - int src_cid, last_mm_cid; 10369 + unsigned int opt_cids; 10379 10370 10380 - if (!mm) 10381 - return -1; 10371 + opt_cids = min(mc->nr_cpus_allowed, mc->users); 10372 + /* Has to be at least 1 because 0 indicates PCPU mode off */ 10373 + return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); 10374 + } 10382 10375 10383 - last_mm_cid = t->last_mm_cid; 10384 - /* 10385 - * If the migrated task has no last cid, or if the current 10386 - * task on src rq uses the cid, it means the source cid does not need 10387 - * to be moved to the destination cpu. 10388 - */ 10389 - if (last_mm_cid == -1) 10390 - return -1; 10391 - src_cid = READ_ONCE(src_pcpu_cid->cid); 10392 - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) 10393 - return -1; 10376 + static bool mm_update_max_cids(struct mm_struct *mm) 10377 + { 10378 + struct mm_mm_cid *mc = &mm->mm_cid; 10394 10379 10395 - /* 10396 - * If we observe an active task using the mm on this rq, it means we 10397 - * are not the last task to be migrated from this cpu for this mm, so 10398 - * there is no need to move src_cid to the destination cpu. 10399 - */ 10400 - guard(rcu)(); 10401 - src_task = rcu_dereference(src_rq->curr); 10402 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10403 - t->last_mm_cid = -1; 10404 - return -1; 10380 + lockdep_assert_held(&mm->mm_cid.lock); 10381 + 10382 + /* Clear deferred mode switch flag. A change is handled by the caller */ 10383 + mc->update_deferred = false; 10384 + __mm_update_max_cids(mc); 10385 + 10386 + /* Check whether owner mode must be changed */ 10387 + if (!mc->percpu) { 10388 + /* Enable per CPU mode when the number of users is above max_cids */ 10389 + if (mc->users > mc->max_cids) 10390 + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); 10391 + } else { 10392 + /* Switch back to per task if user count under threshold */ 10393 + if (mc->users < mc->pcpu_thrs) 10394 + mc->pcpu_thrs = 0; 10405 10395 } 10406 10396 10407 - return src_cid; 10397 + /* Mode change required? */ 10398 + if (!!mc->percpu == !!mc->pcpu_thrs) 10399 + return false; 10400 + /* When switching back to per TASK mode, set the transition flag */ 10401 + if (!mc->pcpu_thrs) 10402 + WRITE_ONCE(mc->transit, MM_CID_TRANSIT); 10403 + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); 10404 + return true; 10408 10405 } 10409 10406 10410 - static 10411 - int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, 10412 - struct task_struct *t, 10413 - struct mm_cid *src_pcpu_cid, 10414 - int src_cid) 10407 + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) 10415 10408 { 10416 - struct task_struct *src_task; 10417 - struct mm_struct *mm = t->mm; 10418 - int lazy_cid; 10409 + struct cpumask *mm_allowed; 10410 + struct mm_mm_cid *mc; 10411 + unsigned int weight; 10419 10412 10420 - if (src_cid == -1) 10421 - return -1; 10422 - 10413 + if (!mm || !READ_ONCE(mm->mm_cid.users)) 10414 + return; 10423 10415 /* 10424 - * Attempt to clear the source cpu cid to move it to the destination 10425 - * cpu. 10416 + * mm::mm_cid::mm_cpus_allowed is the superset of each threads 10417 + * allowed CPUs mask which means it can only grow. 10426 10418 */ 10427 - lazy_cid = mm_cid_set_lazy_put(src_cid); 10428 - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) 10429 - return -1; 10419 + mc = &mm->mm_cid; 10420 + guard(raw_spinlock)(&mc->lock); 10421 + mm_allowed = mm_cpus_allowed(mm); 10422 + weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); 10423 + if (weight == mc->nr_cpus_allowed) 10424 + return; 10430 10425 10431 - /* 10432 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10433 - * rq->curr->mm matches the scheduler barrier in context_switch() 10434 - * between store to rq->curr and load of prev and next task's 10435 - * per-mm/cpu cid. 10436 - * 10437 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10438 - * rq->curr->mm_cid_active matches the barrier in 10439 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10440 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10441 - * load of per-mm/cpu cid. 10442 - */ 10426 + WRITE_ONCE(mc->nr_cpus_allowed, weight); 10427 + __mm_update_max_cids(mc); 10428 + if (!mc->percpu) 10429 + return; 10443 10430 10444 - /* 10445 - * If we observe an active task using the mm on this rq after setting 10446 - * the lazy-put flag, this task will be responsible for transitioning 10447 - * from lazy-put flag set to MM_CID_UNSET. 10448 - */ 10449 - scoped_guard (rcu) { 10450 - src_task = rcu_dereference(src_rq->curr); 10451 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10431 + /* Adjust the threshold to the wider set */ 10432 + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); 10433 + /* Switch back to per task mode? */ 10434 + if (mc->users >= mc->pcpu_thrs) 10435 + return; 10436 + 10437 + /* Don't queue twice */ 10438 + if (mc->update_deferred) 10439 + return; 10440 + 10441 + /* Queue the irq work, which schedules the real work */ 10442 + mc->update_deferred = true; 10443 + irq_work_queue(&mc->irq_work); 10444 + } 10445 + 10446 + static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) 10447 + { 10448 + if (cid_on_cpu(t->mm_cid.cid)) { 10449 + unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); 10450 + 10451 + t->mm_cid.cid = cid_to_transit_cid(cid); 10452 + pcp->cid = t->mm_cid.cid; 10453 + } 10454 + } 10455 + 10456 + static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) 10457 + { 10458 + unsigned int cpu; 10459 + 10460 + /* Walk the CPUs and fixup all stale CIDs */ 10461 + for_each_possible_cpu(cpu) { 10462 + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); 10463 + struct rq *rq = cpu_rq(cpu); 10464 + 10465 + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10466 + guard(rq_lock_irq)(rq); 10467 + /* Is the CID still owned by the CPU? */ 10468 + if (cid_on_cpu(pcp->cid)) { 10452 10469 /* 10453 - * We observed an active task for this mm, there is therefore 10454 - * no point in moving this cid to the destination cpu. 10470 + * If rq->curr has @mm, transfer it with the 10471 + * transition bit set. Otherwise drop it. 10455 10472 */ 10456 - t->last_mm_cid = -1; 10457 - return -1; 10473 + if (rq->curr->mm == mm && rq->curr->mm_cid.active) 10474 + mm_cid_transit_to_task(rq->curr, pcp); 10475 + else 10476 + mm_drop_cid_on_cpu(mm, pcp); 10477 + 10478 + } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { 10479 + unsigned int cid = rq->curr->mm_cid.cid; 10480 + 10481 + /* Ensure it has the transition bit set */ 10482 + if (!cid_in_transit(cid)) { 10483 + cid = cid_to_transit_cid(cid); 10484 + rq->curr->mm_cid.cid = cid; 10485 + pcp->cid = cid; 10486 + } 10458 10487 } 10459 10488 } 10460 - 10461 - /* 10462 - * The src_cid is unused, so it can be unset. 10463 - */ 10464 - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10465 - return -1; 10466 - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); 10467 - return src_cid; 10489 + /* Clear the transition bit */ 10490 + WRITE_ONCE(mm->mm_cid.transit, 0); 10468 10491 } 10469 10492 10470 - /* 10471 - * Migration to dst cpu. Called with dst_rq lock held. 10472 - * Interrupts are disabled, which keeps the window of cid ownership without the 10473 - * source rq lock held small. 10474 - */ 10475 - void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) 10493 + static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) 10476 10494 { 10477 - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; 10478 - struct mm_struct *mm = t->mm; 10479 - int src_cid, src_cpu; 10480 - bool dst_cid_is_set; 10481 - struct rq *src_rq; 10482 - 10483 - lockdep_assert_rq_held(dst_rq); 10484 - 10485 - if (!mm) 10486 - return; 10487 - src_cpu = t->migrate_from_cpu; 10488 - if (src_cpu == -1) { 10489 - t->last_mm_cid = -1; 10490 - return; 10491 - } 10492 - /* 10493 - * Move the src cid if the dst cid is unset. This keeps id 10494 - * allocation closest to 0 in cases where few threads migrate around 10495 - * many CPUs. 10496 - * 10497 - * If destination cid or recent cid is already set, we may have 10498 - * to just clear the src cid to ensure compactness in frequent 10499 - * migrations scenarios. 10500 - * 10501 - * It is not useful to clear the src cid when the number of threads is 10502 - * greater or equal to the number of allowed CPUs, because user-space 10503 - * can expect that the number of allowed cids can reach the number of 10504 - * allowed CPUs. 10505 - */ 10506 - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); 10507 - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || 10508 - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); 10509 - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) 10510 - return; 10511 - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); 10512 - src_rq = cpu_rq(src_cpu); 10513 - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); 10514 - if (src_cid == -1) 10515 - return; 10516 - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, 10517 - src_cid); 10518 - if (src_cid == -1) 10519 - return; 10520 - if (dst_cid_is_set) { 10521 - __mm_cid_put(mm, src_cid); 10522 - return; 10523 - } 10524 - /* Move src_cid to dst cpu. */ 10525 - mm_cid_snapshot_time(dst_rq, mm); 10526 - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); 10527 - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); 10528 - } 10529 - 10530 - static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, 10531 - int cpu) 10532 - { 10533 - struct rq *rq = cpu_rq(cpu); 10534 - struct task_struct *t; 10535 - int cid, lazy_cid; 10536 - 10537 - cid = READ_ONCE(pcpu_cid->cid); 10538 - if (!mm_cid_is_valid(cid)) 10539 - return; 10540 - 10541 - /* 10542 - * Clear the cpu cid if it is set to keep cid allocation compact. If 10543 - * there happens to be other tasks left on the source cpu using this 10544 - * mm, the next task using this mm will reallocate its cid on context 10545 - * switch. 10546 - */ 10547 - lazy_cid = mm_cid_set_lazy_put(cid); 10548 - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) 10549 - return; 10550 - 10551 - /* 10552 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10553 - * rq->curr->mm matches the scheduler barrier in context_switch() 10554 - * between store to rq->curr and load of prev and next task's 10555 - * per-mm/cpu cid. 10556 - * 10557 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10558 - * rq->curr->mm_cid_active matches the barrier in 10559 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10560 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10561 - * load of per-mm/cpu cid. 10562 - */ 10563 - 10564 - /* 10565 - * If we observe an active task using the mm on this rq after setting 10566 - * the lazy-put flag, that task will be responsible for transitioning 10567 - * from lazy-put flag set to MM_CID_UNSET. 10568 - */ 10569 - scoped_guard (rcu) { 10570 - t = rcu_dereference(rq->curr); 10571 - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) 10572 - return; 10573 - } 10574 - 10575 - /* 10576 - * The cid is unused, so it can be unset. 10577 - * Disable interrupts to keep the window of cid ownership without rq 10578 - * lock small. 10579 - */ 10580 - scoped_guard (irqsave) { 10581 - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10582 - __mm_cid_put(mm, cid); 10495 + if (cid_on_task(t->mm_cid.cid)) { 10496 + t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); 10497 + pcp->cid = t->mm_cid.cid; 10583 10498 } 10584 10499 } 10585 10500 10586 - static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) 10501 + static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) 10587 10502 { 10588 - struct rq *rq = cpu_rq(cpu); 10589 - struct mm_cid *pcpu_cid; 10590 - struct task_struct *curr; 10591 - u64 rq_clock; 10592 - 10593 - /* 10594 - * rq->clock load is racy on 32-bit but one spurious clear once in a 10595 - * while is irrelevant. 10596 - */ 10597 - rq_clock = READ_ONCE(rq->clock); 10598 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10599 - 10600 - /* 10601 - * In order to take care of infrequently scheduled tasks, bump the time 10602 - * snapshot associated with this cid if an active task using the mm is 10603 - * observed on this rq. 10604 - */ 10605 - scoped_guard (rcu) { 10606 - curr = rcu_dereference(rq->curr); 10607 - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { 10608 - WRITE_ONCE(pcpu_cid->time, rq_clock); 10609 - return; 10610 - } 10611 - } 10612 - 10613 - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) 10614 - return; 10615 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10616 - } 10617 - 10618 - static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, 10619 - int weight) 10620 - { 10621 - struct mm_cid *pcpu_cid; 10622 - int cid; 10623 - 10624 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10625 - cid = READ_ONCE(pcpu_cid->cid); 10626 - if (!mm_cid_is_valid(cid) || cid < weight) 10627 - return; 10628 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10629 - } 10630 - 10631 - static void task_mm_cid_work(struct callback_head *work) 10632 - { 10633 - unsigned long now = jiffies, old_scan, next_scan; 10634 - struct task_struct *t = current; 10635 - struct cpumask *cidmask; 10636 - struct mm_struct *mm; 10637 - int weight, cpu; 10638 - 10639 - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); 10640 - 10641 - work->next = work; /* Prevent double-add */ 10642 - if (t->flags & PF_EXITING) 10643 - return; 10644 - mm = t->mm; 10645 - if (!mm) 10646 - return; 10647 - old_scan = READ_ONCE(mm->mm_cid_next_scan); 10648 - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10649 - if (!old_scan) { 10650 - unsigned long res; 10651 - 10652 - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); 10653 - if (res != old_scan) 10654 - old_scan = res; 10503 + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10504 + guard(task_rq_lock)(t); 10505 + /* If the task is not active it is not in the users count */ 10506 + if (!t->mm_cid.active) 10507 + return false; 10508 + if (cid_on_task(t->mm_cid.cid)) { 10509 + /* If running on the CPU, transfer the CID, otherwise drop it */ 10510 + if (task_rq(t)->curr == t) 10511 + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); 10655 10512 else 10656 - old_scan = next_scan; 10513 + mm_unset_cid_on_task(t); 10657 10514 } 10658 - if (time_before(now, old_scan)) 10659 - return; 10660 - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) 10661 - return; 10662 - cidmask = mm_cidmask(mm); 10663 - /* Clear cids that were not recently used. */ 10664 - for_each_possible_cpu(cpu) 10665 - sched_mm_cid_remote_clear_old(mm, cpu); 10666 - weight = cpumask_weight(cidmask); 10667 - /* 10668 - * Clear cids that are greater or equal to the cidmask weight to 10669 - * recompact it. 10670 - */ 10671 - for_each_possible_cpu(cpu) 10672 - sched_mm_cid_remote_clear_weight(mm, cpu, weight); 10515 + return true; 10673 10516 } 10674 10517 10675 - void init_sched_mm_cid(struct task_struct *t) 10518 + static void mm_cid_fixup_tasks_to_cpus(void) 10676 10519 { 10677 - struct mm_struct *mm = t->mm; 10678 - int mm_users = 0; 10520 + struct mm_struct *mm = current->mm; 10521 + struct task_struct *p, *t; 10522 + unsigned int users; 10679 10523 10680 - if (mm) { 10681 - mm_users = atomic_read(&mm->mm_users); 10682 - if (mm_users == 1) 10683 - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10524 + /* 10525 + * This can obviously race with a concurrent affinity change, which 10526 + * increases the number of allowed CPUs for this mm, but that does 10527 + * not affect the mode and only changes the CID constraints. A 10528 + * possible switch back to per task mode happens either in the 10529 + * deferred handler function or in the next fork()/exit(). 10530 + * 10531 + * The caller has already transferred. The newly incoming task is 10532 + * already accounted for, but not yet visible. 10533 + */ 10534 + users = mm->mm_cid.users - 2; 10535 + if (!users) 10536 + return; 10537 + 10538 + guard(rcu)(); 10539 + for_other_threads(current, t) { 10540 + if (mm_cid_fixup_task_to_cpu(t, mm)) 10541 + users--; 10684 10542 } 10685 - t->cid_work.next = &t->cid_work; /* Protect against double add */ 10686 - init_task_work(&t->cid_work, task_mm_cid_work); 10687 - } 10688 10543 10689 - void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) 10690 - { 10691 - struct callback_head *work = &curr->cid_work; 10692 - unsigned long now = jiffies; 10693 - 10694 - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || 10695 - work->next != work) 10696 - return; 10697 - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) 10544 + if (!users) 10698 10545 return; 10699 10546 10700 - /* No page allocation under rq lock */ 10701 - task_work_add(curr, work, TWA_RESUME); 10702 - } 10703 - 10704 - void sched_mm_cid_exit_signals(struct task_struct *t) 10705 - { 10706 - struct mm_struct *mm = t->mm; 10707 - struct rq *rq; 10708 - 10709 - if (!mm) 10710 - return; 10711 - 10712 - preempt_disable(); 10713 - rq = this_rq(); 10714 - guard(rq_lock_irqsave)(rq); 10715 - preempt_enable_no_resched(); /* holding spinlock */ 10716 - WRITE_ONCE(t->mm_cid_active, 0); 10717 - /* 10718 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10719 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10720 - */ 10721 - smp_mb(); 10722 - mm_cid_put(mm); 10723 - t->last_mm_cid = t->mm_cid = -1; 10724 - } 10725 - 10726 - void sched_mm_cid_before_execve(struct task_struct *t) 10727 - { 10728 - struct mm_struct *mm = t->mm; 10729 - struct rq *rq; 10730 - 10731 - if (!mm) 10732 - return; 10733 - 10734 - preempt_disable(); 10735 - rq = this_rq(); 10736 - guard(rq_lock_irqsave)(rq); 10737 - preempt_enable_no_resched(); /* holding spinlock */ 10738 - WRITE_ONCE(t->mm_cid_active, 0); 10739 - /* 10740 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10741 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10742 - */ 10743 - smp_mb(); 10744 - mm_cid_put(mm); 10745 - t->last_mm_cid = t->mm_cid = -1; 10746 - } 10747 - 10748 - void sched_mm_cid_after_execve(struct task_struct *t) 10749 - { 10750 - struct mm_struct *mm = t->mm; 10751 - struct rq *rq; 10752 - 10753 - if (!mm) 10754 - return; 10755 - 10756 - preempt_disable(); 10757 - rq = this_rq(); 10758 - scoped_guard (rq_lock_irqsave, rq) { 10759 - preempt_enable_no_resched(); /* holding spinlock */ 10760 - WRITE_ONCE(t->mm_cid_active, 1); 10761 - /* 10762 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10763 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10764 - */ 10765 - smp_mb(); 10766 - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); 10547 + /* Happens only for VM_CLONE processes. */ 10548 + for_each_process_thread(p, t) { 10549 + if (t == current || t->mm != mm) 10550 + continue; 10551 + if (mm_cid_fixup_task_to_cpu(t, mm)) { 10552 + if (--users == 0) 10553 + return; 10554 + } 10767 10555 } 10556 + } 10557 + 10558 + static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) 10559 + { 10560 + t->mm_cid.active = 1; 10561 + mm->mm_cid.users++; 10562 + return mm_update_max_cids(mm); 10768 10563 } 10769 10564 10770 10565 void sched_mm_cid_fork(struct task_struct *t) 10771 10566 { 10772 - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); 10773 - t->mm_cid_active = 1; 10567 + struct mm_struct *mm = t->mm; 10568 + bool percpu; 10569 + 10570 + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); 10571 + 10572 + guard(mutex)(&mm->mm_cid.mutex); 10573 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10574 + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); 10575 + 10576 + /* First user ? */ 10577 + if (!mm->mm_cid.users) { 10578 + sched_mm_cid_add_user(t, mm); 10579 + t->mm_cid.cid = mm_get_cid(mm); 10580 + /* Required for execve() */ 10581 + pcp->cid = t->mm_cid.cid; 10582 + return; 10583 + } 10584 + 10585 + if (!sched_mm_cid_add_user(t, mm)) { 10586 + if (!mm->mm_cid.percpu) 10587 + t->mm_cid.cid = mm_get_cid(mm); 10588 + return; 10589 + } 10590 + 10591 + /* Handle the mode change and transfer current's CID */ 10592 + percpu = !!mm->mm_cid.percpu; 10593 + if (!percpu) 10594 + mm_cid_transit_to_task(current, pcp); 10595 + else 10596 + mm_cid_transfer_to_cpu(current, pcp); 10597 + } 10598 + 10599 + if (percpu) { 10600 + mm_cid_fixup_tasks_to_cpus(); 10601 + } else { 10602 + mm_cid_fixup_cpus_to_tasks(mm); 10603 + t->mm_cid.cid = mm_get_cid(mm); 10604 + } 10774 10605 } 10775 - #endif /* CONFIG_SCHED_MM_CID */ 10606 + 10607 + static bool sched_mm_cid_remove_user(struct task_struct *t) 10608 + { 10609 + t->mm_cid.active = 0; 10610 + scoped_guard(preempt) { 10611 + /* Clear the transition bit */ 10612 + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); 10613 + mm_unset_cid_on_task(t); 10614 + } 10615 + t->mm->mm_cid.users--; 10616 + return mm_update_max_cids(t->mm); 10617 + } 10618 + 10619 + static bool __sched_mm_cid_exit(struct task_struct *t) 10620 + { 10621 + struct mm_struct *mm = t->mm; 10622 + 10623 + if (!sched_mm_cid_remove_user(t)) 10624 + return false; 10625 + /* 10626 + * Contrary to fork() this only deals with a switch back to per 10627 + * task mode either because the above decreased users or an 10628 + * affinity change increased the number of allowed CPUs and the 10629 + * deferred fixup did not run yet. 10630 + */ 10631 + if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10632 + return false; 10633 + /* 10634 + * A failed fork(2) cleanup never gets here, so @current must have 10635 + * the same MM as @t. That's true for exit() and the failed 10636 + * pthread_create() cleanup case. 10637 + */ 10638 + if (WARN_ON_ONCE(current->mm != mm)) 10639 + return false; 10640 + return true; 10641 + } 10642 + 10643 + /* 10644 + * When a task exits, the MM CID held by the task is not longer required as 10645 + * the task cannot return to user space. 10646 + */ 10647 + void sched_mm_cid_exit(struct task_struct *t) 10648 + { 10649 + struct mm_struct *mm = t->mm; 10650 + 10651 + if (!mm || !t->mm_cid.active) 10652 + return; 10653 + /* 10654 + * Ensure that only one instance is doing MM CID operations within 10655 + * a MM. The common case is uncontended. The rare fixup case adds 10656 + * some overhead. 10657 + */ 10658 + scoped_guard(mutex, &mm->mm_cid.mutex) { 10659 + /* mm_cid::mutex is sufficient to protect mm_cid::users */ 10660 + if (likely(mm->mm_cid.users > 1)) { 10661 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10662 + if (!__sched_mm_cid_exit(t)) 10663 + return; 10664 + /* Mode change required. Transfer currents CID */ 10665 + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); 10666 + } 10667 + mm_cid_fixup_cpus_to_tasks(mm); 10668 + return; 10669 + } 10670 + /* Last user */ 10671 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10672 + /* Required across execve() */ 10673 + if (t == current) 10674 + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); 10675 + /* Ignore mode change. There is nothing to do. */ 10676 + sched_mm_cid_remove_user(t); 10677 + } 10678 + } 10679 + 10680 + /* 10681 + * As this is the last user (execve(), process exit or failed 10682 + * fork(2)) there is no concurrency anymore. 10683 + * 10684 + * Synchronize eventually pending work to ensure that there are no 10685 + * dangling references left. @t->mm_cid.users is zero so nothing 10686 + * can queue this work anymore. 10687 + */ 10688 + irq_work_sync(&mm->mm_cid.irq_work); 10689 + cancel_work_sync(&mm->mm_cid.work); 10690 + } 10691 + 10692 + /* Deactivate MM CID allocation across execve() */ 10693 + void sched_mm_cid_before_execve(struct task_struct *t) 10694 + { 10695 + sched_mm_cid_exit(t); 10696 + } 10697 + 10698 + /* Reactivate MM CID after successful execve() */ 10699 + void sched_mm_cid_after_execve(struct task_struct *t) 10700 + { 10701 + sched_mm_cid_fork(t); 10702 + } 10703 + 10704 + static void mm_cid_work_fn(struct work_struct *work) 10705 + { 10706 + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); 10707 + 10708 + guard(mutex)(&mm->mm_cid.mutex); 10709 + /* Did the last user task exit already? */ 10710 + if (!mm->mm_cid.users) 10711 + return; 10712 + 10713 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10714 + /* Have fork() or exit() handled it already? */ 10715 + if (!mm->mm_cid.update_deferred) 10716 + return; 10717 + /* This clears mm_cid::update_deferred */ 10718 + if (!mm_update_max_cids(mm)) 10719 + return; 10720 + /* Affinity changes can only switch back to task mode */ 10721 + if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10722 + return; 10723 + } 10724 + mm_cid_fixup_cpus_to_tasks(mm); 10725 + } 10726 + 10727 + static void mm_cid_irq_work(struct irq_work *work) 10728 + { 10729 + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); 10730 + 10731 + /* 10732 + * Needs to be unconditional because mm_cid::lock cannot be held 10733 + * when scheduling work as mm_update_cpus_allowed() nests inside 10734 + * rq::lock and schedule_work() might end up in wakeup... 10735 + */ 10736 + schedule_work(&mm->mm_cid.work); 10737 + } 10738 + 10739 + void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 10740 + { 10741 + mm->mm_cid.max_cids = 0; 10742 + mm->mm_cid.percpu = 0; 10743 + mm->mm_cid.transit = 0; 10744 + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; 10745 + mm->mm_cid.users = 0; 10746 + mm->mm_cid.pcpu_thrs = 0; 10747 + mm->mm_cid.update_deferred = 0; 10748 + raw_spin_lock_init(&mm->mm_cid.lock); 10749 + mutex_init(&mm->mm_cid.mutex); 10750 + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); 10751 + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); 10752 + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 10753 + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); 10754 + } 10755 + #else /* CONFIG_SCHED_MM_CID */ 10756 + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } 10757 + #endif /* !CONFIG_SCHED_MM_CID */ 10776 10758 10777 10759 static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); 10778 10760

+4 -4

kernel/sched/membarrier.c

··· 199 199 * is negligible. 200 200 */ 201 201 smp_mb(); 202 - rseq_preempt(current); 202 + rseq_sched_switch_event(current); 203 203 } 204 204 205 205 static void ipi_sync_rq_state(void *info) ··· 407 407 * membarrier, we will end up with some thread in the mm 408 408 * running without a core sync. 409 409 * 410 - * For RSEQ, don't rseq_preempt() the caller. User code 411 - * is not supposed to issue syscalls at all from inside an 412 - * rseq critical section. 410 + * For RSEQ, don't invoke rseq_sched_switch_event() on the 411 + * caller. User code is not supposed to issue syscalls at 412 + * all from inside an rseq critical section. 413 413 */ 414 414 if (flags != MEMBARRIER_FLAG_SYNC_CORE) { 415 415 preempt_disable();

+193 -263

kernel/sched/sched.h

··· 2223 2223 smp_wmb(); 2224 2224 WRITE_ONCE(task_thread_info(p)->cpu, cpu); 2225 2225 p->wake_cpu = cpu; 2226 + rseq_sched_set_ids_changed(p); 2226 2227 #endif /* CONFIG_SMP */ 2227 2228 } 2228 2229 ··· 3680 3679 3681 3680 #ifdef CONFIG_SCHED_MM_CID 3682 3681 3683 - #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ 3684 - #define MM_CID_SCAN_DELAY 100 /* 100ms */ 3685 - 3686 - extern raw_spinlock_t cid_lock; 3687 - extern int use_cid_lock; 3688 - 3689 - extern void sched_mm_cid_migrate_from(struct task_struct *t); 3690 - extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); 3691 - extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); 3692 - extern void init_sched_mm_cid(struct task_struct *t); 3693 - 3694 - static inline void __mm_cid_put(struct mm_struct *mm, int cid) 3682 + static __always_inline bool cid_on_cpu(unsigned int cid) 3695 3683 { 3696 - if (cid < 0) 3697 - return; 3698 - cpumask_clear_cpu(cid, mm_cidmask(mm)); 3684 + return cid & MM_CID_ONCPU; 3699 3685 } 3700 3686 3701 - /* 3702 - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to 3703 - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to 3704 - * be held to transition to other states. 3705 - * 3706 - * State transitions synchronized with cmpxchg or try_cmpxchg need to be 3707 - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. 3708 - */ 3709 - static inline void mm_cid_put_lazy(struct task_struct *t) 3687 + static __always_inline bool cid_in_transit(unsigned int cid) 3710 3688 { 3689 + return cid & MM_CID_TRANSIT; 3690 + } 3691 + 3692 + static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) 3693 + { 3694 + return cid & ~MM_CID_ONCPU; 3695 + } 3696 + 3697 + static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) 3698 + { 3699 + return cid | MM_CID_ONCPU; 3700 + } 3701 + 3702 + static __always_inline unsigned int cid_to_transit_cid(unsigned int cid) 3703 + { 3704 + return cid | MM_CID_TRANSIT; 3705 + } 3706 + 3707 + static __always_inline unsigned int cid_from_transit_cid(unsigned int cid) 3708 + { 3709 + return cid & ~MM_CID_TRANSIT; 3710 + } 3711 + 3712 + static __always_inline bool cid_on_task(unsigned int cid) 3713 + { 3714 + /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */ 3715 + return cid < MM_CID_TRANSIT; 3716 + } 3717 + 3718 + static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid) 3719 + { 3720 + clear_bit(cid, mm_cidmask(mm)); 3721 + } 3722 + 3723 + static __always_inline void mm_unset_cid_on_task(struct task_struct *t) 3724 + { 3725 + unsigned int cid = t->mm_cid.cid; 3726 + 3727 + t->mm_cid.cid = MM_CID_UNSET; 3728 + if (cid_on_task(cid)) 3729 + mm_drop_cid(t->mm, cid); 3730 + } 3731 + 3732 + static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) 3733 + { 3734 + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ 3735 + pcp->cid = cpu_cid_to_cid(pcp->cid); 3736 + mm_drop_cid(mm, pcp->cid); 3737 + } 3738 + 3739 + static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) 3740 + { 3741 + unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids); 3742 + 3743 + if (cid >= max_cids) 3744 + return MM_CID_UNSET; 3745 + if (test_and_set_bit(cid, mm_cidmask(mm))) 3746 + return MM_CID_UNSET; 3747 + return cid; 3748 + } 3749 + 3750 + static inline unsigned int mm_get_cid(struct mm_struct *mm) 3751 + { 3752 + unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); 3753 + 3754 + while (cid == MM_CID_UNSET) { 3755 + cpu_relax(); 3756 + cid = __mm_get_cid(mm, num_possible_cpus()); 3757 + } 3758 + return cid; 3759 + } 3760 + 3761 + static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid, 3762 + unsigned int max_cids) 3763 + { 3764 + unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid); 3765 + 3766 + /* Is it in the optimal CID space? */ 3767 + if (likely(cid < max_cids)) 3768 + return orig_cid; 3769 + 3770 + /* Try to find one in the optimal space. Otherwise keep the provided. */ 3771 + new_cid = __mm_get_cid(mm, max_cids); 3772 + if (new_cid != MM_CID_UNSET) { 3773 + mm_drop_cid(mm, cid); 3774 + /* Preserve the ONCPU mode of the original CID */ 3775 + return new_cid | (orig_cid & MM_CID_ONCPU); 3776 + } 3777 + return orig_cid; 3778 + } 3779 + 3780 + static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid) 3781 + { 3782 + if (t->mm_cid.cid != cid) { 3783 + t->mm_cid.cid = cid; 3784 + rseq_sched_set_ids_changed(t); 3785 + } 3786 + } 3787 + 3788 + static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid) 3789 + { 3790 + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); 3791 + } 3792 + 3793 + static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) 3794 + { 3795 + unsigned int max_cids, tcid = t->mm_cid.cid; 3711 3796 struct mm_struct *mm = t->mm; 3712 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3713 - int cid; 3714 3797 3715 - lockdep_assert_irqs_disabled(); 3716 - cid = __this_cpu_read(pcpu_cid->cid); 3717 - if (!mm_cid_is_lazy_put(cid) || 3718 - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3719 - return; 3720 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3721 - } 3722 - 3723 - static inline int mm_cid_pcpu_unset(struct mm_struct *mm) 3724 - { 3725 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3726 - int cid, res; 3727 - 3728 - lockdep_assert_irqs_disabled(); 3729 - cid = __this_cpu_read(pcpu_cid->cid); 3730 - for (;;) { 3731 - if (mm_cid_is_unset(cid)) 3732 - return MM_CID_UNSET; 3733 - /* 3734 - * Attempt transition from valid or lazy-put to unset. 3735 - */ 3736 - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); 3737 - if (res == cid) 3738 - break; 3739 - cid = res; 3740 - } 3741 - return cid; 3742 - } 3743 - 3744 - static inline void mm_cid_put(struct mm_struct *mm) 3745 - { 3746 - int cid; 3747 - 3748 - lockdep_assert_irqs_disabled(); 3749 - cid = mm_cid_pcpu_unset(mm); 3750 - if (cid == MM_CID_UNSET) 3751 - return; 3752 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3753 - } 3754 - 3755 - static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) 3756 - { 3757 - struct cpumask *cidmask = mm_cidmask(mm); 3758 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3759 - int cid, max_nr_cid, allowed_max_nr_cid; 3760 - 3761 - /* 3762 - * After shrinking the number of threads or reducing the number 3763 - * of allowed cpus, reduce the value of max_nr_cid so expansion 3764 - * of cid allocation will preserve cache locality if the number 3765 - * of threads or allowed cpus increase again. 3766 - */ 3767 - max_nr_cid = atomic_read(&mm->max_nr_cid); 3768 - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), 3769 - atomic_read(&mm->mm_users))), 3770 - max_nr_cid > allowed_max_nr_cid) { 3771 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ 3772 - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { 3773 - max_nr_cid = allowed_max_nr_cid; 3774 - break; 3798 + max_cids = READ_ONCE(mm->mm_cid.max_cids); 3799 + /* Optimize for the common case where both have the ONCPU bit set */ 3800 + if (likely(cid_on_cpu(cpu_cid & tcid))) { 3801 + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { 3802 + mm_cid_update_task_cid(t, cpu_cid); 3803 + return; 3775 3804 } 3776 - } 3777 - /* Try to re-use recent cid. This improves cache locality. */ 3778 - cid = __this_cpu_read(pcpu_cid->recent_cid); 3779 - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && 3780 - !cpumask_test_and_set_cpu(cid, cidmask)) 3781 - return cid; 3782 - /* 3783 - * Expand cid allocation if the maximum number of concurrency 3784 - * IDs allocated (max_nr_cid) is below the number cpus allowed 3785 - * and number of threads. Expanding cid allocation as much as 3786 - * possible improves cache locality. 3787 - */ 3788 - cid = max_nr_cid; 3789 - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { 3790 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ 3791 - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) 3792 - continue; 3793 - if (!cpumask_test_and_set_cpu(cid, cidmask)) 3794 - return cid; 3795 - } 3796 - /* 3797 - * Find the first available concurrency id. 3798 - * Retry finding first zero bit if the mask is temporarily 3799 - * filled. This only happens during concurrent remote-clear 3800 - * which owns a cid without holding a rq lock. 3801 - */ 3802 - for (;;) { 3803 - cid = cpumask_first_zero(cidmask); 3804 - if (cid < READ_ONCE(mm->nr_cpus_allowed)) 3805 - break; 3806 - cpu_relax(); 3807 - } 3808 - if (cpumask_test_and_set_cpu(cid, cidmask)) 3809 - return -1; 3810 - 3811 - return cid; 3812 - } 3813 - 3814 - /* 3815 - * Save a snapshot of the current runqueue time of this cpu 3816 - * with the per-cpu cid value, allowing to estimate how recently it was used. 3817 - */ 3818 - static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) 3819 - { 3820 - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); 3821 - 3822 - lockdep_assert_rq_held(rq); 3823 - WRITE_ONCE(pcpu_cid->time, rq->clock); 3824 - } 3825 - 3826 - static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, 3827 - struct mm_struct *mm) 3828 - { 3829 - int cid; 3830 - 3831 - /* 3832 - * All allocations (even those using the cid_lock) are lock-free. If 3833 - * use_cid_lock is set, hold the cid_lock to perform cid allocation to 3834 - * guarantee forward progress. 3835 - */ 3836 - if (!READ_ONCE(use_cid_lock)) { 3837 - cid = __mm_cid_try_get(t, mm); 3838 - if (cid >= 0) 3839 - goto end; 3840 - raw_spin_lock(&cid_lock); 3805 + /* Try to converge into the optimal CID space */ 3806 + cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids); 3841 3807 } else { 3842 - raw_spin_lock(&cid_lock); 3843 - cid = __mm_cid_try_get(t, mm); 3844 - if (cid >= 0) 3845 - goto unlock; 3846 - } 3847 - 3848 - /* 3849 - * cid concurrently allocated. Retry while forcing following 3850 - * allocations to use the cid_lock to ensure forward progress. 3851 - */ 3852 - WRITE_ONCE(use_cid_lock, 1); 3853 - /* 3854 - * Set use_cid_lock before allocation. Only care about program order 3855 - * because this is only required for forward progress. 3856 - */ 3857 - barrier(); 3858 - /* 3859 - * Retry until it succeeds. It is guaranteed to eventually succeed once 3860 - * all newcoming allocations observe the use_cid_lock flag set. 3861 - */ 3862 - do { 3863 - cid = __mm_cid_try_get(t, mm); 3864 - cpu_relax(); 3865 - } while (cid < 0); 3866 - /* 3867 - * Allocate before clearing use_cid_lock. Only care about 3868 - * program order because this is for forward progress. 3869 - */ 3870 - barrier(); 3871 - WRITE_ONCE(use_cid_lock, 0); 3872 - unlock: 3873 - raw_spin_unlock(&cid_lock); 3874 - end: 3875 - mm_cid_snapshot_time(rq, mm); 3876 - 3877 - return cid; 3878 - } 3879 - 3880 - static inline int mm_cid_get(struct rq *rq, struct task_struct *t, 3881 - struct mm_struct *mm) 3882 - { 3883 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3884 - int cid; 3885 - 3886 - lockdep_assert_rq_held(rq); 3887 - cid = __this_cpu_read(pcpu_cid->cid); 3888 - if (mm_cid_is_valid(cid)) { 3889 - mm_cid_snapshot_time(rq, mm); 3890 - return cid; 3891 - } 3892 - if (mm_cid_is_lazy_put(cid)) { 3893 - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3894 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3895 - } 3896 - cid = __mm_cid_get(rq, t, mm); 3897 - __this_cpu_write(pcpu_cid->cid, cid); 3898 - __this_cpu_write(pcpu_cid->recent_cid, cid); 3899 - 3900 - return cid; 3901 - } 3902 - 3903 - static inline void switch_mm_cid(struct rq *rq, 3904 - struct task_struct *prev, 3905 - struct task_struct *next) 3906 - { 3907 - /* 3908 - * Provide a memory barrier between rq->curr store and load of 3909 - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. 3910 - * 3911 - * Should be adapted if context_switch() is modified. 3912 - */ 3913 - if (!next->mm) { // to kernel 3914 - /* 3915 - * user -> kernel transition does not guarantee a barrier, but 3916 - * we can use the fact that it performs an atomic operation in 3917 - * mmgrab(). 3918 - */ 3919 - if (prev->mm) // from user 3920 - smp_mb__after_mmgrab(); 3921 - /* 3922 - * kernel -> kernel transition does not change rq->curr->mm 3923 - * state. It stays NULL. 3924 - */ 3925 - } else { // to user 3926 - /* 3927 - * kernel -> user transition does not provide a barrier 3928 - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. 3929 - * Provide it here. 3930 - */ 3931 - if (!prev->mm) { // from kernel 3932 - smp_mb(); 3933 - } else { // from user 3934 - /* 3935 - * user->user transition relies on an implicit 3936 - * memory barrier in switch_mm() when 3937 - * current->mm changes. If the architecture 3938 - * switch_mm() does not have an implicit memory 3939 - * barrier, it is emitted here. If current->mm 3940 - * is unchanged, no barrier is needed. 3941 - */ 3942 - smp_mb__after_switch_mm(); 3808 + /* Hand over or drop the task owned CID */ 3809 + if (cid_on_task(tcid)) { 3810 + if (cid_on_cpu(cpu_cid)) 3811 + mm_unset_cid_on_task(t); 3812 + else 3813 + cpu_cid = cid_to_cpu_cid(tcid); 3943 3814 } 3815 + /* Still nothing, allocate a new one */ 3816 + if (!cid_on_cpu(cpu_cid)) 3817 + cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); 3944 3818 } 3945 - if (prev->mm_cid_active) { 3946 - mm_cid_snapshot_time(rq, prev->mm); 3947 - mm_cid_put_lazy(prev); 3948 - prev->mm_cid = -1; 3819 + mm_cid_update_pcpu_cid(mm, cpu_cid); 3820 + mm_cid_update_task_cid(t, cpu_cid); 3821 + } 3822 + 3823 + static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) 3824 + { 3825 + unsigned int max_cids, tcid = t->mm_cid.cid; 3826 + struct mm_struct *mm = t->mm; 3827 + 3828 + max_cids = READ_ONCE(mm->mm_cid.max_cids); 3829 + /* Optimize for the common case, where both have the ONCPU bit clear */ 3830 + if (likely(cid_on_task(tcid | cpu_cid))) { 3831 + if (likely(tcid < max_cids)) { 3832 + mm_cid_update_pcpu_cid(mm, tcid); 3833 + return; 3834 + } 3835 + /* Try to converge into the optimal CID space */ 3836 + tcid = mm_cid_converge(mm, tcid, max_cids); 3837 + } else { 3838 + /* Hand over or drop the CPU owned CID */ 3839 + if (cid_on_cpu(cpu_cid)) { 3840 + if (cid_on_task(tcid)) 3841 + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); 3842 + else 3843 + tcid = cpu_cid_to_cid(cpu_cid); 3844 + } 3845 + /* Still nothing, allocate a new one */ 3846 + if (!cid_on_task(tcid)) 3847 + tcid = mm_get_cid(mm); 3848 + /* Set the transition mode flag if required */ 3849 + tcid |= READ_ONCE(mm->mm_cid.transit); 3949 3850 } 3950 - if (next->mm_cid_active) 3951 - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); 3851 + mm_cid_update_pcpu_cid(mm, tcid); 3852 + mm_cid_update_task_cid(t, tcid); 3853 + } 3854 + 3855 + static __always_inline void mm_cid_schedin(struct task_struct *next) 3856 + { 3857 + struct mm_struct *mm = next->mm; 3858 + unsigned int cpu_cid; 3859 + 3860 + if (!next->mm_cid.active) 3861 + return; 3862 + 3863 + cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); 3864 + if (likely(!READ_ONCE(mm->mm_cid.percpu))) 3865 + mm_cid_from_task(next, cpu_cid); 3866 + else 3867 + mm_cid_from_cpu(next, cpu_cid); 3868 + } 3869 + 3870 + static __always_inline void mm_cid_schedout(struct task_struct *prev) 3871 + { 3872 + /* During mode transitions CIDs are temporary and need to be dropped */ 3873 + if (likely(!cid_in_transit(prev->mm_cid.cid))) 3874 + return; 3875 + 3876 + mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); 3877 + prev->mm_cid.cid = MM_CID_UNSET; 3878 + } 3879 + 3880 + static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) 3881 + { 3882 + mm_cid_schedout(prev); 3883 + mm_cid_schedin(next); 3952 3884 } 3953 3885 3954 3886 #else /* !CONFIG_SCHED_MM_CID: */ 3955 - static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } 3956 - static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } 3957 - static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } 3958 - static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } 3959 - static inline void init_sched_mm_cid(struct task_struct *t) { } 3887 + static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } 3960 3888 #endif /* !CONFIG_SCHED_MM_CID */ 3961 3889 3962 3890 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);

-2

kernel/signal.c

··· 3125 3125 cgroup_threadgroup_change_begin(tsk); 3126 3126 3127 3127 if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { 3128 - sched_mm_cid_exit_signals(tsk); 3129 3128 tsk->flags |= PF_EXITING; 3130 3129 cgroup_threadgroup_change_end(tsk); 3131 3130 return; ··· 3135 3136 * From now this task is not visible for group-wide signals, 3136 3137 * see wants_signal(), do_signal_stop(). 3137 3138 */ 3138 - sched_mm_cid_exit_signals(tsk); 3139 3139 tsk->flags |= PF_EXITING; 3140 3140 3141 3141 cgroup_threadgroup_change_end(tsk);

+6

lib/bitmap.c

··· 355 355 } 356 356 EXPORT_SYMBOL(__bitmap_weight_andnot); 357 357 358 + unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, 359 + const unsigned long *bitmap2, unsigned int bits) 360 + { 361 + return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits); 362 + } 363 + 358 364 void __bitmap_set(unsigned long *map, unsigned int start, int len) 359 365 { 360 366 unsigned long *p = map + BIT_WORD(start);

+7

virt/kvm/kvm_main.c

··· 49 49 #include <linux/lockdep.h> 50 50 #include <linux/kthread.h> 51 51 #include <linux/suspend.h> 52 + #include <linux/rseq.h> 52 53 53 54 #include <asm/processor.h> 54 55 #include <asm/ioctl.h> ··· 4476 4475 vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); 4477 4476 r = kvm_arch_vcpu_ioctl_run(vcpu); 4478 4477 vcpu->wants_to_run = false; 4478 + 4479 + /* 4480 + * FIXME: Remove this hack once all KVM architectures 4481 + * support the generic TIF bits, i.e. a dedicated TIF_RSEQ. 4482 + */ 4483 + rseq_virt_userspace_exit(); 4479 4484 4480 4485 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 4481 4486 break;

Configure Feed

Configure Feed