Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull rseq updates from Thomas Gleixner:
"A large overhaul of the restartable sequences and CID management:

The recent enablement of RSEQ in glibc resulted in regressions which
are caused by the related overhead. It turned out that the decision to
invoke the exit to user work was not really a decision. More or less
each context switch caused that. There is a long list of small issues
which sums up nicely and results in a 3-4% regression in I/O
benchmarks.

The other detail which caused issues due to extra work in context
switch and task migration is the CID (memory context ID) management.
It also requires to use a task work to consolidate the CID space,
which is executed in the context of an arbitrary task and results in
sporadic uncontrolled exit latencies.

The rewrite addresses this by:

- Removing deprecated and long unsupported functionality

- Moving the related data into dedicated data structures which are
optimized for fast path processing.

- Caching values so actual decisions can be made

- Replacing the current implementation with a optimized inlined
variant.

- Separating fast and slow path for architectures which use the
generic entry code, so that only fault and error handling goes into
the TIF_NOTIFY_RESUME handler.

- Rewriting the CID management so that it becomes mostly invisible in
the context switch path. That moves the work of switching modes
into the fork/exit path, which is a reasonable tradeoff. That work
is only required when a process creates more threads than the
cpuset it is allowed to run on or when enough threads exit after
that. An artificial thread pool benchmarks which triggers this did
not degrade, it actually improved significantly.

The main effect in migration heavy scenarios is that runqueue lock
held time and therefore contention goes down significantly"

* tag 'core-rseq-2025-11-30' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
sched/mmcid: Switch over to the new mechanism
sched/mmcid: Implement deferred mode change
irqwork: Move data struct to a types header
sched/mmcid: Provide CID ownership mode fixup functions
sched/mmcid: Provide new scheduler CID mechanism
sched/mmcid: Introduce per task/CPU ownership infrastructure
sched/mmcid: Serialize sched_mm_cid_fork()/exit() with a mutex
sched/mmcid: Provide precomputed maximal value
sched/mmcid: Move initialization out of line
signal: Move MMCID exit out of sighand lock
sched/mmcid: Convert mm CID mask to a bitmap
cpumask: Cache num_possible_cpus()
sched/mmcid: Use cpumask_weighted_or()
cpumask: Introduce cpumask_weighted_or()
sched/mmcid: Prevent pointless work in mm_update_cpus_allowed()
sched/mmcid: Move scheduler code out of global header
sched: Fixup whitespace damage
sched/mmcid: Cacheline align MM CID storage
sched/mmcid: Use proper data structures
sched/mmcid: Revert the complex CID management
...

+2152 -1517
+4
Documentation/admin-guide/kernel-parameters.txt
··· 6500 6500 Memory area to be used by remote processor image, 6501 6501 managed by CMA. 6502 6502 6503 + rseq_debug= [KNL] Enable or disable restartable sequence 6504 + debug mode. Defaults to CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE. 6505 + Format: <bool> 6506 + 6503 6507 rt_group_sched= [KNL] Enable or disable SCHED_RR/FIFO group scheduling 6504 6508 when CONFIG_RT_GROUP_SCHED=y. Defaults to 6505 6509 !CONFIG_RT_GROUP_SCHED_DEFAULT_DISABLED.
+1 -1
arch/arm64/kernel/entry-common.c
··· 100 100 static __always_inline void arm64_exit_to_user_mode(struct pt_regs *regs) 101 101 { 102 102 local_irq_disable(); 103 - exit_to_user_mode_prepare(regs); 103 + exit_to_user_mode_prepare_legacy(regs); 104 104 local_daif_mask(); 105 105 mte_check_tfsr_exit(); 106 106 exit_to_user_mode();
+2 -1
arch/x86/entry/syscall_32.c
··· 274 274 * fetch EBP before invoking any of the syscall entry work 275 275 * functions. 276 276 */ 277 - syscall_enter_from_user_mode_prepare(regs); 277 + enter_from_user_mode(regs); 278 278 279 279 instrumentation_begin(); 280 + local_irq_enable(); 280 281 /* Fetch EBP from where the vDSO stashed it. */ 281 282 if (IS_ENABLED(CONFIG_X86_64)) { 282 283 /*
+10 -10
arch/x86/include/asm/ptrace.h
··· 187 187 extern void send_sigtrap(struct pt_regs *regs, int error_code, int si_code); 188 188 189 189 190 - static inline unsigned long regs_return_value(struct pt_regs *regs) 190 + static __always_inline unsigned long regs_return_value(struct pt_regs *regs) 191 191 { 192 192 return regs->ax; 193 193 } 194 194 195 - static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) 195 + static __always_inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) 196 196 { 197 197 regs->ax = rc; 198 198 } ··· 277 277 } 278 278 #endif 279 279 280 - static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) 280 + static __always_inline unsigned long kernel_stack_pointer(struct pt_regs *regs) 281 281 { 282 282 return regs->sp; 283 283 } 284 284 285 - static inline unsigned long instruction_pointer(struct pt_regs *regs) 285 + static __always_inline unsigned long instruction_pointer(struct pt_regs *regs) 286 286 { 287 287 return regs->ip; 288 288 } 289 289 290 - static inline void instruction_pointer_set(struct pt_regs *regs, 291 - unsigned long val) 290 + static __always_inline 291 + void instruction_pointer_set(struct pt_regs *regs, unsigned long val) 292 292 { 293 293 regs->ip = val; 294 294 } 295 295 296 - static inline unsigned long frame_pointer(struct pt_regs *regs) 296 + static __always_inline unsigned long frame_pointer(struct pt_regs *regs) 297 297 { 298 298 return regs->bp; 299 299 } 300 300 301 - static inline unsigned long user_stack_pointer(struct pt_regs *regs) 301 + static __always_inline unsigned long user_stack_pointer(struct pt_regs *regs) 302 302 { 303 303 return regs->sp; 304 304 } 305 305 306 - static inline void user_stack_pointer_set(struct pt_regs *regs, 307 - unsigned long val) 306 + static __always_inline 307 + void user_stack_pointer_set(struct pt_regs *regs, unsigned long val) 308 308 { 309 309 regs->sp = val; 310 310 }
+3
drivers/hv/mshv_root_main.c
··· 29 29 #include <linux/crash_dump.h> 30 30 #include <linux/panic_notifier.h> 31 31 #include <linux/vmalloc.h> 32 + #include <linux/rseq.h> 32 33 33 34 #include "mshv_eventfd.h" 34 35 #include "mshv.h" ··· 560 559 vp->run.flags.intercept_suspend = 1; 561 560 } 562 561 } while (!vp->run.flags.intercept_suspend); 562 + 563 + rseq_virt_userspace_exit(); 563 564 564 565 return ret; 565 566 }
+1 -1
fs/binfmt_elf.c
··· 46 46 #include <linux/cred.h> 47 47 #include <linux/dax.h> 48 48 #include <linux/uaccess.h> 49 - #include <linux/rseq.h> 49 + #include <uapi/linux/rseq.h> 50 50 #include <asm/param.h> 51 51 #include <asm/page.h> 52 52
+1 -1
fs/exec.c
··· 1774 1774 force_fatal_sig(SIGSEGV); 1775 1775 1776 1776 sched_mm_cid_after_execve(current); 1777 - rseq_set_notify_resume(current); 1777 + rseq_force_update(); 1778 1778 current->in_execve = 0; 1779 1779 1780 1780 return retval;
+3
include/asm-generic/thread_info_tif.h
··· 45 45 # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) 46 46 #endif 47 47 48 + #define TIF_RSEQ 11 // Run RSEQ fast path 49 + #define _TIF_RSEQ BIT(TIF_RSEQ) 50 + 48 51 #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */
+15
include/linux/bitmap.h
··· 45 45 * bitmap_copy(dst, src, nbits) *dst = *src 46 46 * bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2 47 47 * bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2 48 + * bitmap_weighted_or(dst, src1, src2, nbits) *dst = *src1 | *src2. Returns Hamming Weight of dst 48 49 * bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2 49 50 * bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2) 50 51 * bitmap_complement(dst, src, nbits) *dst = ~(*src) ··· 166 165 const unsigned long *bitmap2, unsigned int nbits); 167 166 void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, 168 167 const unsigned long *bitmap2, unsigned int nbits); 168 + unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, 169 + const unsigned long *bitmap2, unsigned int nbits); 169 170 void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, 170 171 const unsigned long *bitmap2, unsigned int nbits); 171 172 bool __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, ··· 338 335 *dst = *src1 | *src2; 339 336 else 340 337 __bitmap_or(dst, src1, src2, nbits); 338 + } 339 + 340 + static __always_inline 341 + unsigned int bitmap_weighted_or(unsigned long *dst, const unsigned long *src1, 342 + const unsigned long *src2, unsigned int nbits) 343 + { 344 + if (small_const_nbits(nbits)) { 345 + *dst = *src1 | *src2; 346 + return hweight_long(*dst & BITMAP_LAST_WORD_MASK(nbits)); 347 + } else { 348 + return __bitmap_weighted_or(dst, src1, src2, nbits); 349 + } 341 350 } 342 351 343 352 static __always_inline
+15 -15
include/linux/cleanup.h
··· 208 208 */ 209 209 210 210 #define DEFINE_FREE(_name, _type, _free) \ 211 - static inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } 211 + static __always_inline void __free_##_name(void *p) { _type _T = *(_type *)p; _free; } 212 212 213 213 #define __free(_name) __cleanup(__free_##_name) 214 214 ··· 220 220 __val; \ 221 221 }) 222 222 223 - static inline __must_check 223 + static __always_inline __must_check 224 224 const volatile void * __must_check_fn(const volatile void *val) 225 225 { return val; } 226 226 ··· 278 278 279 279 #define DEFINE_CLASS(_name, _type, _exit, _init, _init_args...) \ 280 280 typedef _type class_##_name##_t; \ 281 - static inline void class_##_name##_destructor(_type *p) \ 281 + static __always_inline void class_##_name##_destructor(_type *p) \ 282 282 { _type _T = *p; _exit; } \ 283 - static inline _type class_##_name##_constructor(_init_args) \ 283 + static __always_inline _type class_##_name##_constructor(_init_args) \ 284 284 { _type t = _init; return t; } 285 285 286 286 #define EXTEND_CLASS(_name, ext, _init, _init_args...) \ 287 287 typedef class_##_name##_t class_##_name##ext##_t; \ 288 - static inline void class_##_name##ext##_destructor(class_##_name##_t *p)\ 288 + static __always_inline void class_##_name##ext##_destructor(class_##_name##_t *p) \ 289 289 { class_##_name##_destructor(p); } \ 290 - static inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ 290 + static __always_inline class_##_name##_t class_##_name##ext##_constructor(_init_args) \ 291 291 { class_##_name##_t t = _init; return t; } 292 292 293 293 #define CLASS(_name, var) \ ··· 360 360 }) 361 361 362 362 #define __DEFINE_GUARD_LOCK_PTR(_name, _exp) \ 363 - static inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ 363 + static __always_inline void *class_##_name##_lock_ptr(class_##_name##_t *_T) \ 364 364 { \ 365 365 void *_ptr = (void *)(__force unsigned long)*(_exp); \ 366 366 if (IS_ERR(_ptr)) { \ ··· 368 368 } \ 369 369 return _ptr; \ 370 370 } \ 371 - static inline int class_##_name##_lock_err(class_##_name##_t *_T) \ 371 + static __always_inline int class_##_name##_lock_err(class_##_name##_t *_T) \ 372 372 { \ 373 373 long _rc = (__force unsigned long)*(_exp); \ 374 374 if (!_rc) { \ ··· 397 397 EXTEND_CLASS(_name, _ext, \ 398 398 ({ void *_t = _T; int _RET = (_lock); if (_T && !(_cond)) _t = ERR_PTR(_RET); _t; }), \ 399 399 class_##_name##_t _T) \ 400 - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 400 + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 401 401 { return class_##_name##_lock_ptr(_T); } \ 402 - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 402 + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 403 403 { return class_##_name##_lock_err(_T); } 404 404 405 405 /* ··· 479 479 __VA_ARGS__; \ 480 480 } class_##_name##_t; \ 481 481 \ 482 - static inline void class_##_name##_destructor(class_##_name##_t *_T) \ 482 + static __always_inline void class_##_name##_destructor(class_##_name##_t *_T) \ 483 483 { \ 484 484 if (!__GUARD_IS_ERR(_T->lock)) { _unlock; } \ 485 485 } \ ··· 487 487 __DEFINE_GUARD_LOCK_PTR(_name, &_T->lock) 488 488 489 489 #define __DEFINE_LOCK_GUARD_1(_name, _type, _lock) \ 490 - static inline class_##_name##_t class_##_name##_constructor(_type *l) \ 490 + static __always_inline class_##_name##_t class_##_name##_constructor(_type *l) \ 491 491 { \ 492 492 class_##_name##_t _t = { .lock = l }, *_T = &_t; \ 493 493 _lock; \ ··· 495 495 } 496 496 497 497 #define __DEFINE_LOCK_GUARD_0(_name, _lock) \ 498 - static inline class_##_name##_t class_##_name##_constructor(void) \ 498 + static __always_inline class_##_name##_t class_##_name##_constructor(void) \ 499 499 { \ 500 500 class_##_name##_t _t = { .lock = (void*)1 }, \ 501 501 *_T __maybe_unused = &_t; \ ··· 521 521 if (_T->lock && !(_cond)) _T->lock = ERR_PTR(_RET);\ 522 522 _t; }), \ 523 523 typeof_member(class_##_name##_t, lock) l) \ 524 - static inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 524 + static __always_inline void * class_##_name##_ext##_lock_ptr(class_##_name##_t *_T) \ 525 525 { return class_##_name##_lock_ptr(_T); } \ 526 - static inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 526 + static __always_inline int class_##_name##_ext##_lock_err(class_##_name##_t *_T) \ 527 527 { return class_##_name##_lock_err(_T); } 528 528 529 529 #define DEFINE_LOCK_GUARD_1_COND_3(_name, _ext, _lock) \
+24 -2
include/linux/cpumask.h
··· 126 126 #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) 127 127 128 128 extern atomic_t __num_online_cpus; 129 + extern unsigned int __num_possible_cpus; 129 130 130 131 extern cpumask_t cpus_booted_once_mask; 131 132 ··· 730 729 } 731 730 732 731 /** 732 + * cpumask_weighted_or - *dstp = *src1p | *src2p and return the weight of the result 733 + * @dstp: the cpumask result 734 + * @src1p: the first input 735 + * @src2p: the second input 736 + * 737 + * Return: The number of bits set in the resulting cpumask @dstp 738 + */ 739 + static __always_inline 740 + unsigned int cpumask_weighted_or(struct cpumask *dstp, const struct cpumask *src1p, 741 + const struct cpumask *src2p) 742 + { 743 + return bitmap_weighted_or(cpumask_bits(dstp), cpumask_bits(src1p), 744 + cpumask_bits(src2p), small_cpumask_bits); 745 + } 746 + 747 + /** 733 748 * cpumask_xor - *dstp = *src1p ^ *src2p 734 749 * @dstp: the cpumask result 735 750 * @src1p: the first input ··· 1153 1136 #define __assign_cpu(cpu, mask, val) \ 1154 1137 __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) 1155 1138 1156 - #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) 1157 1139 #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) 1158 1140 #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) 1159 1141 #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) 1160 1142 #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) 1161 1143 1162 1144 void set_cpu_online(unsigned int cpu, bool online); 1145 + void set_cpu_possible(unsigned int cpu, bool possible); 1163 1146 1164 1147 /** 1165 1148 * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * ··· 1212 1195 { 1213 1196 return raw_atomic_read(&__num_online_cpus); 1214 1197 } 1215 - #define num_possible_cpus() cpumask_weight(cpu_possible_mask) 1198 + 1199 + static __always_inline unsigned int num_possible_cpus(void) 1200 + { 1201 + return __num_possible_cpus; 1202 + } 1203 + 1216 1204 #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) 1217 1205 #define num_present_cpus() cpumask_weight(cpu_present_mask) 1218 1206 #define num_active_cpus() cpumask_weight(cpu_active_mask)
+11 -27
include/linux/entry-common.h
··· 3 3 #define __LINUX_ENTRYCOMMON_H 4 4 5 5 #include <linux/irq-entry-common.h> 6 + #include <linux/livepatch.h> 6 7 #include <linux/ptrace.h> 8 + #include <linux/resume_user_mode.h> 7 9 #include <linux/seccomp.h> 8 10 #include <linux/sched.h> 9 - #include <linux/livepatch.h> 10 - #include <linux/resume_user_mode.h> 11 11 12 12 #include <asm/entry-common.h> 13 13 #include <asm/syscall.h> ··· 37 37 SYSCALL_WORK_SYSCALL_AUDIT | \ 38 38 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 39 39 ARCH_SYSCALL_WORK_ENTER) 40 + 40 41 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 41 42 SYSCALL_WORK_SYSCALL_TRACE | \ 42 43 SYSCALL_WORK_SYSCALL_AUDIT | \ ··· 45 44 SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ 46 45 ARCH_SYSCALL_WORK_EXIT) 47 46 48 - /** 49 - * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts 50 - * @regs: Pointer to currents pt_regs 51 - * 52 - * Invoked from architecture specific syscall entry code with interrupts 53 - * disabled. The calling code has to be non-instrumentable. When the 54 - * function returns all state is correct, interrupts are enabled and the 55 - * subsequent functions can be instrumented. 56 - * 57 - * This handles lockdep, RCU (context tracking) and tracing state, i.e. 58 - * the functionality provided by enter_from_user_mode(). 59 - * 60 - * This is invoked when there is extra architecture specific functionality 61 - * to be done between establishing state and handling user mode entry work. 62 - */ 63 - void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); 64 - 65 - long syscall_trace_enter(struct pt_regs *regs, long syscall, 66 - unsigned long work); 47 + long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work); 67 48 68 49 /** 69 50 * syscall_enter_from_user_mode_work - Check and handle work before invoking ··· 54 71 * @syscall: The syscall number 55 72 * 56 73 * Invoked from architecture specific syscall entry code with interrupts 57 - * enabled after invoking syscall_enter_from_user_mode_prepare() and extra 58 - * architecture specific work. 74 + * enabled after invoking enter_from_user_mode(), enabling interrupts and 75 + * extra architecture specific work. 59 76 * 60 77 * Returns: The original or a modified syscall number 61 78 * ··· 91 108 * function returns all state is correct, interrupts are enabled and the 92 109 * subsequent functions can be instrumented. 93 110 * 94 - * This is combination of syscall_enter_from_user_mode_prepare() and 95 - * syscall_enter_from_user_mode_work(). 111 + * This is the combination of enter_from_user_mode() and 112 + * syscall_enter_from_user_mode_work() to be used when there is no 113 + * architecture specific work to be done between the two. 96 114 * 97 115 * Returns: The original or a modified syscall number. See 98 116 * syscall_enter_from_user_mode_work() for further explanation. ··· 146 162 local_irq_enable(); 147 163 } 148 164 149 - rseq_syscall(regs); 165 + rseq_debug_syscall_return(regs); 150 166 151 167 /* 152 168 * Do one-time syscall specific work. If these work items are ··· 156 172 if (unlikely(work & SYSCALL_WORK_EXIT)) 157 173 syscall_exit_work(regs, work); 158 174 local_irq_disable_exit_to_user(); 159 - exit_to_user_mode_prepare(regs); 175 + syscall_exit_to_user_mode_prepare(regs); 160 176 } 161 177 162 178 /**
+63 -12
include/linux/irq-entry-common.h
··· 2 2 #ifndef __LINUX_IRQENTRYCOMMON_H 3 3 #define __LINUX_IRQENTRYCOMMON_H 4 4 5 + #include <linux/context_tracking.h> 6 + #include <linux/kmsan.h> 7 + #include <linux/rseq_entry.h> 5 8 #include <linux/static_call_types.h> 6 9 #include <linux/syscalls.h> 7 - #include <linux/context_tracking.h> 8 10 #include <linux/tick.h> 9 - #include <linux/kmsan.h> 10 11 #include <linux/unwind_deferred.h> 11 12 12 13 #include <asm/entry-common.h> ··· 30 29 #define EXIT_TO_USER_MODE_WORK \ 31 30 (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ 32 31 _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ 33 - _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ 32 + _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ 34 33 ARCH_EXIT_TO_USER_MODE_WORK) 35 34 36 35 /** ··· 68 67 69 68 /** 70 69 * enter_from_user_mode - Establish state when coming from user mode 70 + * @regs: Pointer to currents pt_regs 71 71 * 72 72 * Syscall/interrupt entry disables interrupts, but user mode is traced as 73 73 * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. ··· 197 195 */ 198 196 void arch_do_signal_or_restart(struct pt_regs *regs); 199 197 200 - /** 201 - * exit_to_user_mode_loop - do any pending work before leaving to user space 202 - */ 203 - unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 204 - unsigned long ti_work); 198 + /* Handle pending TIF work */ 199 + unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work); 205 200 206 201 /** 207 - * exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 202 + * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 208 203 * @regs: Pointer to pt_regs on entry stack 209 204 * 210 205 * 1) check that interrupts are disabled ··· 209 210 * 3) call exit_to_user_mode_loop() if any flags from 210 211 * EXIT_TO_USER_MODE_WORK are set 211 212 * 4) check that interrupts are still disabled 213 + * 214 + * Don't invoke directly, use the syscall/irqentry_ prefixed variants below 212 215 */ 213 - static __always_inline void exit_to_user_mode_prepare(struct pt_regs *regs) 216 + static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) 214 217 { 215 218 unsigned long ti_work; 216 219 ··· 226 225 ti_work = exit_to_user_mode_loop(regs, ti_work); 227 226 228 227 arch_exit_to_user_mode_prepare(regs, ti_work); 228 + } 229 229 230 + static __always_inline void __exit_to_user_mode_validate(void) 231 + { 230 232 /* Ensure that kernel state is sane for a return to userspace */ 231 233 kmap_assert_nomap(); 232 234 lockdep_assert_irqs_disabled(); 233 235 lockdep_sys_exit(); 236 + } 237 + 238 + /* Temporary workaround to keep ARM64 alive */ 239 + static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) 240 + { 241 + __exit_to_user_mode_prepare(regs); 242 + rseq_exit_to_user_mode_legacy(); 243 + __exit_to_user_mode_validate(); 244 + } 245 + 246 + /** 247 + * syscall_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 248 + * @regs: Pointer to pt_regs on entry stack 249 + * 250 + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for 251 + * syscalls and interrupts. 252 + */ 253 + static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) 254 + { 255 + __exit_to_user_mode_prepare(regs); 256 + rseq_syscall_exit_to_user_mode(); 257 + __exit_to_user_mode_validate(); 258 + } 259 + 260 + /** 261 + * irqentry_exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required 262 + * @regs: Pointer to pt_regs on entry stack 263 + * 264 + * Wrapper around __exit_to_user_mode_prepare() to separate the exit work for 265 + * syscalls and interrupts. 266 + */ 267 + static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) 268 + { 269 + __exit_to_user_mode_prepare(regs); 270 + rseq_irqentry_exit_to_user_mode(); 271 + __exit_to_user_mode_validate(); 234 272 } 235 273 236 274 /** ··· 314 274 * 315 275 * The function establishes state (lockdep, RCU (context tracking), tracing) 316 276 */ 317 - void irqentry_enter_from_user_mode(struct pt_regs *regs); 277 + static __always_inline void irqentry_enter_from_user_mode(struct pt_regs *regs) 278 + { 279 + enter_from_user_mode(regs); 280 + rseq_note_user_irq_entry(); 281 + } 318 282 319 283 /** 320 284 * irqentry_exit_to_user_mode - Interrupt exit work ··· 333 289 * Interrupt exit is not invoking #1 which is the syscall specific one time 334 290 * work. 335 291 */ 336 - void irqentry_exit_to_user_mode(struct pt_regs *regs); 292 + static __always_inline void irqentry_exit_to_user_mode(struct pt_regs *regs) 293 + { 294 + instrumentation_begin(); 295 + irqentry_exit_to_user_mode_prepare(regs); 296 + instrumentation_end(); 297 + exit_to_user_mode(); 298 + } 337 299 338 300 #ifndef irqentry_state 339 301 /** ··· 404 354 * Conditional reschedule with additional sanity checks. 405 355 */ 406 356 void raw_irqentry_exit_cond_resched(void); 357 + 407 358 #ifdef CONFIG_PREEMPT_DYNAMIC 408 359 #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) 409 360 #define irqentry_exit_cond_resched_dynamic_enabled raw_irqentry_exit_cond_resched
+2 -7
include/linux/irq_work.h
··· 2 2 #ifndef _LINUX_IRQ_WORK_H 3 3 #define _LINUX_IRQ_WORK_H 4 4 5 - #include <linux/smp_types.h> 5 + #include <linux/irq_work_types.h> 6 6 #include <linux/rcuwait.h> 7 + #include <linux/smp_types.h> 7 8 8 9 /* 9 10 * An entry can be in one of four states: ··· 14 13 * pending next, 3 -> {busy} : queued, pending callback 15 14 * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed 16 15 */ 17 - 18 - struct irq_work { 19 - struct __call_single_node node; 20 - void (*func)(struct irq_work *); 21 - struct rcuwait irqwait; 22 - }; 23 16 24 17 #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ 25 18 .node = { .u_flags = (_flags), }, \
+14
include/linux/irq_work_types.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_IRQ_WORK_TYPES_H 3 + #define _LINUX_IRQ_WORK_TYPES_H 4 + 5 + #include <linux/smp_types.h> 6 + #include <linux/types.h> 7 + 8 + struct irq_work { 9 + struct __call_single_node node; 10 + void (*func)(struct irq_work *); 11 + struct rcuwait irqwait; 12 + }; 13 + 14 + #endif
-25
include/linux/mm.h
··· 2408 2408 /* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ 2409 2409 #define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) 2410 2410 2411 - #ifdef CONFIG_SCHED_MM_CID 2412 - void sched_mm_cid_before_execve(struct task_struct *t); 2413 - void sched_mm_cid_after_execve(struct task_struct *t); 2414 - void sched_mm_cid_fork(struct task_struct *t); 2415 - void sched_mm_cid_exit_signals(struct task_struct *t); 2416 - static inline int task_mm_cid(struct task_struct *t) 2417 - { 2418 - return t->mm_cid; 2419 - } 2420 - #else 2421 - static inline void sched_mm_cid_before_execve(struct task_struct *t) { } 2422 - static inline void sched_mm_cid_after_execve(struct task_struct *t) { } 2423 - static inline void sched_mm_cid_fork(struct task_struct *t) { } 2424 - static inline void sched_mm_cid_exit_signals(struct task_struct *t) { } 2425 - static inline int task_mm_cid(struct task_struct *t) 2426 - { 2427 - /* 2428 - * Use the processor id as a fall-back when the mm cid feature is 2429 - * disabled. This provides functional per-cpu data structure accesses 2430 - * in user-space, althrough it won't provide the memory usage benefits. 2431 - */ 2432 - return raw_smp_processor_id(); 2433 - } 2434 - #endif 2435 - 2436 2411 #ifdef CONFIG_MMU 2437 2412 extern bool can_do_mlock(void); 2438 2413 #else
+13 -115
include/linux/mm_types.h
··· 20 20 #include <linux/seqlock.h> 21 21 #include <linux/percpu_counter.h> 22 22 #include <linux/types.h> 23 + #include <linux/rseq_types.h> 23 24 #include <linux/bitmap.h> 24 25 25 26 #include <asm/mmu.h> ··· 923 922 #define vma_policy(vma) NULL 924 923 #endif 925 924 926 - #ifdef CONFIG_SCHED_MM_CID 927 - struct mm_cid { 928 - u64 time; 929 - int cid; 930 - int recent_cid; 931 - }; 932 - #endif 933 - 934 925 /* 935 926 * Opaque type representing current mm_struct flag state. Must be accessed via 936 927 * mm_flags_xxx() helper functions. ··· 984 991 */ 985 992 atomic_t mm_users; 986 993 987 - #ifdef CONFIG_SCHED_MM_CID 988 - /** 989 - * @pcpu_cid: Per-cpu current cid. 990 - * 991 - * Keep track of the currently allocated mm_cid for each cpu. 992 - * The per-cpu mm_cid values are serialized by their respective 993 - * runqueue locks. 994 - */ 995 - struct mm_cid __percpu *pcpu_cid; 996 - /* 997 - * @mm_cid_next_scan: Next mm_cid scan (in jiffies). 998 - * 999 - * When the next mm_cid scan is due (in jiffies). 1000 - */ 1001 - unsigned long mm_cid_next_scan; 1002 - /** 1003 - * @nr_cpus_allowed: Number of CPUs allowed for mm. 1004 - * 1005 - * Number of CPUs allowed in the union of all mm's 1006 - * threads allowed CPUs. 1007 - */ 1008 - unsigned int nr_cpus_allowed; 1009 - /** 1010 - * @max_nr_cid: Maximum number of allowed concurrency 1011 - * IDs allocated. 1012 - * 1013 - * Track the highest number of allowed concurrency IDs 1014 - * allocated for the mm. 1015 - */ 1016 - atomic_t max_nr_cid; 1017 - /** 1018 - * @cpus_allowed_lock: Lock protecting mm cpus_allowed. 1019 - * 1020 - * Provide mutual exclusion for mm cpus_allowed and 1021 - * mm nr_cpus_allowed updates. 1022 - */ 1023 - raw_spinlock_t cpus_allowed_lock; 1024 - #endif 994 + /* MM CID related storage */ 995 + struct mm_mm_cid mm_cid; 996 + 1025 997 #ifdef CONFIG_MMU 1026 998 atomic_long_t pgtables_bytes; /* size of all page tables */ 1027 999 #endif ··· 1328 1370 } 1329 1371 1330 1372 #ifdef CONFIG_SCHED_MM_CID 1331 - 1332 - enum mm_cid_state { 1333 - MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */ 1334 - MM_CID_LAZY_PUT = (1U << 31), 1335 - }; 1336 - 1337 - static inline bool mm_cid_is_unset(int cid) 1338 - { 1339 - return cid == MM_CID_UNSET; 1340 - } 1341 - 1342 - static inline bool mm_cid_is_lazy_put(int cid) 1343 - { 1344 - return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT); 1345 - } 1346 - 1347 - static inline bool mm_cid_is_valid(int cid) 1348 - { 1349 - return !(cid & MM_CID_LAZY_PUT); 1350 - } 1351 - 1352 - static inline int mm_cid_set_lazy_put(int cid) 1353 - { 1354 - return cid | MM_CID_LAZY_PUT; 1355 - } 1356 - 1357 - static inline int mm_cid_clear_lazy_put(int cid) 1358 - { 1359 - return cid & ~MM_CID_LAZY_PUT; 1360 - } 1361 - 1362 1373 /* 1363 1374 * mm_cpus_allowed: Union of all mm's threads allowed CPUs. 1364 1375 */ ··· 1342 1415 } 1343 1416 1344 1417 /* Accessor for struct mm_struct's cidmask. */ 1345 - static inline cpumask_t *mm_cidmask(struct mm_struct *mm) 1418 + static inline unsigned long *mm_cidmask(struct mm_struct *mm) 1346 1419 { 1347 1420 unsigned long cid_bitmap = (unsigned long)mm_cpus_allowed(mm); 1348 1421 1349 1422 /* Skip mm_cpus_allowed */ 1350 1423 cid_bitmap += cpumask_size(); 1351 - return (struct cpumask *)cid_bitmap; 1424 + return (unsigned long *)cid_bitmap; 1352 1425 } 1353 1426 1354 - static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 1355 - { 1356 - int i; 1357 - 1358 - for_each_possible_cpu(i) { 1359 - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i); 1360 - 1361 - pcpu_cid->cid = MM_CID_UNSET; 1362 - pcpu_cid->recent_cid = MM_CID_UNSET; 1363 - pcpu_cid->time = 0; 1364 - } 1365 - mm->nr_cpus_allowed = p->nr_cpus_allowed; 1366 - atomic_set(&mm->max_nr_cid, 0); 1367 - raw_spin_lock_init(&mm->cpus_allowed_lock); 1368 - cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 1369 - cpumask_clear(mm_cidmask(mm)); 1370 - } 1427 + void mm_init_cid(struct mm_struct *mm, struct task_struct *p); 1371 1428 1372 1429 static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *p) 1373 1430 { 1374 - mm->pcpu_cid = alloc_percpu_noprof(struct mm_cid); 1375 - if (!mm->pcpu_cid) 1431 + mm->mm_cid.pcpu = alloc_percpu_noprof(struct mm_cid_pcpu); 1432 + if (!mm->mm_cid.pcpu) 1376 1433 return -ENOMEM; 1377 1434 mm_init_cid(mm, p); 1378 1435 return 0; ··· 1365 1454 1366 1455 static inline void mm_destroy_cid(struct mm_struct *mm) 1367 1456 { 1368 - free_percpu(mm->pcpu_cid); 1369 - mm->pcpu_cid = NULL; 1457 + free_percpu(mm->mm_cid.pcpu); 1458 + mm->mm_cid.pcpu = NULL; 1370 1459 } 1371 1460 1372 1461 static inline unsigned int mm_cid_size(void) 1373 1462 { 1374 - return 2 * cpumask_size(); /* mm_cpus_allowed(), mm_cidmask(). */ 1463 + /* mm_cpus_allowed(), mm_cidmask(). */ 1464 + return cpumask_size() + bitmap_size(num_possible_cpus()); 1375 1465 } 1376 1466 1377 - static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) 1378 - { 1379 - struct cpumask *mm_allowed = mm_cpus_allowed(mm); 1380 - 1381 - if (!mm) 1382 - return; 1383 - /* The mm_cpus_allowed is the union of each thread allowed CPUs masks. */ 1384 - raw_spin_lock(&mm->cpus_allowed_lock); 1385 - cpumask_or(mm_allowed, mm_allowed, cpumask); 1386 - WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed)); 1387 - raw_spin_unlock(&mm->cpus_allowed_lock); 1388 - } 1389 1467 #else /* CONFIG_SCHED_MM_CID */ 1390 1468 static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { } 1391 1469 static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; } 1392 1470 static inline void mm_destroy_cid(struct mm_struct *mm) { } 1393 - 1394 1471 static inline unsigned int mm_cid_size(void) 1395 1472 { 1396 1473 return 0; 1397 1474 } 1398 - static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } 1399 1475 #endif /* CONFIG_SCHED_MM_CID */ 1400 1476 1401 1477 struct mmu_gather;
+1 -1
include/linux/resume_user_mode.h
··· 59 59 mem_cgroup_handle_over_high(GFP_KERNEL); 60 60 blkcg_maybe_throttle_current(); 61 61 62 - rseq_handle_notify_resume(NULL, regs); 62 + rseq_handle_slowpath(regs); 63 63 } 64 64 65 65 #endif /* LINUX_RESUME_USER_MODE_H */
+127 -97
include/linux/rseq.h
··· 3 3 #define _LINUX_RSEQ_H 4 4 5 5 #ifdef CONFIG_RSEQ 6 - 7 - #include <linux/preempt.h> 8 6 #include <linux/sched.h> 9 7 10 - #ifdef CONFIG_MEMBARRIER 11 - # define RSEQ_EVENT_GUARD irq 12 - #else 13 - # define RSEQ_EVENT_GUARD preempt 14 - #endif 8 + #include <uapi/linux/rseq.h> 9 + 10 + void __rseq_handle_slowpath(struct pt_regs *regs); 11 + 12 + /* Invoked from resume_user_mode_work() */ 13 + static inline void rseq_handle_slowpath(struct pt_regs *regs) 14 + { 15 + if (IS_ENABLED(CONFIG_GENERIC_ENTRY)) { 16 + if (current->rseq.event.slowpath) 17 + __rseq_handle_slowpath(regs); 18 + } else { 19 + /* '&' is intentional to spare one conditional branch */ 20 + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) 21 + __rseq_handle_slowpath(regs); 22 + } 23 + } 24 + 25 + void __rseq_signal_deliver(int sig, struct pt_regs *regs); 15 26 16 27 /* 17 - * Map the event mask on the user-space ABI enum rseq_cs_flags 18 - * for direct mask checks. 28 + * Invoked from signal delivery to fixup based on the register context before 29 + * switching to the signal delivery context. 19 30 */ 20 - enum rseq_event_mask_bits { 21 - RSEQ_EVENT_PREEMPT_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT_BIT, 22 - RSEQ_EVENT_SIGNAL_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL_BIT, 23 - RSEQ_EVENT_MIGRATE_BIT = RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE_BIT, 24 - }; 25 - 26 - enum rseq_event_mask { 27 - RSEQ_EVENT_PREEMPT = (1U << RSEQ_EVENT_PREEMPT_BIT), 28 - RSEQ_EVENT_SIGNAL = (1U << RSEQ_EVENT_SIGNAL_BIT), 29 - RSEQ_EVENT_MIGRATE = (1U << RSEQ_EVENT_MIGRATE_BIT), 30 - }; 31 - 32 - static inline void rseq_set_notify_resume(struct task_struct *t) 31 + static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) 33 32 { 34 - if (t->rseq) 35 - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 33 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 34 + /* '&' is intentional to spare one conditional branch */ 35 + if (current->rseq.event.has_rseq & current->rseq.event.user_irq) 36 + __rseq_signal_deliver(ksig->sig, regs); 37 + } else { 38 + if (current->rseq.event.has_rseq) 39 + __rseq_signal_deliver(ksig->sig, regs); 40 + } 36 41 } 37 42 38 - void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); 39 - 40 - static inline void rseq_handle_notify_resume(struct ksignal *ksig, 41 - struct pt_regs *regs) 43 + static inline void rseq_raise_notify_resume(struct task_struct *t) 42 44 { 43 - if (current->rseq) 44 - __rseq_handle_notify_resume(ksig, regs); 45 + set_tsk_thread_flag(t, TIF_RSEQ); 45 46 } 46 47 47 - static inline void rseq_signal_deliver(struct ksignal *ksig, 48 - struct pt_regs *regs) 48 + /* Invoked from context switch to force evaluation on exit to user */ 49 + static __always_inline void rseq_sched_switch_event(struct task_struct *t) 49 50 { 50 - scoped_guard(RSEQ_EVENT_GUARD) 51 - __set_bit(RSEQ_EVENT_SIGNAL_BIT, &current->rseq_event_mask); 52 - rseq_handle_notify_resume(ksig, regs); 51 + struct rseq_event *ev = &t->rseq.event; 52 + 53 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 54 + /* 55 + * Avoid a boat load of conditionals by using simple logic 56 + * to determine whether NOTIFY_RESUME needs to be raised. 57 + * 58 + * It's required when the CPU or MM CID has changed or 59 + * the entry was from user space. 60 + */ 61 + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; 62 + 63 + if (raise) { 64 + ev->sched_switch = true; 65 + rseq_raise_notify_resume(t); 66 + } 67 + } else { 68 + if (ev->has_rseq) { 69 + t->rseq.event.sched_switch = true; 70 + rseq_raise_notify_resume(t); 71 + } 72 + } 53 73 } 54 74 55 - /* rseq_preempt() requires preemption to be disabled. */ 56 - static inline void rseq_preempt(struct task_struct *t) 75 + /* 76 + * Invoked from __set_task_cpu() when a task migrates or from 77 + * mm_cid_schedin() when the CID changes to enforce an IDs update. 78 + * 79 + * This does not raise TIF_NOTIFY_RESUME as that happens in 80 + * rseq_sched_switch_event(). 81 + */ 82 + static __always_inline void rseq_sched_set_ids_changed(struct task_struct *t) 57 83 { 58 - __set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask); 59 - rseq_set_notify_resume(t); 84 + t->rseq.event.ids_changed = true; 60 85 } 61 86 62 - /* rseq_migrate() requires preemption to be disabled. */ 63 - static inline void rseq_migrate(struct task_struct *t) 87 + /* Enforce a full update after RSEQ registration and when execve() failed */ 88 + static inline void rseq_force_update(void) 64 89 { 65 - __set_bit(RSEQ_EVENT_MIGRATE_BIT, &t->rseq_event_mask); 66 - rseq_set_notify_resume(t); 90 + if (current->rseq.event.has_rseq) { 91 + current->rseq.event.ids_changed = true; 92 + current->rseq.event.sched_switch = true; 93 + rseq_raise_notify_resume(current); 94 + } 95 + } 96 + 97 + /* 98 + * KVM/HYPERV invoke resume_user_mode_work() before entering guest mode, 99 + * which clears TIF_NOTIFY_RESUME on architectures that don't use the 100 + * generic TIF bits and therefore can't provide a separate TIF_RSEQ flag. 101 + * 102 + * To avoid updating user space RSEQ in that case just to do it eventually 103 + * again before returning to user space, because __rseq_handle_slowpath() 104 + * does nothing when invoked with NULL register state. 105 + * 106 + * After returning from guest mode, before exiting to userspace, hypervisors 107 + * must invoke this function to re-raise TIF_NOTIFY_RESUME if necessary. 108 + */ 109 + static inline void rseq_virt_userspace_exit(void) 110 + { 111 + /* 112 + * The generic optimization for deferring RSEQ updates until the next 113 + * exit relies on having a dedicated TIF_RSEQ. 114 + */ 115 + if (!IS_ENABLED(CONFIG_HAVE_GENERIC_TIF_BITS) && 116 + current->rseq.event.sched_switch) 117 + rseq_raise_notify_resume(current); 118 + } 119 + 120 + static inline void rseq_reset(struct task_struct *t) 121 + { 122 + memset(&t->rseq, 0, sizeof(t->rseq)); 123 + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; 124 + } 125 + 126 + static inline void rseq_execve(struct task_struct *t) 127 + { 128 + rseq_reset(t); 67 129 } 68 130 69 131 /* 70 132 * If parent process has a registered restartable sequences area, the 71 133 * child inherits. Unregister rseq for a clone with CLONE_VM set. 134 + * 135 + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault 136 + * on the COW page on exit to user space, when the child stays on the same 137 + * CPU as the parent. That's obviously not guaranteed, but in overcommit 138 + * scenarios it is more likely and optimizes for the fork/exec case without 139 + * taking the fault. 72 140 */ 73 141 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) 74 142 { 75 - if (clone_flags & CLONE_VM) { 76 - t->rseq = NULL; 77 - t->rseq_len = 0; 78 - t->rseq_sig = 0; 79 - t->rseq_event_mask = 0; 80 - } else { 143 + if (clone_flags & CLONE_VM) 144 + rseq_reset(t); 145 + else 81 146 t->rseq = current->rseq; 82 - t->rseq_len = current->rseq_len; 83 - t->rseq_sig = current->rseq_sig; 84 - t->rseq_event_mask = current->rseq_event_mask; 85 - } 86 147 } 87 148 88 - static inline void rseq_execve(struct task_struct *t) 89 - { 90 - t->rseq = NULL; 91 - t->rseq_len = 0; 92 - t->rseq_sig = 0; 93 - t->rseq_event_mask = 0; 94 - } 95 - 96 - #else 97 - 98 - static inline void rseq_set_notify_resume(struct task_struct *t) 99 - { 100 - } 101 - static inline void rseq_handle_notify_resume(struct ksignal *ksig, 102 - struct pt_regs *regs) 103 - { 104 - } 105 - static inline void rseq_signal_deliver(struct ksignal *ksig, 106 - struct pt_regs *regs) 107 - { 108 - } 109 - static inline void rseq_preempt(struct task_struct *t) 110 - { 111 - } 112 - static inline void rseq_migrate(struct task_struct *t) 113 - { 114 - } 115 - static inline void rseq_fork(struct task_struct *t, u64 clone_flags) 116 - { 117 - } 118 - static inline void rseq_execve(struct task_struct *t) 119 - { 120 - } 121 - 122 - #endif 149 + #else /* CONFIG_RSEQ */ 150 + static inline void rseq_handle_slowpath(struct pt_regs *regs) { } 151 + static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } 152 + static inline void rseq_sched_switch_event(struct task_struct *t) { } 153 + static inline void rseq_sched_set_ids_changed(struct task_struct *t) { } 154 + static inline void rseq_force_update(void) { } 155 + static inline void rseq_virt_userspace_exit(void) { } 156 + static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } 157 + static inline void rseq_execve(struct task_struct *t) { } 158 + #endif /* !CONFIG_RSEQ */ 123 159 124 160 #ifdef CONFIG_DEBUG_RSEQ 125 - 126 161 void rseq_syscall(struct pt_regs *regs); 127 - 128 - #else 129 - 130 - static inline void rseq_syscall(struct pt_regs *regs) 131 - { 132 - } 133 - 134 - #endif 162 + #else /* CONFIG_DEBUG_RSEQ */ 163 + static inline void rseq_syscall(struct pt_regs *regs) { } 164 + #endif /* !CONFIG_DEBUG_RSEQ */ 135 165 136 166 #endif /* _LINUX_RSEQ_H */
+616
include/linux/rseq_entry.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_RSEQ_ENTRY_H 3 + #define _LINUX_RSEQ_ENTRY_H 4 + 5 + /* Must be outside the CONFIG_RSEQ guard to resolve the stubs */ 6 + #ifdef CONFIG_RSEQ_STATS 7 + #include <linux/percpu.h> 8 + 9 + struct rseq_stats { 10 + unsigned long exit; 11 + unsigned long signal; 12 + unsigned long slowpath; 13 + unsigned long fastpath; 14 + unsigned long ids; 15 + unsigned long cs; 16 + unsigned long clear; 17 + unsigned long fixup; 18 + }; 19 + 20 + DECLARE_PER_CPU(struct rseq_stats, rseq_stats); 21 + 22 + /* 23 + * Slow path has interrupts and preemption enabled, but the fast path 24 + * runs with interrupts disabled so there is no point in having the 25 + * preemption checks implied in __this_cpu_inc() for every operation. 26 + */ 27 + #ifdef RSEQ_BUILD_SLOW_PATH 28 + #define rseq_stat_inc(which) this_cpu_inc((which)) 29 + #else 30 + #define rseq_stat_inc(which) raw_cpu_inc((which)) 31 + #endif 32 + 33 + #else /* CONFIG_RSEQ_STATS */ 34 + #define rseq_stat_inc(x) do { } while (0) 35 + #endif /* !CONFIG_RSEQ_STATS */ 36 + 37 + #ifdef CONFIG_RSEQ 38 + #include <linux/jump_label.h> 39 + #include <linux/rseq.h> 40 + #include <linux/uaccess.h> 41 + 42 + #include <linux/tracepoint-defs.h> 43 + 44 + #ifdef CONFIG_TRACEPOINTS 45 + DECLARE_TRACEPOINT(rseq_update); 46 + DECLARE_TRACEPOINT(rseq_ip_fixup); 47 + void __rseq_trace_update(struct task_struct *t); 48 + void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 49 + unsigned long offset, unsigned long abort_ip); 50 + 51 + static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) 52 + { 53 + if (tracepoint_enabled(rseq_update) && ids) 54 + __rseq_trace_update(t); 55 + } 56 + 57 + static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 58 + unsigned long offset, unsigned long abort_ip) 59 + { 60 + if (tracepoint_enabled(rseq_ip_fixup)) 61 + __rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 62 + } 63 + 64 + #else /* CONFIG_TRACEPOINT */ 65 + static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { } 66 + static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 67 + unsigned long offset, unsigned long abort_ip) { } 68 + #endif /* !CONFIG_TRACEPOINT */ 69 + 70 + DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 71 + 72 + #ifdef RSEQ_BUILD_SLOW_PATH 73 + #define rseq_inline 74 + #else 75 + #define rseq_inline __always_inline 76 + #endif 77 + 78 + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); 79 + bool rseq_debug_validate_ids(struct task_struct *t); 80 + 81 + static __always_inline void rseq_note_user_irq_entry(void) 82 + { 83 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) 84 + current->rseq.event.user_irq = true; 85 + } 86 + 87 + /* 88 + * Check whether there is a valid critical section and whether the 89 + * instruction pointer in @regs is inside the critical section. 90 + * 91 + * - If the critical section is invalid, terminate the task. 92 + * 93 + * - If valid and the instruction pointer is inside, set it to the abort IP. 94 + * 95 + * - If valid and the instruction pointer is outside, clear the critical 96 + * section address. 97 + * 98 + * Returns true, if the section was valid and either fixup or clear was 99 + * done, false otherwise. 100 + * 101 + * In the failure case task::rseq_event::fatal is set when a invalid 102 + * section was found. It's clear when the failure was an unresolved page 103 + * fault. 104 + * 105 + * If inlined into the exit to user path with interrupts disabled, the 106 + * caller has to protect against page faults with pagefault_disable(). 107 + * 108 + * In preemptible task context this would be counterproductive as the page 109 + * faults could not be fully resolved. As a consequence unresolved page 110 + * faults in task context are fatal too. 111 + */ 112 + 113 + #ifdef RSEQ_BUILD_SLOW_PATH 114 + /* 115 + * The debug version is put out of line, but kept here so the code stays 116 + * together. 117 + * 118 + * @csaddr has already been checked by the caller to be in user space 119 + */ 120 + bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, 121 + unsigned long csaddr) 122 + { 123 + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 124 + u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE; 125 + unsigned long ip = instruction_pointer(regs); 126 + u64 __user *uc_head = (u64 __user *) ucs; 127 + u32 usig, __user *uc_sig; 128 + 129 + scoped_user_rw_access(ucs, efault) { 130 + /* 131 + * Evaluate the user pile and exit if one of the conditions 132 + * is not fulfilled. 133 + */ 134 + unsafe_get_user(start_ip, &ucs->start_ip, efault); 135 + if (unlikely(start_ip >= tasksize)) 136 + goto die; 137 + /* If outside, just clear the critical section. */ 138 + if (ip < start_ip) 139 + goto clear; 140 + 141 + unsafe_get_user(offset, &ucs->post_commit_offset, efault); 142 + cs_end = start_ip + offset; 143 + /* Check for overflow and wraparound */ 144 + if (unlikely(cs_end >= tasksize || cs_end < start_ip)) 145 + goto die; 146 + 147 + /* If not inside, clear it. */ 148 + if (ip >= cs_end) 149 + goto clear; 150 + 151 + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 152 + /* Ensure it's "valid" */ 153 + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 154 + goto die; 155 + /* Validate that the abort IP is not in the critical section */ 156 + if (unlikely(abort_ip - start_ip < offset)) 157 + goto die; 158 + 159 + /* 160 + * Check version and flags for 0. No point in emitting 161 + * deprecated warnings before dying. That could be done in 162 + * the slow path eventually, but *shrug*. 163 + */ 164 + unsafe_get_user(head, uc_head, efault); 165 + if (unlikely(head)) 166 + goto die; 167 + 168 + /* abort_ip - 4 is >= 0. See abort_ip check above */ 169 + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 170 + unsafe_get_user(usig, uc_sig, efault); 171 + if (unlikely(usig != t->rseq.sig)) 172 + goto die; 173 + 174 + /* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */ 175 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 176 + /* If not in interrupt from user context, let it die */ 177 + if (unlikely(!t->rseq.event.user_irq)) 178 + goto die; 179 + } 180 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 181 + instruction_pointer_set(regs, (unsigned long)abort_ip); 182 + rseq_stat_inc(rseq_stats.fixup); 183 + break; 184 + clear: 185 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 186 + rseq_stat_inc(rseq_stats.clear); 187 + abort_ip = 0ULL; 188 + } 189 + 190 + if (unlikely(abort_ip)) 191 + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 192 + return true; 193 + die: 194 + t->rseq.event.fatal = true; 195 + efault: 196 + return false; 197 + } 198 + 199 + /* 200 + * On debug kernels validate that user space did not mess with it if the 201 + * debug branch is enabled. 202 + */ 203 + bool rseq_debug_validate_ids(struct task_struct *t) 204 + { 205 + struct rseq __user *rseq = t->rseq.usrptr; 206 + u32 cpu_id, uval, node_id; 207 + 208 + /* 209 + * On the first exit after registering the rseq region CPU ID is 210 + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! 211 + */ 212 + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? 213 + cpu_to_node(t->rseq.ids.cpu_id) : 0; 214 + 215 + scoped_user_read_access(rseq, efault) { 216 + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); 217 + if (cpu_id != t->rseq.ids.cpu_id) 218 + goto die; 219 + unsafe_get_user(uval, &rseq->cpu_id, efault); 220 + if (uval != cpu_id) 221 + goto die; 222 + unsafe_get_user(uval, &rseq->node_id, efault); 223 + if (uval != node_id) 224 + goto die; 225 + unsafe_get_user(uval, &rseq->mm_cid, efault); 226 + if (uval != t->rseq.ids.mm_cid) 227 + goto die; 228 + } 229 + return true; 230 + die: 231 + t->rseq.event.fatal = true; 232 + efault: 233 + return false; 234 + } 235 + 236 + #endif /* RSEQ_BUILD_SLOW_PATH */ 237 + 238 + /* 239 + * This only ensures that abort_ip is in the user address space and 240 + * validates that it is preceded by the signature. 241 + * 242 + * No other sanity checks are done here, that's what the debug code is for. 243 + */ 244 + static rseq_inline bool 245 + rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr) 246 + { 247 + struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr; 248 + unsigned long ip = instruction_pointer(regs); 249 + unsigned long tasksize = TASK_SIZE; 250 + u64 start_ip, abort_ip, offset; 251 + u32 usig, __user *uc_sig; 252 + 253 + rseq_stat_inc(rseq_stats.cs); 254 + 255 + if (unlikely(csaddr >= tasksize)) { 256 + t->rseq.event.fatal = true; 257 + return false; 258 + } 259 + 260 + if (static_branch_unlikely(&rseq_debug_enabled)) 261 + return rseq_debug_update_user_cs(t, regs, csaddr); 262 + 263 + scoped_user_rw_access(ucs, efault) { 264 + unsafe_get_user(start_ip, &ucs->start_ip, efault); 265 + unsafe_get_user(offset, &ucs->post_commit_offset, efault); 266 + unsafe_get_user(abort_ip, &ucs->abort_ip, efault); 267 + 268 + /* 269 + * No sanity checks. If user space screwed it up, it can 270 + * keep the pieces. That's what debug code is for. 271 + * 272 + * If outside, just clear the critical section. 273 + */ 274 + if (ip - start_ip >= offset) 275 + goto clear; 276 + 277 + /* 278 + * Two requirements for @abort_ip: 279 + * - Must be in user space as x86 IRET would happily return to 280 + * the kernel. 281 + * - The four bytes preceding the instruction at @abort_ip must 282 + * contain the signature. 283 + * 284 + * The latter protects against the following attack vector: 285 + * 286 + * An attacker with limited abilities to write, creates a critical 287 + * section descriptor, sets the abort IP to a library function or 288 + * some other ROP gadget and stores the address of the descriptor 289 + * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP 290 + * protection. 291 + */ 292 + if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig))) 293 + goto die; 294 + 295 + /* The address is guaranteed to be >= 0 and < TASK_SIZE */ 296 + uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig)); 297 + unsafe_get_user(usig, uc_sig, efault); 298 + if (unlikely(usig != t->rseq.sig)) 299 + goto die; 300 + 301 + /* Invalidate the critical section */ 302 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 303 + /* Update the instruction pointer */ 304 + instruction_pointer_set(regs, (unsigned long)abort_ip); 305 + rseq_stat_inc(rseq_stats.fixup); 306 + break; 307 + clear: 308 + unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault); 309 + rseq_stat_inc(rseq_stats.clear); 310 + abort_ip = 0ULL; 311 + } 312 + 313 + if (unlikely(abort_ip)) 314 + rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip); 315 + return true; 316 + die: 317 + t->rseq.event.fatal = true; 318 + efault: 319 + return false; 320 + } 321 + 322 + /* 323 + * Updates CPU ID, Node ID and MM CID and reads the critical section 324 + * address, when @csaddr != NULL. This allows to put the ID update and the 325 + * read under the same uaccess region to spare a separate begin/end. 326 + * 327 + * As this is either invoked from a C wrapper with @csaddr = NULL or from 328 + * the fast path code with a valid pointer, a clever compiler should be 329 + * able to optimize the read out. Spares a duplicate implementation. 330 + * 331 + * Returns true, if the operation was successful, false otherwise. 332 + * 333 + * In the failure case task::rseq_event::fatal is set when invalid data 334 + * was found on debug kernels. It's clear when the failure was an unresolved page 335 + * fault. 336 + * 337 + * If inlined into the exit to user path with interrupts disabled, the 338 + * caller has to protect against page faults with pagefault_disable(). 339 + * 340 + * In preemptible task context this would be counterproductive as the page 341 + * faults could not be fully resolved. As a consequence unresolved page 342 + * faults in task context are fatal too. 343 + */ 344 + static rseq_inline 345 + bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, 346 + u32 node_id, u64 *csaddr) 347 + { 348 + struct rseq __user *rseq = t->rseq.usrptr; 349 + 350 + if (static_branch_unlikely(&rseq_debug_enabled)) { 351 + if (!rseq_debug_validate_ids(t)) 352 + return false; 353 + } 354 + 355 + scoped_user_rw_access(rseq, efault) { 356 + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); 357 + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); 358 + unsafe_put_user(node_id, &rseq->node_id, efault); 359 + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); 360 + if (csaddr) 361 + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); 362 + } 363 + 364 + /* Cache the new values */ 365 + t->rseq.ids.cpu_cid = ids->cpu_cid; 366 + rseq_stat_inc(rseq_stats.ids); 367 + rseq_trace_update(t, ids); 368 + return true; 369 + efault: 370 + return false; 371 + } 372 + 373 + /* 374 + * Update user space with new IDs and conditionally check whether the task 375 + * is in a critical section. 376 + */ 377 + static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, 378 + struct rseq_ids *ids, u32 node_id) 379 + { 380 + u64 csaddr; 381 + 382 + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) 383 + return false; 384 + 385 + /* 386 + * On architectures which utilize the generic entry code this 387 + * allows to skip the critical section when the entry was not from 388 + * a user space interrupt, unless debug mode is enabled. 389 + */ 390 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 391 + if (!static_branch_unlikely(&rseq_debug_enabled)) { 392 + if (likely(!t->rseq.event.user_irq)) 393 + return true; 394 + } 395 + } 396 + if (likely(!csaddr)) 397 + return true; 398 + /* Sigh, this really needs to do work */ 399 + return rseq_update_user_cs(t, regs, csaddr); 400 + } 401 + 402 + /* 403 + * If you want to use this then convert your architecture to the generic 404 + * entry code. I'm tired of building workarounds for people who can't be 405 + * bothered to make the maintenance of generic infrastructure less 406 + * burdensome. Just sucking everything into the architecture code and 407 + * thereby making others chase the horrible hacks and keep them working is 408 + * neither acceptable nor sustainable. 409 + */ 410 + #ifdef CONFIG_GENERIC_ENTRY 411 + 412 + /* 413 + * This is inlined into the exit path because: 414 + * 415 + * 1) It's a one time comparison in the fast path when there is no event to 416 + * handle 417 + * 418 + * 2) The access to the user space rseq memory (TLS) is unlikely to fault 419 + * so the straight inline operation is: 420 + * 421 + * - Four 32-bit stores only if CPU ID/ MM CID need to be updated 422 + * - One 64-bit load to retrieve the critical section address 423 + * 424 + * 3) In the unlikely case that the critical section address is != NULL: 425 + * 426 + * - One 64-bit load to retrieve the start IP 427 + * - One 64-bit load to retrieve the offset for calculating the end 428 + * - One 64-bit load to retrieve the abort IP 429 + * - One 64-bit load to retrieve the signature 430 + * - One store to clear the critical section address 431 + * 432 + * The non-debug case implements only the minimal required checking. It 433 + * provides protection against a rogue abort IP in kernel space, which 434 + * would be exploitable at least on x86, and also against a rogue CS 435 + * descriptor by checking the signature at the abort IP. Any fallout from 436 + * invalid critical section descriptors is a user space problem. The debug 437 + * case provides the full set of checks and terminates the task if a 438 + * condition is not met. 439 + * 440 + * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and 441 + * tells the caller to loop back into exit_to_user_mode_loop(). The rseq 442 + * slow path there will handle the failure. 443 + */ 444 + static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t) 445 + { 446 + /* 447 + * Page faults need to be disabled as this is called with 448 + * interrupts disabled 449 + */ 450 + guard(pagefault)(); 451 + if (likely(!t->rseq.event.ids_changed)) { 452 + struct rseq __user *rseq = t->rseq.usrptr; 453 + /* 454 + * If IDs have not changed rseq_event::user_irq must be true 455 + * See rseq_sched_switch_event(). 456 + */ 457 + u64 csaddr; 458 + 459 + if (unlikely(get_user_inline(csaddr, &rseq->rseq_cs))) 460 + return false; 461 + 462 + if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) { 463 + if (unlikely(!rseq_update_user_cs(t, regs, csaddr))) 464 + return false; 465 + } 466 + return true; 467 + } 468 + 469 + struct rseq_ids ids = { 470 + .cpu_id = task_cpu(t), 471 + .mm_cid = task_mm_cid(t), 472 + }; 473 + u32 node_id = cpu_to_node(ids.cpu_id); 474 + 475 + return rseq_update_usr(t, regs, &ids, node_id); 476 + } 477 + 478 + static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs) 479 + { 480 + struct task_struct *t = current; 481 + 482 + /* 483 + * If the task did not go through schedule or got the flag enforced 484 + * by the rseq syscall or execve, then nothing to do here. 485 + * 486 + * CPU ID and MM CID can only change when going through a context 487 + * switch. 488 + * 489 + * rseq_sched_switch_event() sets the rseq_event::sched_switch bit 490 + * only when rseq_event::has_rseq is true. That conditional is 491 + * required to avoid setting the TIF bit if RSEQ is not registered 492 + * for a task. rseq_event::sched_switch is cleared when RSEQ is 493 + * unregistered by a task so it's sufficient to check for the 494 + * sched_switch bit alone. 495 + * 496 + * A sane compiler requires three instructions for the nothing to do 497 + * case including clearing the events, but your mileage might vary. 498 + */ 499 + if (unlikely((t->rseq.event.sched_switch))) { 500 + rseq_stat_inc(rseq_stats.fastpath); 501 + 502 + if (unlikely(!rseq_exit_user_update(regs, t))) 503 + return true; 504 + } 505 + /* Clear state so next entry starts from a clean slate */ 506 + t->rseq.event.events = 0; 507 + return false; 508 + } 509 + 510 + /* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */ 511 + #ifdef CONFIG_HAVE_GENERIC_TIF_BITS 512 + static __always_inline bool test_tif_rseq(unsigned long ti_work) 513 + { 514 + return ti_work & _TIF_RSEQ; 515 + } 516 + 517 + static __always_inline void clear_tif_rseq(void) 518 + { 519 + static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME); 520 + clear_thread_flag(TIF_RSEQ); 521 + } 522 + #else 523 + static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; } 524 + static __always_inline void clear_tif_rseq(void) { } 525 + #endif 526 + 527 + static __always_inline bool 528 + rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 529 + { 530 + if (likely(!test_tif_rseq(ti_work))) 531 + return false; 532 + 533 + if (unlikely(__rseq_exit_to_user_mode_restart(regs))) { 534 + current->rseq.event.slowpath = true; 535 + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 536 + return true; 537 + } 538 + 539 + clear_tif_rseq(); 540 + return false; 541 + } 542 + 543 + #else /* CONFIG_GENERIC_ENTRY */ 544 + static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 545 + { 546 + return false; 547 + } 548 + #endif /* !CONFIG_GENERIC_ENTRY */ 549 + 550 + static __always_inline void rseq_syscall_exit_to_user_mode(void) 551 + { 552 + struct rseq_event *ev = &current->rseq.event; 553 + 554 + rseq_stat_inc(rseq_stats.exit); 555 + 556 + /* Needed to remove the store for the !lockdep case */ 557 + if (IS_ENABLED(CONFIG_LOCKDEP)) { 558 + WARN_ON_ONCE(ev->sched_switch); 559 + ev->events = 0; 560 + } 561 + } 562 + 563 + static __always_inline void rseq_irqentry_exit_to_user_mode(void) 564 + { 565 + struct rseq_event *ev = &current->rseq.event; 566 + 567 + rseq_stat_inc(rseq_stats.exit); 568 + 569 + lockdep_assert_once(!ev->sched_switch); 570 + 571 + /* 572 + * Ensure that event (especially user_irq) is cleared when the 573 + * interrupt did not result in a schedule and therefore the 574 + * rseq processing could not clear it. 575 + */ 576 + ev->events = 0; 577 + } 578 + 579 + /* Required to keep ARM64 working */ 580 + static __always_inline void rseq_exit_to_user_mode_legacy(void) 581 + { 582 + struct rseq_event *ev = &current->rseq.event; 583 + 584 + rseq_stat_inc(rseq_stats.exit); 585 + 586 + if (static_branch_unlikely(&rseq_debug_enabled)) 587 + WARN_ON_ONCE(ev->sched_switch); 588 + 589 + /* 590 + * Ensure that event (especially user_irq) is cleared when the 591 + * interrupt did not result in a schedule and therefore the 592 + * rseq processing did not clear it. 593 + */ 594 + ev->events = 0; 595 + } 596 + 597 + void __rseq_debug_syscall_return(struct pt_regs *regs); 598 + 599 + static inline void rseq_debug_syscall_return(struct pt_regs *regs) 600 + { 601 + if (static_branch_unlikely(&rseq_debug_enabled)) 602 + __rseq_debug_syscall_return(regs); 603 + } 604 + #else /* CONFIG_RSEQ */ 605 + static inline void rseq_note_user_irq_entry(void) { } 606 + static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work) 607 + { 608 + return false; 609 + } 610 + static inline void rseq_syscall_exit_to_user_mode(void) { } 611 + static inline void rseq_irqentry_exit_to_user_mode(void) { } 612 + static inline void rseq_exit_to_user_mode_legacy(void) { } 613 + static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } 614 + #endif /* !CONFIG_RSEQ */ 615 + 616 + #endif /* _LINUX_RSEQ_ENTRY_H */
+164
include/linux/rseq_types.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_RSEQ_TYPES_H 3 + #define _LINUX_RSEQ_TYPES_H 4 + 5 + #include <linux/irq_work_types.h> 6 + #include <linux/types.h> 7 + #include <linux/workqueue_types.h> 8 + 9 + #ifdef CONFIG_RSEQ 10 + struct rseq; 11 + 12 + /** 13 + * struct rseq_event - Storage for rseq related event management 14 + * @all: Compound to initialize and clear the data efficiently 15 + * @events: Compound to access events with a single load/store 16 + * @sched_switch: True if the task was scheduled and needs update on 17 + * exit to user 18 + * @ids_changed: Indicator that IDs need to be updated 19 + * @user_irq: True on interrupt entry from user mode 20 + * @has_rseq: True if the task has a rseq pointer installed 21 + * @error: Compound error code for the slow path to analyze 22 + * @fatal: User space data corrupted or invalid 23 + * @slowpath: Indicator that slow path processing via TIF_NOTIFY_RESUME 24 + * is required 25 + * 26 + * @sched_switch and @ids_changed must be adjacent and the combo must be 27 + * 16bit aligned to allow a single store, when both are set at the same 28 + * time in the scheduler. 29 + */ 30 + struct rseq_event { 31 + union { 32 + u64 all; 33 + struct { 34 + union { 35 + u32 events; 36 + struct { 37 + u8 sched_switch; 38 + u8 ids_changed; 39 + u8 user_irq; 40 + }; 41 + }; 42 + 43 + u8 has_rseq; 44 + u8 __pad; 45 + union { 46 + u16 error; 47 + struct { 48 + u8 fatal; 49 + u8 slowpath; 50 + }; 51 + }; 52 + }; 53 + }; 54 + }; 55 + 56 + /** 57 + * struct rseq_ids - Cache for ids, which need to be updated 58 + * @cpu_cid: Compound of @cpu_id and @mm_cid to make the 59 + * compiler emit a single compare on 64-bit 60 + * @cpu_id: The CPU ID which was written last to user space 61 + * @mm_cid: The MM CID which was written last to user space 62 + * 63 + * @cpu_id and @mm_cid are updated when the data is written to user space. 64 + */ 65 + struct rseq_ids { 66 + union { 67 + u64 cpu_cid; 68 + struct { 69 + u32 cpu_id; 70 + u32 mm_cid; 71 + }; 72 + }; 73 + }; 74 + 75 + /** 76 + * struct rseq_data - Storage for all rseq related data 77 + * @usrptr: Pointer to the registered user space RSEQ memory 78 + * @len: Length of the RSEQ region 79 + * @sig: Signature of critial section abort IPs 80 + * @event: Storage for event management 81 + * @ids: Storage for cached CPU ID and MM CID 82 + */ 83 + struct rseq_data { 84 + struct rseq __user *usrptr; 85 + u32 len; 86 + u32 sig; 87 + struct rseq_event event; 88 + struct rseq_ids ids; 89 + }; 90 + 91 + #else /* CONFIG_RSEQ */ 92 + struct rseq_data { }; 93 + #endif /* !CONFIG_RSEQ */ 94 + 95 + #ifdef CONFIG_SCHED_MM_CID 96 + 97 + #define MM_CID_UNSET BIT(31) 98 + #define MM_CID_ONCPU BIT(30) 99 + #define MM_CID_TRANSIT BIT(29) 100 + 101 + /** 102 + * struct sched_mm_cid - Storage for per task MM CID data 103 + * @active: MM CID is active for the task 104 + * @cid: The CID associated to the task either permanently or 105 + * borrowed from the CPU 106 + */ 107 + struct sched_mm_cid { 108 + unsigned int active; 109 + unsigned int cid; 110 + }; 111 + 112 + /** 113 + * struct mm_cid_pcpu - Storage for per CPU MM_CID data 114 + * @cid: The CID associated to the CPU either permanently or 115 + * while a task with a CID is running 116 + */ 117 + struct mm_cid_pcpu { 118 + unsigned int cid; 119 + }____cacheline_aligned_in_smp; 120 + 121 + /** 122 + * struct mm_mm_cid - Storage for per MM CID data 123 + * @pcpu: Per CPU storage for CIDs associated to a CPU 124 + * @percpu: Set, when CIDs are in per CPU mode 125 + * @transit: Set to MM_CID_TRANSIT during a mode change transition phase 126 + * @max_cids: The exclusive maximum CID value for allocation and convergence 127 + * @irq_work: irq_work to handle the affinity mode change case 128 + * @work: Regular work to handle the affinity mode change case 129 + * @lock: Spinlock to protect against affinity setting which can't take @mutex 130 + * @mutex: Mutex to serialize forks and exits related to this mm 131 + * @nr_cpus_allowed: The number of CPUs in the per MM allowed CPUs map. The map 132 + * is growth only. 133 + * @users: The number of tasks sharing this MM. Separate from mm::mm_users 134 + * as that is modified by mmget()/mm_put() by other entities which 135 + * do not actually share the MM. 136 + * @pcpu_thrs: Threshold for switching back from per CPU mode 137 + * @update_deferred: A deferred switch back to per task mode is pending. 138 + */ 139 + struct mm_mm_cid { 140 + /* Hotpath read mostly members */ 141 + struct mm_cid_pcpu __percpu *pcpu; 142 + unsigned int percpu; 143 + unsigned int transit; 144 + unsigned int max_cids; 145 + 146 + /* Rarely used. Moves @lock and @mutex into the second cacheline */ 147 + struct irq_work irq_work; 148 + struct work_struct work; 149 + 150 + raw_spinlock_t lock; 151 + struct mutex mutex; 152 + 153 + /* Low frequency modified */ 154 + unsigned int nr_cpus_allowed; 155 + unsigned int users; 156 + unsigned int pcpu_thrs; 157 + unsigned int update_deferred; 158 + }____cacheline_aligned_in_smp; 159 + #else /* CONFIG_SCHED_MM_CID */ 160 + struct mm_mm_cid { }; 161 + struct sched_mm_cid { }; 162 + #endif /* !CONFIG_SCHED_MM_CID */ 163 + 164 + #endif
+29 -28
include/linux/sched.h
··· 41 41 #include <linux/task_io_accounting.h> 42 42 #include <linux/posix-timers_types.h> 43 43 #include <linux/restart_block.h> 44 - #include <uapi/linux/rseq.h> 44 + #include <linux/rseq_types.h> 45 45 #include <linux/seqlock_types.h> 46 46 #include <linux/kcsan.h> 47 47 #include <linux/rv.h> ··· 1406 1406 unsigned long numa_pages_migrated; 1407 1407 #endif /* CONFIG_NUMA_BALANCING */ 1408 1408 1409 - #ifdef CONFIG_RSEQ 1410 - struct rseq __user *rseq; 1411 - u32 rseq_len; 1412 - u32 rseq_sig; 1413 - /* 1414 - * RmW on rseq_event_mask must be performed atomically 1415 - * with respect to preemption. 1416 - */ 1417 - unsigned long rseq_event_mask; 1418 - # ifdef CONFIG_DEBUG_RSEQ 1419 - /* 1420 - * This is a place holder to save a copy of the rseq fields for 1421 - * validation of read-only fields. The struct rseq has a 1422 - * variable-length array at the end, so it cannot be used 1423 - * directly. Reserve a size large enough for the known fields. 1424 - */ 1425 - char rseq_fields[sizeof(struct rseq)]; 1426 - # endif 1427 - #endif 1428 - 1429 - #ifdef CONFIG_SCHED_MM_CID 1430 - int mm_cid; /* Current cid in mm */ 1431 - int last_mm_cid; /* Most recent cid in mm */ 1432 - int migrate_from_cpu; 1433 - int mm_cid_active; /* Whether cid bitmap is active */ 1434 - struct callback_head cid_work; 1435 - #endif 1409 + struct rseq_data rseq; 1410 + struct sched_mm_cid mm_cid; 1436 1411 1437 1412 struct tlbflush_unmap_batch tlb_ubc; 1438 1413 ··· 2298 2323 #else 2299 2324 #define alloc_tag_save(_tag) NULL 2300 2325 #define alloc_tag_restore(_tag, _old) do {} while (0) 2326 + #endif 2327 + 2328 + /* Avoids recursive inclusion hell */ 2329 + #ifdef CONFIG_SCHED_MM_CID 2330 + void sched_mm_cid_before_execve(struct task_struct *t); 2331 + void sched_mm_cid_after_execve(struct task_struct *t); 2332 + void sched_mm_cid_fork(struct task_struct *t); 2333 + void sched_mm_cid_exit(struct task_struct *t); 2334 + static __always_inline int task_mm_cid(struct task_struct *t) 2335 + { 2336 + return t->mm_cid.cid & ~(MM_CID_ONCPU | MM_CID_TRANSIT); 2337 + } 2338 + #else 2339 + static inline void sched_mm_cid_before_execve(struct task_struct *t) { } 2340 + static inline void sched_mm_cid_after_execve(struct task_struct *t) { } 2341 + static inline void sched_mm_cid_fork(struct task_struct *t) { } 2342 + static inline void sched_mm_cid_exit(struct task_struct *t) { } 2343 + static __always_inline int task_mm_cid(struct task_struct *t) 2344 + { 2345 + /* 2346 + * Use the processor id as a fall-back when the mm cid feature is 2347 + * disabled. This provides functional per-cpu data structure accesses 2348 + * in user-space, althrough it won't provide the memory usage benefits. 2349 + */ 2350 + return task_cpu(t); 2351 + } 2301 2352 #endif 2302 2353 2303 2354 #ifndef MODULE
+5
include/linux/thread_info.h
··· 67 67 #define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED 68 68 #endif 69 69 70 + #ifndef TIF_RSEQ 71 + # define TIF_RSEQ TIF_NOTIFY_RESUME 72 + # define _TIF_RSEQ _TIF_NOTIFY_RESUME 73 + #endif 74 + 70 75 #ifdef __KERNEL__ 71 76 72 77 #ifndef arch_set_restart_data
+2 -2
include/trace/events/rseq.h
··· 21 21 ), 22 22 23 23 TP_fast_assign( 24 - __entry->cpu_id = raw_smp_processor_id(); 24 + __entry->cpu_id = t->rseq.ids.cpu_id; 25 25 __entry->node_id = cpu_to_node(__entry->cpu_id); 26 - __entry->mm_cid = task_mm_cid(t); 26 + __entry->mm_cid = t->rseq.ids.mm_cid; 27 27 ), 28 28 29 29 TP_printk("cpu_id=%d node_id=%d mm_cid=%d", __entry->cpu_id,
+7 -14
include/uapi/linux/rseq.h
··· 114 114 /* 115 115 * Restartable sequences flags field. 116 116 * 117 - * This field should only be updated by the thread which 118 - * registered this data structure. Read by the kernel. 119 - * Mainly used for single-stepping through rseq critical sections 120 - * with debuggers. 121 - * 122 - * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT 123 - * Inhibit instruction sequence block restart on preemption 124 - * for this thread. 125 - * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL 126 - * Inhibit instruction sequence block restart on signal 127 - * delivery for this thread. 128 - * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE 129 - * Inhibit instruction sequence block restart on migration for 130 - * this thread. 117 + * This field was initially intended to allow event masking for 118 + * single-stepping through rseq critical sections with debuggers. 119 + * The kernel does not support this anymore and the relevant bits 120 + * are checked for being always false: 121 + * - RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT 122 + * - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL 123 + * - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE 131 124 */ 132 125 __u32 flags; 133 126
+27 -1
init/Kconfig
··· 1913 1913 1914 1914 If unsure, say Y. 1915 1915 1916 + config RSEQ_STATS 1917 + default n 1918 + bool "Enable lightweight statistics of restartable sequences" if EXPERT 1919 + depends on RSEQ && DEBUG_FS 1920 + help 1921 + Enable lightweight counters which expose information about the 1922 + frequency of RSEQ operations via debugfs. Mostly interesting for 1923 + kernel debugging or performance analysis. While lightweight it's 1924 + still adding code into the user/kernel mode transitions. 1925 + 1926 + If unsure, say N. 1927 + 1928 + config RSEQ_DEBUG_DEFAULT_ENABLE 1929 + default n 1930 + bool "Enable restartable sequences debug mode by default" if EXPERT 1931 + depends on RSEQ 1932 + help 1933 + This enables the static branch for debug mode of restartable 1934 + sequences. 1935 + 1936 + This also can be controlled on the kernel command line via the 1937 + command line parameter "rseq_debug=0/1" and through debugfs. 1938 + 1939 + If unsure, say N. 1940 + 1916 1941 config DEBUG_RSEQ 1917 1942 default n 1918 1943 bool "Enable debugging of rseq() system call" if EXPERT 1919 - depends on RSEQ && DEBUG_KERNEL 1944 + depends on RSEQ && DEBUG_KERNEL && !GENERIC_ENTRY 1945 + select RSEQ_DEBUG_DEFAULT_ENABLE 1920 1946 help 1921 1947 Enable extra debugging checks for the rseq system call. 1922 1948
+3
init/init_task.c
··· 250 250 #ifdef CONFIG_SECCOMP_FILTER 251 251 .seccomp = { .filter_count = ATOMIC_INIT(0) }, 252 252 #endif 253 + #ifdef CONFIG_SCHED_MM_CID 254 + .mm_cid = { .cid = MM_CID_UNSET, }, 255 + #endif 253 256 }; 254 257 EXPORT_SYMBOL(init_task); 255 258
+19
kernel/cpu.c
··· 3085 3085 #ifdef CONFIG_INIT_ALL_POSSIBLE 3086 3086 struct cpumask __cpu_possible_mask __ro_after_init 3087 3087 = {CPU_BITS_ALL}; 3088 + unsigned int __num_possible_cpus __ro_after_init = NR_CPUS; 3088 3089 #else 3089 3090 struct cpumask __cpu_possible_mask __ro_after_init; 3091 + unsigned int __num_possible_cpus __ro_after_init; 3090 3092 #endif 3091 3093 EXPORT_SYMBOL(__cpu_possible_mask); 3094 + EXPORT_SYMBOL(__num_possible_cpus); 3092 3095 3093 3096 struct cpumask __cpu_online_mask __read_mostly; 3094 3097 EXPORT_SYMBOL(__cpu_online_mask); ··· 3119 3116 void init_cpu_possible(const struct cpumask *src) 3120 3117 { 3121 3118 cpumask_copy(&__cpu_possible_mask, src); 3119 + __num_possible_cpus = cpumask_weight(&__cpu_possible_mask); 3122 3120 } 3123 3121 3124 3122 void set_cpu_online(unsigned int cpu, bool online) ··· 3140 3136 } else { 3141 3137 if (cpumask_test_and_clear_cpu(cpu, &__cpu_online_mask)) 3142 3138 atomic_dec(&__num_online_cpus); 3139 + } 3140 + } 3141 + 3142 + /* 3143 + * This should be marked __init, but there is a boatload of call sites 3144 + * which need to be fixed up to do so. Sigh... 3145 + */ 3146 + void set_cpu_possible(unsigned int cpu, bool possible) 3147 + { 3148 + if (possible) { 3149 + if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask)) 3150 + __num_possible_cpus++; 3151 + } else { 3152 + if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask)) 3153 + __num_possible_cpus--; 3143 3154 } 3144 3155 } 3145 3156
+22 -17
kernel/entry/common.c
··· 11 11 /* Workaround to allow gradual conversion of architecture code */ 12 12 void __weak arch_do_signal_or_restart(struct pt_regs *regs) { } 13 13 14 - /** 15 - * exit_to_user_mode_loop - do any pending work before leaving to user space 16 - * @regs: Pointer to pt_regs on entry stack 17 - * @ti_work: TIF work flags as read by the caller 18 - */ 19 - __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 20 - unsigned long ti_work) 14 + #ifdef CONFIG_HAVE_GENERIC_TIF_BITS 15 + #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ) 16 + #else 17 + #define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK) 18 + #endif 19 + 20 + static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs, 21 + unsigned long ti_work) 21 22 { 22 23 /* 23 24 * Before returning to user space ensure that all pending work 24 25 * items have been completed. 25 26 */ 26 - while (ti_work & EXIT_TO_USER_MODE_WORK) { 27 + while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) { 27 28 28 29 local_irq_enable_exit_to_user(ti_work); 29 30 ··· 63 62 return ti_work; 64 63 } 65 64 66 - noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) 65 + /** 66 + * exit_to_user_mode_loop - do any pending work before leaving to user space 67 + * @regs: Pointer to pt_regs on entry stack 68 + * @ti_work: TIF work flags as read by the caller 69 + */ 70 + __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, 71 + unsigned long ti_work) 67 72 { 68 - enter_from_user_mode(regs); 69 - } 73 + for (;;) { 74 + ti_work = __exit_to_user_mode_loop(regs, ti_work); 70 75 71 - noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) 72 - { 73 - instrumentation_begin(); 74 - exit_to_user_mode_prepare(regs); 75 - instrumentation_end(); 76 - exit_to_user_mode(); 76 + if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work))) 77 + return ti_work; 78 + ti_work = read_thread_flags(); 79 + } 77 80 } 78 81 79 82 noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
-8
kernel/entry/syscall-common.c
··· 63 63 return ret ? : syscall; 64 64 } 65 65 66 - noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) 67 - { 68 - enter_from_user_mode(regs); 69 - instrumentation_begin(); 70 - local_irq_enable(); 71 - instrumentation_end(); 72 - } 73 - 74 66 /* 75 67 * If SYSCALL_EMU is set, then the only reason to report is when 76 68 * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
+1
kernel/exit.c
··· 911 911 user_events_exit(tsk); 912 912 913 913 io_uring_files_cancel(); 914 + sched_mm_cid_exit(tsk); 914 915 exit_signals(tsk); /* sets PF_EXITING */ 915 916 916 917 seccomp_filter_release(tsk);
+3 -4
kernel/fork.c
··· 955 955 #endif 956 956 957 957 #ifdef CONFIG_SCHED_MM_CID 958 - tsk->mm_cid = -1; 959 - tsk->last_mm_cid = -1; 960 - tsk->mm_cid_active = 0; 961 - tsk->migrate_from_cpu = -1; 958 + tsk->mm_cid.cid = MM_CID_UNSET; 959 + tsk->mm_cid.active = 0; 962 960 #endif 963 961 return tsk; 964 962 ··· 2454 2456 exit_nsproxy_namespaces(p); 2455 2457 bad_fork_cleanup_mm: 2456 2458 if (p->mm) { 2459 + sched_mm_cid_exit(p); 2457 2460 mm_clear_owner(p->mm, p); 2458 2461 mmput(p->mm); 2459 2462 }
+3 -3
kernel/ptrace.c
··· 793 793 unsigned long size, void __user *data) 794 794 { 795 795 struct ptrace_rseq_configuration conf = { 796 - .rseq_abi_pointer = (u64)(uintptr_t)task->rseq, 797 - .rseq_abi_size = task->rseq_len, 798 - .signature = task->rseq_sig, 796 + .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr, 797 + .rseq_abi_size = task->rseq.len, 798 + .signature = task->rseq.sig, 799 799 .flags = 0, 800 800 }; 801 801
+289 -384
kernel/rseq.c
··· 8 8 * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> 9 9 */ 10 10 11 - #include <linux/sched.h> 12 - #include <linux/uaccess.h> 13 - #include <linux/syscalls.h> 14 - #include <linux/rseq.h> 15 - #include <linux/types.h> 16 - #include <linux/ratelimit.h> 17 - #include <asm/ptrace.h> 18 - 19 - #define CREATE_TRACE_POINTS 20 - #include <trace/events/rseq.h> 21 - 22 - /* The original rseq structure size (including padding) is 32 bytes. */ 23 - #define ORIG_RSEQ_SIZE 32 24 - 25 - #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ 26 - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ 27 - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) 28 - 29 - #ifdef CONFIG_DEBUG_RSEQ 30 - static struct rseq *rseq_kernel_fields(struct task_struct *t) 31 - { 32 - return (struct rseq *) t->rseq_fields; 33 - } 34 - 35 - static int rseq_validate_ro_fields(struct task_struct *t) 36 - { 37 - static DEFINE_RATELIMIT_STATE(_rs, 38 - DEFAULT_RATELIMIT_INTERVAL, 39 - DEFAULT_RATELIMIT_BURST); 40 - u32 cpu_id_start, cpu_id, node_id, mm_cid; 41 - struct rseq __user *rseq = t->rseq; 42 - 43 - /* 44 - * Validate fields which are required to be read-only by 45 - * user-space. 46 - */ 47 - if (!user_read_access_begin(rseq, t->rseq_len)) 48 - goto efault; 49 - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); 50 - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); 51 - unsafe_get_user(node_id, &rseq->node_id, efault_end); 52 - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); 53 - user_read_access_end(); 54 - 55 - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || 56 - cpu_id != rseq_kernel_fields(t)->cpu_id || 57 - node_id != rseq_kernel_fields(t)->node_id || 58 - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { 59 - 60 - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" 61 - "\tcpu_id_start: %u ?= %u\n" 62 - "\tcpu_id: %u ?= %u\n" 63 - "\tnode_id: %u ?= %u\n" 64 - "\tmm_cid: %u ?= %u\n", 65 - t->pid, t->comm, 66 - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, 67 - cpu_id, rseq_kernel_fields(t)->cpu_id, 68 - node_id, rseq_kernel_fields(t)->node_id, 69 - mm_cid, rseq_kernel_fields(t)->mm_cid); 70 - } 71 - 72 - /* For now, only print a console warning on mismatch. */ 73 - return 0; 74 - 75 - efault_end: 76 - user_read_access_end(); 77 - efault: 78 - return -EFAULT; 79 - } 80 - 81 11 /* 82 - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent 83 - * state. 84 - */ 85 - #define rseq_unsafe_put_user(t, value, field, error_label) \ 86 - do { \ 87 - unsafe_put_user(value, &t->rseq->field, error_label); \ 88 - rseq_kernel_fields(t)->field = value; \ 89 - } while (0) 90 - 91 - #else 92 - static int rseq_validate_ro_fields(struct task_struct *t) 93 - { 94 - return 0; 95 - } 96 - 97 - #define rseq_unsafe_put_user(t, value, field, error_label) \ 98 - unsafe_put_user(value, &t->rseq->field, error_label) 99 - #endif 100 - 101 - /* 102 - * 103 12 * Restartable sequences are a lightweight interface that allows 104 13 * user-level code to be executed atomically relative to scheduler 105 14 * preemption and signal delivery. Typically used for implementing ··· 67 158 * F1. <failure> 68 159 */ 69 160 70 - static int rseq_update_cpu_node_id(struct task_struct *t) 161 + /* Required to select the proper per_cpu ops for rseq_stats_inc() */ 162 + #define RSEQ_BUILD_SLOW_PATH 163 + 164 + #include <linux/debugfs.h> 165 + #include <linux/ratelimit.h> 166 + #include <linux/rseq_entry.h> 167 + #include <linux/sched.h> 168 + #include <linux/syscalls.h> 169 + #include <linux/uaccess.h> 170 + #include <linux/types.h> 171 + #include <asm/ptrace.h> 172 + 173 + #define CREATE_TRACE_POINTS 174 + #include <trace/events/rseq.h> 175 + 176 + DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 177 + 178 + static inline void rseq_control_debug(bool on) 71 179 { 72 - struct rseq __user *rseq = t->rseq; 73 - u32 cpu_id = raw_smp_processor_id(); 74 - u32 node_id = cpu_to_node(cpu_id); 75 - u32 mm_cid = task_mm_cid(t); 180 + if (on) 181 + static_branch_enable(&rseq_debug_enabled); 182 + else 183 + static_branch_disable(&rseq_debug_enabled); 184 + } 76 185 77 - /* 78 - * Validate read-only rseq fields. 79 - */ 80 - if (rseq_validate_ro_fields(t)) 81 - goto efault; 82 - WARN_ON_ONCE((int) mm_cid < 0); 83 - if (!user_write_access_begin(rseq, t->rseq_len)) 84 - goto efault; 186 + static int __init rseq_setup_debug(char *str) 187 + { 188 + bool on; 85 189 86 - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); 87 - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 88 - rseq_unsafe_put_user(t, node_id, node_id, efault_end); 89 - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 190 + if (kstrtobool(str, &on)) 191 + return -EINVAL; 192 + rseq_control_debug(on); 193 + return 1; 194 + } 195 + __setup("rseq_debug=", rseq_setup_debug); 90 196 91 - /* 92 - * Additional feature fields added after ORIG_RSEQ_SIZE 93 - * need to be conditionally updated only if 94 - * t->rseq_len != ORIG_RSEQ_SIZE. 95 - */ 96 - user_write_access_end(); 197 + #ifdef CONFIG_TRACEPOINTS 198 + /* 199 + * Out of line, so the actual update functions can be in a header to be 200 + * inlined into the exit to user code. 201 + */ 202 + void __rseq_trace_update(struct task_struct *t) 203 + { 97 204 trace_rseq_update(t); 98 - return 0; 99 - 100 - efault_end: 101 - user_write_access_end(); 102 - efault: 103 - return -EFAULT; 104 205 } 105 206 106 - static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) 207 + void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip, 208 + unsigned long offset, unsigned long abort_ip) 107 209 { 108 - struct rseq __user *rseq = t->rseq; 109 - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, 110 - mm_cid = 0; 210 + trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip); 211 + } 212 + #endif /* CONFIG_TRACEPOINTS */ 111 213 112 - /* 113 - * Validate read-only rseq fields. 114 - */ 115 - if (rseq_validate_ro_fields(t)) 116 - goto efault; 214 + #ifdef CONFIG_DEBUG_FS 215 + #ifdef CONFIG_RSEQ_STATS 216 + DEFINE_PER_CPU(struct rseq_stats, rseq_stats); 117 217 118 - if (!user_write_access_begin(rseq, t->rseq_len)) 119 - goto efault; 218 + static int rseq_stats_show(struct seq_file *m, void *p) 219 + { 220 + struct rseq_stats stats = { }; 221 + unsigned int cpu; 120 222 121 - /* 122 - * Reset all fields to their initial state. 123 - * 124 - * All fields have an initial state of 0 except cpu_id which is set to 125 - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after 126 - * unregistration can figure out that rseq needs to be registered 127 - * again. 128 - */ 129 - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); 130 - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 131 - rseq_unsafe_put_user(t, node_id, node_id, efault_end); 132 - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 223 + for_each_possible_cpu(cpu) { 224 + stats.exit += data_race(per_cpu(rseq_stats.exit, cpu)); 225 + stats.signal += data_race(per_cpu(rseq_stats.signal, cpu)); 226 + stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu)); 227 + stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu)); 228 + stats.ids += data_race(per_cpu(rseq_stats.ids, cpu)); 229 + stats.cs += data_race(per_cpu(rseq_stats.cs, cpu)); 230 + stats.clear += data_race(per_cpu(rseq_stats.clear, cpu)); 231 + stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu)); 232 + } 133 233 134 - /* 135 - * Additional feature fields added after ORIG_RSEQ_SIZE 136 - * need to be conditionally reset only if 137 - * t->rseq_len != ORIG_RSEQ_SIZE. 138 - */ 139 - user_write_access_end(); 234 + seq_printf(m, "exit: %16lu\n", stats.exit); 235 + seq_printf(m, "signal: %16lu\n", stats.signal); 236 + seq_printf(m, "slowp: %16lu\n", stats.slowpath); 237 + seq_printf(m, "fastp: %16lu\n", stats.fastpath); 238 + seq_printf(m, "ids: %16lu\n", stats.ids); 239 + seq_printf(m, "cs: %16lu\n", stats.cs); 240 + seq_printf(m, "clear: %16lu\n", stats.clear); 241 + seq_printf(m, "fixup: %16lu\n", stats.fixup); 140 242 return 0; 141 - 142 - efault_end: 143 - user_write_access_end(); 144 - efault: 145 - return -EFAULT; 146 243 } 147 244 148 - /* 149 - * Get the user-space pointer value stored in the 'rseq_cs' field. 150 - */ 151 - static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs) 245 + static int rseq_stats_open(struct inode *inode, struct file *file) 152 246 { 153 - if (!rseq_cs) 154 - return -EFAULT; 247 + return single_open(file, rseq_stats_show, inode->i_private); 248 + } 155 249 156 - #ifdef CONFIG_64BIT 157 - if (get_user(*rseq_cs, &rseq->rseq_cs)) 158 - return -EFAULT; 250 + static const struct file_operations stat_ops = { 251 + .open = rseq_stats_open, 252 + .read = seq_read, 253 + .llseek = seq_lseek, 254 + .release = single_release, 255 + }; 256 + 257 + static int __init rseq_stats_init(struct dentry *root_dir) 258 + { 259 + debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops); 260 + return 0; 261 + } 159 262 #else 160 - if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs))) 161 - return -EFAULT; 162 - #endif 263 + static inline void rseq_stats_init(struct dentry *root_dir) { } 264 + #endif /* CONFIG_RSEQ_STATS */ 163 265 266 + static int rseq_debug_show(struct seq_file *m, void *p) 267 + { 268 + bool on = static_branch_unlikely(&rseq_debug_enabled); 269 + 270 + seq_printf(m, "%d\n", on); 164 271 return 0; 165 272 } 166 273 167 - /* 168 - * If the rseq_cs field of 'struct rseq' contains a valid pointer to 169 - * user-space, copy 'struct rseq_cs' from user-space and validate its fields. 170 - */ 171 - static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) 274 + static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf, 275 + size_t count, loff_t *ppos) 172 276 { 173 - struct rseq_cs __user *urseq_cs; 174 - u64 ptr; 175 - u32 __user *usig; 176 - u32 sig; 177 - int ret; 277 + bool on; 178 278 179 - ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr); 180 - if (ret) 181 - return ret; 182 - 183 - /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */ 184 - if (!ptr) { 185 - memset(rseq_cs, 0, sizeof(*rseq_cs)); 186 - return 0; 187 - } 188 - /* Check that the pointer value fits in the user-space process space. */ 189 - if (ptr >= TASK_SIZE) 190 - return -EINVAL; 191 - urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; 192 - if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) 193 - return -EFAULT; 194 - 195 - if (rseq_cs->start_ip >= TASK_SIZE || 196 - rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || 197 - rseq_cs->abort_ip >= TASK_SIZE || 198 - rseq_cs->version > 0) 199 - return -EINVAL; 200 - /* Check for overflow. */ 201 - if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) 202 - return -EINVAL; 203 - /* Ensure that abort_ip is not in the critical section. */ 204 - if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) 279 + if (kstrtobool_from_user(ubuf, count, &on)) 205 280 return -EINVAL; 206 281 207 - usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); 208 - ret = get_user(sig, usig); 209 - if (ret) 210 - return ret; 282 + rseq_control_debug(on); 283 + return count; 284 + } 211 285 212 - if (current->rseq_sig != sig) { 213 - printk_ratelimited(KERN_WARNING 214 - "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", 215 - sig, current->rseq_sig, current->pid, usig); 216 - return -EINVAL; 217 - } 286 + static int rseq_debug_open(struct inode *inode, struct file *file) 287 + { 288 + return single_open(file, rseq_debug_show, inode->i_private); 289 + } 290 + 291 + static const struct file_operations debug_ops = { 292 + .open = rseq_debug_open, 293 + .read = seq_read, 294 + .write = rseq_debug_write, 295 + .llseek = seq_lseek, 296 + .release = single_release, 297 + }; 298 + 299 + static int __init rseq_debugfs_init(void) 300 + { 301 + struct dentry *root_dir = debugfs_create_dir("rseq", NULL); 302 + 303 + debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops); 304 + rseq_stats_init(root_dir); 218 305 return 0; 219 306 } 307 + __initcall(rseq_debugfs_init); 308 + #endif /* CONFIG_DEBUG_FS */ 220 309 221 - static bool rseq_warn_flags(const char *str, u32 flags) 310 + static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) 222 311 { 223 - u32 test_flags; 224 - 225 - if (!flags) 226 - return false; 227 - test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; 228 - if (test_flags) 229 - pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); 230 - test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; 231 - if (test_flags) 232 - pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); 233 - return true; 312 + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); 234 313 } 235 314 236 - static int rseq_need_restart(struct task_struct *t, u32 cs_flags) 315 + static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) 237 316 { 238 - u32 flags, event_mask; 239 - int ret; 317 + struct rseq __user *urseq = t->rseq.usrptr; 318 + u64 csaddr; 240 319 241 - if (rseq_warn_flags("rseq_cs", cs_flags)) 242 - return -EINVAL; 243 - 244 - /* Get thread flags. */ 245 - ret = get_user(flags, &t->rseq->flags); 246 - if (ret) 247 - return ret; 248 - 249 - if (rseq_warn_flags("rseq", flags)) 250 - return -EINVAL; 251 - 252 - /* 253 - * Load and clear event mask atomically with respect to 254 - * scheduler preemption and membarrier IPIs. 255 - */ 256 - scoped_guard(RSEQ_EVENT_GUARD) { 257 - event_mask = t->rseq_event_mask; 258 - t->rseq_event_mask = 0; 259 - } 260 - 261 - return !!event_mask; 320 + scoped_user_read_access(urseq, efault) 321 + unsafe_get_user(csaddr, &urseq->rseq_cs, efault); 322 + if (likely(!csaddr)) 323 + return true; 324 + return rseq_update_user_cs(t, regs, csaddr); 325 + efault: 326 + return false; 262 327 } 263 328 264 - static int clear_rseq_cs(struct rseq __user *rseq) 329 + static void rseq_slowpath_update_usr(struct pt_regs *regs) 265 330 { 266 331 /* 267 - * The rseq_cs field is set to NULL on preemption or signal 268 - * delivery on top of rseq assembly block, as well as on top 269 - * of code outside of the rseq assembly block. This performs 270 - * a lazy clear of the rseq_cs field. 271 - * 272 - * Set rseq_cs to NULL. 332 + * Preserve rseq state and user_irq state. The generic entry code 333 + * clears user_irq on the way out, the non-generic entry 334 + * architectures are not having user_irq. 273 335 */ 274 - #ifdef CONFIG_64BIT 275 - return put_user(0UL, &rseq->rseq_cs); 276 - #else 277 - if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs))) 278 - return -EFAULT; 279 - return 0; 280 - #endif 281 - } 282 - 283 - /* 284 - * Unsigned comparison will be true when ip >= start_ip, and when 285 - * ip < start_ip + post_commit_offset. 286 - */ 287 - static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) 288 - { 289 - return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; 290 - } 291 - 292 - static int rseq_ip_fixup(struct pt_regs *regs) 293 - { 294 - unsigned long ip = instruction_pointer(regs); 336 + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; 295 337 struct task_struct *t = current; 296 - struct rseq_cs rseq_cs; 297 - int ret; 298 - 299 - ret = rseq_get_rseq_cs(t, &rseq_cs); 300 - if (ret) 301 - return ret; 302 - 303 - /* 304 - * Handle potentially not being within a critical section. 305 - * If not nested over a rseq critical section, restart is useless. 306 - * Clear the rseq_cs pointer and return. 307 - */ 308 - if (!in_rseq_cs(ip, &rseq_cs)) 309 - return clear_rseq_cs(t->rseq); 310 - ret = rseq_need_restart(t, rseq_cs.flags); 311 - if (ret <= 0) 312 - return ret; 313 - ret = clear_rseq_cs(t->rseq); 314 - if (ret) 315 - return ret; 316 - trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, 317 - rseq_cs.abort_ip); 318 - instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); 319 - return 0; 320 - } 321 - 322 - /* 323 - * This resume handler must always be executed between any of: 324 - * - preemption, 325 - * - signal delivery, 326 - * and return to user-space. 327 - * 328 - * This is how we can ensure that the entire rseq critical section 329 - * will issue the commit instruction only if executed atomically with 330 - * respect to other threads scheduled on the same CPU, and with respect 331 - * to signal handlers. 332 - */ 333 - void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) 334 - { 335 - struct task_struct *t = current; 336 - int ret, sig; 338 + struct rseq_ids ids; 339 + u32 node_id; 340 + bool event; 337 341 338 342 if (unlikely(t->flags & PF_EXITING)) 339 343 return; 340 344 341 - /* 342 - * regs is NULL if and only if the caller is in a syscall path. Skip 343 - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and 344 - * kill a misbehaving userspace on debug kernels. 345 - */ 346 - if (regs) { 347 - ret = rseq_ip_fixup(regs); 348 - if (unlikely(ret < 0)) 349 - goto error; 350 - } 351 - if (unlikely(rseq_update_cpu_node_id(t))) 352 - goto error; 353 - return; 345 + rseq_stat_inc(rseq_stats.slowpath); 354 346 355 - error: 356 - sig = ksig ? ksig->sig : 0; 357 - force_sigsegv(sig); 347 + /* 348 + * Read and clear the event pending bit first. If the task 349 + * was not preempted or migrated or a signal is on the way, 350 + * there is no point in doing any of the heavy lifting here 351 + * on production kernels. In that case TIF_NOTIFY_RESUME 352 + * was raised by some other functionality. 353 + * 354 + * This is correct because the read/clear operation is 355 + * guarded against scheduler preemption, which makes it CPU 356 + * local atomic. If the task is preempted right after 357 + * re-enabling preemption then TIF_NOTIFY_RESUME is set 358 + * again and this function is invoked another time _before_ 359 + * the task is able to return to user mode. 360 + * 361 + * On a debug kernel, invoke the fixup code unconditionally 362 + * with the result handed in to allow the detection of 363 + * inconsistencies. 364 + */ 365 + scoped_guard(irq) { 366 + event = t->rseq.event.sched_switch; 367 + t->rseq.event.all &= evt_mask.all; 368 + ids.cpu_id = task_cpu(t); 369 + ids.mm_cid = task_mm_cid(t); 370 + } 371 + 372 + if (!event) 373 + return; 374 + 375 + node_id = cpu_to_node(ids.cpu_id); 376 + 377 + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { 378 + /* 379 + * Clear the errors just in case this might survive magically, but 380 + * leave the rest intact. 381 + */ 382 + t->rseq.event.error = 0; 383 + force_sig(SIGSEGV); 384 + } 358 385 } 359 386 360 - #ifdef CONFIG_DEBUG_RSEQ 387 + void __rseq_handle_slowpath(struct pt_regs *regs) 388 + { 389 + /* 390 + * If invoked from hypervisors before entering the guest via 391 + * resume_user_mode_work(), then @regs is a NULL pointer. 392 + * 393 + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 394 + * it before returning from the ioctl() to user space when 395 + * rseq_event.sched_switch is set. 396 + * 397 + * So it's safe to ignore here instead of pointlessly updating it 398 + * in the vcpu_run() loop. 399 + */ 400 + if (!regs) 401 + return; 402 + 403 + rseq_slowpath_update_usr(regs); 404 + } 405 + 406 + void __rseq_signal_deliver(int sig, struct pt_regs *regs) 407 + { 408 + rseq_stat_inc(rseq_stats.signal); 409 + /* 410 + * Don't update IDs, they are handled on exit to user if 411 + * necessary. The important thing is to abort a critical section of 412 + * the interrupted context as after this point the instruction 413 + * pointer in @regs points to the signal handler. 414 + */ 415 + if (unlikely(!rseq_handle_cs(current, regs))) { 416 + /* 417 + * Clear the errors just in case this might survive 418 + * magically, but leave the rest intact. 419 + */ 420 + current->rseq.event.error = 0; 421 + force_sigsegv(sig); 422 + } 423 + } 361 424 362 425 /* 363 426 * Terminate the process if a syscall is issued within a restartable 364 427 * sequence. 365 428 */ 366 - void rseq_syscall(struct pt_regs *regs) 429 + void __rseq_debug_syscall_return(struct pt_regs *regs) 367 430 { 368 - unsigned long ip = instruction_pointer(regs); 369 431 struct task_struct *t = current; 370 - struct rseq_cs rseq_cs; 432 + u64 csaddr; 371 433 372 - if (!t->rseq) 434 + if (!t->rseq.event.has_rseq) 373 435 return; 374 - if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) 375 - force_sig(SIGSEGV); 436 + if (get_user(csaddr, &t->rseq.usrptr->rseq_cs)) 437 + goto fail; 438 + if (likely(!csaddr)) 439 + return; 440 + if (unlikely(csaddr >= TASK_SIZE)) 441 + goto fail; 442 + if (rseq_debug_update_user_cs(t, regs, csaddr)) 443 + return; 444 + fail: 445 + force_sig(SIGSEGV); 376 446 } 377 447 448 + #ifdef CONFIG_DEBUG_RSEQ 449 + /* Kept around to keep GENERIC_ENTRY=n architectures supported. */ 450 + void rseq_syscall(struct pt_regs *regs) 451 + { 452 + __rseq_debug_syscall_return(regs); 453 + } 378 454 #endif 455 + 456 + static bool rseq_reset_ids(void) 457 + { 458 + struct rseq_ids ids = { 459 + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 460 + .mm_cid = 0, 461 + }; 462 + 463 + /* 464 + * If this fails, terminate it because this leaves the kernel in 465 + * stupid state as exit to user space will try to fixup the ids 466 + * again. 467 + */ 468 + if (rseq_set_ids(current, &ids, 0)) 469 + return true; 470 + 471 + force_sig(SIGSEGV); 472 + return false; 473 + } 474 + 475 + /* The original rseq structure size (including padding) is 32 bytes. */ 476 + #define ORIG_RSEQ_SIZE 32 379 477 380 478 /* 381 479 * sys_rseq - setup restartable sequences for caller thread. 382 480 */ 383 - SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, 384 - int, flags, u32, sig) 481 + SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) 385 482 { 386 - int ret; 387 - u64 rseq_cs; 388 - 389 483 if (flags & RSEQ_FLAG_UNREGISTER) { 390 484 if (flags & ~RSEQ_FLAG_UNREGISTER) 391 485 return -EINVAL; 392 486 /* Unregister rseq for current thread. */ 393 - if (current->rseq != rseq || !current->rseq) 487 + if (current->rseq.usrptr != rseq || !current->rseq.usrptr) 394 488 return -EINVAL; 395 - if (rseq_len != current->rseq_len) 489 + if (rseq_len != current->rseq.len) 396 490 return -EINVAL; 397 - if (current->rseq_sig != sig) 491 + if (current->rseq.sig != sig) 398 492 return -EPERM; 399 - ret = rseq_reset_rseq_cpu_node_id(current); 400 - if (ret) 401 - return ret; 402 - current->rseq = NULL; 403 - current->rseq_sig = 0; 404 - current->rseq_len = 0; 493 + if (!rseq_reset_ids()) 494 + return -EFAULT; 495 + rseq_reset(current); 405 496 return 0; 406 497 } 407 498 408 499 if (unlikely(flags)) 409 500 return -EINVAL; 410 501 411 - if (current->rseq) { 502 + if (current->rseq.usrptr) { 412 503 /* 413 504 * If rseq is already registered, check whether 414 505 * the provided address differs from the prior 415 506 * one. 416 507 */ 417 - if (current->rseq != rseq || rseq_len != current->rseq_len) 508 + if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len) 418 509 return -EINVAL; 419 - if (current->rseq_sig != sig) 510 + if (current->rseq.sig != sig) 420 511 return -EPERM; 421 512 /* Already registered. */ 422 513 return -EBUSY; ··· 440 531 if (!access_ok(rseq, rseq_len)) 441 532 return -EFAULT; 442 533 443 - /* 444 - * If the rseq_cs pointer is non-NULL on registration, clear it to 445 - * avoid a potential segfault on return to user-space. The proper thing 446 - * to do would have been to fail the registration but this would break 447 - * older libcs that reuse the rseq area for new threads without 448 - * clearing the fields. 449 - */ 450 - if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs)) 451 - return -EFAULT; 452 - if (rseq_cs && clear_rseq_cs(rseq)) 453 - return -EFAULT; 534 + scoped_user_write_access(rseq, efault) { 535 + /* 536 + * If the rseq_cs pointer is non-NULL on registration, clear it to 537 + * avoid a potential segfault on return to user-space. The proper thing 538 + * to do would have been to fail the registration but this would break 539 + * older libcs that reuse the rseq area for new threads without 540 + * clearing the fields. Don't bother reading it, just reset it. 541 + */ 542 + unsafe_put_user(0UL, &rseq->rseq_cs, efault); 543 + /* Initialize IDs in user space */ 544 + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); 545 + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 546 + unsafe_put_user(0U, &rseq->node_id, efault); 547 + unsafe_put_user(0U, &rseq->mm_cid, efault); 548 + } 454 549 455 - #ifdef CONFIG_DEBUG_RSEQ 456 - /* 457 - * Initialize the in-kernel rseq fields copy for validation of 458 - * read-only fields. 459 - */ 460 - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || 461 - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || 462 - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || 463 - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) 464 - return -EFAULT; 465 - #endif 466 550 /* 467 551 * Activate the registration by setting the rseq area address, length 468 552 * and signature in the task struct. 469 553 */ 470 - current->rseq = rseq; 471 - current->rseq_len = rseq_len; 472 - current->rseq_sig = sig; 554 + current->rseq.usrptr = rseq; 555 + current->rseq.len = rseq_len; 556 + current->rseq.sig = sig; 473 557 474 558 /* 475 559 * If rseq was previously inactive, and has just been 476 560 * registered, ensure the cpu_id_start and cpu_id fields 477 561 * are updated before returning to user-space. 478 562 */ 479 - rseq_set_notify_resume(current); 480 - 563 + current->rseq.event.has_rseq = true; 564 + rseq_force_update(); 481 565 return 0; 566 + 567 + efault: 568 + return -EFAULT; 482 569 }
+442 -472
kernel/sched/core.c
··· 2131 2131 { 2132 2132 if (task_on_rq_migrating(p)) 2133 2133 flags |= ENQUEUE_MIGRATED; 2134 - if (flags & ENQUEUE_MIGRATED) 2135 - sched_mm_cid_migrate_to(rq, p); 2136 2134 2137 2135 enqueue_task(rq, p, flags); 2138 2136 ··· 2641 2643 return 0; 2642 2644 } 2643 2645 2646 + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask); 2647 + 2644 2648 /* 2645 2649 * sched_class::set_cpus_allowed must do the below, but is not required to 2646 2650 * actually call this function. ··· 2656 2656 2657 2657 cpumask_copy(&p->cpus_mask, ctx->new_mask); 2658 2658 p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); 2659 + mm_update_cpus_allowed(p->mm, ctx->new_mask); 2659 2660 2660 2661 /* 2661 2662 * Swap in a new user_cpus_ptr if SCA_USER flag set ··· 2668 2667 static void 2669 2668 do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) 2670 2669 { 2671 - scoped_guard (sched_change, p, DEQUEUE_SAVE) { 2670 + scoped_guard (sched_change, p, DEQUEUE_SAVE) 2672 2671 p->sched_class->set_cpus_allowed(p, ctx); 2673 - mm_set_cpus_allowed(p->mm, ctx->new_mask); 2674 - } 2675 2672 } 2676 2673 2677 2674 /* ··· 3262 3263 if (p->sched_class->migrate_task_rq) 3263 3264 p->sched_class->migrate_task_rq(p, new_cpu); 3264 3265 p->se.nr_migrations++; 3265 - rseq_migrate(p); 3266 - sched_mm_cid_migrate_from(p); 3267 3266 perf_event_task_migrate(p); 3268 3267 } 3269 3268 ··· 4412 4415 init_numa_balancing(clone_flags, p); 4413 4416 p->wake_entry.u_flags = CSD_TYPE_TTWU; 4414 4417 p->migration_pending = NULL; 4415 - init_sched_mm_cid(p); 4416 4418 } 4417 4419 4418 4420 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); ··· 4687 4691 p->sched_task_group = tg; 4688 4692 } 4689 4693 #endif 4690 - rseq_migrate(p); 4691 4694 /* 4692 4695 * We're setting the CPU for the first time, we don't migrate, 4693 4696 * so use __set_task_cpu(). ··· 4750 4755 * as we're not fully set-up yet. 4751 4756 */ 4752 4757 p->recent_used_cpu = task_cpu(p); 4753 - rseq_migrate(p); 4754 4758 __set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags)); 4755 4759 rq = __task_rq_lock(p, &rf); 4756 4760 update_rq_clock(rq); ··· 5043 5049 kcov_prepare_switch(prev); 5044 5050 sched_info_switch(rq, prev, next); 5045 5051 perf_event_task_sched_out(prev, next); 5046 - rseq_preempt(prev); 5047 5052 fire_sched_out_preempt_notifiers(prev, next); 5048 5053 kmap_local_sched_out(); 5049 5054 prepare_task(next); ··· 5205 5212 * 5206 5213 * kernel -> user switch + mmdrop_lazy_tlb() active 5207 5214 * user -> user switch 5208 - * 5209 - * switch_mm_cid() needs to be updated if the barriers provided 5210 - * by context_switch() are modified. 5211 5215 */ 5212 - if (!next->mm) { // to kernel 5216 + if (!next->mm) { // to kernel 5213 5217 enter_lazy_tlb(prev->active_mm, next); 5214 5218 5215 5219 next->active_mm = prev->active_mm; 5216 - if (prev->mm) // from user 5220 + if (prev->mm) // from user 5217 5221 mmgrab_lazy_tlb(prev->active_mm); 5218 5222 else 5219 5223 prev->active_mm = NULL; 5220 - } else { // to user 5224 + } else { // to user 5221 5225 membarrier_switch_mm(rq, prev->active_mm, next->mm); 5222 5226 /* 5223 5227 * sys_membarrier() requires an smp_mb() between setting ··· 5227 5237 switch_mm_irqs_off(prev->active_mm, next->mm, next); 5228 5238 lru_gen_use_mm(next->mm); 5229 5239 5230 - if (!prev->mm) { // from kernel 5240 + if (!prev->mm) { // from kernel 5231 5241 /* will mmdrop_lazy_tlb() in finish_task_switch(). */ 5232 5242 rq->prev_mm = prev->active_mm; 5233 5243 prev->active_mm = NULL; 5234 5244 } 5235 5245 } 5236 5246 5237 - /* switch_mm_cid() requires the memory barriers above. */ 5238 - switch_mm_cid(rq, prev, next); 5247 + mm_cid_switch_to(prev, next); 5248 + 5249 + /* 5250 + * Tell rseq that the task was scheduled in. Must be after 5251 + * switch_mm_cid() to get the TIF flag set. 5252 + */ 5253 + rseq_sched_switch_event(next); 5239 5254 5240 5255 prepare_lock_switch(rq, next, rf); 5241 5256 ··· 5525 5530 resched_latency = cpu_resched_latency(rq); 5526 5531 calc_global_load_tick(rq); 5527 5532 sched_core_tick(rq); 5528 - task_tick_mm_cid(rq, donor); 5529 5533 scx_tick(rq); 5530 5534 5531 5535 rq_unlock(rq, &rf); ··· 10254 10260 } 10255 10261 10256 10262 #ifdef CONFIG_SCHED_MM_CID 10257 - 10258 10263 /* 10259 - * @cid_lock: Guarantee forward-progress of cid allocation. 10264 + * Concurrency IDentifier management 10260 10265 * 10261 - * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock 10262 - * is only used when contention is detected by the lock-free allocation so 10263 - * forward progress can be guaranteed. 10264 - */ 10265 - DEFINE_RAW_SPINLOCK(cid_lock); 10266 - 10267 - /* 10268 - * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock. 10266 + * Serialization rules: 10269 10267 * 10270 - * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is 10271 - * detected, it is set to 1 to ensure that all newly coming allocations are 10272 - * serialized by @cid_lock until the allocation which detected contention 10273 - * completes and sets @use_cid_lock back to 0. This guarantees forward progress 10274 - * of a cid allocation. 10275 - */ 10276 - int use_cid_lock; 10277 - 10278 - /* 10279 - * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid 10280 - * concurrently with respect to the execution of the source runqueue context 10281 - * switch. 10268 + * mm::mm_cid::mutex: Serializes fork() and exit() and therefore 10269 + * protects mm::mm_cid::users. 10282 10270 * 10283 - * There is one basic properties we want to guarantee here: 10271 + * mm::mm_cid::lock: Serializes mm_update_max_cids() and 10272 + * mm_update_cpus_allowed(). Nests in mm_cid::mutex 10273 + * and runqueue lock. 10284 10274 * 10285 - * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively 10286 - * used by a task. That would lead to concurrent allocation of the cid and 10287 - * userspace corruption. 10275 + * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks 10276 + * and can only be modified with atomic operations. 10288 10277 * 10289 - * Provide this guarantee by introducing a Dekker memory ordering to guarantee 10290 - * that a pair of loads observe at least one of a pair of stores, which can be 10291 - * shown as: 10278 + * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue 10279 + * lock. 10292 10280 * 10293 - * X = Y = 0 10281 + * CID ownership: 10294 10282 * 10295 - * w[X]=1 w[Y]=1 10296 - * MB MB 10297 - * r[Y]=y r[X]=x 10283 + * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or 10284 + * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the 10285 + * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode, 10286 + * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the 10287 + * task needs to drop the CID into the pool when scheduling out. Both bits 10288 + * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is 10289 + * actually handed over to user space in the RSEQ memory. 10298 10290 * 10299 - * Which guarantees that x==0 && y==0 is impossible. But rather than using 10300 - * values 0 and 1, this algorithm cares about specific state transitions of the 10301 - * runqueue current task (as updated by the scheduler context switch), and the 10302 - * per-mm/cpu cid value. 10291 + * Mode switching: 10303 10292 * 10304 - * Let's introduce task (Y) which has task->mm == mm and task (N) which has 10305 - * task->mm != mm for the rest of the discussion. There are two scheduler state 10306 - * transitions on context switch we care about: 10293 + * Switching to per CPU mode happens when the user count becomes greater 10294 + * than the maximum number of CIDs, which is calculated by: 10307 10295 * 10308 - * (TSA) Store to rq->curr with transition from (N) to (Y) 10296 + * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users); 10297 + * max_cids = min(1.25 * opt_cids, num_possible_cpus()); 10309 10298 * 10310 - * (TSB) Store to rq->curr with transition from (Y) to (N) 10299 + * The +25% allowance is useful for tight CPU masks in scenarios where only 10300 + * a few threads are created and destroyed to avoid frequent mode 10301 + * switches. Though this allowance shrinks, the closer opt_cids becomes to 10302 + * num_possible_cpus(), which is the (unfortunate) hard ABI limit. 10311 10303 * 10312 - * On the remote-clear side, there is one transition we care about: 10304 + * At the point of switching to per CPU mode the new user is not yet 10305 + * visible in the system, so the task which initiated the fork() runs the 10306 + * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and 10307 + * either transfers each tasks owned CID to the CPU the task runs on or 10308 + * drops it into the CID pool if a task is not on a CPU at that point in 10309 + * time. Tasks which schedule in before the task walk reaches them do the 10310 + * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes 10311 + * it's guaranteed that no task related to that MM owns a CID anymore. 10313 10312 * 10314 - * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag 10313 + * Switching back to task mode happens when the user count goes below the 10314 + * threshold which was recorded on the per CPU mode switch: 10315 10315 * 10316 - * There is also a transition to UNSET state which can be performed from all 10317 - * sides (scheduler, remote-clear). It is always performed with a cmpxchg which 10318 - * guarantees that only a single thread will succeed: 10316 + * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2); 10319 10317 * 10320 - * (TMB) cmpxchg to *pcpu_cid to mark UNSET 10318 + * This threshold is updated when a affinity change increases the number of 10319 + * allowed CPUs for the MM, which might cause a switch back to per task 10320 + * mode. 10321 10321 * 10322 - * Just to be clear, what we do _not_ want to happen is a transition to UNSET 10323 - * when a thread is actively using the cid (property (1)). 10322 + * If the switch back was initiated by a exiting task, then that task runs 10323 + * the fixup function. If it was initiated by a affinity change, then it's 10324 + * run either in the deferred update function in context of a workqueue or 10325 + * by a task which forks a new one or by a task which exits. Whatever 10326 + * happens first. mm_cid_fixup_cpus_to_task() walks through the possible 10327 + * CPUs and either transfers the CPU owned CIDs to a related task which 10328 + * runs on the CPU or drops it into the pool. Tasks which schedule in on a 10329 + * CPU which the walk did not cover yet do the handover themself. 10324 10330 * 10325 - * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions. 10331 + * This transition from CPU to per task ownership happens in two phases: 10326 10332 * 10327 - * Scenario A) (TSA)+(TMA) (from next task perspective) 10333 + * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task 10334 + * CID and denotes that the CID is only temporarily owned by the 10335 + * task. When it schedules out the task drops the CID back into the 10336 + * pool if this bit is set. 10328 10337 * 10329 - * CPU0 CPU1 10338 + * 2) The initiating context walks the per CPU space and after completion 10339 + * clears mm:mm_cid.transit. So after that point the CIDs are strictly 10340 + * task owned again. 10330 10341 * 10331 - * Context switch CS-1 Remote-clear 10332 - * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA) 10333 - * (implied barrier after cmpxchg) 10334 - * - switch_mm_cid() 10335 - * - memory barrier (see switch_mm_cid() 10336 - * comment explaining how this barrier 10337 - * is combined with other scheduler 10338 - * barriers) 10339 - * - mm_cid_get (next) 10340 - * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr) 10342 + * This two phase transition is required to prevent CID space exhaustion 10343 + * during the transition as a direct transfer of ownership would fail if 10344 + * two tasks are scheduled in on the same CPU before the fixup freed per 10345 + * CPU CIDs. 10341 10346 * 10342 - * This Dekker ensures that either task (Y) is observed by the 10343 - * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are 10344 - * observed. 10345 - * 10346 - * If task (Y) store is observed by rcu_dereference(), it means that there is 10347 - * still an active task on the cpu. Remote-clear will therefore not transition 10348 - * to UNSET, which fulfills property (1). 10349 - * 10350 - * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(), 10351 - * it will move its state to UNSET, which clears the percpu cid perhaps 10352 - * uselessly (which is not an issue for correctness). Because task (Y) is not 10353 - * observed, CPU1 can move ahead to set the state to UNSET. Because moving 10354 - * state to UNSET is done with a cmpxchg expecting that the old state has the 10355 - * LAZY flag set, only one thread will successfully UNSET. 10356 - * 10357 - * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0 10358 - * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and 10359 - * CPU1 will observe task (Y) and do nothing more, which is fine. 10360 - * 10361 - * What we are effectively preventing with this Dekker is a scenario where 10362 - * neither LAZY flag nor store (Y) are observed, which would fail property (1) 10363 - * because this would UNSET a cid which is actively used. 10347 + * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID 10348 + * related to that MM is owned by a CPU anymore. 10364 10349 */ 10365 10350 10366 - void sched_mm_cid_migrate_from(struct task_struct *t) 10351 + /* 10352 + * Update the CID range properties when the constraints change. Invoked via 10353 + * fork(), exit() and affinity changes 10354 + */ 10355 + static void __mm_update_max_cids(struct mm_mm_cid *mc) 10367 10356 { 10368 - t->migrate_from_cpu = task_cpu(t); 10357 + unsigned int opt_cids, max_cids; 10358 + 10359 + /* Calculate the new optimal constraint */ 10360 + opt_cids = min(mc->nr_cpus_allowed, mc->users); 10361 + 10362 + /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */ 10363 + max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus()); 10364 + WRITE_ONCE(mc->max_cids, max_cids); 10369 10365 } 10370 10366 10371 - static 10372 - int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, 10373 - struct task_struct *t, 10374 - struct mm_cid *src_pcpu_cid) 10367 + static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc) 10375 10368 { 10376 - struct mm_struct *mm = t->mm; 10377 - struct task_struct *src_task; 10378 - int src_cid, last_mm_cid; 10369 + unsigned int opt_cids; 10379 10370 10380 - if (!mm) 10381 - return -1; 10371 + opt_cids = min(mc->nr_cpus_allowed, mc->users); 10372 + /* Has to be at least 1 because 0 indicates PCPU mode off */ 10373 + return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1); 10374 + } 10382 10375 10383 - last_mm_cid = t->last_mm_cid; 10384 - /* 10385 - * If the migrated task has no last cid, or if the current 10386 - * task on src rq uses the cid, it means the source cid does not need 10387 - * to be moved to the destination cpu. 10388 - */ 10389 - if (last_mm_cid == -1) 10390 - return -1; 10391 - src_cid = READ_ONCE(src_pcpu_cid->cid); 10392 - if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid) 10393 - return -1; 10376 + static bool mm_update_max_cids(struct mm_struct *mm) 10377 + { 10378 + struct mm_mm_cid *mc = &mm->mm_cid; 10394 10379 10395 - /* 10396 - * If we observe an active task using the mm on this rq, it means we 10397 - * are not the last task to be migrated from this cpu for this mm, so 10398 - * there is no need to move src_cid to the destination cpu. 10399 - */ 10400 - guard(rcu)(); 10401 - src_task = rcu_dereference(src_rq->curr); 10402 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10403 - t->last_mm_cid = -1; 10404 - return -1; 10380 + lockdep_assert_held(&mm->mm_cid.lock); 10381 + 10382 + /* Clear deferred mode switch flag. A change is handled by the caller */ 10383 + mc->update_deferred = false; 10384 + __mm_update_max_cids(mc); 10385 + 10386 + /* Check whether owner mode must be changed */ 10387 + if (!mc->percpu) { 10388 + /* Enable per CPU mode when the number of users is above max_cids */ 10389 + if (mc->users > mc->max_cids) 10390 + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); 10391 + } else { 10392 + /* Switch back to per task if user count under threshold */ 10393 + if (mc->users < mc->pcpu_thrs) 10394 + mc->pcpu_thrs = 0; 10405 10395 } 10406 10396 10407 - return src_cid; 10397 + /* Mode change required? */ 10398 + if (!!mc->percpu == !!mc->pcpu_thrs) 10399 + return false; 10400 + /* When switching back to per TASK mode, set the transition flag */ 10401 + if (!mc->pcpu_thrs) 10402 + WRITE_ONCE(mc->transit, MM_CID_TRANSIT); 10403 + WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs); 10404 + return true; 10408 10405 } 10409 10406 10410 - static 10411 - int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, 10412 - struct task_struct *t, 10413 - struct mm_cid *src_pcpu_cid, 10414 - int src_cid) 10407 + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) 10415 10408 { 10416 - struct task_struct *src_task; 10417 - struct mm_struct *mm = t->mm; 10418 - int lazy_cid; 10409 + struct cpumask *mm_allowed; 10410 + struct mm_mm_cid *mc; 10411 + unsigned int weight; 10419 10412 10420 - if (src_cid == -1) 10421 - return -1; 10422 - 10413 + if (!mm || !READ_ONCE(mm->mm_cid.users)) 10414 + return; 10423 10415 /* 10424 - * Attempt to clear the source cpu cid to move it to the destination 10425 - * cpu. 10416 + * mm::mm_cid::mm_cpus_allowed is the superset of each threads 10417 + * allowed CPUs mask which means it can only grow. 10426 10418 */ 10427 - lazy_cid = mm_cid_set_lazy_put(src_cid); 10428 - if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid)) 10429 - return -1; 10419 + mc = &mm->mm_cid; 10420 + guard(raw_spinlock)(&mc->lock); 10421 + mm_allowed = mm_cpus_allowed(mm); 10422 + weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk); 10423 + if (weight == mc->nr_cpus_allowed) 10424 + return; 10430 10425 10431 - /* 10432 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10433 - * rq->curr->mm matches the scheduler barrier in context_switch() 10434 - * between store to rq->curr and load of prev and next task's 10435 - * per-mm/cpu cid. 10436 - * 10437 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10438 - * rq->curr->mm_cid_active matches the barrier in 10439 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10440 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10441 - * load of per-mm/cpu cid. 10442 - */ 10426 + WRITE_ONCE(mc->nr_cpus_allowed, weight); 10427 + __mm_update_max_cids(mc); 10428 + if (!mc->percpu) 10429 + return; 10443 10430 10444 - /* 10445 - * If we observe an active task using the mm on this rq after setting 10446 - * the lazy-put flag, this task will be responsible for transitioning 10447 - * from lazy-put flag set to MM_CID_UNSET. 10448 - */ 10449 - scoped_guard (rcu) { 10450 - src_task = rcu_dereference(src_rq->curr); 10451 - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { 10431 + /* Adjust the threshold to the wider set */ 10432 + mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc); 10433 + /* Switch back to per task mode? */ 10434 + if (mc->users >= mc->pcpu_thrs) 10435 + return; 10436 + 10437 + /* Don't queue twice */ 10438 + if (mc->update_deferred) 10439 + return; 10440 + 10441 + /* Queue the irq work, which schedules the real work */ 10442 + mc->update_deferred = true; 10443 + irq_work_queue(&mc->irq_work); 10444 + } 10445 + 10446 + static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp) 10447 + { 10448 + if (cid_on_cpu(t->mm_cid.cid)) { 10449 + unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid); 10450 + 10451 + t->mm_cid.cid = cid_to_transit_cid(cid); 10452 + pcp->cid = t->mm_cid.cid; 10453 + } 10454 + } 10455 + 10456 + static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm) 10457 + { 10458 + unsigned int cpu; 10459 + 10460 + /* Walk the CPUs and fixup all stale CIDs */ 10461 + for_each_possible_cpu(cpu) { 10462 + struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu); 10463 + struct rq *rq = cpu_rq(cpu); 10464 + 10465 + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10466 + guard(rq_lock_irq)(rq); 10467 + /* Is the CID still owned by the CPU? */ 10468 + if (cid_on_cpu(pcp->cid)) { 10452 10469 /* 10453 - * We observed an active task for this mm, there is therefore 10454 - * no point in moving this cid to the destination cpu. 10470 + * If rq->curr has @mm, transfer it with the 10471 + * transition bit set. Otherwise drop it. 10455 10472 */ 10456 - t->last_mm_cid = -1; 10457 - return -1; 10473 + if (rq->curr->mm == mm && rq->curr->mm_cid.active) 10474 + mm_cid_transit_to_task(rq->curr, pcp); 10475 + else 10476 + mm_drop_cid_on_cpu(mm, pcp); 10477 + 10478 + } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) { 10479 + unsigned int cid = rq->curr->mm_cid.cid; 10480 + 10481 + /* Ensure it has the transition bit set */ 10482 + if (!cid_in_transit(cid)) { 10483 + cid = cid_to_transit_cid(cid); 10484 + rq->curr->mm_cid.cid = cid; 10485 + pcp->cid = cid; 10486 + } 10458 10487 } 10459 10488 } 10460 - 10461 - /* 10462 - * The src_cid is unused, so it can be unset. 10463 - */ 10464 - if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10465 - return -1; 10466 - WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET); 10467 - return src_cid; 10489 + /* Clear the transition bit */ 10490 + WRITE_ONCE(mm->mm_cid.transit, 0); 10468 10491 } 10469 10492 10470 - /* 10471 - * Migration to dst cpu. Called with dst_rq lock held. 10472 - * Interrupts are disabled, which keeps the window of cid ownership without the 10473 - * source rq lock held small. 10474 - */ 10475 - void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) 10493 + static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp) 10476 10494 { 10477 - struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; 10478 - struct mm_struct *mm = t->mm; 10479 - int src_cid, src_cpu; 10480 - bool dst_cid_is_set; 10481 - struct rq *src_rq; 10482 - 10483 - lockdep_assert_rq_held(dst_rq); 10484 - 10485 - if (!mm) 10486 - return; 10487 - src_cpu = t->migrate_from_cpu; 10488 - if (src_cpu == -1) { 10489 - t->last_mm_cid = -1; 10490 - return; 10491 - } 10492 - /* 10493 - * Move the src cid if the dst cid is unset. This keeps id 10494 - * allocation closest to 0 in cases where few threads migrate around 10495 - * many CPUs. 10496 - * 10497 - * If destination cid or recent cid is already set, we may have 10498 - * to just clear the src cid to ensure compactness in frequent 10499 - * migrations scenarios. 10500 - * 10501 - * It is not useful to clear the src cid when the number of threads is 10502 - * greater or equal to the number of allowed CPUs, because user-space 10503 - * can expect that the number of allowed cids can reach the number of 10504 - * allowed CPUs. 10505 - */ 10506 - dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq)); 10507 - dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) || 10508 - !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid)); 10509 - if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed)) 10510 - return; 10511 - src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); 10512 - src_rq = cpu_rq(src_cpu); 10513 - src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid); 10514 - if (src_cid == -1) 10515 - return; 10516 - src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid, 10517 - src_cid); 10518 - if (src_cid == -1) 10519 - return; 10520 - if (dst_cid_is_set) { 10521 - __mm_cid_put(mm, src_cid); 10522 - return; 10523 - } 10524 - /* Move src_cid to dst cpu. */ 10525 - mm_cid_snapshot_time(dst_rq, mm); 10526 - WRITE_ONCE(dst_pcpu_cid->cid, src_cid); 10527 - WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid); 10528 - } 10529 - 10530 - static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid, 10531 - int cpu) 10532 - { 10533 - struct rq *rq = cpu_rq(cpu); 10534 - struct task_struct *t; 10535 - int cid, lazy_cid; 10536 - 10537 - cid = READ_ONCE(pcpu_cid->cid); 10538 - if (!mm_cid_is_valid(cid)) 10539 - return; 10540 - 10541 - /* 10542 - * Clear the cpu cid if it is set to keep cid allocation compact. If 10543 - * there happens to be other tasks left on the source cpu using this 10544 - * mm, the next task using this mm will reallocate its cid on context 10545 - * switch. 10546 - */ 10547 - lazy_cid = mm_cid_set_lazy_put(cid); 10548 - if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) 10549 - return; 10550 - 10551 - /* 10552 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10553 - * rq->curr->mm matches the scheduler barrier in context_switch() 10554 - * between store to rq->curr and load of prev and next task's 10555 - * per-mm/cpu cid. 10556 - * 10557 - * The implicit barrier after cmpxchg per-mm/cpu cid before loading 10558 - * rq->curr->mm_cid_active matches the barrier in 10559 - * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and 10560 - * sched_mm_cid_after_execve() between store to t->mm_cid_active and 10561 - * load of per-mm/cpu cid. 10562 - */ 10563 - 10564 - /* 10565 - * If we observe an active task using the mm on this rq after setting 10566 - * the lazy-put flag, that task will be responsible for transitioning 10567 - * from lazy-put flag set to MM_CID_UNSET. 10568 - */ 10569 - scoped_guard (rcu) { 10570 - t = rcu_dereference(rq->curr); 10571 - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) 10572 - return; 10573 - } 10574 - 10575 - /* 10576 - * The cid is unused, so it can be unset. 10577 - * Disable interrupts to keep the window of cid ownership without rq 10578 - * lock small. 10579 - */ 10580 - scoped_guard (irqsave) { 10581 - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) 10582 - __mm_cid_put(mm, cid); 10495 + if (cid_on_task(t->mm_cid.cid)) { 10496 + t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid); 10497 + pcp->cid = t->mm_cid.cid; 10583 10498 } 10584 10499 } 10585 10500 10586 - static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) 10501 + static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm) 10587 10502 { 10588 - struct rq *rq = cpu_rq(cpu); 10589 - struct mm_cid *pcpu_cid; 10590 - struct task_struct *curr; 10591 - u64 rq_clock; 10592 - 10593 - /* 10594 - * rq->clock load is racy on 32-bit but one spurious clear once in a 10595 - * while is irrelevant. 10596 - */ 10597 - rq_clock = READ_ONCE(rq->clock); 10598 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10599 - 10600 - /* 10601 - * In order to take care of infrequently scheduled tasks, bump the time 10602 - * snapshot associated with this cid if an active task using the mm is 10603 - * observed on this rq. 10604 - */ 10605 - scoped_guard (rcu) { 10606 - curr = rcu_dereference(rq->curr); 10607 - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { 10608 - WRITE_ONCE(pcpu_cid->time, rq_clock); 10609 - return; 10610 - } 10611 - } 10612 - 10613 - if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) 10614 - return; 10615 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10616 - } 10617 - 10618 - static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu, 10619 - int weight) 10620 - { 10621 - struct mm_cid *pcpu_cid; 10622 - int cid; 10623 - 10624 - pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); 10625 - cid = READ_ONCE(pcpu_cid->cid); 10626 - if (!mm_cid_is_valid(cid) || cid < weight) 10627 - return; 10628 - sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); 10629 - } 10630 - 10631 - static void task_mm_cid_work(struct callback_head *work) 10632 - { 10633 - unsigned long now = jiffies, old_scan, next_scan; 10634 - struct task_struct *t = current; 10635 - struct cpumask *cidmask; 10636 - struct mm_struct *mm; 10637 - int weight, cpu; 10638 - 10639 - WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work)); 10640 - 10641 - work->next = work; /* Prevent double-add */ 10642 - if (t->flags & PF_EXITING) 10643 - return; 10644 - mm = t->mm; 10645 - if (!mm) 10646 - return; 10647 - old_scan = READ_ONCE(mm->mm_cid_next_scan); 10648 - next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10649 - if (!old_scan) { 10650 - unsigned long res; 10651 - 10652 - res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan); 10653 - if (res != old_scan) 10654 - old_scan = res; 10503 + /* Remote access to mm::mm_cid::pcpu requires rq_lock */ 10504 + guard(task_rq_lock)(t); 10505 + /* If the task is not active it is not in the users count */ 10506 + if (!t->mm_cid.active) 10507 + return false; 10508 + if (cid_on_task(t->mm_cid.cid)) { 10509 + /* If running on the CPU, transfer the CID, otherwise drop it */ 10510 + if (task_rq(t)->curr == t) 10511 + mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t))); 10655 10512 else 10656 - old_scan = next_scan; 10513 + mm_unset_cid_on_task(t); 10657 10514 } 10658 - if (time_before(now, old_scan)) 10659 - return; 10660 - if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan)) 10661 - return; 10662 - cidmask = mm_cidmask(mm); 10663 - /* Clear cids that were not recently used. */ 10664 - for_each_possible_cpu(cpu) 10665 - sched_mm_cid_remote_clear_old(mm, cpu); 10666 - weight = cpumask_weight(cidmask); 10667 - /* 10668 - * Clear cids that are greater or equal to the cidmask weight to 10669 - * recompact it. 10670 - */ 10671 - for_each_possible_cpu(cpu) 10672 - sched_mm_cid_remote_clear_weight(mm, cpu, weight); 10515 + return true; 10673 10516 } 10674 10517 10675 - void init_sched_mm_cid(struct task_struct *t) 10518 + static void mm_cid_fixup_tasks_to_cpus(void) 10676 10519 { 10677 - struct mm_struct *mm = t->mm; 10678 - int mm_users = 0; 10520 + struct mm_struct *mm = current->mm; 10521 + struct task_struct *p, *t; 10522 + unsigned int users; 10679 10523 10680 - if (mm) { 10681 - mm_users = atomic_read(&mm->mm_users); 10682 - if (mm_users == 1) 10683 - mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY); 10524 + /* 10525 + * This can obviously race with a concurrent affinity change, which 10526 + * increases the number of allowed CPUs for this mm, but that does 10527 + * not affect the mode and only changes the CID constraints. A 10528 + * possible switch back to per task mode happens either in the 10529 + * deferred handler function or in the next fork()/exit(). 10530 + * 10531 + * The caller has already transferred. The newly incoming task is 10532 + * already accounted for, but not yet visible. 10533 + */ 10534 + users = mm->mm_cid.users - 2; 10535 + if (!users) 10536 + return; 10537 + 10538 + guard(rcu)(); 10539 + for_other_threads(current, t) { 10540 + if (mm_cid_fixup_task_to_cpu(t, mm)) 10541 + users--; 10684 10542 } 10685 - t->cid_work.next = &t->cid_work; /* Protect against double add */ 10686 - init_task_work(&t->cid_work, task_mm_cid_work); 10687 - } 10688 10543 10689 - void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) 10690 - { 10691 - struct callback_head *work = &curr->cid_work; 10692 - unsigned long now = jiffies; 10693 - 10694 - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) || 10695 - work->next != work) 10696 - return; 10697 - if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan))) 10544 + if (!users) 10698 10545 return; 10699 10546 10700 - /* No page allocation under rq lock */ 10701 - task_work_add(curr, work, TWA_RESUME); 10702 - } 10703 - 10704 - void sched_mm_cid_exit_signals(struct task_struct *t) 10705 - { 10706 - struct mm_struct *mm = t->mm; 10707 - struct rq *rq; 10708 - 10709 - if (!mm) 10710 - return; 10711 - 10712 - preempt_disable(); 10713 - rq = this_rq(); 10714 - guard(rq_lock_irqsave)(rq); 10715 - preempt_enable_no_resched(); /* holding spinlock */ 10716 - WRITE_ONCE(t->mm_cid_active, 0); 10717 - /* 10718 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10719 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10720 - */ 10721 - smp_mb(); 10722 - mm_cid_put(mm); 10723 - t->last_mm_cid = t->mm_cid = -1; 10724 - } 10725 - 10726 - void sched_mm_cid_before_execve(struct task_struct *t) 10727 - { 10728 - struct mm_struct *mm = t->mm; 10729 - struct rq *rq; 10730 - 10731 - if (!mm) 10732 - return; 10733 - 10734 - preempt_disable(); 10735 - rq = this_rq(); 10736 - guard(rq_lock_irqsave)(rq); 10737 - preempt_enable_no_resched(); /* holding spinlock */ 10738 - WRITE_ONCE(t->mm_cid_active, 0); 10739 - /* 10740 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10741 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10742 - */ 10743 - smp_mb(); 10744 - mm_cid_put(mm); 10745 - t->last_mm_cid = t->mm_cid = -1; 10746 - } 10747 - 10748 - void sched_mm_cid_after_execve(struct task_struct *t) 10749 - { 10750 - struct mm_struct *mm = t->mm; 10751 - struct rq *rq; 10752 - 10753 - if (!mm) 10754 - return; 10755 - 10756 - preempt_disable(); 10757 - rq = this_rq(); 10758 - scoped_guard (rq_lock_irqsave, rq) { 10759 - preempt_enable_no_resched(); /* holding spinlock */ 10760 - WRITE_ONCE(t->mm_cid_active, 1); 10761 - /* 10762 - * Store t->mm_cid_active before loading per-mm/cpu cid. 10763 - * Matches barrier in sched_mm_cid_remote_clear_old(). 10764 - */ 10765 - smp_mb(); 10766 - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm); 10547 + /* Happens only for VM_CLONE processes. */ 10548 + for_each_process_thread(p, t) { 10549 + if (t == current || t->mm != mm) 10550 + continue; 10551 + if (mm_cid_fixup_task_to_cpu(t, mm)) { 10552 + if (--users == 0) 10553 + return; 10554 + } 10767 10555 } 10556 + } 10557 + 10558 + static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm) 10559 + { 10560 + t->mm_cid.active = 1; 10561 + mm->mm_cid.users++; 10562 + return mm_update_max_cids(mm); 10768 10563 } 10769 10564 10770 10565 void sched_mm_cid_fork(struct task_struct *t) 10771 10566 { 10772 - WARN_ON_ONCE(!t->mm || t->mm_cid != -1); 10773 - t->mm_cid_active = 1; 10567 + struct mm_struct *mm = t->mm; 10568 + bool percpu; 10569 + 10570 + WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET); 10571 + 10572 + guard(mutex)(&mm->mm_cid.mutex); 10573 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10574 + struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu); 10575 + 10576 + /* First user ? */ 10577 + if (!mm->mm_cid.users) { 10578 + sched_mm_cid_add_user(t, mm); 10579 + t->mm_cid.cid = mm_get_cid(mm); 10580 + /* Required for execve() */ 10581 + pcp->cid = t->mm_cid.cid; 10582 + return; 10583 + } 10584 + 10585 + if (!sched_mm_cid_add_user(t, mm)) { 10586 + if (!mm->mm_cid.percpu) 10587 + t->mm_cid.cid = mm_get_cid(mm); 10588 + return; 10589 + } 10590 + 10591 + /* Handle the mode change and transfer current's CID */ 10592 + percpu = !!mm->mm_cid.percpu; 10593 + if (!percpu) 10594 + mm_cid_transit_to_task(current, pcp); 10595 + else 10596 + mm_cid_transfer_to_cpu(current, pcp); 10597 + } 10598 + 10599 + if (percpu) { 10600 + mm_cid_fixup_tasks_to_cpus(); 10601 + } else { 10602 + mm_cid_fixup_cpus_to_tasks(mm); 10603 + t->mm_cid.cid = mm_get_cid(mm); 10604 + } 10774 10605 } 10775 - #endif /* CONFIG_SCHED_MM_CID */ 10606 + 10607 + static bool sched_mm_cid_remove_user(struct task_struct *t) 10608 + { 10609 + t->mm_cid.active = 0; 10610 + scoped_guard(preempt) { 10611 + /* Clear the transition bit */ 10612 + t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid); 10613 + mm_unset_cid_on_task(t); 10614 + } 10615 + t->mm->mm_cid.users--; 10616 + return mm_update_max_cids(t->mm); 10617 + } 10618 + 10619 + static bool __sched_mm_cid_exit(struct task_struct *t) 10620 + { 10621 + struct mm_struct *mm = t->mm; 10622 + 10623 + if (!sched_mm_cid_remove_user(t)) 10624 + return false; 10625 + /* 10626 + * Contrary to fork() this only deals with a switch back to per 10627 + * task mode either because the above decreased users or an 10628 + * affinity change increased the number of allowed CPUs and the 10629 + * deferred fixup did not run yet. 10630 + */ 10631 + if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10632 + return false; 10633 + /* 10634 + * A failed fork(2) cleanup never gets here, so @current must have 10635 + * the same MM as @t. That's true for exit() and the failed 10636 + * pthread_create() cleanup case. 10637 + */ 10638 + if (WARN_ON_ONCE(current->mm != mm)) 10639 + return false; 10640 + return true; 10641 + } 10642 + 10643 + /* 10644 + * When a task exits, the MM CID held by the task is not longer required as 10645 + * the task cannot return to user space. 10646 + */ 10647 + void sched_mm_cid_exit(struct task_struct *t) 10648 + { 10649 + struct mm_struct *mm = t->mm; 10650 + 10651 + if (!mm || !t->mm_cid.active) 10652 + return; 10653 + /* 10654 + * Ensure that only one instance is doing MM CID operations within 10655 + * a MM. The common case is uncontended. The rare fixup case adds 10656 + * some overhead. 10657 + */ 10658 + scoped_guard(mutex, &mm->mm_cid.mutex) { 10659 + /* mm_cid::mutex is sufficient to protect mm_cid::users */ 10660 + if (likely(mm->mm_cid.users > 1)) { 10661 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10662 + if (!__sched_mm_cid_exit(t)) 10663 + return; 10664 + /* Mode change required. Transfer currents CID */ 10665 + mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu)); 10666 + } 10667 + mm_cid_fixup_cpus_to_tasks(mm); 10668 + return; 10669 + } 10670 + /* Last user */ 10671 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10672 + /* Required across execve() */ 10673 + if (t == current) 10674 + mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu)); 10675 + /* Ignore mode change. There is nothing to do. */ 10676 + sched_mm_cid_remove_user(t); 10677 + } 10678 + } 10679 + 10680 + /* 10681 + * As this is the last user (execve(), process exit or failed 10682 + * fork(2)) there is no concurrency anymore. 10683 + * 10684 + * Synchronize eventually pending work to ensure that there are no 10685 + * dangling references left. @t->mm_cid.users is zero so nothing 10686 + * can queue this work anymore. 10687 + */ 10688 + irq_work_sync(&mm->mm_cid.irq_work); 10689 + cancel_work_sync(&mm->mm_cid.work); 10690 + } 10691 + 10692 + /* Deactivate MM CID allocation across execve() */ 10693 + void sched_mm_cid_before_execve(struct task_struct *t) 10694 + { 10695 + sched_mm_cid_exit(t); 10696 + } 10697 + 10698 + /* Reactivate MM CID after successful execve() */ 10699 + void sched_mm_cid_after_execve(struct task_struct *t) 10700 + { 10701 + sched_mm_cid_fork(t); 10702 + } 10703 + 10704 + static void mm_cid_work_fn(struct work_struct *work) 10705 + { 10706 + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work); 10707 + 10708 + guard(mutex)(&mm->mm_cid.mutex); 10709 + /* Did the last user task exit already? */ 10710 + if (!mm->mm_cid.users) 10711 + return; 10712 + 10713 + scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) { 10714 + /* Have fork() or exit() handled it already? */ 10715 + if (!mm->mm_cid.update_deferred) 10716 + return; 10717 + /* This clears mm_cid::update_deferred */ 10718 + if (!mm_update_max_cids(mm)) 10719 + return; 10720 + /* Affinity changes can only switch back to task mode */ 10721 + if (WARN_ON_ONCE(mm->mm_cid.percpu)) 10722 + return; 10723 + } 10724 + mm_cid_fixup_cpus_to_tasks(mm); 10725 + } 10726 + 10727 + static void mm_cid_irq_work(struct irq_work *work) 10728 + { 10729 + struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work); 10730 + 10731 + /* 10732 + * Needs to be unconditional because mm_cid::lock cannot be held 10733 + * when scheduling work as mm_update_cpus_allowed() nests inside 10734 + * rq::lock and schedule_work() might end up in wakeup... 10735 + */ 10736 + schedule_work(&mm->mm_cid.work); 10737 + } 10738 + 10739 + void mm_init_cid(struct mm_struct *mm, struct task_struct *p) 10740 + { 10741 + mm->mm_cid.max_cids = 0; 10742 + mm->mm_cid.percpu = 0; 10743 + mm->mm_cid.transit = 0; 10744 + mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed; 10745 + mm->mm_cid.users = 0; 10746 + mm->mm_cid.pcpu_thrs = 0; 10747 + mm->mm_cid.update_deferred = 0; 10748 + raw_spin_lock_init(&mm->mm_cid.lock); 10749 + mutex_init(&mm->mm_cid.mutex); 10750 + mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work); 10751 + INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn); 10752 + cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask); 10753 + bitmap_zero(mm_cidmask(mm), num_possible_cpus()); 10754 + } 10755 + #else /* CONFIG_SCHED_MM_CID */ 10756 + static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { } 10757 + #endif /* !CONFIG_SCHED_MM_CID */ 10776 10758 10777 10759 static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); 10778 10760
+4 -4
kernel/sched/membarrier.c
··· 199 199 * is negligible. 200 200 */ 201 201 smp_mb(); 202 - rseq_preempt(current); 202 + rseq_sched_switch_event(current); 203 203 } 204 204 205 205 static void ipi_sync_rq_state(void *info) ··· 407 407 * membarrier, we will end up with some thread in the mm 408 408 * running without a core sync. 409 409 * 410 - * For RSEQ, don't rseq_preempt() the caller. User code 411 - * is not supposed to issue syscalls at all from inside an 412 - * rseq critical section. 410 + * For RSEQ, don't invoke rseq_sched_switch_event() on the 411 + * caller. User code is not supposed to issue syscalls at 412 + * all from inside an rseq critical section. 413 413 */ 414 414 if (flags != MEMBARRIER_FLAG_SYNC_CORE) { 415 415 preempt_disable();
+193 -263
kernel/sched/sched.h
··· 2223 2223 smp_wmb(); 2224 2224 WRITE_ONCE(task_thread_info(p)->cpu, cpu); 2225 2225 p->wake_cpu = cpu; 2226 + rseq_sched_set_ids_changed(p); 2226 2227 #endif /* CONFIG_SMP */ 2227 2228 } 2228 2229 ··· 3680 3679 3681 3680 #ifdef CONFIG_SCHED_MM_CID 3682 3681 3683 - #define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */ 3684 - #define MM_CID_SCAN_DELAY 100 /* 100ms */ 3685 - 3686 - extern raw_spinlock_t cid_lock; 3687 - extern int use_cid_lock; 3688 - 3689 - extern void sched_mm_cid_migrate_from(struct task_struct *t); 3690 - extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t); 3691 - extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr); 3692 - extern void init_sched_mm_cid(struct task_struct *t); 3693 - 3694 - static inline void __mm_cid_put(struct mm_struct *mm, int cid) 3682 + static __always_inline bool cid_on_cpu(unsigned int cid) 3695 3683 { 3696 - if (cid < 0) 3697 - return; 3698 - cpumask_clear_cpu(cid, mm_cidmask(mm)); 3684 + return cid & MM_CID_ONCPU; 3699 3685 } 3700 3686 3701 - /* 3702 - * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to 3703 - * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to 3704 - * be held to transition to other states. 3705 - * 3706 - * State transitions synchronized with cmpxchg or try_cmpxchg need to be 3707 - * consistent across CPUs, which prevents use of this_cpu_cmpxchg. 3708 - */ 3709 - static inline void mm_cid_put_lazy(struct task_struct *t) 3687 + static __always_inline bool cid_in_transit(unsigned int cid) 3710 3688 { 3689 + return cid & MM_CID_TRANSIT; 3690 + } 3691 + 3692 + static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid) 3693 + { 3694 + return cid & ~MM_CID_ONCPU; 3695 + } 3696 + 3697 + static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid) 3698 + { 3699 + return cid | MM_CID_ONCPU; 3700 + } 3701 + 3702 + static __always_inline unsigned int cid_to_transit_cid(unsigned int cid) 3703 + { 3704 + return cid | MM_CID_TRANSIT; 3705 + } 3706 + 3707 + static __always_inline unsigned int cid_from_transit_cid(unsigned int cid) 3708 + { 3709 + return cid & ~MM_CID_TRANSIT; 3710 + } 3711 + 3712 + static __always_inline bool cid_on_task(unsigned int cid) 3713 + { 3714 + /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */ 3715 + return cid < MM_CID_TRANSIT; 3716 + } 3717 + 3718 + static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid) 3719 + { 3720 + clear_bit(cid, mm_cidmask(mm)); 3721 + } 3722 + 3723 + static __always_inline void mm_unset_cid_on_task(struct task_struct *t) 3724 + { 3725 + unsigned int cid = t->mm_cid.cid; 3726 + 3727 + t->mm_cid.cid = MM_CID_UNSET; 3728 + if (cid_on_task(cid)) 3729 + mm_drop_cid(t->mm, cid); 3730 + } 3731 + 3732 + static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp) 3733 + { 3734 + /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */ 3735 + pcp->cid = cpu_cid_to_cid(pcp->cid); 3736 + mm_drop_cid(mm, pcp->cid); 3737 + } 3738 + 3739 + static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids) 3740 + { 3741 + unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids); 3742 + 3743 + if (cid >= max_cids) 3744 + return MM_CID_UNSET; 3745 + if (test_and_set_bit(cid, mm_cidmask(mm))) 3746 + return MM_CID_UNSET; 3747 + return cid; 3748 + } 3749 + 3750 + static inline unsigned int mm_get_cid(struct mm_struct *mm) 3751 + { 3752 + unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids)); 3753 + 3754 + while (cid == MM_CID_UNSET) { 3755 + cpu_relax(); 3756 + cid = __mm_get_cid(mm, num_possible_cpus()); 3757 + } 3758 + return cid; 3759 + } 3760 + 3761 + static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid, 3762 + unsigned int max_cids) 3763 + { 3764 + unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid); 3765 + 3766 + /* Is it in the optimal CID space? */ 3767 + if (likely(cid < max_cids)) 3768 + return orig_cid; 3769 + 3770 + /* Try to find one in the optimal space. Otherwise keep the provided. */ 3771 + new_cid = __mm_get_cid(mm, max_cids); 3772 + if (new_cid != MM_CID_UNSET) { 3773 + mm_drop_cid(mm, cid); 3774 + /* Preserve the ONCPU mode of the original CID */ 3775 + return new_cid | (orig_cid & MM_CID_ONCPU); 3776 + } 3777 + return orig_cid; 3778 + } 3779 + 3780 + static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid) 3781 + { 3782 + if (t->mm_cid.cid != cid) { 3783 + t->mm_cid.cid = cid; 3784 + rseq_sched_set_ids_changed(t); 3785 + } 3786 + } 3787 + 3788 + static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid) 3789 + { 3790 + __this_cpu_write(mm->mm_cid.pcpu->cid, cid); 3791 + } 3792 + 3793 + static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid) 3794 + { 3795 + unsigned int max_cids, tcid = t->mm_cid.cid; 3711 3796 struct mm_struct *mm = t->mm; 3712 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3713 - int cid; 3714 3797 3715 - lockdep_assert_irqs_disabled(); 3716 - cid = __this_cpu_read(pcpu_cid->cid); 3717 - if (!mm_cid_is_lazy_put(cid) || 3718 - !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3719 - return; 3720 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3721 - } 3722 - 3723 - static inline int mm_cid_pcpu_unset(struct mm_struct *mm) 3724 - { 3725 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3726 - int cid, res; 3727 - 3728 - lockdep_assert_irqs_disabled(); 3729 - cid = __this_cpu_read(pcpu_cid->cid); 3730 - for (;;) { 3731 - if (mm_cid_is_unset(cid)) 3732 - return MM_CID_UNSET; 3733 - /* 3734 - * Attempt transition from valid or lazy-put to unset. 3735 - */ 3736 - res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET); 3737 - if (res == cid) 3738 - break; 3739 - cid = res; 3740 - } 3741 - return cid; 3742 - } 3743 - 3744 - static inline void mm_cid_put(struct mm_struct *mm) 3745 - { 3746 - int cid; 3747 - 3748 - lockdep_assert_irqs_disabled(); 3749 - cid = mm_cid_pcpu_unset(mm); 3750 - if (cid == MM_CID_UNSET) 3751 - return; 3752 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3753 - } 3754 - 3755 - static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) 3756 - { 3757 - struct cpumask *cidmask = mm_cidmask(mm); 3758 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3759 - int cid, max_nr_cid, allowed_max_nr_cid; 3760 - 3761 - /* 3762 - * After shrinking the number of threads or reducing the number 3763 - * of allowed cpus, reduce the value of max_nr_cid so expansion 3764 - * of cid allocation will preserve cache locality if the number 3765 - * of threads or allowed cpus increase again. 3766 - */ 3767 - max_nr_cid = atomic_read(&mm->max_nr_cid); 3768 - while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), 3769 - atomic_read(&mm->mm_users))), 3770 - max_nr_cid > allowed_max_nr_cid) { 3771 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ 3772 - if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { 3773 - max_nr_cid = allowed_max_nr_cid; 3774 - break; 3798 + max_cids = READ_ONCE(mm->mm_cid.max_cids); 3799 + /* Optimize for the common case where both have the ONCPU bit set */ 3800 + if (likely(cid_on_cpu(cpu_cid & tcid))) { 3801 + if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) { 3802 + mm_cid_update_task_cid(t, cpu_cid); 3803 + return; 3775 3804 } 3776 - } 3777 - /* Try to re-use recent cid. This improves cache locality. */ 3778 - cid = __this_cpu_read(pcpu_cid->recent_cid); 3779 - if (!mm_cid_is_unset(cid) && cid < max_nr_cid && 3780 - !cpumask_test_and_set_cpu(cid, cidmask)) 3781 - return cid; 3782 - /* 3783 - * Expand cid allocation if the maximum number of concurrency 3784 - * IDs allocated (max_nr_cid) is below the number cpus allowed 3785 - * and number of threads. Expanding cid allocation as much as 3786 - * possible improves cache locality. 3787 - */ 3788 - cid = max_nr_cid; 3789 - while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { 3790 - /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ 3791 - if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) 3792 - continue; 3793 - if (!cpumask_test_and_set_cpu(cid, cidmask)) 3794 - return cid; 3795 - } 3796 - /* 3797 - * Find the first available concurrency id. 3798 - * Retry finding first zero bit if the mask is temporarily 3799 - * filled. This only happens during concurrent remote-clear 3800 - * which owns a cid without holding a rq lock. 3801 - */ 3802 - for (;;) { 3803 - cid = cpumask_first_zero(cidmask); 3804 - if (cid < READ_ONCE(mm->nr_cpus_allowed)) 3805 - break; 3806 - cpu_relax(); 3807 - } 3808 - if (cpumask_test_and_set_cpu(cid, cidmask)) 3809 - return -1; 3810 - 3811 - return cid; 3812 - } 3813 - 3814 - /* 3815 - * Save a snapshot of the current runqueue time of this cpu 3816 - * with the per-cpu cid value, allowing to estimate how recently it was used. 3817 - */ 3818 - static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm) 3819 - { 3820 - struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq)); 3821 - 3822 - lockdep_assert_rq_held(rq); 3823 - WRITE_ONCE(pcpu_cid->time, rq->clock); 3824 - } 3825 - 3826 - static inline int __mm_cid_get(struct rq *rq, struct task_struct *t, 3827 - struct mm_struct *mm) 3828 - { 3829 - int cid; 3830 - 3831 - /* 3832 - * All allocations (even those using the cid_lock) are lock-free. If 3833 - * use_cid_lock is set, hold the cid_lock to perform cid allocation to 3834 - * guarantee forward progress. 3835 - */ 3836 - if (!READ_ONCE(use_cid_lock)) { 3837 - cid = __mm_cid_try_get(t, mm); 3838 - if (cid >= 0) 3839 - goto end; 3840 - raw_spin_lock(&cid_lock); 3805 + /* Try to converge into the optimal CID space */ 3806 + cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids); 3841 3807 } else { 3842 - raw_spin_lock(&cid_lock); 3843 - cid = __mm_cid_try_get(t, mm); 3844 - if (cid >= 0) 3845 - goto unlock; 3846 - } 3847 - 3848 - /* 3849 - * cid concurrently allocated. Retry while forcing following 3850 - * allocations to use the cid_lock to ensure forward progress. 3851 - */ 3852 - WRITE_ONCE(use_cid_lock, 1); 3853 - /* 3854 - * Set use_cid_lock before allocation. Only care about program order 3855 - * because this is only required for forward progress. 3856 - */ 3857 - barrier(); 3858 - /* 3859 - * Retry until it succeeds. It is guaranteed to eventually succeed once 3860 - * all newcoming allocations observe the use_cid_lock flag set. 3861 - */ 3862 - do { 3863 - cid = __mm_cid_try_get(t, mm); 3864 - cpu_relax(); 3865 - } while (cid < 0); 3866 - /* 3867 - * Allocate before clearing use_cid_lock. Only care about 3868 - * program order because this is for forward progress. 3869 - */ 3870 - barrier(); 3871 - WRITE_ONCE(use_cid_lock, 0); 3872 - unlock: 3873 - raw_spin_unlock(&cid_lock); 3874 - end: 3875 - mm_cid_snapshot_time(rq, mm); 3876 - 3877 - return cid; 3878 - } 3879 - 3880 - static inline int mm_cid_get(struct rq *rq, struct task_struct *t, 3881 - struct mm_struct *mm) 3882 - { 3883 - struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; 3884 - int cid; 3885 - 3886 - lockdep_assert_rq_held(rq); 3887 - cid = __this_cpu_read(pcpu_cid->cid); 3888 - if (mm_cid_is_valid(cid)) { 3889 - mm_cid_snapshot_time(rq, mm); 3890 - return cid; 3891 - } 3892 - if (mm_cid_is_lazy_put(cid)) { 3893 - if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET)) 3894 - __mm_cid_put(mm, mm_cid_clear_lazy_put(cid)); 3895 - } 3896 - cid = __mm_cid_get(rq, t, mm); 3897 - __this_cpu_write(pcpu_cid->cid, cid); 3898 - __this_cpu_write(pcpu_cid->recent_cid, cid); 3899 - 3900 - return cid; 3901 - } 3902 - 3903 - static inline void switch_mm_cid(struct rq *rq, 3904 - struct task_struct *prev, 3905 - struct task_struct *next) 3906 - { 3907 - /* 3908 - * Provide a memory barrier between rq->curr store and load of 3909 - * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition. 3910 - * 3911 - * Should be adapted if context_switch() is modified. 3912 - */ 3913 - if (!next->mm) { // to kernel 3914 - /* 3915 - * user -> kernel transition does not guarantee a barrier, but 3916 - * we can use the fact that it performs an atomic operation in 3917 - * mmgrab(). 3918 - */ 3919 - if (prev->mm) // from user 3920 - smp_mb__after_mmgrab(); 3921 - /* 3922 - * kernel -> kernel transition does not change rq->curr->mm 3923 - * state. It stays NULL. 3924 - */ 3925 - } else { // to user 3926 - /* 3927 - * kernel -> user transition does not provide a barrier 3928 - * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu]. 3929 - * Provide it here. 3930 - */ 3931 - if (!prev->mm) { // from kernel 3932 - smp_mb(); 3933 - } else { // from user 3934 - /* 3935 - * user->user transition relies on an implicit 3936 - * memory barrier in switch_mm() when 3937 - * current->mm changes. If the architecture 3938 - * switch_mm() does not have an implicit memory 3939 - * barrier, it is emitted here. If current->mm 3940 - * is unchanged, no barrier is needed. 3941 - */ 3942 - smp_mb__after_switch_mm(); 3808 + /* Hand over or drop the task owned CID */ 3809 + if (cid_on_task(tcid)) { 3810 + if (cid_on_cpu(cpu_cid)) 3811 + mm_unset_cid_on_task(t); 3812 + else 3813 + cpu_cid = cid_to_cpu_cid(tcid); 3943 3814 } 3815 + /* Still nothing, allocate a new one */ 3816 + if (!cid_on_cpu(cpu_cid)) 3817 + cpu_cid = cid_to_cpu_cid(mm_get_cid(mm)); 3944 3818 } 3945 - if (prev->mm_cid_active) { 3946 - mm_cid_snapshot_time(rq, prev->mm); 3947 - mm_cid_put_lazy(prev); 3948 - prev->mm_cid = -1; 3819 + mm_cid_update_pcpu_cid(mm, cpu_cid); 3820 + mm_cid_update_task_cid(t, cpu_cid); 3821 + } 3822 + 3823 + static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid) 3824 + { 3825 + unsigned int max_cids, tcid = t->mm_cid.cid; 3826 + struct mm_struct *mm = t->mm; 3827 + 3828 + max_cids = READ_ONCE(mm->mm_cid.max_cids); 3829 + /* Optimize for the common case, where both have the ONCPU bit clear */ 3830 + if (likely(cid_on_task(tcid | cpu_cid))) { 3831 + if (likely(tcid < max_cids)) { 3832 + mm_cid_update_pcpu_cid(mm, tcid); 3833 + return; 3834 + } 3835 + /* Try to converge into the optimal CID space */ 3836 + tcid = mm_cid_converge(mm, tcid, max_cids); 3837 + } else { 3838 + /* Hand over or drop the CPU owned CID */ 3839 + if (cid_on_cpu(cpu_cid)) { 3840 + if (cid_on_task(tcid)) 3841 + mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu)); 3842 + else 3843 + tcid = cpu_cid_to_cid(cpu_cid); 3844 + } 3845 + /* Still nothing, allocate a new one */ 3846 + if (!cid_on_task(tcid)) 3847 + tcid = mm_get_cid(mm); 3848 + /* Set the transition mode flag if required */ 3849 + tcid |= READ_ONCE(mm->mm_cid.transit); 3949 3850 } 3950 - if (next->mm_cid_active) 3951 - next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); 3851 + mm_cid_update_pcpu_cid(mm, tcid); 3852 + mm_cid_update_task_cid(t, tcid); 3853 + } 3854 + 3855 + static __always_inline void mm_cid_schedin(struct task_struct *next) 3856 + { 3857 + struct mm_struct *mm = next->mm; 3858 + unsigned int cpu_cid; 3859 + 3860 + if (!next->mm_cid.active) 3861 + return; 3862 + 3863 + cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid); 3864 + if (likely(!READ_ONCE(mm->mm_cid.percpu))) 3865 + mm_cid_from_task(next, cpu_cid); 3866 + else 3867 + mm_cid_from_cpu(next, cpu_cid); 3868 + } 3869 + 3870 + static __always_inline void mm_cid_schedout(struct task_struct *prev) 3871 + { 3872 + /* During mode transitions CIDs are temporary and need to be dropped */ 3873 + if (likely(!cid_in_transit(prev->mm_cid.cid))) 3874 + return; 3875 + 3876 + mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid)); 3877 + prev->mm_cid.cid = MM_CID_UNSET; 3878 + } 3879 + 3880 + static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) 3881 + { 3882 + mm_cid_schedout(prev); 3883 + mm_cid_schedin(next); 3952 3884 } 3953 3885 3954 3886 #else /* !CONFIG_SCHED_MM_CID: */ 3955 - static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { } 3956 - static inline void sched_mm_cid_migrate_from(struct task_struct *t) { } 3957 - static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { } 3958 - static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } 3959 - static inline void init_sched_mm_cid(struct task_struct *t) { } 3887 + static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { } 3960 3888 #endif /* !CONFIG_SCHED_MM_CID */ 3961 3889 3962 3890 extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
-2
kernel/signal.c
··· 3125 3125 cgroup_threadgroup_change_begin(tsk); 3126 3126 3127 3127 if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) { 3128 - sched_mm_cid_exit_signals(tsk); 3129 3128 tsk->flags |= PF_EXITING; 3130 3129 cgroup_threadgroup_change_end(tsk); 3131 3130 return; ··· 3135 3136 * From now this task is not visible for group-wide signals, 3136 3137 * see wants_signal(), do_signal_stop(). 3137 3138 */ 3138 - sched_mm_cid_exit_signals(tsk); 3139 3139 tsk->flags |= PF_EXITING; 3140 3140 3141 3141 cgroup_threadgroup_change_end(tsk);
+6
lib/bitmap.c
··· 355 355 } 356 356 EXPORT_SYMBOL(__bitmap_weight_andnot); 357 357 358 + unsigned int __bitmap_weighted_or(unsigned long *dst, const unsigned long *bitmap1, 359 + const unsigned long *bitmap2, unsigned int bits) 360 + { 361 + return BITMAP_WEIGHT(({dst[idx] = bitmap1[idx] | bitmap2[idx]; dst[idx]; }), bits); 362 + } 363 + 358 364 void __bitmap_set(unsigned long *map, unsigned int start, int len) 359 365 { 360 366 unsigned long *p = map + BIT_WORD(start);
+7
virt/kvm/kvm_main.c
··· 49 49 #include <linux/lockdep.h> 50 50 #include <linux/kthread.h> 51 51 #include <linux/suspend.h> 52 + #include <linux/rseq.h> 52 53 53 54 #include <asm/processor.h> 54 55 #include <asm/ioctl.h> ··· 4476 4475 vcpu->wants_to_run = !READ_ONCE(vcpu->run->immediate_exit__unsafe); 4477 4476 r = kvm_arch_vcpu_ioctl_run(vcpu); 4478 4477 vcpu->wants_to_run = false; 4478 + 4479 + /* 4480 + * FIXME: Remove this hack once all KVM architectures 4481 + * support the generic TIF bits, i.e. a dedicated TIF_RSEQ. 4482 + */ 4483 + rseq_virt_userspace_exit(); 4479 4484 4480 4485 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 4481 4486 break;