Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Implement syscall entry work for time slice extensions

The kernel sets SYSCALL_WORK_RSEQ_SLICE when it grants a time slice
extension. This allows to handle the rseq_slice_yield() syscall, which is
used by user space to relinquish the CPU after finishing the critical
section for which it requested an extension.

In case the kernel state is still GRANTED, the kernel resets both kernel
and user space state with a set of sanity checks. If the kernel state is
already cleared, then this raced against the timer or some other interrupt
and just clears the work bit.

Doing it in syscall entry work allows to catch misbehaving user space,
which issues an arbitrary syscall, i.e. not rseq_slice_yield(), from the
critical section. Contrary to the initial strict requirement to use
rseq_slice_yield() arbitrary syscalls are not considered a violation of the
ABI contract anymore to allow onion architecture applications, which cannot
control the code inside a critical section, to utilize this as well.

If the code detects inconsistent user space that result in a SIGSEGV for
the application.

If the grant was still active and the task was not preempted yet, the work
code reschedules immediately before continuing through the syscall.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://patch.msgid.link/20251215155709.005777059@linutronix.de

authored by

Thomas Gleixner and committed by
Peter Zijlstra
dd0a0460 99d25920

+112 -10
+1 -1
include/linux/entry-common.h
··· 36 36 SYSCALL_WORK_SYSCALL_EMU | \ 37 37 SYSCALL_WORK_SYSCALL_AUDIT | \ 38 38 SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ 39 + SYSCALL_WORK_SYSCALL_RSEQ_SLICE | \ 39 40 ARCH_SYSCALL_WORK_ENTER) 40 - 41 41 #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ 42 42 SYSCALL_WORK_SYSCALL_TRACE | \ 43 43 SYSCALL_WORK_SYSCALL_AUDIT | \
+2
include/linux/rseq.h
··· 164 164 #endif /* !CONFIG_DEBUG_RSEQ */ 165 165 166 166 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 167 + void rseq_syscall_enter_work(long syscall); 167 168 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3); 168 169 #else /* CONFIG_RSEQ_SLICE_EXTENSION */ 170 + static inline void rseq_syscall_enter_work(long syscall) { } 169 171 static inline int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) 170 172 { 171 173 return -ENOTSUPP;
+9 -7
include/linux/thread_info.h
··· 46 46 SYSCALL_WORK_BIT_SYSCALL_AUDIT, 47 47 SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, 48 48 SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, 49 + SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE, 49 50 }; 50 51 51 - #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) 52 - #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) 53 - #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) 54 - #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) 55 - #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) 56 - #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) 57 - #define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) 52 + #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) 53 + #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) 54 + #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) 55 + #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) 56 + #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) 57 + #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) 58 + #define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) 59 + #define SYSCALL_WORK_SYSCALL_RSEQ_SLICE BIT(SYSCALL_WORK_BIT_SYSCALL_RSEQ_SLICE) 58 60 #endif 59 61 60 62 #include <asm/thread_info.h>
+9 -2
kernel/entry/syscall-common.c
··· 17 17 } 18 18 } 19 19 20 - long syscall_trace_enter(struct pt_regs *regs, long syscall, 21 - unsigned long work) 20 + long syscall_trace_enter(struct pt_regs *regs, long syscall, unsigned long work) 22 21 { 23 22 long ret = 0; 24 23 ··· 30 31 if (syscall_user_dispatch(regs)) 31 32 return -1L; 32 33 } 34 + 35 + /* 36 + * User space got a time slice extension granted and relinquishes 37 + * the CPU. The work stops the slice timer to avoid an extra round 38 + * through hrtimer_interrupt(). 39 + */ 40 + if (work & SYSCALL_WORK_SYSCALL_RSEQ_SLICE) 41 + rseq_syscall_enter_work(syscall); 33 42 34 43 /* Handle ptrace */ 35 44 if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
+91
kernel/rseq.c
··· 502 502 #ifdef CONFIG_RSEQ_SLICE_EXTENSION 503 503 DEFINE_STATIC_KEY_TRUE(rseq_slice_extension_key); 504 504 505 + static inline void rseq_slice_set_need_resched(struct task_struct *curr) 506 + { 507 + /* 508 + * The interrupt guard is required to prevent inconsistent state in 509 + * this case: 510 + * 511 + * set_tsk_need_resched() 512 + * --> Interrupt 513 + * wakeup() 514 + * set_tsk_need_resched() 515 + * set_preempt_need_resched() 516 + * schedule_on_return() 517 + * clear_tsk_need_resched() 518 + * clear_preempt_need_resched() 519 + * set_preempt_need_resched() <- Inconsistent state 520 + * 521 + * This is safe vs. a remote set of TIF_NEED_RESCHED because that 522 + * only sets the already set bit and does not create inconsistent 523 + * state. 524 + */ 525 + scoped_guard(irq) 526 + set_need_resched_current(); 527 + } 528 + 529 + static void rseq_slice_validate_ctrl(u32 expected) 530 + { 531 + u32 __user *sctrl = &current->rseq.usrptr->slice_ctrl.all; 532 + u32 uval; 533 + 534 + if (get_user(uval, sctrl) || uval != expected) 535 + force_sig(SIGSEGV); 536 + } 537 + 538 + /* 539 + * Invoked from syscall entry if a time slice extension was granted and the 540 + * kernel did not clear it before user space left the critical section. 541 + * 542 + * While the recommended way to relinquish the CPU side effect free is 543 + * rseq_slice_yield(2), any syscall within a granted slice terminates the 544 + * grant and immediately reschedules if required. This supports onion layer 545 + * applications, where the code requesting the grant cannot control the 546 + * code within the critical section. 547 + */ 548 + void rseq_syscall_enter_work(long syscall) 549 + { 550 + struct task_struct *curr = current; 551 + struct rseq_slice_ctrl ctrl = { .granted = curr->rseq.slice.state.granted }; 552 + 553 + clear_task_syscall_work(curr, SYSCALL_RSEQ_SLICE); 554 + 555 + if (static_branch_unlikely(&rseq_debug_enabled)) 556 + rseq_slice_validate_ctrl(ctrl.all); 557 + 558 + /* 559 + * The kernel might have raced, revoked the grant and updated 560 + * userspace, but kept the SLICE work set. 561 + */ 562 + if (!ctrl.granted) 563 + return; 564 + 565 + /* 566 + * Required to make set_tsk_need_resched() correct on PREEMPT[RT] 567 + * kernels. Leaving the scope will reschedule on preemption models 568 + * FULL, LAZY and RT if necessary. 569 + */ 570 + scoped_guard(preempt) { 571 + /* 572 + * Now that preemption is disabled, quickly check whether 573 + * the task was already rescheduled before arriving here. 574 + */ 575 + if (!curr->rseq.event.sched_switch) { 576 + rseq_slice_set_need_resched(curr); 577 + 578 + if (syscall == __NR_rseq_slice_yield) { 579 + rseq_stat_inc(rseq_stats.s_yielded); 580 + /* Update the yielded state for syscall return */ 581 + curr->rseq.slice.yielded = 1; 582 + } else { 583 + rseq_stat_inc(rseq_stats.s_aborted); 584 + } 585 + } 586 + } 587 + /* Reschedule on NONE/VOLUNTARY preemption models */ 588 + cond_resched(); 589 + 590 + /* Clear the grant in kernel state and user space */ 591 + curr->rseq.slice.state.granted = false; 592 + if (put_user(0U, &curr->rseq.usrptr->slice_ctrl.all)) 593 + force_sig(SIGSEGV); 594 + } 595 + 505 596 int rseq_slice_extension_prctl(unsigned long arg2, unsigned long arg3) 506 597 { 507 598 switch (arg2) {