Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Avoid pointless evaluation in __rseq_notify_resume()

The RSEQ critical section mechanism only clears the event mask when a
critical section is registered, otherwise it is stale and collects
bits.

That means once a critical section is installed the first invocation of
that code when TIF_NOTIFY_RESUME is set will abort the critical section,
even when the TIF bit was not raised by the rseq preempt/migrate/signal
helpers.

This also has a performance implication because TIF_NOTIFY_RESUME is a
multiplexing TIF bit, which is utilized by quite some infrastructure. That
means every invocation of __rseq_notify_resume() goes unconditionally
through the heavy lifting of user space access and consistency checks even
if there is no reason to do so.

Keeping the stale event mask around when exiting to user space also
prevents it from being utilized by the upcoming time slice extension
mechanism.

Avoid this by reading and clearing the event mask before doing the user
space critical section access with interrupts or preemption disabled, which
ensures that the read and clear operation is CPU local atomic versus
scheduling and the membarrier IPI.

This is correct as after re-enabling interrupts/preemption any relevant
event will set the bit again and raise TIF_NOTIFY_RESUME, which makes the
user space exit code take another round of TIF bit clearing.

If the event mask was non-zero, invoke the slow path. On debug kernels the
slow path is invoked unconditionally and the result of the event mask
evaluation is handed in.

Add a exit path check after the TIF bit loop, which validates on debug
kernels that the event mask is zero before exiting to user space.

While at it reword the convoluted comment why the pt_regs pointer can be
NULL under certain circumstances.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084306.022571576@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
3ca59da7 3ce17e69

+58 -25
+5 -2
include/linux/irq-entry-common.h
··· 2 2 #ifndef __LINUX_IRQENTRYCOMMON_H 3 3 #define __LINUX_IRQENTRYCOMMON_H 4 4 5 + #include <linux/context_tracking.h> 6 + #include <linux/kmsan.h> 7 + #include <linux/rseq.h> 5 8 #include <linux/static_call_types.h> 6 9 #include <linux/syscalls.h> 7 - #include <linux/context_tracking.h> 8 10 #include <linux/tick.h> 9 - #include <linux/kmsan.h> 10 11 #include <linux/unwind_deferred.h> 11 12 12 13 #include <asm/entry-common.h> ··· 226 225 ti_work = exit_to_user_mode_loop(regs, ti_work); 227 226 228 227 arch_exit_to_user_mode_prepare(regs, ti_work); 228 + 229 + rseq_exit_to_user_mode(); 229 230 230 231 /* Ensure that kernel state is sane for a return to userspace */ 231 232 kmap_assert_nomap();
+9 -1
include/linux/rseq.h
··· 66 66 rseq_set_notify_resume(t); 67 67 } 68 68 69 + static __always_inline void rseq_exit_to_user_mode(void) 70 + { 71 + if (IS_ENABLED(CONFIG_DEBUG_RSEQ)) { 72 + if (WARN_ON_ONCE(current->rseq && current->rseq_event_mask)) 73 + current->rseq_event_mask = 0; 74 + } 75 + } 76 + 69 77 /* 70 78 * If parent process has a registered restartable sequences area, the 71 79 * child inherits. Unregister rseq for a clone with CLONE_VM set. ··· 126 118 static inline void rseq_execve(struct task_struct *t) 127 119 { 128 120 } 129 - 121 + static inline void rseq_exit_to_user_mode(void) { } 130 122 #endif 131 123 132 124 #ifdef CONFIG_DEBUG_RSEQ
+44 -22
kernel/rseq.c
··· 324 324 return true; 325 325 } 326 326 327 - static int rseq_need_restart(struct task_struct *t, u32 cs_flags) 327 + static int rseq_check_flags(struct task_struct *t, u32 cs_flags) 328 328 { 329 - u32 flags, event_mask; 329 + u32 flags; 330 330 int ret; 331 331 332 332 if (rseq_warn_flags("rseq_cs", cs_flags)) ··· 339 339 340 340 if (rseq_warn_flags("rseq", flags)) 341 341 return -EINVAL; 342 - 343 - /* 344 - * Load and clear event mask atomically with respect to 345 - * scheduler preemption and membarrier IPIs. 346 - */ 347 - scoped_guard(RSEQ_EVENT_GUARD) { 348 - event_mask = t->rseq_event_mask; 349 - t->rseq_event_mask = 0; 350 - } 351 - 352 - return !!event_mask; 342 + return 0; 353 343 } 354 344 355 345 static int clear_rseq_cs(struct rseq __user *rseq) ··· 370 380 return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; 371 381 } 372 382 373 - static int rseq_ip_fixup(struct pt_regs *regs) 383 + static int rseq_ip_fixup(struct pt_regs *regs, bool abort) 374 384 { 375 385 unsigned long ip = instruction_pointer(regs); 376 386 struct task_struct *t = current; ··· 388 398 */ 389 399 if (!in_rseq_cs(ip, &rseq_cs)) 390 400 return clear_rseq_cs(t->rseq); 391 - ret = rseq_need_restart(t, rseq_cs.flags); 392 - if (ret <= 0) 401 + ret = rseq_check_flags(t, rseq_cs.flags); 402 + if (ret < 0) 393 403 return ret; 404 + if (!abort) 405 + return 0; 394 406 ret = clear_rseq_cs(t->rseq); 395 407 if (ret) 396 408 return ret; ··· 422 430 return; 423 431 424 432 /* 425 - * regs is NULL if and only if the caller is in a syscall path. Skip 426 - * fixup and leave rseq_cs as is so that rseq_sycall() will detect and 427 - * kill a misbehaving userspace on debug kernels. 433 + * If invoked from hypervisors or IO-URING, then @regs is a NULL 434 + * pointer, so fixup cannot be done. If the syscall which led to 435 + * this invocation was invoked inside a critical section, then it 436 + * will either end up in this code again or a possible violation of 437 + * a syscall inside a critical region can only be detected by the 438 + * debug code in rseq_syscall() in a debug enabled kernel. 428 439 */ 429 440 if (regs) { 430 - ret = rseq_ip_fixup(regs); 431 - if (unlikely(ret < 0)) 432 - goto error; 441 + /* 442 + * Read and clear the event mask first. If the task was not 443 + * preempted or migrated or a signal is on the way, there 444 + * is no point in doing any of the heavy lifting here on 445 + * production kernels. In that case TIF_NOTIFY_RESUME was 446 + * raised by some other functionality. 447 + * 448 + * This is correct because the read/clear operation is 449 + * guarded against scheduler preemption, which makes it CPU 450 + * local atomic. If the task is preempted right after 451 + * re-enabling preemption then TIF_NOTIFY_RESUME is set 452 + * again and this function is invoked another time _before_ 453 + * the task is able to return to user mode. 454 + * 455 + * On a debug kernel, invoke the fixup code unconditionally 456 + * with the result handed in to allow the detection of 457 + * inconsistencies. 458 + */ 459 + u32 event_mask; 460 + 461 + scoped_guard(RSEQ_EVENT_GUARD) { 462 + event_mask = t->rseq_event_mask; 463 + t->rseq_event_mask = 0; 464 + } 465 + 466 + if (IS_ENABLED(CONFIG_DEBUG_RSEQ) || event_mask) { 467 + ret = rseq_ip_fixup(regs, !!event_mask); 468 + if (unlikely(ret < 0)) 469 + goto error; 470 + } 433 471 } 434 472 if (unlikely(rseq_update_cpu_node_id(t))) 435 473 goto error;