Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Rework the TIF_NOTIFY handler

Replace the whole logic with a new implementation, which is shared with
signal delivery and the upcoming exit fast path.

Contrary to the original implementation, this ignores invocations from
KVM/IO-uring, which invoke resume_user_mode_work() with the @regs argument
set to NULL.

The original implementation updated the CPU/Node/MM CID fields, but that
was just a side effect, which was addressing the problem that this
invocation cleared TIF_NOTIFY_RESUME, which in turn could cause an update
on return to user space to be lost.

This problem has been addressed differently, so that it's not longer
required to do that update before entering the guest.

That might be considered a user visible change, when the hosts thread TLS
memory is mapped into the guest, but as this was never intentionally
supported, this abuse of kernel internal implementation details is not
considered an ABI break.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.517640811@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
e2d4f422 9f6ffd4c

+62 -43
+29
include/linux/rseq_entry.h
··· 368 368 return false; 369 369 } 370 370 371 + /* 372 + * Update user space with new IDs and conditionally check whether the task 373 + * is in a critical section. 374 + */ 375 + static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs, 376 + struct rseq_ids *ids, u32 node_id) 377 + { 378 + u64 csaddr; 379 + 380 + if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr)) 381 + return false; 382 + 383 + /* 384 + * On architectures which utilize the generic entry code this 385 + * allows to skip the critical section when the entry was not from 386 + * a user space interrupt, unless debug mode is enabled. 387 + */ 388 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 389 + if (!static_branch_unlikely(&rseq_debug_enabled)) { 390 + if (likely(!t->rseq.event.user_irq)) 391 + return true; 392 + } 393 + } 394 + if (likely(!csaddr)) 395 + return true; 396 + /* Sigh, this really needs to do work */ 397 + return rseq_update_user_cs(t, regs, csaddr); 398 + } 399 + 371 400 static __always_inline void rseq_exit_to_user_mode(void) 372 401 { 373 402 struct rseq_event *ev = &current->rseq.event;
+33 -43
kernel/rseq.c
··· 82 82 #define CREATE_TRACE_POINTS 83 83 #include <trace/events/rseq.h> 84 84 85 - #ifdef CONFIG_MEMBARRIER 86 - # define RSEQ_EVENT_GUARD irq 87 - #else 88 - # define RSEQ_EVENT_GUARD preempt 89 - #endif 90 - 91 85 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 92 86 93 87 static inline void rseq_control_debug(bool on) ··· 233 239 return false; 234 240 } 235 241 236 - /* 237 - * This resume handler must always be executed between any of: 238 - * - preemption, 239 - * - signal delivery, 240 - * and return to user-space. 241 - * 242 - * This is how we can ensure that the entire rseq critical section 243 - * will issue the commit instruction only if executed atomically with 244 - * respect to other threads scheduled on the same CPU, and with respect 245 - * to signal handlers. 246 - */ 247 - void __rseq_handle_notify_resume(struct pt_regs *regs) 242 + static void rseq_slowpath_update_usr(struct pt_regs *regs) 248 243 { 244 + /* Preserve rseq state and user_irq state for exit to user */ 245 + const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, }; 249 246 struct task_struct *t = current; 250 247 struct rseq_ids ids; 251 248 u32 node_id; 252 249 bool event; 253 - 254 - /* 255 - * If invoked from hypervisors before entering the guest via 256 - * resume_user_mode_work(), then @regs is a NULL pointer. 257 - * 258 - * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 259 - * it before returning from the ioctl() to user space when 260 - * rseq_event.sched_switch is set. 261 - * 262 - * So it's safe to ignore here instead of pointlessly updating it 263 - * in the vcpu_run() loop. 264 - */ 265 - if (!regs) 266 - return; 267 250 268 251 if (unlikely(t->flags & PF_EXITING)) 269 252 return; ··· 265 294 * with the result handed in to allow the detection of 266 295 * inconsistencies. 267 296 */ 268 - scoped_guard(RSEQ_EVENT_GUARD) { 297 + scoped_guard(irq) { 269 298 event = t->rseq.event.sched_switch; 270 - t->rseq.event.sched_switch = false; 299 + t->rseq.event.all &= evt_mask.all; 271 300 ids.cpu_id = task_cpu(t); 272 301 ids.mm_cid = task_mm_cid(t); 273 302 } 274 303 275 - if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) 304 + if (!event) 276 305 return; 277 306 278 - if (!rseq_handle_cs(t, regs)) 279 - goto error; 280 - 281 307 node_id = cpu_to_node(ids.cpu_id); 282 - if (!rseq_set_ids(t, &ids, node_id)) 283 - goto error; 284 - return; 285 308 286 - error: 287 - force_sig(SIGSEGV); 309 + if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) { 310 + /* 311 + * Clear the errors just in case this might survive magically, but 312 + * leave the rest intact. 313 + */ 314 + t->rseq.event.error = 0; 315 + force_sig(SIGSEGV); 316 + } 317 + } 318 + 319 + void __rseq_handle_notify_resume(struct pt_regs *regs) 320 + { 321 + /* 322 + * If invoked from hypervisors before entering the guest via 323 + * resume_user_mode_work(), then @regs is a NULL pointer. 324 + * 325 + * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises 326 + * it before returning from the ioctl() to user space when 327 + * rseq_event.sched_switch is set. 328 + * 329 + * So it's safe to ignore here instead of pointlessly updating it 330 + * in the vcpu_run() loop. 331 + */ 332 + if (!regs) 333 + return; 334 + 335 + rseq_slowpath_update_usr(regs); 288 336 } 289 337 290 338 void __rseq_signal_deliver(int sig, struct pt_regs *regs)