Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Provide and use rseq_set_ids()

Provide a new and straight forward implementation to set the IDs (CPU ID,
Node ID and MM CID), which can be later inlined into the fast path.

It does all operations in one scoped_user_rw_access() section and retrieves
also the critical section member (rseq::cs_rseq) from user space to avoid
another user..begin/end() pair. This is in preparation for optimizing the
fast path to avoid extra work when not required.

On rseq registration set the CPU ID fields to RSEQ_CPU_ID_UNINITIALIZED and
node and MM CID to zero. That's the same as the kernel internal reset
values. That makes the debug validation in the exit code work correctly on
the first exit to user space.

Use it to replace the whole related zoo in rseq.c

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.393972266@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
0f085b41 eaa9088d

+151 -202
+1 -1
fs/binfmt_elf.c
··· 46 46 #include <linux/cred.h> 47 47 #include <linux/dax.h> 48 48 #include <linux/uaccess.h> 49 - #include <linux/rseq.h> 49 + #include <uapi/linux/rseq.h> 50 50 #include <asm/param.h> 51 51 #include <asm/page.h> 52 52
+11 -5
include/linux/rseq.h
··· 5 5 #ifdef CONFIG_RSEQ 6 6 #include <linux/sched.h> 7 7 8 + #include <uapi/linux/rseq.h> 9 + 8 10 void __rseq_handle_notify_resume(struct ksignal *sig, struct pt_regs *regs); 9 11 10 12 static inline void rseq_handle_notify_resume(struct pt_regs *regs) ··· 50 48 static inline void rseq_reset(struct task_struct *t) 51 49 { 52 50 memset(&t->rseq, 0, sizeof(t->rseq)); 53 - t->rseq.ids.cpu_cid = ~0ULL; 51 + t->rseq.ids.cpu_id = RSEQ_CPU_ID_UNINITIALIZED; 54 52 } 55 53 56 54 static inline void rseq_execve(struct task_struct *t) ··· 61 59 /* 62 60 * If parent process has a registered restartable sequences area, the 63 61 * child inherits. Unregister rseq for a clone with CLONE_VM set. 62 + * 63 + * On fork, keep the IDs (CPU, MMCID) of the parent, which avoids a fault 64 + * on the COW page on exit to user space, when the child stays on the same 65 + * CPU as the parent. That's obviously not guaranteed, but in overcommit 66 + * scenarios it is more likely and optimizes for the fork/exec case without 67 + * taking the fault. 64 68 */ 65 69 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) 66 70 { 67 - if (clone_flags & CLONE_VM) { 71 + if (clone_flags & CLONE_VM) 68 72 rseq_reset(t); 69 - } else { 73 + else 70 74 t->rseq = current->rseq; 71 - t->rseq.ids.cpu_cid = ~0ULL; 72 - } 73 75 } 74 76 75 77 #else /* CONFIG_RSEQ */
+89
include/linux/rseq_entry.h
··· 75 75 #endif 76 76 77 77 bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); 78 + bool rseq_debug_validate_ids(struct task_struct *t); 78 79 79 80 static __always_inline void rseq_note_user_irq_entry(void) 80 81 { ··· 195 194 return false; 196 195 } 197 196 197 + /* 198 + * On debug kernels validate that user space did not mess with it if the 199 + * debug branch is enabled. 200 + */ 201 + bool rseq_debug_validate_ids(struct task_struct *t) 202 + { 203 + struct rseq __user *rseq = t->rseq.usrptr; 204 + u32 cpu_id, uval, node_id; 205 + 206 + /* 207 + * On the first exit after registering the rseq region CPU ID is 208 + * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0! 209 + */ 210 + node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ? 211 + cpu_to_node(t->rseq.ids.cpu_id) : 0; 212 + 213 + scoped_user_read_access(rseq, efault) { 214 + unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault); 215 + if (cpu_id != t->rseq.ids.cpu_id) 216 + goto die; 217 + unsafe_get_user(uval, &rseq->cpu_id, efault); 218 + if (uval != cpu_id) 219 + goto die; 220 + unsafe_get_user(uval, &rseq->node_id, efault); 221 + if (uval != node_id) 222 + goto die; 223 + unsafe_get_user(uval, &rseq->mm_cid, efault); 224 + if (uval != t->rseq.ids.mm_cid) 225 + goto die; 226 + } 227 + return true; 228 + die: 229 + t->rseq.event.fatal = true; 230 + efault: 231 + return false; 232 + } 233 + 198 234 #endif /* RSEQ_BUILD_SLOW_PATH */ 199 235 200 236 /* ··· 313 275 return true; 314 276 die: 315 277 t->rseq.event.fatal = true; 278 + efault: 279 + return false; 280 + } 281 + 282 + /* 283 + * Updates CPU ID, Node ID and MM CID and reads the critical section 284 + * address, when @csaddr != NULL. This allows to put the ID update and the 285 + * read under the same uaccess region to spare a separate begin/end. 286 + * 287 + * As this is either invoked from a C wrapper with @csaddr = NULL or from 288 + * the fast path code with a valid pointer, a clever compiler should be 289 + * able to optimize the read out. Spares a duplicate implementation. 290 + * 291 + * Returns true, if the operation was successful, false otherwise. 292 + * 293 + * In the failure case task::rseq_event::fatal is set when invalid data 294 + * was found on debug kernels. It's clear when the failure was an unresolved page 295 + * fault. 296 + * 297 + * If inlined into the exit to user path with interrupts disabled, the 298 + * caller has to protect against page faults with pagefault_disable(). 299 + * 300 + * In preemptible task context this would be counterproductive as the page 301 + * faults could not be fully resolved. As a consequence unresolved page 302 + * faults in task context are fatal too. 303 + */ 304 + static rseq_inline 305 + bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids, 306 + u32 node_id, u64 *csaddr) 307 + { 308 + struct rseq __user *rseq = t->rseq.usrptr; 309 + 310 + if (static_branch_unlikely(&rseq_debug_enabled)) { 311 + if (!rseq_debug_validate_ids(t)) 312 + return false; 313 + } 314 + 315 + scoped_user_rw_access(rseq, efault) { 316 + unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault); 317 + unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault); 318 + unsafe_put_user(node_id, &rseq->node_id, efault); 319 + unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault); 320 + if (csaddr) 321 + unsafe_get_user(*csaddr, &rseq->rseq_cs, efault); 322 + } 323 + 324 + /* Cache the new values */ 325 + t->rseq.ids.cpu_cid = ids->cpu_cid; 326 + rseq_stat_inc(rseq_stats.ids); 327 + rseq_trace_update(t, ids); 328 + return true; 316 329 efault: 317 330 return false; 318 331 }
-10
include/linux/sched.h
··· 42 42 #include <linux/posix-timers_types.h> 43 43 #include <linux/restart_block.h> 44 44 #include <linux/rseq_types.h> 45 - #include <uapi/linux/rseq.h> 46 45 #include <linux/seqlock_types.h> 47 46 #include <linux/kcsan.h> 48 47 #include <linux/rv.h> ··· 1407 1408 #endif /* CONFIG_NUMA_BALANCING */ 1408 1409 1409 1410 struct rseq_data rseq; 1410 - #ifdef CONFIG_DEBUG_RSEQ 1411 - /* 1412 - * This is a place holder to save a copy of the rseq fields for 1413 - * validation of read-only fields. The struct rseq has a 1414 - * variable-length array at the end, so it cannot be used 1415 - * directly. Reserve a size large enough for the known fields. 1416 - */ 1417 - char rseq_fields[sizeof(struct rseq)]; 1418 - #endif 1419 1411 1420 1412 #ifdef CONFIG_SCHED_MM_CID 1421 1413 int mm_cid; /* Current cid in mm */
+50 -186
kernel/rseq.c
··· 88 88 # define RSEQ_EVENT_GUARD preempt 89 89 #endif 90 90 91 - /* The original rseq structure size (including padding) is 32 bytes. */ 92 - #define ORIG_RSEQ_SIZE 32 93 - 94 - #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ 95 - RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ 96 - RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) 97 - 98 91 DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled); 99 92 100 93 static inline void rseq_control_debug(bool on) ··· 220 227 __initcall(rseq_debugfs_init); 221 228 #endif /* CONFIG_DEBUG_FS */ 222 229 223 - #ifdef CONFIG_DEBUG_RSEQ 224 - static struct rseq *rseq_kernel_fields(struct task_struct *t) 230 + static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id) 225 231 { 226 - return (struct rseq *) t->rseq_fields; 227 - } 228 - 229 - static int rseq_validate_ro_fields(struct task_struct *t) 230 - { 231 - static DEFINE_RATELIMIT_STATE(_rs, 232 - DEFAULT_RATELIMIT_INTERVAL, 233 - DEFAULT_RATELIMIT_BURST); 234 - u32 cpu_id_start, cpu_id, node_id, mm_cid; 235 - struct rseq __user *rseq = t->rseq.usrptr; 236 - 237 - /* 238 - * Validate fields which are required to be read-only by 239 - * user-space. 240 - */ 241 - if (!user_read_access_begin(rseq, t->rseq.len)) 242 - goto efault; 243 - unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end); 244 - unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end); 245 - unsafe_get_user(node_id, &rseq->node_id, efault_end); 246 - unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end); 247 - user_read_access_end(); 248 - 249 - if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start || 250 - cpu_id != rseq_kernel_fields(t)->cpu_id || 251 - node_id != rseq_kernel_fields(t)->node_id || 252 - mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) { 253 - 254 - pr_warn("Detected rseq corruption for pid: %d, name: %s\n" 255 - "\tcpu_id_start: %u ?= %u\n" 256 - "\tcpu_id: %u ?= %u\n" 257 - "\tnode_id: %u ?= %u\n" 258 - "\tmm_cid: %u ?= %u\n", 259 - t->pid, t->comm, 260 - cpu_id_start, rseq_kernel_fields(t)->cpu_id_start, 261 - cpu_id, rseq_kernel_fields(t)->cpu_id, 262 - node_id, rseq_kernel_fields(t)->node_id, 263 - mm_cid, rseq_kernel_fields(t)->mm_cid); 264 - } 265 - 266 - /* For now, only print a console warning on mismatch. */ 267 - return 0; 268 - 269 - efault_end: 270 - user_read_access_end(); 271 - efault: 272 - return -EFAULT; 273 - } 274 - 275 - /* 276 - * Update an rseq field and its in-kernel copy in lock-step to keep a coherent 277 - * state. 278 - */ 279 - #define rseq_unsafe_put_user(t, value, field, error_label) \ 280 - do { \ 281 - unsafe_put_user(value, &t->rseq.usrptr->field, error_label); \ 282 - rseq_kernel_fields(t)->field = value; \ 283 - } while (0) 284 - 285 - #else 286 - static int rseq_validate_ro_fields(struct task_struct *t) 287 - { 288 - return 0; 289 - } 290 - 291 - #define rseq_unsafe_put_user(t, value, field, error_label) \ 292 - unsafe_put_user(value, &t->rseq.usrptr->field, error_label) 293 - #endif 294 - 295 - static int rseq_update_cpu_node_id(struct task_struct *t) 296 - { 297 - struct rseq __user *rseq = t->rseq.usrptr; 298 - u32 cpu_id = raw_smp_processor_id(); 299 - u32 node_id = cpu_to_node(cpu_id); 300 - u32 mm_cid = task_mm_cid(t); 301 - 302 - rseq_stat_inc(rseq_stats.ids); 303 - 304 - /* Validate read-only rseq fields on debug kernels */ 305 - if (rseq_validate_ro_fields(t)) 306 - goto efault; 307 - WARN_ON_ONCE((int) mm_cid < 0); 308 - 309 - if (!user_write_access_begin(rseq, t->rseq.len)) 310 - goto efault; 311 - 312 - rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end); 313 - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 314 - rseq_unsafe_put_user(t, node_id, node_id, efault_end); 315 - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 316 - 317 - /* Cache the user space values */ 318 - t->rseq.ids.cpu_id = cpu_id; 319 - t->rseq.ids.mm_cid = mm_cid; 320 - 321 - /* 322 - * Additional feature fields added after ORIG_RSEQ_SIZE 323 - * need to be conditionally updated only if 324 - * t->rseq_len != ORIG_RSEQ_SIZE. 325 - */ 326 - user_write_access_end(); 327 - trace_rseq_update(t); 328 - return 0; 329 - 330 - efault_end: 331 - user_write_access_end(); 332 - efault: 333 - return -EFAULT; 334 - } 335 - 336 - static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) 337 - { 338 - struct rseq __user *rseq = t->rseq.usrptr; 339 - u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, 340 - mm_cid = 0; 341 - 342 - /* 343 - * Validate read-only rseq fields. 344 - */ 345 - if (rseq_validate_ro_fields(t)) 346 - goto efault; 347 - 348 - if (!user_write_access_begin(rseq, t->rseq.len)) 349 - goto efault; 350 - 351 - /* 352 - * Reset all fields to their initial state. 353 - * 354 - * All fields have an initial state of 0 except cpu_id which is set to 355 - * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after 356 - * unregistration can figure out that rseq needs to be registered 357 - * again. 358 - */ 359 - rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end); 360 - rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end); 361 - rseq_unsafe_put_user(t, node_id, node_id, efault_end); 362 - rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end); 363 - 364 - /* 365 - * Additional feature fields added after ORIG_RSEQ_SIZE 366 - * need to be conditionally reset only if 367 - * t->rseq_len != ORIG_RSEQ_SIZE. 368 - */ 369 - user_write_access_end(); 370 - return 0; 371 - 372 - efault_end: 373 - user_write_access_end(); 374 - efault: 375 - return -EFAULT; 232 + return rseq_set_ids_get_csaddr(t, ids, node_id, NULL); 376 233 } 377 234 378 235 static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs) ··· 253 410 void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) 254 411 { 255 412 struct task_struct *t = current; 413 + struct rseq_ids ids; 414 + u32 node_id; 256 415 bool event; 257 416 int sig; 258 417 ··· 301 456 scoped_guard(RSEQ_EVENT_GUARD) { 302 457 event = t->rseq.event.sched_switch; 303 458 t->rseq.event.sched_switch = false; 459 + ids.cpu_id = task_cpu(t); 460 + ids.mm_cid = task_mm_cid(t); 304 461 } 305 462 306 463 if (!IS_ENABLED(CONFIG_DEBUG_RSEQ) && !event) ··· 311 464 if (!rseq_handle_cs(t, regs)) 312 465 goto error; 313 466 314 - if (unlikely(rseq_update_cpu_node_id(t))) 467 + node_id = cpu_to_node(ids.cpu_id); 468 + if (!rseq_set_ids(t, &ids, node_id)) 315 469 goto error; 316 470 return; 317 471 ··· 352 504 } 353 505 #endif 354 506 507 + static bool rseq_reset_ids(void) 508 + { 509 + struct rseq_ids ids = { 510 + .cpu_id = RSEQ_CPU_ID_UNINITIALIZED, 511 + .mm_cid = 0, 512 + }; 513 + 514 + /* 515 + * If this fails, terminate it because this leaves the kernel in 516 + * stupid state as exit to user space will try to fixup the ids 517 + * again. 518 + */ 519 + if (rseq_set_ids(current, &ids, 0)) 520 + return true; 521 + 522 + force_sig(SIGSEGV); 523 + return false; 524 + } 525 + 526 + /* The original rseq structure size (including padding) is 32 bytes. */ 527 + #define ORIG_RSEQ_SIZE 32 528 + 355 529 /* 356 530 * sys_rseq - setup restartable sequences for caller thread. 357 531 */ 358 532 SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) 359 533 { 360 - int ret; 361 - 362 534 if (flags & RSEQ_FLAG_UNREGISTER) { 363 535 if (flags & ~RSEQ_FLAG_UNREGISTER) 364 536 return -EINVAL; ··· 389 521 return -EINVAL; 390 522 if (current->rseq.sig != sig) 391 523 return -EPERM; 392 - ret = rseq_reset_rseq_cpu_node_id(current); 393 - if (ret) 394 - return ret; 524 + if (!rseq_reset_ids()) 525 + return -EFAULT; 395 526 rseq_reset(current); 396 527 return 0; 397 528 } ··· 430 563 if (!access_ok(rseq, rseq_len)) 431 564 return -EFAULT; 432 565 433 - /* 434 - * If the rseq_cs pointer is non-NULL on registration, clear it to 435 - * avoid a potential segfault on return to user-space. The proper thing 436 - * to do would have been to fail the registration but this would break 437 - * older libcs that reuse the rseq area for new threads without 438 - * clearing the fields. Don't bother reading it, just reset it. 439 - */ 440 - if (put_user(0UL, &rseq->rseq_cs)) 441 - return -EFAULT; 566 + scoped_user_write_access(rseq, efault) { 567 + /* 568 + * If the rseq_cs pointer is non-NULL on registration, clear it to 569 + * avoid a potential segfault on return to user-space. The proper thing 570 + * to do would have been to fail the registration but this would break 571 + * older libcs that reuse the rseq area for new threads without 572 + * clearing the fields. Don't bother reading it, just reset it. 573 + */ 574 + unsafe_put_user(0UL, &rseq->rseq_cs, efault); 575 + /* Initialize IDs in user space */ 576 + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault); 577 + unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault); 578 + unsafe_put_user(0U, &rseq->node_id, efault); 579 + unsafe_put_user(0U, &rseq->mm_cid, efault); 580 + } 442 581 443 - #ifdef CONFIG_DEBUG_RSEQ 444 - /* 445 - * Initialize the in-kernel rseq fields copy for validation of 446 - * read-only fields. 447 - */ 448 - if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) || 449 - get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) || 450 - get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) || 451 - get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid)) 452 - return -EFAULT; 453 - #endif 454 582 /* 455 583 * Activate the registration by setting the rseq area address, length 456 584 * and signature in the task struct. ··· 461 599 */ 462 600 current->rseq.event.has_rseq = true; 463 601 rseq_sched_switch_event(current); 464 - 465 602 return 0; 603 + 604 + efault: 605 + return -EFAULT; 466 606 }