Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rseq: Optimize event setting

After removing the various condition bits earlier it turns out that one
extra information is needed to avoid setting event::sched_switch and
TIF_NOTIFY_RESUME unconditionally on every context switch.

The update of the RSEQ user space memory is only required, when either

the task was interrupted in user space and schedules

or

the CPU or MM CID changes in schedule() independent of the entry mode

Right now only the interrupt from user information is available.

Add an event flag, which is set when the CPU or MM CID or both change.

Evaluate this event in the scheduler to decide whether the sched_switch
event and the TIF bit need to be set.

It's an extra conditional in context_switch(), but the downside of
unconditionally handling RSEQ after a context switch to user is way more
significant. The utilized boolean logic minimizes this to a single
conditional branch.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Link: https://patch.msgid.link/20251027084307.578058898@linutronix.de

authored by

Thomas Gleixner and committed by
Ingo Molnar
39a16756 e2d4f422

+95 -13
+1 -1
fs/exec.c
··· 1775 1775 force_fatal_sig(SIGSEGV); 1776 1776 1777 1777 sched_mm_cid_after_execve(current); 1778 - rseq_sched_switch_event(current); 1778 + rseq_force_update(); 1779 1779 current->in_execve = 0; 1780 1780 1781 1781 return retval;
+74 -7
include/linux/rseq.h
··· 11 11 12 12 static inline void rseq_handle_notify_resume(struct pt_regs *regs) 13 13 { 14 - if (current->rseq.event.has_rseq) 14 + /* '&' is intentional to spare one conditional branch */ 15 + if (current->rseq.event.sched_switch & current->rseq.event.has_rseq) 15 16 __rseq_handle_notify_resume(regs); 16 17 } 17 18 ··· 34 33 } 35 34 } 36 35 37 - /* Raised from context switch and exevce to force evaluation on exit to user */ 38 - static inline void rseq_sched_switch_event(struct task_struct *t) 36 + static inline void rseq_raise_notify_resume(struct task_struct *t) 39 37 { 40 - if (t->rseq.event.has_rseq) { 41 - t->rseq.event.sched_switch = true; 42 - set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 38 + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); 39 + } 40 + 41 + /* Invoked from context switch to force evaluation on exit to user */ 42 + static __always_inline void rseq_sched_switch_event(struct task_struct *t) 43 + { 44 + struct rseq_event *ev = &t->rseq.event; 45 + 46 + if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) { 47 + /* 48 + * Avoid a boat load of conditionals by using simple logic 49 + * to determine whether NOTIFY_RESUME needs to be raised. 50 + * 51 + * It's required when the CPU or MM CID has changed or 52 + * the entry was from user space. 53 + */ 54 + bool raise = (ev->user_irq | ev->ids_changed) & ev->has_rseq; 55 + 56 + if (raise) { 57 + ev->sched_switch = true; 58 + rseq_raise_notify_resume(t); 59 + } 60 + } else { 61 + if (ev->has_rseq) { 62 + t->rseq.event.sched_switch = true; 63 + rseq_raise_notify_resume(t); 64 + } 65 + } 66 + } 67 + 68 + /* 69 + * Invoked from __set_task_cpu() when a task migrates to enforce an IDs 70 + * update. 71 + * 72 + * This does not raise TIF_NOTIFY_RESUME as that happens in 73 + * rseq_sched_switch_event(). 74 + */ 75 + static __always_inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) 76 + { 77 + t->rseq.event.ids_changed = true; 78 + } 79 + 80 + /* 81 + * Invoked from switch_mm_cid() in context switch when the task gets a MM 82 + * CID assigned. 83 + * 84 + * This does not raise TIF_NOTIFY_RESUME as that happens in 85 + * rseq_sched_switch_event(). 86 + */ 87 + static __always_inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) 88 + { 89 + /* 90 + * Requires a comparison as the switch_mm_cid() code does not 91 + * provide a conditional for it readily. So avoid excessive updates 92 + * when nothing changes. 93 + */ 94 + if (t->rseq.ids.mm_cid != cid) 95 + t->rseq.event.ids_changed = true; 96 + } 97 + 98 + /* Enforce a full update after RSEQ registration and when execve() failed */ 99 + static inline void rseq_force_update(void) 100 + { 101 + if (current->rseq.event.has_rseq) { 102 + current->rseq.event.ids_changed = true; 103 + current->rseq.event.sched_switch = true; 104 + rseq_raise_notify_resume(current); 43 105 } 44 106 } 45 107 ··· 119 55 static inline void rseq_virt_userspace_exit(void) 120 56 { 121 57 if (current->rseq.event.sched_switch) 122 - set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); 58 + rseq_raise_notify_resume(current); 123 59 } 124 60 125 61 static inline void rseq_reset(struct task_struct *t) ··· 155 91 static inline void rseq_handle_notify_resume(struct pt_regs *regs) { } 156 92 static inline void rseq_signal_deliver(struct ksignal *ksig, struct pt_regs *regs) { } 157 93 static inline void rseq_sched_switch_event(struct task_struct *t) { } 94 + static inline void rseq_sched_set_task_cpu(struct task_struct *t, unsigned int cpu) { } 95 + static inline void rseq_sched_set_task_mm_cid(struct task_struct *t, unsigned int cid) { } 96 + static inline void rseq_force_update(void) { } 158 97 static inline void rseq_virt_userspace_exit(void) { } 159 98 static inline void rseq_fork(struct task_struct *t, u64 clone_flags) { } 160 99 static inline void rseq_execve(struct task_struct *t) { }
+9 -2
include/linux/rseq_types.h
··· 11 11 * struct rseq_event - Storage for rseq related event management 12 12 * @all: Compound to initialize and clear the data efficiently 13 13 * @events: Compound to access events with a single load/store 14 - * @sched_switch: True if the task was scheduled out 14 + * @sched_switch: True if the task was scheduled and needs update on 15 + * exit to user 16 + * @ids_changed: Indicator that IDs need to be updated 15 17 * @user_irq: True on interrupt entry from user mode 16 18 * @has_rseq: True if the task has a rseq pointer installed 17 19 * @error: Compound error code for the slow path to analyze 18 20 * @fatal: User space data corrupted or invalid 21 + * 22 + * @sched_switch and @ids_changed must be adjacent and the combo must be 23 + * 16bit aligned to allow a single store, when both are set at the same 24 + * time in the scheduler. 19 25 */ 20 26 struct rseq_event { 21 27 union { 22 28 u64 all; 23 29 struct { 24 30 union { 25 - u16 events; 31 + u32 events; 26 32 struct { 27 33 u8 sched_switch; 34 + u8 ids_changed; 28 35 u8 user_irq; 29 36 }; 30 37 };
+1 -1
kernel/rseq.c
··· 464 464 * are updated before returning to user-space. 465 465 */ 466 466 current->rseq.event.has_rseq = true; 467 - rseq_sched_switch_event(current); 467 + rseq_force_update(); 468 468 return 0; 469 469 470 470 efault:
+6 -1
kernel/sched/core.c
··· 5118 5118 kcov_prepare_switch(prev); 5119 5119 sched_info_switch(rq, prev, next); 5120 5120 perf_event_task_sched_out(prev, next); 5121 - rseq_sched_switch_event(prev); 5122 5121 fire_sched_out_preempt_notifiers(prev, next); 5123 5122 kmap_local_sched_out(); 5124 5123 prepare_task(next); ··· 5314 5315 5315 5316 /* switch_mm_cid() requires the memory barriers above. */ 5316 5317 switch_mm_cid(rq, prev, next); 5318 + 5319 + /* 5320 + * Tell rseq that the task was scheduled in. Must be after 5321 + * switch_mm_cid() to get the TIF flag set. 5322 + */ 5323 + rseq_sched_switch_event(next); 5317 5324 5318 5325 prepare_lock_switch(rq, next, rf); 5319 5326
+4 -1
kernel/sched/sched.h
··· 2209 2209 smp_wmb(); 2210 2210 WRITE_ONCE(task_thread_info(p)->cpu, cpu); 2211 2211 p->wake_cpu = cpu; 2212 + rseq_sched_set_task_cpu(p, cpu); 2212 2213 #endif /* CONFIG_SMP */ 2213 2214 } 2214 2215 ··· 3808 3807 mm_cid_put_lazy(prev); 3809 3808 prev->mm_cid = -1; 3810 3809 } 3811 - if (next->mm_cid_active) 3810 + if (next->mm_cid_active) { 3812 3811 next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm); 3812 + rseq_sched_set_task_mm_cid(next, next->mm_cid); 3813 + } 3813 3814 } 3814 3815 3815 3816 #else /* !CONFIG_SCHED_MM_CID: */