include/linux/rseq_entry.h at 703ccb63ae9f7444d6ff876d024e17f628103c69

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / include / linux / rseq_entry.h
at 703ccb63ae9f7444d6ff876d024e17f628103c69 784 lines 24 kB view raw
wrap content
  1/* SPDX-License-Identifier: GPL-2.0 */
  2#ifndef _LINUX_RSEQ_ENTRY_H
  3#define _LINUX_RSEQ_ENTRY_H
  4
  5/* Must be outside the CONFIG_RSEQ guard to resolve the stubs */
  6#ifdef CONFIG_RSEQ_STATS
  7#include <linux/percpu.h>
  8
  9struct rseq_stats {
 10	unsigned long	exit;
 11	unsigned long	signal;
 12	unsigned long	slowpath;
 13	unsigned long	fastpath;
 14	unsigned long	ids;
 15	unsigned long	cs;
 16	unsigned long	clear;
 17	unsigned long	fixup;
 18	unsigned long	s_granted;
 19	unsigned long	s_expired;
 20	unsigned long	s_revoked;
 21	unsigned long	s_yielded;
 22	unsigned long	s_aborted;
 23};
 24
 25DECLARE_PER_CPU(struct rseq_stats, rseq_stats);
 26
 27/*
 28 * Slow path has interrupts and preemption enabled, but the fast path
 29 * runs with interrupts disabled so there is no point in having the
 30 * preemption checks implied in __this_cpu_inc() for every operation.
 31 */
 32#ifdef RSEQ_BUILD_SLOW_PATH
 33#define rseq_stat_inc(which)	this_cpu_inc((which))
 34#else
 35#define rseq_stat_inc(which)	raw_cpu_inc((which))
 36#endif
 37
 38#else /* CONFIG_RSEQ_STATS */
 39#define rseq_stat_inc(x)	do { } while (0)
 40#endif /* !CONFIG_RSEQ_STATS */
 41
 42#ifdef CONFIG_RSEQ
 43#include <linux/jump_label.h>
 44#include <linux/rseq.h>
 45#include <linux/sched/signal.h>
 46#include <linux/uaccess.h>
 47
 48#include <linux/tracepoint-defs.h>
 49
 50#ifdef CONFIG_TRACEPOINTS
 51DECLARE_TRACEPOINT(rseq_update);
 52DECLARE_TRACEPOINT(rseq_ip_fixup);
 53void __rseq_trace_update(struct task_struct *t);
 54void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 55			   unsigned long offset, unsigned long abort_ip);
 56
 57static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids)
 58{
 59	if (tracepoint_enabled(rseq_update) && ids)
 60		__rseq_trace_update(t);
 61}
 62
 63static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 64				       unsigned long offset, unsigned long abort_ip)
 65{
 66	if (tracepoint_enabled(rseq_ip_fixup))
 67		__rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
 68}
 69
 70#else /* CONFIG_TRACEPOINT */
 71static inline void rseq_trace_update(struct task_struct *t, struct rseq_ids *ids) { }
 72static inline void rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
 73				       unsigned long offset, unsigned long abort_ip) { }
 74#endif /* !CONFIG_TRACEPOINT */
 75
 76DECLARE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
 77
 78#ifdef RSEQ_BUILD_SLOW_PATH
 79#define rseq_inline
 80#else
 81#define rseq_inline __always_inline
 82#endif
 83
 84#ifdef CONFIG_RSEQ_SLICE_EXTENSION
 85DECLARE_STATIC_KEY_TRUE(rseq_slice_extension_key);
 86
 87static __always_inline bool rseq_slice_extension_enabled(void)
 88{
 89	return static_branch_likely(&rseq_slice_extension_key);
 90}
 91
 92extern unsigned int rseq_slice_ext_nsecs;
 93bool __rseq_arm_slice_extension_timer(void);
 94
 95static __always_inline bool rseq_arm_slice_extension_timer(void)
 96{
 97	if (!rseq_slice_extension_enabled())
 98		return false;
 99
100	if (likely(!current->rseq.slice.state.granted))
101		return false;
102
103	return __rseq_arm_slice_extension_timer();
104}
105
106static __always_inline void rseq_slice_clear_grant(struct task_struct *t)
107{
108	if (IS_ENABLED(CONFIG_RSEQ_STATS) && t->rseq.slice.state.granted)
109		rseq_stat_inc(rseq_stats.s_revoked);
110	t->rseq.slice.state.granted = false;
111}
112
113static __always_inline bool rseq_grant_slice_extension(bool work_pending)
114{
115	struct task_struct *curr = current;
116	struct rseq_slice_ctrl usr_ctrl;
117	union rseq_slice_state state;
118	struct rseq __user *rseq;
119
120	if (!rseq_slice_extension_enabled())
121		return false;
122
123	/* If not enabled or not a return from interrupt, nothing to do. */
124	state = curr->rseq.slice.state;
125	state.enabled &= curr->rseq.event.user_irq;
126	if (likely(!state.state))
127		return false;
128
129	rseq = curr->rseq.usrptr;
130	scoped_user_rw_access(rseq, efault) {
131
132		/*
133		 * Quick check conditions where a grant is not possible or
134		 * needs to be revoked.
135		 *
136		 *  1) Any TIF bit which needs to do extra work aside of
137		 *     rescheduling prevents a grant.
138		 *
139		 *  2) A previous rescheduling request resulted in a slice
140		 *     extension grant.
141		 */
142		if (unlikely(work_pending || state.granted)) {
143			/* Clear user control unconditionally. No point for checking */
144			unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
145			rseq_slice_clear_grant(curr);
146			return false;
147		}
148
149		unsafe_get_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
150		if (likely(!(usr_ctrl.request)))
151			return false;
152
153		/* Grant the slice extention */
154		usr_ctrl.request = 0;
155		usr_ctrl.granted = 1;
156		unsafe_put_user(usr_ctrl.all, &rseq->slice_ctrl.all, efault);
157	}
158
159	rseq_stat_inc(rseq_stats.s_granted);
160
161	curr->rseq.slice.state.granted = true;
162	/* Store expiry time for arming the timer on the way out */
163	curr->rseq.slice.expires = data_race(rseq_slice_ext_nsecs) + ktime_get_mono_fast_ns();
164	/*
165	 * This is racy against a remote CPU setting TIF_NEED_RESCHED in
166	 * several ways:
167	 *
168	 * 1)
169	 *	CPU0			CPU1
170	 *	clear_tsk()
171	 *				set_tsk()
172	 *	clear_preempt()
173	 *				Raise scheduler IPI on CPU0
174	 *	--> IPI
175	 *	    fold_need_resched() -> Folds correctly
176	 * 2)
177	 *	CPU0			CPU1
178	 *				set_tsk()
179	 *	clear_tsk()
180	 *	clear_preempt()
181	 *				Raise scheduler IPI on CPU0
182	 *	--> IPI
183	 *	    fold_need_resched() <- NOOP as TIF_NEED_RESCHED is false
184	 *
185	 * #1 is not any different from a regular remote reschedule as it
186	 *    sets the previously not set bit and then raises the IPI which
187	 *    folds it into the preempt counter
188	 *
189	 * #2 is obviously incorrect from a scheduler POV, but it's not
190	 *    differently incorrect than the code below clearing the
191	 *    reschedule request with the safety net of the timer.
192	 *
193	 * The important part is that the clearing is protected against the
194	 * scheduler IPI and also against any other interrupt which might
195	 * end up waking up a task and setting the bits in the middle of
196	 * the operation:
197	 *
198	 *	clear_tsk()
199	 *	---> Interrupt
200	 *		wakeup_on_this_cpu()
201	 *		set_tsk()
202	 *		set_preempt()
203	 *	clear_preempt()
204	 *
205	 * which would be inconsistent state.
206	 */
207	scoped_guard(irq) {
208		clear_tsk_need_resched(curr);
209		clear_preempt_need_resched();
210	}
211	return true;
212
213efault:
214	force_sig(SIGSEGV);
215	return false;
216}
217
218#else /* CONFIG_RSEQ_SLICE_EXTENSION */
219static __always_inline bool rseq_slice_extension_enabled(void) { return false; }
220static __always_inline bool rseq_arm_slice_extension_timer(void) { return false; }
221static __always_inline void rseq_slice_clear_grant(struct task_struct *t) { }
222static __always_inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
223#endif /* !CONFIG_RSEQ_SLICE_EXTENSION */
224
225bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr);
226bool rseq_debug_validate_ids(struct task_struct *t);
227
228static __always_inline void rseq_note_user_irq_entry(void)
229{
230	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY))
231		current->rseq.event.user_irq = true;
232}
233
234/*
235 * Check whether there is a valid critical section and whether the
236 * instruction pointer in @regs is inside the critical section.
237 *
238 *  - If the critical section is invalid, terminate the task.
239 *
240 *  - If valid and the instruction pointer is inside, set it to the abort IP.
241 *
242 *  - If valid and the instruction pointer is outside, clear the critical
243 *    section address.
244 *
245 * Returns true, if the section was valid and either fixup or clear was
246 * done, false otherwise.
247 *
248 * In the failure case task::rseq_event::fatal is set when a invalid
249 * section was found. It's clear when the failure was an unresolved page
250 * fault.
251 *
252 * If inlined into the exit to user path with interrupts disabled, the
253 * caller has to protect against page faults with pagefault_disable().
254 *
255 * In preemptible task context this would be counterproductive as the page
256 * faults could not be fully resolved. As a consequence unresolved page
257 * faults in task context are fatal too.
258 */
259
260#ifdef RSEQ_BUILD_SLOW_PATH
261/*
262 * The debug version is put out of line, but kept here so the code stays
263 * together.
264 *
265 * @csaddr has already been checked by the caller to be in user space
266 */
267bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs,
268			       unsigned long csaddr)
269{
270	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
271	u64 start_ip, abort_ip, offset, cs_end, head, tasksize = TASK_SIZE;
272	unsigned long ip = instruction_pointer(regs);
273	u64 __user *uc_head = (u64 __user *) ucs;
274	u32 usig, __user *uc_sig;
275
276	scoped_user_rw_access(ucs, efault) {
277		/*
278		 * Evaluate the user pile and exit if one of the conditions
279		 * is not fulfilled.
280		 */
281		unsafe_get_user(start_ip, &ucs->start_ip, efault);
282		if (unlikely(start_ip >= tasksize))
283			goto die;
284		/* If outside, just clear the critical section. */
285		if (ip < start_ip)
286			goto clear;
287
288		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
289		cs_end = start_ip + offset;
290		/* Check for overflow and wraparound */
291		if (unlikely(cs_end >= tasksize || cs_end < start_ip))
292			goto die;
293
294		/* If not inside, clear it. */
295		if (ip >= cs_end)
296			goto clear;
297
298		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
299		/* Ensure it's "valid" */
300		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
301			goto die;
302		/* Validate that the abort IP is not in the critical section */
303		if (unlikely(abort_ip - start_ip < offset))
304			goto die;
305
306		/*
307		 * Check version and flags for 0. No point in emitting
308		 * deprecated warnings before dying. That could be done in
309		 * the slow path eventually, but *shrug*.
310		 */
311		unsafe_get_user(head, uc_head, efault);
312		if (unlikely(head))
313			goto die;
314
315		/* abort_ip - 4 is >= 0. See abort_ip check above */
316		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
317		unsafe_get_user(usig, uc_sig, efault);
318		if (unlikely(usig != t->rseq.sig))
319			goto die;
320
321		/* rseq_event.user_irq is only valid if CONFIG_GENERIC_IRQ_ENTRY=y */
322		if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
323			/* If not in interrupt from user context, let it die */
324			if (unlikely(!t->rseq.event.user_irq))
325				goto die;
326		}
327		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
328		instruction_pointer_set(regs, (unsigned long)abort_ip);
329		rseq_stat_inc(rseq_stats.fixup);
330		break;
331	clear:
332		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
333		rseq_stat_inc(rseq_stats.clear);
334		abort_ip = 0ULL;
335	}
336
337	if (unlikely(abort_ip))
338		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
339	return true;
340die:
341	t->rseq.event.fatal = true;
342efault:
343	return false;
344}
345
346/*
347 * On debug kernels validate that user space did not mess with it if the
348 * debug branch is enabled.
349 */
350bool rseq_debug_validate_ids(struct task_struct *t)
351{
352	struct rseq __user *rseq = t->rseq.usrptr;
353	u32 cpu_id, uval, node_id;
354
355	/*
356	 * On the first exit after registering the rseq region CPU ID is
357	 * RSEQ_CPU_ID_UNINITIALIZED and node_id in user space is 0!
358	 */
359	node_id = t->rseq.ids.cpu_id != RSEQ_CPU_ID_UNINITIALIZED ?
360		  cpu_to_node(t->rseq.ids.cpu_id) : 0;
361
362	scoped_user_read_access(rseq, efault) {
363		unsafe_get_user(cpu_id, &rseq->cpu_id_start, efault);
364		if (cpu_id != t->rseq.ids.cpu_id)
365			goto die;
366		unsafe_get_user(uval, &rseq->cpu_id, efault);
367		if (uval != cpu_id)
368			goto die;
369		unsafe_get_user(uval, &rseq->node_id, efault);
370		if (uval != node_id)
371			goto die;
372		unsafe_get_user(uval, &rseq->mm_cid, efault);
373		if (uval != t->rseq.ids.mm_cid)
374			goto die;
375	}
376	return true;
377die:
378	t->rseq.event.fatal = true;
379efault:
380	return false;
381}
382
383#endif /* RSEQ_BUILD_SLOW_PATH */
384
385/*
386 * This only ensures that abort_ip is in the user address space and
387 * validates that it is preceded by the signature.
388 *
389 * No other sanity checks are done here, that's what the debug code is for.
390 */
391static rseq_inline bool
392rseq_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr)
393{
394	struct rseq_cs __user *ucs = (struct rseq_cs __user *)(unsigned long)csaddr;
395	unsigned long ip = instruction_pointer(regs);
396	unsigned long tasksize = TASK_SIZE;
397	u64 start_ip, abort_ip, offset;
398	u32 usig, __user *uc_sig;
399
400	rseq_stat_inc(rseq_stats.cs);
401
402	if (unlikely(csaddr >= tasksize)) {
403		t->rseq.event.fatal = true;
404		return false;
405	}
406
407	if (static_branch_unlikely(&rseq_debug_enabled))
408		return rseq_debug_update_user_cs(t, regs, csaddr);
409
410	scoped_user_rw_access(ucs, efault) {
411		unsafe_get_user(start_ip, &ucs->start_ip, efault);
412		unsafe_get_user(offset, &ucs->post_commit_offset, efault);
413		unsafe_get_user(abort_ip, &ucs->abort_ip, efault);
414
415		/*
416		 * No sanity checks. If user space screwed it up, it can
417		 * keep the pieces. That's what debug code is for.
418		 *
419		 * If outside, just clear the critical section.
420		 */
421		if (ip - start_ip >= offset)
422			goto clear;
423
424		/*
425		 * Two requirements for @abort_ip:
426		 *   - Must be in user space as x86 IRET would happily return to
427		 *     the kernel.
428		 *   - The four bytes preceding the instruction at @abort_ip must
429		 *     contain the signature.
430		 *
431		 * The latter protects against the following attack vector:
432		 *
433		 * An attacker with limited abilities to write, creates a critical
434		 * section descriptor, sets the abort IP to a library function or
435		 * some other ROP gadget and stores the address of the descriptor
436		 * in TLS::rseq::rseq_cs. An RSEQ abort would then evade ROP
437		 * protection.
438		 */
439		if (unlikely(abort_ip >= tasksize || abort_ip < sizeof(*uc_sig)))
440			goto die;
441
442		/* The address is guaranteed to be >= 0 and < TASK_SIZE */
443		uc_sig = (u32 __user *)(unsigned long)(abort_ip - sizeof(*uc_sig));
444		unsafe_get_user(usig, uc_sig, efault);
445		if (unlikely(usig != t->rseq.sig))
446			goto die;
447
448		/* Invalidate the critical section */
449		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
450		/* Update the instruction pointer */
451		instruction_pointer_set(regs, (unsigned long)abort_ip);
452		rseq_stat_inc(rseq_stats.fixup);
453		break;
454	clear:
455		unsafe_put_user(0ULL, &t->rseq.usrptr->rseq_cs, efault);
456		rseq_stat_inc(rseq_stats.clear);
457		abort_ip = 0ULL;
458	}
459
460	if (unlikely(abort_ip))
461		rseq_trace_ip_fixup(ip, start_ip, offset, abort_ip);
462	return true;
463die:
464	t->rseq.event.fatal = true;
465efault:
466	return false;
467}
468
469/*
470 * Updates CPU ID, Node ID and MM CID and reads the critical section
471 * address, when @csaddr != NULL. This allows to put the ID update and the
472 * read under the same uaccess region to spare a separate begin/end.
473 *
474 * As this is either invoked from a C wrapper with @csaddr = NULL or from
475 * the fast path code with a valid pointer, a clever compiler should be
476 * able to optimize the read out. Spares a duplicate implementation.
477 *
478 * Returns true, if the operation was successful, false otherwise.
479 *
480 * In the failure case task::rseq_event::fatal is set when invalid data
481 * was found on debug kernels. It's clear when the failure was an unresolved page
482 * fault.
483 *
484 * If inlined into the exit to user path with interrupts disabled, the
485 * caller has to protect against page faults with pagefault_disable().
486 *
487 * In preemptible task context this would be counterproductive as the page
488 * faults could not be fully resolved. As a consequence unresolved page
489 * faults in task context are fatal too.
490 */
491static rseq_inline
492bool rseq_set_ids_get_csaddr(struct task_struct *t, struct rseq_ids *ids,
493			     u32 node_id, u64 *csaddr)
494{
495	struct rseq __user *rseq = t->rseq.usrptr;
496
497	if (static_branch_unlikely(&rseq_debug_enabled)) {
498		if (!rseq_debug_validate_ids(t))
499			return false;
500	}
501
502	scoped_user_rw_access(rseq, efault) {
503		unsafe_put_user(ids->cpu_id, &rseq->cpu_id_start, efault);
504		unsafe_put_user(ids->cpu_id, &rseq->cpu_id, efault);
505		unsafe_put_user(node_id, &rseq->node_id, efault);
506		unsafe_put_user(ids->mm_cid, &rseq->mm_cid, efault);
507		if (csaddr)
508			unsafe_get_user(*csaddr, &rseq->rseq_cs, efault);
509
510		/* Open coded, so it's in the same user access region */
511		if (rseq_slice_extension_enabled()) {
512			/* Unconditionally clear it, no point in conditionals */
513			unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
514		}
515	}
516
517	rseq_slice_clear_grant(t);
518	/* Cache the new values */
519	t->rseq.ids.cpu_cid = ids->cpu_cid;
520	rseq_stat_inc(rseq_stats.ids);
521	rseq_trace_update(t, ids);
522	return true;
523efault:
524	return false;
525}
526
527/*
528 * Update user space with new IDs and conditionally check whether the task
529 * is in a critical section.
530 */
531static rseq_inline bool rseq_update_usr(struct task_struct *t, struct pt_regs *regs,
532					struct rseq_ids *ids, u32 node_id)
533{
534	u64 csaddr;
535
536	if (!rseq_set_ids_get_csaddr(t, ids, node_id, &csaddr))
537		return false;
538
539	/*
540	 * On architectures which utilize the generic entry code this
541	 * allows to skip the critical section when the entry was not from
542	 * a user space interrupt, unless debug mode is enabled.
543	 */
544	if (IS_ENABLED(CONFIG_GENERIC_IRQ_ENTRY)) {
545		if (!static_branch_unlikely(&rseq_debug_enabled)) {
546			if (likely(!t->rseq.event.user_irq))
547				return true;
548		}
549	}
550	if (likely(!csaddr))
551		return true;
552	/* Sigh, this really needs to do work */
553	return rseq_update_user_cs(t, regs, csaddr);
554}
555
556/*
557 * If you want to use this then convert your architecture to the generic
558 * entry code. I'm tired of building workarounds for people who can't be
559 * bothered to make the maintenance of generic infrastructure less
560 * burdensome. Just sucking everything into the architecture code and
561 * thereby making others chase the horrible hacks and keep them working is
562 * neither acceptable nor sustainable.
563 */
564#ifdef CONFIG_GENERIC_ENTRY
565
566/*
567 * This is inlined into the exit path because:
568 *
569 * 1) It's a one time comparison in the fast path when there is no event to
570 *    handle
571 *
572 * 2) The access to the user space rseq memory (TLS) is unlikely to fault
573 *    so the straight inline operation is:
574 *
575 *	- Four 32-bit stores only if CPU ID/ MM CID need to be updated
576 *	- One 64-bit load to retrieve the critical section address
577 *
578 * 3) In the unlikely case that the critical section address is != NULL:
579 *
580 *     - One 64-bit load to retrieve the start IP
581 *     - One 64-bit load to retrieve the offset for calculating the end
582 *     - One 64-bit load to retrieve the abort IP
583 *     - One 64-bit load to retrieve the signature
584 *     - One store to clear the critical section address
585 *
586 * The non-debug case implements only the minimal required checking. It
587 * provides protection against a rogue abort IP in kernel space, which
588 * would be exploitable at least on x86, and also against a rogue CS
589 * descriptor by checking the signature at the abort IP. Any fallout from
590 * invalid critical section descriptors is a user space problem. The debug
591 * case provides the full set of checks and terminates the task if a
592 * condition is not met.
593 *
594 * In case of a fault or an invalid value, this sets TIF_NOTIFY_RESUME and
595 * tells the caller to loop back into exit_to_user_mode_loop(). The rseq
596 * slow path there will handle the failure.
597 */
598static __always_inline bool rseq_exit_user_update(struct pt_regs *regs, struct task_struct *t)
599{
600	/*
601	 * Page faults need to be disabled as this is called with
602	 * interrupts disabled
603	 */
604	guard(pagefault)();
605	if (likely(!t->rseq.event.ids_changed)) {
606		struct rseq __user *rseq = t->rseq.usrptr;
607		/*
608		 * If IDs have not changed rseq_event::user_irq must be true
609		 * See rseq_sched_switch_event().
610		 */
611		u64 csaddr;
612
613		scoped_user_rw_access(rseq, efault) {
614			unsafe_get_user(csaddr, &rseq->rseq_cs, efault);
615
616			/* Open coded, so it's in the same user access region */
617			if (rseq_slice_extension_enabled()) {
618				/* Unconditionally clear it, no point in conditionals */
619				unsafe_put_user(0U, &rseq->slice_ctrl.all, efault);
620			}
621		}
622
623		rseq_slice_clear_grant(t);
624
625		if (static_branch_unlikely(&rseq_debug_enabled) || unlikely(csaddr)) {
626			if (unlikely(!rseq_update_user_cs(t, regs, csaddr)))
627				return false;
628		}
629		return true;
630	}
631
632	struct rseq_ids ids = {
633		.cpu_id = task_cpu(t),
634		.mm_cid = task_mm_cid(t),
635	};
636	u32 node_id = cpu_to_node(ids.cpu_id);
637
638	return rseq_update_usr(t, regs, &ids, node_id);
639efault:
640	return false;
641}
642
643static __always_inline bool __rseq_exit_to_user_mode_restart(struct pt_regs *regs)
644{
645	struct task_struct *t = current;
646
647	/*
648	 * If the task did not go through schedule or got the flag enforced
649	 * by the rseq syscall or execve, then nothing to do here.
650	 *
651	 * CPU ID and MM CID can only change when going through a context
652	 * switch.
653	 *
654	 * rseq_sched_switch_event() sets the rseq_event::sched_switch bit
655	 * only when rseq_event::has_rseq is true. That conditional is
656	 * required to avoid setting the TIF bit if RSEQ is not registered
657	 * for a task. rseq_event::sched_switch is cleared when RSEQ is
658	 * unregistered by a task so it's sufficient to check for the
659	 * sched_switch bit alone.
660	 *
661	 * A sane compiler requires three instructions for the nothing to do
662	 * case including clearing the events, but your mileage might vary.
663	 */
664	if (unlikely((t->rseq.event.sched_switch))) {
665		rseq_stat_inc(rseq_stats.fastpath);
666
667		if (unlikely(!rseq_exit_user_update(regs, t)))
668			return true;
669	}
670	/* Clear state so next entry starts from a clean slate */
671	t->rseq.event.events = 0;
672	return false;
673}
674
675/* Required to allow conversion to GENERIC_ENTRY w/o GENERIC_TIF_BITS */
676#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
677static __always_inline bool test_tif_rseq(unsigned long ti_work)
678{
679	return ti_work & _TIF_RSEQ;
680}
681
682static __always_inline void clear_tif_rseq(void)
683{
684	static_assert(TIF_RSEQ != TIF_NOTIFY_RESUME);
685	clear_thread_flag(TIF_RSEQ);
686}
687#else
688static __always_inline bool test_tif_rseq(unsigned long ti_work) { return true; }
689static __always_inline void clear_tif_rseq(void) { }
690#endif
691
692static __always_inline bool
693rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
694{
695	if (unlikely(test_tif_rseq(ti_work))) {
696		if (unlikely(__rseq_exit_to_user_mode_restart(regs))) {
697			current->rseq.event.slowpath = true;
698			set_tsk_thread_flag(current, TIF_NOTIFY_RESUME);
699			return true;
700		}
701		clear_tif_rseq();
702	}
703	/*
704	 * Arm the slice extension timer if nothing to do anymore and the
705	 * task really goes out to user space.
706	 */
707	return rseq_arm_slice_extension_timer();
708}
709
710#else /* CONFIG_GENERIC_ENTRY */
711static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
712{
713	return false;
714}
715#endif /* !CONFIG_GENERIC_ENTRY */
716
717static __always_inline void rseq_syscall_exit_to_user_mode(void)
718{
719	struct rseq_event *ev = &current->rseq.event;
720
721	rseq_stat_inc(rseq_stats.exit);
722
723	/* Needed to remove the store for the !lockdep case */
724	if (IS_ENABLED(CONFIG_LOCKDEP)) {
725		WARN_ON_ONCE(ev->sched_switch);
726		ev->events = 0;
727	}
728}
729
730static __always_inline void rseq_irqentry_exit_to_user_mode(void)
731{
732	struct rseq_event *ev = &current->rseq.event;
733
734	rseq_stat_inc(rseq_stats.exit);
735
736	lockdep_assert_once(!ev->sched_switch);
737
738	/*
739	 * Ensure that event (especially user_irq) is cleared when the
740	 * interrupt did not result in a schedule and therefore the
741	 * rseq processing could not clear it.
742	 */
743	ev->events = 0;
744}
745
746/* Required to keep ARM64 working */
747static __always_inline void rseq_exit_to_user_mode_legacy(void)
748{
749	struct rseq_event *ev = &current->rseq.event;
750
751	rseq_stat_inc(rseq_stats.exit);
752
753	if (static_branch_unlikely(&rseq_debug_enabled))
754		WARN_ON_ONCE(ev->sched_switch);
755
756	/*
757	 * Ensure that event (especially user_irq) is cleared when the
758	 * interrupt did not result in a schedule and therefore the
759	 * rseq processing did not clear it.
760	 */
761	ev->events = 0;
762}
763
764void __rseq_debug_syscall_return(struct pt_regs *regs);
765
766static __always_inline void rseq_debug_syscall_return(struct pt_regs *regs)
767{
768	if (static_branch_unlikely(&rseq_debug_enabled))
769		__rseq_debug_syscall_return(regs);
770}
771#else /* CONFIG_RSEQ */
772static inline void rseq_note_user_irq_entry(void) { }
773static inline bool rseq_exit_to_user_mode_restart(struct pt_regs *regs, unsigned long ti_work)
774{
775	return false;
776}
777static inline void rseq_syscall_exit_to_user_mode(void) { }
778static inline void rseq_irqentry_exit_to_user_mode(void) { }
779static inline void rseq_exit_to_user_mode_legacy(void) { }
780static inline void rseq_debug_syscall_return(struct pt_regs *regs) { }
781static inline bool rseq_grant_slice_extension(bool work_pending) { return false; }
782#endif /* !CONFIG_RSEQ */
783
784#endif /* _LINUX_RSEQ_ENTRY_H */
Configure Feed

Configure Feed