Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

uprobes/x86: Add uprobe syscall to speed up uprobe

Adding new uprobe syscall that calls uprobe handlers for given
'breakpoint' address.

The idea is that the 'breakpoint' address calls the user space
trampoline which executes the uprobe syscall.

The syscall handler reads the return address of the initial call
to retrieve the original 'breakpoint' address. With this address
we find the related uprobe object and call its consumers.

Adding the arch_uprobe_trampoline_mapping function that provides
uprobe trampoline mapping. This mapping is backed with one global
page initialized at __init time and shared by the all the mapping
instances.

We do not allow to execute uprobe syscall if the caller is not
from uprobe trampoline mapping.

The uprobe syscall ensures the consumer (bpf program) sees registers
values in the state before the trampoline was called.

Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
Link: https://lore.kernel.org/r/20250720112133.244369-10-jolsa@kernel.org

authored by

Jiri Olsa and committed by
Peter Zijlstra
56101b69 91440ff4

+161
+1
arch/x86/entry/syscalls/syscall_64.tbl
··· 345 345 333 common io_pgetevents sys_io_pgetevents 346 346 334 common rseq sys_rseq 347 347 335 common uretprobe sys_uretprobe 348 + 336 common uprobe sys_uprobe 348 349 # don't use numbers 387 through 423, add new calls after the last 349 350 # 'common' entry 350 351 424 common pidfd_send_signal sys_pidfd_send_signal
+139
arch/x86/kernel/uprobes.c
··· 752 752 hlist_for_each_entry_safe(tramp, n, &state->head_tramps, node) 753 753 destroy_uprobe_trampoline(tramp); 754 754 } 755 + 756 + static bool __in_uprobe_trampoline(unsigned long ip) 757 + { 758 + struct vm_area_struct *vma = vma_lookup(current->mm, ip); 759 + 760 + return vma && vma_is_special_mapping(vma, &tramp_mapping); 761 + } 762 + 763 + static bool in_uprobe_trampoline(unsigned long ip) 764 + { 765 + struct mm_struct *mm = current->mm; 766 + bool found, retry = true; 767 + unsigned int seq; 768 + 769 + rcu_read_lock(); 770 + if (mmap_lock_speculate_try_begin(mm, &seq)) { 771 + found = __in_uprobe_trampoline(ip); 772 + retry = mmap_lock_speculate_retry(mm, seq); 773 + } 774 + rcu_read_unlock(); 775 + 776 + if (retry) { 777 + mmap_read_lock(mm); 778 + found = __in_uprobe_trampoline(ip); 779 + mmap_read_unlock(mm); 780 + } 781 + return found; 782 + } 783 + 784 + /* 785 + * See uprobe syscall trampoline; the call to the trampoline will push 786 + * the return address on the stack, the trampoline itself then pushes 787 + * cx, r11 and ax. 788 + */ 789 + struct uprobe_syscall_args { 790 + unsigned long ax; 791 + unsigned long r11; 792 + unsigned long cx; 793 + unsigned long retaddr; 794 + }; 795 + 796 + SYSCALL_DEFINE0(uprobe) 797 + { 798 + struct pt_regs *regs = task_pt_regs(current); 799 + struct uprobe_syscall_args args; 800 + unsigned long ip, sp; 801 + int err; 802 + 803 + /* Allow execution only from uprobe trampolines. */ 804 + if (!in_uprobe_trampoline(regs->ip)) 805 + goto sigill; 806 + 807 + err = copy_from_user(&args, (void __user *)regs->sp, sizeof(args)); 808 + if (err) 809 + goto sigill; 810 + 811 + ip = regs->ip; 812 + 813 + /* 814 + * expose the "right" values of ax/r11/cx/ip/sp to uprobe_consumer/s, plus: 815 + * - adjust ip to the probe address, call saved next instruction address 816 + * - adjust sp to the probe's stack frame (check trampoline code) 817 + */ 818 + regs->ax = args.ax; 819 + regs->r11 = args.r11; 820 + regs->cx = args.cx; 821 + regs->ip = args.retaddr - 5; 822 + regs->sp += sizeof(args); 823 + regs->orig_ax = -1; 824 + 825 + sp = regs->sp; 826 + 827 + handle_syscall_uprobe(regs, regs->ip); 828 + 829 + /* 830 + * Some of the uprobe consumers has changed sp, we can do nothing, 831 + * just return via iret. 832 + */ 833 + if (regs->sp != sp) { 834 + /* skip the trampoline call */ 835 + if (args.retaddr - 5 == regs->ip) 836 + regs->ip += 5; 837 + return regs->ax; 838 + } 839 + 840 + regs->sp -= sizeof(args); 841 + 842 + /* for the case uprobe_consumer has changed ax/r11/cx */ 843 + args.ax = regs->ax; 844 + args.r11 = regs->r11; 845 + args.cx = regs->cx; 846 + 847 + /* keep return address unless we are instructed otherwise */ 848 + if (args.retaddr - 5 != regs->ip) 849 + args.retaddr = regs->ip; 850 + 851 + regs->ip = ip; 852 + 853 + err = copy_to_user((void __user *)regs->sp, &args, sizeof(args)); 854 + if (err) 855 + goto sigill; 856 + 857 + /* ensure sysret, see do_syscall_64() */ 858 + regs->r11 = regs->flags; 859 + regs->cx = regs->ip; 860 + return 0; 861 + 862 + sigill: 863 + force_sig(SIGILL); 864 + return -1; 865 + } 866 + 867 + asm ( 868 + ".pushsection .rodata\n" 869 + ".balign " __stringify(PAGE_SIZE) "\n" 870 + "uprobe_trampoline_entry:\n" 871 + "push %rcx\n" 872 + "push %r11\n" 873 + "push %rax\n" 874 + "movq $" __stringify(__NR_uprobe) ", %rax\n" 875 + "syscall\n" 876 + "pop %rax\n" 877 + "pop %r11\n" 878 + "pop %rcx\n" 879 + "ret\n" 880 + ".balign " __stringify(PAGE_SIZE) "\n" 881 + ".popsection\n" 882 + ); 883 + 884 + extern u8 uprobe_trampoline_entry[]; 885 + 886 + static int __init arch_uprobes_init(void) 887 + { 888 + tramp_mapping_pages[0] = virt_to_page(uprobe_trampoline_entry); 889 + return 0; 890 + } 891 + 892 + late_initcall(arch_uprobes_init); 893 + 755 894 #else /* 32-bit: */ 756 895 /* 757 896 * No RIP-relative addressing on 32-bit
+2
include/linux/syscalls.h
··· 1005 1005 1006 1006 asmlinkage long sys_uretprobe(void); 1007 1007 1008 + asmlinkage long sys_uprobe(void); 1009 + 1008 1010 /* pciconfig: alpha, arm, arm64, ia64, sparc */ 1009 1011 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, 1010 1012 unsigned long off, unsigned long len,
+1
include/linux/uprobes.h
··· 239 239 extern void uprobe_copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len); 240 240 extern void arch_uprobe_clear_state(struct mm_struct *mm); 241 241 extern void arch_uprobe_init_state(struct mm_struct *mm); 242 + extern void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr); 242 243 #else /* !CONFIG_UPROBES */ 243 244 struct uprobes_state { 244 245 };
+17
kernel/events/uprobes.c
··· 2771 2771 rcu_read_unlock_trace(); 2772 2772 } 2773 2773 2774 + void handle_syscall_uprobe(struct pt_regs *regs, unsigned long bp_vaddr) 2775 + { 2776 + struct uprobe *uprobe; 2777 + int is_swbp; 2778 + 2779 + guard(rcu_tasks_trace)(); 2780 + 2781 + uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); 2782 + if (!uprobe) 2783 + return; 2784 + if (!get_utask()) 2785 + return; 2786 + if (arch_uprobe_ignore(&uprobe->arch, regs)) 2787 + return; 2788 + handler_chain(uprobe, regs); 2789 + } 2790 + 2774 2791 /* 2775 2792 * Perform required fix-ups and disable singlestep. 2776 2793 * Allow pending signals to take effect.
+1
kernel/sys_ni.c
··· 392 392 COND_SYSCALL(rseq); 393 393 394 394 COND_SYSCALL(uretprobe); 395 + COND_SYSCALL(uprobe);