Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'perf-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull performance events updates from Ingo Molnar:
"Uprobes:
- Add BPF session support (Jiri Olsa)
- Switch to RCU Tasks Trace flavor for better performance (Andrii
Nakryiko)
- Massively increase uretprobe SMP scalability by SRCU-protecting
the uretprobe lifetime (Andrii Nakryiko)
- Kill xol_area->slot_count (Oleg Nesterov)

Core facilities:
- Implement targeted high-frequency profiling by adding the ability
for an event to "pause" or "resume" AUX area tracing (Adrian
Hunter)

VM profiling/sampling:
- Correct perf sampling with guest VMs (Colton Lewis)

New hardware support:
- x86/intel: Add PMU support for Intel ArrowLake-H CPUs (Dapeng Mi)

Misc fixes and enhancements:
- x86/intel/pt: Fix buffer full but size is 0 case (Adrian Hunter)
- x86/amd: Warn only on new bits set (Breno Leitao)
- x86/amd/uncore: Avoid a false positive warning about snprintf
truncation in amd_uncore_umc_ctx_init (Jean Delvare)
- uprobes: Re-order struct uprobe_task to save some space
(Christophe JAILLET)
- x86/rapl: Move the pmu allocation out of CPU hotplug (Kan Liang)
- x86/rapl: Clean up cpumask and hotplug (Kan Liang)
- uprobes: Deuglify xol_get_insn_slot/xol_free_insn_slot paths (Oleg
Nesterov)"

* tag 'perf-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (32 commits)
perf/core: Correct perf sampling with guest VMs
perf/x86: Refactor misc flag assignments
perf/powerpc: Use perf_arch_instruction_pointer()
perf/core: Hoist perf_instruction_pointer() and perf_misc_flags()
perf/arm: Drop unused functions
uprobes: Re-order struct uprobe_task to save some space
perf/x86/amd/uncore: Avoid a false positive warning about snprintf truncation in amd_uncore_umc_ctx_init
perf/x86/intel: Do not enable large PEBS for events with aux actions or aux sampling
perf/x86/intel/pt: Add support for pause / resume
perf/core: Add aux_pause, aux_resume, aux_start_paused
perf/x86/intel/pt: Fix buffer full but size is 0 case
uprobes: SRCU-protect uretprobe lifetime (with timeout)
uprobes: allow put_uprobe() from non-sleepable softirq context
perf/x86/rapl: Clean up cpumask and hotplug
perf/x86/rapl: Move the pmu allocation out of CPU hotplug
uprobe: Add support for session consumer
uprobe: Add data pointer to consumer handlers
perf/x86/amd: Warn only on new bits set
uprobes: fold xol_take_insn_slot() into xol_get_insn_slot()
uprobes: kill xol_area->slot_count
...

+1073 -422
+1
arch/Kconfig
··· 135 135 config UPROBES 136 136 def_bool n 137 137 depends on ARCH_SUPPORTS_UPROBES 138 + select TASKS_TRACE_RCU 138 139 help 139 140 Uprobes is the user-space counterpart to kprobes: they 140 141 enable instrumentation applications (such as 'perf probe')
-7
arch/arm/include/asm/perf_event.h
··· 8 8 #ifndef __ARM_PERF_EVENT_H__ 9 9 #define __ARM_PERF_EVENT_H__ 10 10 11 - #ifdef CONFIG_PERF_EVENTS 12 - struct pt_regs; 13 - extern unsigned long perf_instruction_pointer(struct pt_regs *regs); 14 - extern unsigned long perf_misc_flags(struct pt_regs *regs); 15 - #define perf_misc_flags(regs) perf_misc_flags(regs) 16 - #endif 17 - 18 11 #define perf_arch_fetch_caller_regs(regs, __ip) { \ 19 12 (regs)->ARM_pc = (__ip); \ 20 13 frame_pointer((regs)) = (unsigned long) __builtin_frame_address(0); \
-17
arch/arm/kernel/perf_callchain.c
··· 96 96 arm_get_current_stackframe(regs, &fr); 97 97 walk_stackframe(&fr, callchain_trace, entry); 98 98 } 99 - 100 - unsigned long perf_instruction_pointer(struct pt_regs *regs) 101 - { 102 - return instruction_pointer(regs); 103 - } 104 - 105 - unsigned long perf_misc_flags(struct pt_regs *regs) 106 - { 107 - int misc = 0; 108 - 109 - if (user_mode(regs)) 110 - misc |= PERF_RECORD_MISC_USER; 111 - else 112 - misc |= PERF_RECORD_MISC_KERNEL; 113 - 114 - return misc; 115 - }
-4
arch/arm64/include/asm/perf_event.h
··· 10 10 #include <asm/ptrace.h> 11 11 12 12 #ifdef CONFIG_PERF_EVENTS 13 - struct pt_regs; 14 - extern unsigned long perf_instruction_pointer(struct pt_regs *regs); 15 - extern unsigned long perf_misc_flags(struct pt_regs *regs); 16 - #define perf_misc_flags(regs) perf_misc_flags(regs) 17 13 #define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs 18 14 #endif 19 15
-28
arch/arm64/kernel/perf_callchain.c
··· 38 38 39 39 arch_stack_walk(callchain_trace, entry, current, regs); 40 40 } 41 - 42 - unsigned long perf_instruction_pointer(struct pt_regs *regs) 43 - { 44 - if (perf_guest_state()) 45 - return perf_guest_get_ip(); 46 - 47 - return instruction_pointer(regs); 48 - } 49 - 50 - unsigned long perf_misc_flags(struct pt_regs *regs) 51 - { 52 - unsigned int guest_state = perf_guest_state(); 53 - int misc = 0; 54 - 55 - if (guest_state) { 56 - if (guest_state & PERF_GUEST_USER) 57 - misc |= PERF_RECORD_MISC_GUEST_USER; 58 - else 59 - misc |= PERF_RECORD_MISC_GUEST_KERNEL; 60 - } else { 61 - if (user_mode(regs)) 62 - misc |= PERF_RECORD_MISC_USER; 63 - else 64 - misc |= PERF_RECORD_MISC_KERNEL; 65 - } 66 - 67 - return misc; 68 - }
+3 -3
arch/powerpc/include/asm/perf_event_server.h
··· 102 102 int __init register_power_pmu(struct power_pmu *pmu); 103 103 104 104 struct pt_regs; 105 - extern unsigned long perf_misc_flags(struct pt_regs *regs); 106 - extern unsigned long perf_instruction_pointer(struct pt_regs *regs); 105 + extern unsigned long perf_arch_misc_flags(struct pt_regs *regs); 106 + extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs); 107 107 extern unsigned long int read_bhrb(int n); 108 108 109 109 /* ··· 111 111 * if we have hardware PMU support. 112 112 */ 113 113 #ifdef CONFIG_PPC_PERF_CTRS 114 - #define perf_misc_flags(regs) perf_misc_flags(regs) 114 + #define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs) 115 115 #endif 116 116 117 117 /*
+1 -1
arch/powerpc/perf/callchain.c
··· 51 51 52 52 lr = regs->link; 53 53 sp = regs->gpr[1]; 54 - perf_callchain_store(entry, perf_instruction_pointer(regs)); 54 + perf_callchain_store(entry, perf_arch_instruction_pointer(regs)); 55 55 56 56 if (!validate_sp(sp, current)) 57 57 return;
+1 -1
arch/powerpc/perf/callchain_32.c
··· 139 139 long level = 0; 140 140 unsigned int __user *fp, *uregs; 141 141 142 - next_ip = perf_instruction_pointer(regs); 142 + next_ip = perf_arch_instruction_pointer(regs); 143 143 lr = regs->link; 144 144 sp = regs->gpr[1]; 145 145 perf_callchain_store(entry, next_ip);
+1 -1
arch/powerpc/perf/callchain_64.c
··· 74 74 struct signal_frame_64 __user *sigframe; 75 75 unsigned long __user *fp, *uregs; 76 76 77 - next_ip = perf_instruction_pointer(regs); 77 + next_ip = perf_arch_instruction_pointer(regs); 78 78 lr = regs->link; 79 79 sp = regs->gpr[1]; 80 80 perf_callchain_store(entry, next_ip);
+2 -2
arch/powerpc/perf/core-book3s.c
··· 2332 2332 * Called from generic code to get the misc flags (i.e. processor mode) 2333 2333 * for an event_id. 2334 2334 */ 2335 - unsigned long perf_misc_flags(struct pt_regs *regs) 2335 + unsigned long perf_arch_misc_flags(struct pt_regs *regs) 2336 2336 { 2337 2337 u32 flags = perf_get_misc_flags(regs); 2338 2338 ··· 2346 2346 * Called from generic code to get the instruction pointer 2347 2347 * for an event_id. 2348 2348 */ 2349 - unsigned long perf_instruction_pointer(struct pt_regs *regs) 2349 + unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) 2350 2350 { 2351 2351 unsigned long siar = mfspr(SPRN_SIAR); 2352 2352
+3 -3
arch/s390/include/asm/perf_event.h
··· 37 37 38 38 /* Perf callbacks */ 39 39 struct pt_regs; 40 - extern unsigned long perf_instruction_pointer(struct pt_regs *regs); 41 - extern unsigned long perf_misc_flags(struct pt_regs *regs); 42 - #define perf_misc_flags(regs) perf_misc_flags(regs) 40 + extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs); 41 + extern unsigned long perf_arch_misc_flags(struct pt_regs *regs); 42 + #define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs) 43 43 #define perf_arch_bpf_user_pt_regs(regs) &regs->user_regs 44 44 45 45 /* Perf pt_regs extension for sample-data-entry indicators */
+2 -2
arch/s390/kernel/perf_event.c
··· 57 57 return sie_block(regs)->gpsw.addr; 58 58 } 59 59 60 - unsigned long perf_instruction_pointer(struct pt_regs *regs) 60 + unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) 61 61 { 62 62 return is_in_guest(regs) ? instruction_pointer_guest(regs) 63 63 : instruction_pointer(regs); ··· 84 84 return flags; 85 85 } 86 86 87 - unsigned long perf_misc_flags(struct pt_regs *regs) 87 + unsigned long perf_arch_misc_flags(struct pt_regs *regs) 88 88 { 89 89 /* Check if the cpum_sf PMU has created the pt_regs structure. 90 90 * In this case, perf misc flags can be easily extracted. Otherwise,
+8 -2
arch/x86/events/amd/core.c
··· 943 943 static int amd_pmu_v2_handle_irq(struct pt_regs *regs) 944 944 { 945 945 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); 946 + static atomic64_t status_warned = ATOMIC64_INIT(0); 947 + u64 reserved, status, mask, new_bits, prev_bits; 946 948 struct perf_sample_data data; 947 949 struct hw_perf_event *hwc; 948 950 struct perf_event *event; 949 951 int handled = 0, idx; 950 - u64 reserved, status, mask; 951 952 bool pmu_enabled; 952 953 953 954 /* ··· 1013 1012 * the corresponding PMCs are expected to be inactive according to the 1014 1013 * active_mask 1015 1014 */ 1016 - WARN_ON(status > 0); 1015 + if (status > 0) { 1016 + prev_bits = atomic64_fetch_or(status, &status_warned); 1017 + // A new bit was set for the very first time. 1018 + new_bits = status & ~prev_bits; 1019 + WARN(new_bits, "New overflows for inactive PMCs: %llx\n", new_bits); 1020 + } 1017 1021 1018 1022 /* Clear overflow and freeze bits */ 1019 1023 amd_pmu_ack_global_status(~status);
+3 -2
arch/x86/events/amd/uncore.c
··· 916 916 u8 group_num_pmcs[UNCORE_GROUP_MAX] = { 0 }; 917 917 union amd_uncore_info info; 918 918 struct amd_uncore_pmu *pmu; 919 - int index = 0, gid, i; 919 + int gid, i; 920 + u16 index = 0; 920 921 921 922 if (pmu_version < 2) 922 923 return 0; ··· 949 948 for_each_set_bit(gid, gmask, UNCORE_GROUP_MAX) { 950 949 for (i = 0; i < group_num_pmus[gid]; i++) { 951 950 pmu = &uncore->pmus[index]; 952 - snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%d", index); 951 + snprintf(pmu->name, sizeof(pmu->name), "amd_umc_%hu", index); 953 952 pmu->num_counters = group_num_pmcs[gid] / group_num_pmus[gid]; 954 953 pmu->msr_base = MSR_F19H_UMC_PERF_CTL + i * pmu->num_counters * 2; 955 954 pmu->rdpmc_base = -1;
+44 -22
arch/x86/events/core.c
··· 3003 3003 return 0; 3004 3004 } 3005 3005 3006 - unsigned long perf_instruction_pointer(struct pt_regs *regs) 3006 + unsigned long perf_arch_instruction_pointer(struct pt_regs *regs) 3007 3007 { 3008 - if (perf_guest_state()) 3009 - return perf_guest_get_ip(); 3010 - 3011 3008 return regs->ip + code_segment_base(regs); 3012 3009 } 3013 3010 3014 - unsigned long perf_misc_flags(struct pt_regs *regs) 3011 + static unsigned long common_misc_flags(struct pt_regs *regs) 3015 3012 { 3016 - unsigned int guest_state = perf_guest_state(); 3017 - int misc = 0; 3018 - 3019 - if (guest_state) { 3020 - if (guest_state & PERF_GUEST_USER) 3021 - misc |= PERF_RECORD_MISC_GUEST_USER; 3022 - else 3023 - misc |= PERF_RECORD_MISC_GUEST_KERNEL; 3024 - } else { 3025 - if (user_mode(regs)) 3026 - misc |= PERF_RECORD_MISC_USER; 3027 - else 3028 - misc |= PERF_RECORD_MISC_KERNEL; 3029 - } 3030 - 3031 3013 if (regs->flags & PERF_EFLAGS_EXACT) 3032 - misc |= PERF_RECORD_MISC_EXACT_IP; 3014 + return PERF_RECORD_MISC_EXACT_IP; 3033 3015 3034 - return misc; 3016 + return 0; 3017 + } 3018 + 3019 + static unsigned long guest_misc_flags(struct pt_regs *regs) 3020 + { 3021 + unsigned long guest_state = perf_guest_state(); 3022 + 3023 + if (!(guest_state & PERF_GUEST_ACTIVE)) 3024 + return 0; 3025 + 3026 + if (guest_state & PERF_GUEST_USER) 3027 + return PERF_RECORD_MISC_GUEST_USER; 3028 + else 3029 + return PERF_RECORD_MISC_GUEST_KERNEL; 3030 + 3031 + } 3032 + 3033 + static unsigned long host_misc_flags(struct pt_regs *regs) 3034 + { 3035 + if (user_mode(regs)) 3036 + return PERF_RECORD_MISC_USER; 3037 + else 3038 + return PERF_RECORD_MISC_KERNEL; 3039 + } 3040 + 3041 + unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) 3042 + { 3043 + unsigned long flags = common_misc_flags(regs); 3044 + 3045 + flags |= guest_misc_flags(regs); 3046 + 3047 + return flags; 3048 + } 3049 + 3050 + unsigned long perf_arch_misc_flags(struct pt_regs *regs) 3051 + { 3052 + unsigned long flags = common_misc_flags(regs); 3053 + 3054 + flags |= host_misc_flags(regs); 3055 + 3056 + return flags; 3035 3057 } 3036 3058 3037 3059 void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap)
+123 -14
arch/x86/events/intel/core.c
··· 3962 3962 3963 3963 if (!(event->attr.freq || (event->attr.wakeup_events && !event->attr.watermark))) { 3964 3964 event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; 3965 - if (!(event->attr.sample_type & 3966 - ~intel_pmu_large_pebs_flags(event))) { 3965 + if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event)) && 3966 + !has_aux_action(event)) { 3967 3967 event->hw.flags |= PERF_X86_EVENT_LARGE_PEBS; 3968 3968 event->attach_state |= PERF_ATTACH_SCHED_CB; 3969 3969 } ··· 4599 4599 X86_CONFIG(.event=0xc0, .umask=0x01); 4600 4600 } 4601 4601 4602 + static struct event_constraint * 4603 + arl_h_get_event_constraints(struct cpu_hw_events *cpuc, int idx, 4604 + struct perf_event *event) 4605 + { 4606 + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); 4607 + 4608 + if (pmu->pmu_type == hybrid_tiny) 4609 + return cmt_get_event_constraints(cpuc, idx, event); 4610 + 4611 + return mtl_get_event_constraints(cpuc, idx, event); 4612 + } 4613 + 4614 + static int arl_h_hw_config(struct perf_event *event) 4615 + { 4616 + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); 4617 + 4618 + if (pmu->pmu_type == hybrid_tiny) 4619 + return intel_pmu_hw_config(event); 4620 + 4621 + return adl_hw_config(event); 4622 + } 4623 + 4602 4624 /* 4603 4625 * The HSW11 requires a period larger than 100 which is the same as the BDM11. 4604 4626 * A minimum period of 128 is enforced as well for the INST_RETIRED.ALL. ··· 4946 4924 4947 4925 /* 4948 4926 * This essentially just maps between the 'hybrid_cpu_type' 4949 - * and 'hybrid_pmu_type' enums: 4927 + * and 'hybrid_pmu_type' enums except for ARL-H processor 4928 + * which needs to compare atom uarch native id since ARL-H 4929 + * contains two different atom uarchs. 4950 4930 */ 4951 4931 for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) { 4952 4932 enum hybrid_pmu_type pmu_type = x86_pmu.hybrid_pmu[i].pmu_type; 4933 + u32 native_id; 4953 4934 4954 - if (cpu_type == HYBRID_INTEL_CORE && 4955 - pmu_type == hybrid_big) 4935 + if (cpu_type == HYBRID_INTEL_CORE && pmu_type == hybrid_big) 4956 4936 return &x86_pmu.hybrid_pmu[i]; 4957 - if (cpu_type == HYBRID_INTEL_ATOM && 4958 - pmu_type == hybrid_small) 4959 - return &x86_pmu.hybrid_pmu[i]; 4937 + if (cpu_type == HYBRID_INTEL_ATOM) { 4938 + if (x86_pmu.num_hybrid_pmus == 2 && pmu_type == hybrid_small) 4939 + return &x86_pmu.hybrid_pmu[i]; 4940 + 4941 + native_id = get_this_hybrid_cpu_native_id(); 4942 + if (native_id == skt_native_id && pmu_type == hybrid_small) 4943 + return &x86_pmu.hybrid_pmu[i]; 4944 + if (native_id == cmt_native_id && pmu_type == hybrid_tiny) 4945 + return &x86_pmu.hybrid_pmu[i]; 4946 + } 4960 4947 } 4961 4948 4962 4949 return NULL; ··· 5996 5965 NULL 5997 5966 }; 5998 5967 5968 + /* The event string must be in PMU IDX order. */ 5969 + EVENT_ATTR_STR_HYBRID(topdown-retiring, 5970 + td_retiring_arl_h, 5971 + "event=0xc2,umask=0x02;event=0x00,umask=0x80;event=0xc2,umask=0x0", 5972 + hybrid_big_small_tiny); 5973 + EVENT_ATTR_STR_HYBRID(topdown-bad-spec, 5974 + td_bad_spec_arl_h, 5975 + "event=0x73,umask=0x0;event=0x00,umask=0x81;event=0x73,umask=0x0", 5976 + hybrid_big_small_tiny); 5977 + EVENT_ATTR_STR_HYBRID(topdown-fe-bound, 5978 + td_fe_bound_arl_h, 5979 + "event=0x9c,umask=0x01;event=0x00,umask=0x82;event=0x71,umask=0x0", 5980 + hybrid_big_small_tiny); 5981 + EVENT_ATTR_STR_HYBRID(topdown-be-bound, 5982 + td_be_bound_arl_h, 5983 + "event=0xa4,umask=0x02;event=0x00,umask=0x83;event=0x74,umask=0x0", 5984 + hybrid_big_small_tiny); 5985 + 5986 + static struct attribute *arl_h_hybrid_events_attrs[] = { 5987 + EVENT_PTR(slots_adl), 5988 + EVENT_PTR(td_retiring_arl_h), 5989 + EVENT_PTR(td_bad_spec_arl_h), 5990 + EVENT_PTR(td_fe_bound_arl_h), 5991 + EVENT_PTR(td_be_bound_arl_h), 5992 + EVENT_PTR(td_heavy_ops_adl), 5993 + EVENT_PTR(td_br_mis_adl), 5994 + EVENT_PTR(td_fetch_lat_adl), 5995 + EVENT_PTR(td_mem_bound_adl), 5996 + NULL, 5997 + }; 5998 + 5999 5999 /* Must be in IDX order */ 6000 6000 EVENT_ATTR_STR_HYBRID(mem-loads, mem_ld_adl, "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3", hybrid_big_small); 6001 6001 EVENT_ATTR_STR_HYBRID(mem-stores, mem_st_adl, "event=0xd0,umask=0x6;event=0xcd,umask=0x2", hybrid_big_small); ··· 6043 5981 EVENT_PTR(mem_ld_adl), 6044 5982 EVENT_PTR(mem_st_adl), 6045 5983 NULL 5984 + }; 5985 + 5986 + EVENT_ATTR_STR_HYBRID(mem-loads, 5987 + mem_ld_arl_h, 5988 + "event=0xd0,umask=0x5,ldlat=3;event=0xcd,umask=0x1,ldlat=3;event=0xd0,umask=0x5,ldlat=3", 5989 + hybrid_big_small_tiny); 5990 + EVENT_ATTR_STR_HYBRID(mem-stores, 5991 + mem_st_arl_h, 5992 + "event=0xd0,umask=0x6;event=0xcd,umask=0x2;event=0xd0,umask=0x6", 5993 + hybrid_big_small_tiny); 5994 + 5995 + static struct attribute *arl_h_hybrid_mem_attrs[] = { 5996 + EVENT_PTR(mem_ld_arl_h), 5997 + EVENT_PTR(mem_st_arl_h), 5998 + NULL, 6046 5999 }; 6047 6000 6048 6001 EVENT_ATTR_STR_HYBRID(tx-start, tx_start_adl, "event=0xc9,umask=0x1", hybrid_big); ··· 6083 6006 6084 6007 FORMAT_ATTR_HYBRID(in_tx, hybrid_big); 6085 6008 FORMAT_ATTR_HYBRID(in_tx_cp, hybrid_big); 6086 - FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small); 6087 - FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small); 6009 + FORMAT_ATTR_HYBRID(offcore_rsp, hybrid_big_small_tiny); 6010 + FORMAT_ATTR_HYBRID(ldlat, hybrid_big_small_tiny); 6088 6011 FORMAT_ATTR_HYBRID(frontend, hybrid_big); 6089 6012 6090 6013 #define ADL_HYBRID_RTM_FORMAT_ATTR \ ··· 6107 6030 NULL 6108 6031 }; 6109 6032 6110 - FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small); 6033 + FORMAT_ATTR_HYBRID(snoop_rsp, hybrid_small_tiny); 6111 6034 6112 6035 static struct attribute *mtl_hybrid_extra_attr_rtm[] = { 6113 6036 ADL_HYBRID_RTM_FORMAT_ATTR, ··· 6315 6238 } 6316 6239 6317 6240 static const struct { enum hybrid_pmu_type id; char *name; } intel_hybrid_pmu_type_map[] __initconst = { 6318 - { hybrid_small, "cpu_atom" }, 6319 - { hybrid_big, "cpu_core" }, 6241 + { hybrid_small, "cpu_atom" }, 6242 + { hybrid_big, "cpu_core" }, 6243 + { hybrid_tiny, "cpu_lowpower" }, 6320 6244 }; 6321 6245 6322 6246 static __always_inline int intel_pmu_init_hybrid(enum hybrid_pmu_type pmus) ··· 6350 6272 0, x86_pmu_num_counters(&pmu->pmu), 0, 0); 6351 6273 6352 6274 pmu->intel_cap.capabilities = x86_pmu.intel_cap.capabilities; 6353 - if (pmu->pmu_type & hybrid_small) { 6275 + if (pmu->pmu_type & hybrid_small_tiny) { 6354 6276 pmu->intel_cap.perf_metrics = 0; 6355 6277 pmu->intel_cap.pebs_output_pt_available = 1; 6356 6278 pmu->mid_ack = true; ··· 7187 7109 intel_pmu_pebs_data_source_lnl(); 7188 7110 pr_cont("Lunarlake Hybrid events, "); 7189 7111 name = "lunarlake_hybrid"; 7112 + break; 7113 + 7114 + case INTEL_ARROWLAKE_H: 7115 + intel_pmu_init_hybrid(hybrid_big_small_tiny); 7116 + 7117 + x86_pmu.pebs_latency_data = arl_h_latency_data; 7118 + x86_pmu.get_event_constraints = arl_h_get_event_constraints; 7119 + x86_pmu.hw_config = arl_h_hw_config; 7120 + 7121 + td_attr = arl_h_hybrid_events_attrs; 7122 + mem_attr = arl_h_hybrid_mem_attrs; 7123 + tsx_attr = adl_hybrid_tsx_attrs; 7124 + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? 7125 + mtl_hybrid_extra_attr_rtm : mtl_hybrid_extra_attr; 7126 + 7127 + /* Initialize big core specific PerfMon capabilities. */ 7128 + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_CORE_IDX]; 7129 + intel_pmu_init_lnc(&pmu->pmu); 7130 + 7131 + /* Initialize Atom core specific PerfMon capabilities. */ 7132 + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX]; 7133 + intel_pmu_init_skt(&pmu->pmu); 7134 + 7135 + /* Initialize Lower Power Atom specific PerfMon capabilities. */ 7136 + pmu = &x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX]; 7137 + intel_pmu_init_grt(&pmu->pmu); 7138 + pmu->extra_regs = intel_cmt_extra_regs; 7139 + 7140 + intel_pmu_pebs_data_source_arl_h(); 7141 + pr_cont("ArrowLake-H Hybrid events, "); 7142 + name = "arrowlake_h_hybrid"; 7190 7143 break; 7191 7144 7192 7145 default:
+21
arch/x86/events/intel/ds.c
··· 177 177 __intel_pmu_pebs_data_source_cmt(data_source); 178 178 } 179 179 180 + void __init intel_pmu_pebs_data_source_arl_h(void) 181 + { 182 + u64 *data_source; 183 + 184 + intel_pmu_pebs_data_source_lnl(); 185 + 186 + data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_TINY_IDX].pebs_data_source; 187 + memcpy(data_source, pebs_data_source, sizeof(pebs_data_source)); 188 + __intel_pmu_pebs_data_source_cmt(data_source); 189 + } 190 + 180 191 void __init intel_pmu_pebs_data_source_cmt(void) 181 192 { 182 193 __intel_pmu_pebs_data_source_cmt(pebs_data_source); ··· 397 386 return cmt_latency_data(event, status); 398 387 399 388 return lnc_latency_data(event, status); 389 + } 390 + 391 + u64 arl_h_latency_data(struct perf_event *event, u64 status) 392 + { 393 + struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu); 394 + 395 + if (pmu->pmu_type == hybrid_tiny) 396 + return cmt_latency_data(event, status); 397 + 398 + return lnl_latency_data(event, status); 400 399 } 401 400 402 401 static u64 load_latency_data(struct perf_event *event, u64 status)
+78 -6
arch/x86/events/intel/pt.c
··· 418 418 struct pt *pt = this_cpu_ptr(&pt_ctx); 419 419 u64 ctl = event->hw.aux_config; 420 420 421 + if (READ_ONCE(event->hw.aux_paused)) 422 + return; 423 + 421 424 ctl |= RTIT_CTL_TRACEEN; 422 425 if (READ_ONCE(pt->vmx_on)) 423 426 perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_PARTIAL); ··· 537 534 reg |= (event->attr.config & PT_CONFIG_MASK); 538 535 539 536 event->hw.aux_config = reg; 537 + 538 + /* 539 + * Allow resume before starting so as not to overwrite a value set by a 540 + * PMI. 541 + */ 542 + barrier(); 543 + WRITE_ONCE(pt->resume_allowed, 1); 544 + /* Configuration is complete, it is now OK to handle an NMI */ 545 + barrier(); 546 + WRITE_ONCE(pt->handle_nmi, 1); 547 + barrier(); 540 548 pt_config_start(event); 549 + barrier(); 550 + /* 551 + * Allow pause after starting so its pt_config_stop() doesn't race with 552 + * pt_config_start(). 553 + */ 554 + WRITE_ONCE(pt->pause_allowed, 1); 541 555 } 542 556 543 557 static void pt_config_stop(struct perf_event *event) ··· 848 828 buf->cur_idx++; 849 829 850 830 if (buf->cur_idx == buf->cur->last) { 851 - if (buf->cur == buf->last) 831 + if (buf->cur == buf->last) { 852 832 buf->cur = buf->first; 853 - else 833 + buf->wrapped = true; 834 + } else { 854 835 buf->cur = list_entry(buf->cur->list.next, struct topa, 855 836 list); 837 + } 856 838 buf->cur_idx = 0; 857 839 } 858 840 } ··· 868 846 static void pt_update_head(struct pt *pt) 869 847 { 870 848 struct pt_buffer *buf = perf_get_aux(&pt->handle); 849 + bool wrapped = buf->wrapped; 871 850 u64 topa_idx, base, old; 851 + 852 + buf->wrapped = false; 872 853 873 854 if (buf->single) { 874 855 local_set(&buf->data_size, buf->output_off); ··· 890 865 } else { 891 866 old = (local64_xchg(&buf->head, base) & 892 867 ((buf->nr_pages << PAGE_SHIFT) - 1)); 893 - if (base < old) 868 + if (base < old || (base == old && wrapped)) 894 869 base += buf->nr_pages << PAGE_SHIFT; 895 870 896 871 local_add(base - old, &buf->data_size); ··· 1536 1511 buf = perf_aux_output_begin(&pt->handle, event); 1537 1512 if (!buf) { 1538 1513 event->hw.state = PERF_HES_STOPPED; 1514 + WRITE_ONCE(pt->resume_allowed, 0); 1539 1515 return; 1540 1516 } 1541 1517 ··· 1545 1519 ret = pt_buffer_reset_markers(buf, &pt->handle); 1546 1520 if (ret) { 1547 1521 perf_aux_output_end(&pt->handle, 0); 1522 + WRITE_ONCE(pt->resume_allowed, 0); 1548 1523 return; 1549 1524 } 1550 1525 ··· 1600 1573 struct pt *pt = this_cpu_ptr(&pt_ctx); 1601 1574 struct pt_buffer *buf; 1602 1575 1576 + if (mode & PERF_EF_RESUME) { 1577 + if (READ_ONCE(pt->resume_allowed)) { 1578 + u64 status; 1579 + 1580 + /* 1581 + * Only if the trace is not active and the error and 1582 + * stopped bits are clear, is it safe to start, but a 1583 + * PMI might have just cleared these, so resume_allowed 1584 + * must be checked again also. 1585 + */ 1586 + rdmsrl(MSR_IA32_RTIT_STATUS, status); 1587 + if (!(status & (RTIT_STATUS_TRIGGEREN | 1588 + RTIT_STATUS_ERROR | 1589 + RTIT_STATUS_STOPPED)) && 1590 + READ_ONCE(pt->resume_allowed)) 1591 + pt_config_start(event); 1592 + } 1593 + return; 1594 + } 1595 + 1603 1596 buf = perf_aux_output_begin(&pt->handle, event); 1604 1597 if (!buf) 1605 1598 goto fail_stop; ··· 1630 1583 goto fail_end_stop; 1631 1584 } 1632 1585 1633 - WRITE_ONCE(pt->handle_nmi, 1); 1634 1586 hwc->state = 0; 1635 1587 1636 1588 pt_config_buffer(buf); ··· 1647 1601 { 1648 1602 struct pt *pt = this_cpu_ptr(&pt_ctx); 1649 1603 1604 + if (mode & PERF_EF_PAUSE) { 1605 + if (READ_ONCE(pt->pause_allowed)) 1606 + pt_config_stop(event); 1607 + return; 1608 + } 1609 + 1650 1610 /* 1651 1611 * Protect against the PMI racing with disabling wrmsr, 1652 1612 * see comment in intel_pt_interrupt(). 1653 1613 */ 1654 1614 WRITE_ONCE(pt->handle_nmi, 0); 1615 + barrier(); 1616 + 1617 + /* 1618 + * Prevent a resume from attempting to restart tracing, or a pause 1619 + * during a subsequent start. Do this after clearing handle_nmi so that 1620 + * pt_event_snapshot_aux() will not re-allow them. 1621 + */ 1622 + WRITE_ONCE(pt->pause_allowed, 0); 1623 + WRITE_ONCE(pt->resume_allowed, 0); 1655 1624 barrier(); 1656 1625 1657 1626 pt_config_stop(event); ··· 1718 1657 if (WARN_ON_ONCE(!buf->snapshot)) 1719 1658 return 0; 1720 1659 1660 + /* Prevent pause/resume from attempting to start/stop tracing */ 1661 + WRITE_ONCE(pt->pause_allowed, 0); 1662 + WRITE_ONCE(pt->resume_allowed, 0); 1663 + barrier(); 1721 1664 /* 1722 1665 * There is no PT interrupt in this mode, so stop the trace and it will 1723 1666 * remain stopped while the buffer is copied. ··· 1741 1676 * Here, handle_nmi tells us if the tracing was on. 1742 1677 * If the tracing was on, restart it. 1743 1678 */ 1744 - if (READ_ONCE(pt->handle_nmi)) 1679 + if (READ_ONCE(pt->handle_nmi)) { 1680 + WRITE_ONCE(pt->resume_allowed, 1); 1681 + barrier(); 1745 1682 pt_config_start(event); 1683 + barrier(); 1684 + WRITE_ONCE(pt->pause_allowed, 1); 1685 + } 1746 1686 1747 1687 return ret; 1748 1688 } ··· 1863 1793 if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) 1864 1794 pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; 1865 1795 1866 - pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; 1796 + pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | 1797 + PERF_PMU_CAP_ITRACE | 1798 + PERF_PMU_CAP_AUX_PAUSE; 1867 1799 pt_pmu.pmu.attr_groups = pt_attr_groups; 1868 1800 pt_pmu.pmu.task_ctx_nr = perf_sw_context; 1869 1801 pt_pmu.pmu.event_init = pt_event_init;
+6
arch/x86/events/intel/pt.h
··· 65 65 * @head: logical write offset inside the buffer 66 66 * @snapshot: if this is for a snapshot/overwrite counter 67 67 * @single: use Single Range Output instead of ToPA 68 + * @wrapped: buffer advance wrapped back to the first topa table 68 69 * @stop_pos: STOP topa entry index 69 70 * @intr_pos: INT topa entry index 70 71 * @stop_te: STOP topa entry pointer ··· 83 82 local64_t head; 84 83 bool snapshot; 85 84 bool single; 85 + bool wrapped; 86 86 long stop_pos, intr_pos; 87 87 struct topa_entry *stop_te, *intr_te; 88 88 void **data_pages; ··· 119 117 * @filters: last configured filters 120 118 * @handle_nmi: do handle PT PMI on this cpu, there's an active event 121 119 * @vmx_on: 1 if VMX is ON on this cpu 120 + * @pause_allowed: PERF_EF_PAUSE is allowed to stop tracing 121 + * @resume_allowed: PERF_EF_RESUME is allowed to start tracing 122 122 * @output_base: cached RTIT_OUTPUT_BASE MSR value 123 123 * @output_mask: cached RTIT_OUTPUT_MASK MSR value 124 124 */ ··· 129 125 struct pt_filters filters; 130 126 int handle_nmi; 131 127 int vmx_on; 128 + int pause_allowed; 129 + int resume_allowed; 132 130 u64 output_base; 133 131 u64 output_mask; 134 132 };
+27 -9
arch/x86/events/perf_event.h
··· 668 668 #define PERF_PEBS_DATA_SOURCE_GRT_MAX 0x10 669 669 #define PERF_PEBS_DATA_SOURCE_GRT_MASK (PERF_PEBS_DATA_SOURCE_GRT_MAX - 1) 670 670 671 + /* 672 + * CPUID.1AH.EAX[31:0] uniquely identifies the microarchitecture 673 + * of the core. Bits 31-24 indicates its core type (Core or Atom) 674 + * and Bits [23:0] indicates the native model ID of the core. 675 + * Core type and native model ID are defined in below enumerations. 676 + */ 671 677 enum hybrid_cpu_type { 672 678 HYBRID_INTEL_NONE, 673 679 HYBRID_INTEL_ATOM = 0x20, 674 680 HYBRID_INTEL_CORE = 0x40, 675 681 }; 676 682 677 - enum hybrid_pmu_type { 678 - not_hybrid, 679 - hybrid_small = BIT(0), 680 - hybrid_big = BIT(1), 681 - 682 - hybrid_big_small = hybrid_big | hybrid_small, /* only used for matching */ 683 - }; 684 - 685 683 #define X86_HYBRID_PMU_ATOM_IDX 0 686 684 #define X86_HYBRID_PMU_CORE_IDX 1 685 + #define X86_HYBRID_PMU_TINY_IDX 2 687 686 688 - #define X86_HYBRID_NUM_PMUS 2 687 + enum hybrid_pmu_type { 688 + not_hybrid, 689 + hybrid_small = BIT(X86_HYBRID_PMU_ATOM_IDX), 690 + hybrid_big = BIT(X86_HYBRID_PMU_CORE_IDX), 691 + hybrid_tiny = BIT(X86_HYBRID_PMU_TINY_IDX), 692 + 693 + /* The belows are only used for matching */ 694 + hybrid_big_small = hybrid_big | hybrid_small, 695 + hybrid_small_tiny = hybrid_small | hybrid_tiny, 696 + hybrid_big_small_tiny = hybrid_big | hybrid_small_tiny, 697 + }; 698 + 699 + enum atom_native_id { 700 + cmt_native_id = 0x2, /* Crestmont */ 701 + skt_native_id = 0x3, /* Skymont */ 702 + }; 689 703 690 704 struct x86_hybrid_pmu { 691 705 struct pmu pmu; ··· 1592 1578 1593 1579 u64 lnl_latency_data(struct perf_event *event, u64 status); 1594 1580 1581 + u64 arl_h_latency_data(struct perf_event *event, u64 status); 1582 + 1595 1583 extern struct event_constraint intel_core2_pebs_event_constraints[]; 1596 1584 1597 1585 extern struct event_constraint intel_atom_pebs_event_constraints[]; ··· 1712 1696 void intel_pmu_pebs_data_source_grt(void); 1713 1697 1714 1698 void intel_pmu_pebs_data_source_mtl(void); 1699 + 1700 + void intel_pmu_pebs_data_source_arl_h(void); 1715 1701 1716 1702 void intel_pmu_pebs_data_source_cmt(void); 1717 1703
+34 -96
arch/x86/events/rapl.c
··· 148 148 /* 1/2^hw_unit Joule */ 149 149 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; 150 150 static struct rapl_pmus *rapl_pmus; 151 - static cpumask_t rapl_cpu_mask; 152 151 static unsigned int rapl_cntr_mask; 153 152 static u64 rapl_timer_ms; 154 153 static struct perf_msr *rapl_msrs; ··· 368 369 if (event->cpu < 0) 369 370 return -EINVAL; 370 371 371 - event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; 372 - 373 372 if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) 374 373 return -EINVAL; 375 374 ··· 386 389 pmu = cpu_to_rapl_pmu(event->cpu); 387 390 if (!pmu) 388 391 return -EINVAL; 389 - event->cpu = pmu->cpu; 390 392 event->pmu_private = pmu; 391 393 event->hw.event_base = rapl_msrs[bit].msr; 392 394 event->hw.config = cfg; ··· 398 402 { 399 403 rapl_event_update(event); 400 404 } 401 - 402 - static ssize_t rapl_get_attr_cpumask(struct device *dev, 403 - struct device_attribute *attr, char *buf) 404 - { 405 - return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); 406 - } 407 - 408 - static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); 409 - 410 - static struct attribute *rapl_pmu_attrs[] = { 411 - &dev_attr_cpumask.attr, 412 - NULL, 413 - }; 414 - 415 - static struct attribute_group rapl_pmu_attr_group = { 416 - .attrs = rapl_pmu_attrs, 417 - }; 418 405 419 406 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); 420 407 RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); ··· 446 467 }; 447 468 448 469 static const struct attribute_group *rapl_attr_groups[] = { 449 - &rapl_pmu_attr_group, 450 470 &rapl_pmu_format_group, 451 471 &rapl_pmu_events_group, 452 472 NULL, ··· 548 570 [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, 549 571 }; 550 572 551 - static int rapl_cpu_offline(unsigned int cpu) 552 - { 553 - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); 554 - int target; 555 - 556 - /* Check if exiting cpu is used for collecting rapl events */ 557 - if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) 558 - return 0; 559 - 560 - pmu->cpu = -1; 561 - /* Find a new cpu to collect rapl events */ 562 - target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu); 563 - 564 - /* Migrate rapl events to the new target */ 565 - if (target < nr_cpu_ids) { 566 - cpumask_set_cpu(target, &rapl_cpu_mask); 567 - pmu->cpu = target; 568 - perf_pmu_migrate_context(pmu->pmu, cpu, target); 569 - } 570 - return 0; 571 - } 572 - 573 - static int rapl_cpu_online(unsigned int cpu) 574 - { 575 - s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu); 576 - if (rapl_pmu_idx < 0) { 577 - pr_err("topology_logical_(package/die)_id() returned a negative value"); 578 - return -EINVAL; 579 - } 580 - struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); 581 - int target; 582 - 583 - if (!pmu) { 584 - pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); 585 - if (!pmu) 586 - return -ENOMEM; 587 - 588 - raw_spin_lock_init(&pmu->lock); 589 - INIT_LIST_HEAD(&pmu->active_list); 590 - pmu->pmu = &rapl_pmus->pmu; 591 - pmu->timer_interval = ms_to_ktime(rapl_timer_ms); 592 - rapl_hrtimer_init(pmu); 593 - 594 - rapl_pmus->pmus[rapl_pmu_idx] = pmu; 595 - } 596 - 597 - /* 598 - * Check if there is an online cpu in the package which collects rapl 599 - * events already. 600 - */ 601 - target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu)); 602 - if (target < nr_cpu_ids) 603 - return 0; 604 - 605 - cpumask_set_cpu(cpu, &rapl_cpu_mask); 606 - pmu->cpu = cpu; 607 - return 0; 608 - } 609 - 610 573 static int rapl_check_hw_unit(struct rapl_model *rm) 611 574 { 612 575 u64 msr_rapl_power_unit_bits; ··· 626 707 NULL, 627 708 }; 628 709 710 + static int __init init_rapl_pmu(void) 711 + { 712 + struct rapl_pmu *pmu; 713 + int idx; 714 + 715 + for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { 716 + pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); 717 + if (!pmu) 718 + goto free; 719 + 720 + raw_spin_lock_init(&pmu->lock); 721 + INIT_LIST_HEAD(&pmu->active_list); 722 + pmu->pmu = &rapl_pmus->pmu; 723 + pmu->timer_interval = ms_to_ktime(rapl_timer_ms); 724 + rapl_hrtimer_init(pmu); 725 + 726 + rapl_pmus->pmus[idx] = pmu; 727 + } 728 + 729 + return 0; 730 + free: 731 + for (; idx > 0; idx--) 732 + kfree(rapl_pmus->pmus[idx - 1]); 733 + return -ENOMEM; 734 + } 735 + 629 736 static int __init init_rapl_pmus(void) 630 737 { 631 738 int nr_rapl_pmu = topology_max_packages(); 739 + int rapl_pmu_scope = PERF_PMU_SCOPE_PKG; 632 740 633 - if (!rapl_pmu_is_pkg_scope()) 741 + if (!rapl_pmu_is_pkg_scope()) { 634 742 nr_rapl_pmu *= topology_max_dies_per_package(); 743 + rapl_pmu_scope = PERF_PMU_SCOPE_DIE; 744 + } 635 745 636 746 rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); 637 747 if (!rapl_pmus) ··· 676 728 rapl_pmus->pmu.start = rapl_pmu_event_start; 677 729 rapl_pmus->pmu.stop = rapl_pmu_event_stop; 678 730 rapl_pmus->pmu.read = rapl_pmu_event_read; 731 + rapl_pmus->pmu.scope = rapl_pmu_scope; 679 732 rapl_pmus->pmu.module = THIS_MODULE; 680 733 rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; 681 - return 0; 734 + 735 + return init_rapl_pmu(); 682 736 } 683 737 684 738 static struct rapl_model model_snb = { ··· 826 876 if (ret) 827 877 return ret; 828 878 829 - /* 830 - * Install callbacks. Core will call them for each online cpu. 831 - */ 832 - ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, 833 - "perf/x86/rapl:online", 834 - rapl_cpu_online, rapl_cpu_offline); 835 - if (ret) 836 - goto out; 837 - 838 879 ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); 839 880 if (ret) 840 - goto out1; 881 + goto out; 841 882 842 883 rapl_advertise(); 843 884 return 0; 844 885 845 - out1: 846 - cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); 847 886 out: 848 887 pr_warn("Initialization failed (%d), disabled\n", ret); 849 888 cleanup_rapl_pmus(); ··· 842 903 843 904 static void __exit intel_rapl_exit(void) 844 905 { 845 - cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); 846 906 perf_pmu_unregister(&rapl_pmus->pmu); 847 907 cleanup_rapl_pmus(); 848 908 }
+6
arch/x86/include/asm/cpu.h
··· 32 32 extern bool handle_guest_split_lock(unsigned long ip); 33 33 extern void handle_bus_lock(struct pt_regs *regs); 34 34 u8 get_this_hybrid_cpu_type(void); 35 + u32 get_this_hybrid_cpu_native_id(void); 35 36 #else 36 37 static inline void __init sld_setup(struct cpuinfo_x86 *c) {} 37 38 static inline bool handle_user_split_lock(struct pt_regs *regs, long error_code) ··· 48 47 static inline void handle_bus_lock(struct pt_regs *regs) {} 49 48 50 49 static inline u8 get_this_hybrid_cpu_type(void) 50 + { 51 + return 0; 52 + } 53 + 54 + static inline u32 get_this_hybrid_cpu_native_id(void) 51 55 { 52 56 return 0; 53 57 }
+7 -5
arch/x86/include/asm/perf_event.h
··· 536 536 u64 *xmm_regs; 537 537 }; 538 538 539 - extern unsigned long perf_instruction_pointer(struct pt_regs *regs); 540 - extern unsigned long perf_misc_flags(struct pt_regs *regs); 541 - #define perf_misc_flags(regs) perf_misc_flags(regs) 539 + extern unsigned long perf_arch_instruction_pointer(struct pt_regs *regs); 540 + extern unsigned long perf_arch_misc_flags(struct pt_regs *regs); 541 + extern unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs); 542 + #define perf_arch_misc_flags(regs) perf_arch_misc_flags(regs) 543 + #define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) 542 544 543 545 #include <asm/stacktrace.h> 544 546 545 547 /* 546 - * We abuse bit 3 from flags to pass exact information, see perf_misc_flags 547 - * and the comment with PERF_EFLAGS_EXACT. 548 + * We abuse bit 3 from flags to pass exact information, see 549 + * perf_arch_misc_flags() and the comment with PERF_EFLAGS_EXACT. 548 550 */ 549 551 #define perf_arch_fetch_caller_regs(regs, __ip) { \ 550 552 (regs)->ip = (__ip); \
+15
arch/x86/kernel/cpu/intel.c
··· 1299 1299 1300 1300 return cpuid_eax(0x0000001a) >> X86_HYBRID_CPU_TYPE_ID_SHIFT; 1301 1301 } 1302 + 1303 + /** 1304 + * get_this_hybrid_cpu_native_id() - Get the native id of this hybrid CPU 1305 + * 1306 + * Returns the uarch native ID [23:0] of a CPU in a hybrid processor. 1307 + * If the processor is not hybrid, returns 0. 1308 + */ 1309 + u32 get_this_hybrid_cpu_native_id(void) 1310 + { 1311 + if (!cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) 1312 + return 0; 1313 + 1314 + return cpuid_eax(0x0000001a) & 1315 + (BIT_ULL(X86_HYBRID_CPU_TYPE_ID_SHIFT) - 1); 1316 + }
-1
include/linux/cpuhotplug.h
··· 208 208 CPUHP_AP_PERF_X86_UNCORE_ONLINE, 209 209 CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, 210 210 CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, 211 - CPUHP_AP_PERF_X86_RAPL_ONLINE, 212 211 CPUHP_AP_PERF_S390_CF_ONLINE, 213 212 CPUHP_AP_PERF_S390_SF_ONLINE, 214 213 CPUHP_AP_PERF_ARM_CCI_ONLINE,
+51 -3
include/linux/perf_event.h
··· 170 170 }; 171 171 struct { /* aux / Intel-PT */ 172 172 u64 aux_config; 173 + /* 174 + * For AUX area events, aux_paused cannot be a state 175 + * flag because it can be updated asynchronously to 176 + * state. 177 + */ 178 + unsigned int aux_paused; 173 179 }; 174 180 struct { /* software */ 175 181 struct hrtimer hrtimer; ··· 300 294 #define PERF_PMU_CAP_NO_EXCLUDE 0x0040 301 295 #define PERF_PMU_CAP_AUX_OUTPUT 0x0080 302 296 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 297 + #define PERF_PMU_CAP_AUX_PAUSE 0x0200 303 298 304 299 /** 305 300 * pmu::scope ··· 391 384 #define PERF_EF_START 0x01 /* start the counter when adding */ 392 385 #define PERF_EF_RELOAD 0x02 /* reload the counter when starting */ 393 386 #define PERF_EF_UPDATE 0x04 /* update the counter when stopping */ 387 + #define PERF_EF_PAUSE 0x08 /* AUX area event, pause tracing */ 388 + #define PERF_EF_RESUME 0x10 /* AUX area event, resume tracing */ 394 389 395 390 /* 396 391 * Adds/Removes a counter to/from the PMU, can be done inside a ··· 432 423 * 433 424 * ->start() with PERF_EF_RELOAD will reprogram the counter 434 425 * value, must be preceded by a ->stop() with PERF_EF_UPDATE. 426 + * 427 + * ->stop() with PERF_EF_PAUSE will stop as simply as possible. Will not 428 + * overlap another ->stop() with PERF_EF_PAUSE nor ->start() with 429 + * PERF_EF_RESUME. 430 + * 431 + * ->start() with PERF_EF_RESUME will start as simply as possible but 432 + * only if the counter is not otherwise stopped. Will not overlap 433 + * another ->start() with PERF_EF_RESUME nor ->stop() with 434 + * PERF_EF_PAUSE. 435 + * 436 + * Notably, PERF_EF_PAUSE/PERF_EF_RESUME *can* be concurrent with other 437 + * ->stop()/->start() invocations, just not itself. 435 438 */ 436 439 void (*start) (struct perf_event *event, int flags); 437 440 void (*stop) (struct perf_event *event, int flags); ··· 1676 1655 struct task_struct *task); 1677 1656 extern void perf_bp_event(struct perf_event *event, void *data); 1678 1657 1679 - #ifndef perf_misc_flags 1680 - # define perf_misc_flags(regs) \ 1658 + extern unsigned long perf_misc_flags(struct perf_event *event, struct pt_regs *regs); 1659 + extern unsigned long perf_instruction_pointer(struct perf_event *event, 1660 + struct pt_regs *regs); 1661 + 1662 + #ifndef perf_arch_misc_flags 1663 + # define perf_arch_misc_flags(regs) \ 1681 1664 (user_mode(regs) ? PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL) 1682 - # define perf_instruction_pointer(regs) instruction_pointer(regs) 1665 + # define perf_arch_instruction_pointer(regs) instruction_pointer(regs) 1683 1666 #endif 1684 1667 #ifndef perf_arch_bpf_user_pt_regs 1685 1668 # define perf_arch_bpf_user_pt_regs(regs) regs 1669 + #endif 1670 + 1671 + #ifndef perf_arch_guest_misc_flags 1672 + static inline unsigned long perf_arch_guest_misc_flags(struct pt_regs *regs) 1673 + { 1674 + unsigned long guest_state = perf_guest_state(); 1675 + 1676 + if (!(guest_state & PERF_GUEST_ACTIVE)) 1677 + return 0; 1678 + 1679 + if (guest_state & PERF_GUEST_USER) 1680 + return PERF_RECORD_MISC_GUEST_USER; 1681 + else 1682 + return PERF_RECORD_MISC_GUEST_KERNEL; 1683 + } 1684 + # define perf_arch_guest_misc_flags(regs) perf_arch_guest_misc_flags(regs) 1686 1685 #endif 1687 1686 1688 1687 static inline bool has_branch_stack(struct perf_event *event) ··· 1718 1677 static inline bool has_aux(struct perf_event *event) 1719 1678 { 1720 1679 return event->pmu->setup_aux; 1680 + } 1681 + 1682 + static inline bool has_aux_action(struct perf_event *event) 1683 + { 1684 + return event->attr.aux_sample_size || 1685 + event->attr.aux_pause || 1686 + event->attr.aux_resume; 1721 1687 } 1722 1688 1723 1689 static inline bool is_write_backward(struct perf_event *event)
+76 -7
include/linux/uprobes.h
··· 15 15 #include <linux/rbtree.h> 16 16 #include <linux/types.h> 17 17 #include <linux/wait.h> 18 + #include <linux/timer.h> 18 19 19 20 struct uprobe; 20 21 struct vm_area_struct; ··· 24 23 struct notifier_block; 25 24 struct page; 26 25 26 + /* 27 + * Allowed return values from uprobe consumer's handler callback 28 + * with following meaning: 29 + * 30 + * UPROBE_HANDLER_REMOVE 31 + * - Remove the uprobe breakpoint from current->mm. 32 + * UPROBE_HANDLER_IGNORE 33 + * - Ignore ret_handler callback for this consumer. 34 + */ 27 35 #define UPROBE_HANDLER_REMOVE 1 28 - #define UPROBE_HANDLER_MASK 1 36 + #define UPROBE_HANDLER_IGNORE 2 29 37 30 38 #define MAX_URETPROBE_DEPTH 64 31 39 ··· 47 37 * for the current process. If filter() is omitted or returns true, 48 38 * UPROBE_HANDLER_REMOVE is effectively ignored. 49 39 */ 50 - int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); 40 + int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs, __u64 *data); 51 41 int (*ret_handler)(struct uprobe_consumer *self, 52 42 unsigned long func, 53 - struct pt_regs *regs); 43 + struct pt_regs *regs, __u64 *data); 54 44 bool (*filter)(struct uprobe_consumer *self, struct mm_struct *mm); 55 45 56 46 struct list_head cons_node; 47 + 48 + __u64 id; /* set when uprobe_consumer is registered */ 57 49 }; 58 50 59 51 #ifdef CONFIG_UPROBES ··· 68 56 UTASK_SSTEP_TRAPPED, 69 57 }; 70 58 59 + /* The state of hybrid-lifetime uprobe inside struct return_instance */ 60 + enum hprobe_state { 61 + HPROBE_LEASED, /* uretprobes_srcu-protected uprobe */ 62 + HPROBE_STABLE, /* refcounted uprobe */ 63 + HPROBE_GONE, /* NULL uprobe, SRCU expired, refcount failed */ 64 + HPROBE_CONSUMED, /* uprobe "consumed" by uretprobe handler */ 65 + }; 66 + 67 + /* 68 + * Hybrid lifetime uprobe. Represents a uprobe instance that could be either 69 + * SRCU protected (with SRCU protection eventually potentially timing out), 70 + * refcounted using uprobe->ref, or there could be no valid uprobe (NULL). 71 + * 72 + * hprobe's internal state is setup such that background timer thread can 73 + * atomically "downgrade" temporarily RCU-protected uprobe into refcounted one 74 + * (or no uprobe, if refcounting failed). 75 + * 76 + * *stable* pointer always point to the uprobe (or could be NULL if there is 77 + * was no valid underlying uprobe to begin with). 78 + * 79 + * *leased* pointer is the key to achieving race-free atomic lifetime state 80 + * transition and can have three possible states: 81 + * - either the same non-NULL value as *stable*, in which case uprobe is 82 + * SRCU-protected; 83 + * - NULL, in which case uprobe (if there is any) is refcounted; 84 + * - special __UPROBE_DEAD value, which represents an uprobe that was SRCU 85 + * protected initially, but SRCU period timed out and we attempted to 86 + * convert it to refcounted, but refcount_inc_not_zero() failed, because 87 + * uprobe effectively went away (the last consumer unsubscribed). In this 88 + * case it's important to know that *stable* pointer (which still has 89 + * non-NULL uprobe pointer) shouldn't be used, because lifetime of 90 + * underlying uprobe is not guaranteed anymore. __UPROBE_DEAD is just an 91 + * internal marker and is handled transparently by hprobe_fetch() helper. 92 + * 93 + * When uprobe is SRCU-protected, we also record srcu_idx value, necessary for 94 + * SRCU unlocking. 95 + * 96 + * See hprobe_expire() and hprobe_fetch() for details of race-free uprobe 97 + * state transitioning details. It all hinges on atomic xchg() over *leaded* 98 + * pointer. *stable* pointer, once initially set, is not modified concurrently. 99 + */ 100 + struct hprobe { 101 + enum hprobe_state state; 102 + int srcu_idx; 103 + struct uprobe *uprobe; 104 + }; 105 + 71 106 /* 72 107 * uprobe_task: Metadata of a task while it singlesteps. 73 108 */ 74 109 struct uprobe_task { 75 110 enum uprobe_task_state state; 111 + 112 + unsigned int depth; 113 + struct return_instance *return_instances; 76 114 77 115 union { 78 116 struct { ··· 137 75 }; 138 76 139 77 struct uprobe *active_uprobe; 78 + struct timer_list ri_timer; 140 79 unsigned long xol_vaddr; 141 80 142 81 struct arch_uprobe *auprobe; 82 + }; 143 83 144 - struct return_instance *return_instances; 145 - unsigned int depth; 84 + struct return_consumer { 85 + __u64 cookie; 86 + __u64 id; 146 87 }; 147 88 148 89 struct return_instance { 149 - struct uprobe *uprobe; 90 + struct hprobe hprobe; 150 91 unsigned long func; 151 92 unsigned long stack; /* stack pointer */ 152 93 unsigned long orig_ret_vaddr; /* original return address */ 153 94 bool chained; /* true, if instance is nested */ 95 + int consumers_cnt; 154 96 155 97 struct return_instance *next; /* keep as stack */ 156 - }; 98 + struct rcu_head rcu; 99 + 100 + struct return_consumer consumers[] __counted_by(consumers_cnt); 101 + } ____cacheline_aligned; 157 102 158 103 enum rp_check { 159 104 RP_CHECK_CALL,
+10 -1
include/uapi/linux/perf_event.h
··· 511 511 __u16 sample_max_stack; 512 512 __u16 __reserved_2; 513 513 __u32 aux_sample_size; 514 - __u32 __reserved_3; 514 + 515 + union { 516 + __u32 aux_action; 517 + struct { 518 + __u32 aux_start_paused : 1, /* start AUX area tracing paused */ 519 + aux_pause : 1, /* on overflow, pause AUX area tracing */ 520 + aux_resume : 1, /* on overflow, resume AUX area tracing */ 521 + __reserved_3 : 29; 522 + }; 523 + }; 515 524 516 525 /* 517 526 * User provided data if sigtrap=1, passed back to user via
+96 -6
kernel/events/core.c
··· 2142 2142 2143 2143 static bool perf_need_aux_event(struct perf_event *event) 2144 2144 { 2145 - return !!event->attr.aux_output || !!event->attr.aux_sample_size; 2145 + return event->attr.aux_output || has_aux_action(event); 2146 2146 } 2147 2147 2148 2148 static int perf_get_aux_event(struct perf_event *event, ··· 2165 2165 2166 2166 if (event->attr.aux_output && 2167 2167 !perf_aux_output_match(event, group_leader)) 2168 + return 0; 2169 + 2170 + if ((event->attr.aux_pause || event->attr.aux_resume) && 2171 + !(group_leader->pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) 2168 2172 return 0; 2169 2173 2170 2174 if (event->attr.aux_sample_size && !group_leader->pmu->snapshot_aux) ··· 7007 7003 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 7008 7004 #endif 7009 7005 7006 + static bool should_sample_guest(struct perf_event *event) 7007 + { 7008 + return !event->attr.exclude_guest && perf_guest_state(); 7009 + } 7010 + 7011 + unsigned long perf_misc_flags(struct perf_event *event, 7012 + struct pt_regs *regs) 7013 + { 7014 + if (should_sample_guest(event)) 7015 + return perf_arch_guest_misc_flags(regs); 7016 + 7017 + return perf_arch_misc_flags(regs); 7018 + } 7019 + 7020 + unsigned long perf_instruction_pointer(struct perf_event *event, 7021 + struct pt_regs *regs) 7022 + { 7023 + if (should_sample_guest(event)) 7024 + return perf_guest_get_ip(); 7025 + 7026 + return perf_arch_instruction_pointer(regs); 7027 + } 7028 + 7010 7029 static void 7011 7030 perf_output_sample_regs(struct perf_output_handle *handle, 7012 7031 struct pt_regs *regs, u64 mask) ··· 7847 7820 __perf_event_header__init_id(data, event, filtered_sample_type); 7848 7821 7849 7822 if (filtered_sample_type & PERF_SAMPLE_IP) { 7850 - data->ip = perf_instruction_pointer(regs); 7823 + data->ip = perf_instruction_pointer(event, regs); 7851 7824 data->sample_flags |= PERF_SAMPLE_IP; 7852 7825 } 7853 7826 ··· 8011 7984 { 8012 7985 header->type = PERF_RECORD_SAMPLE; 8013 7986 header->size = perf_sample_data_size(data, event); 8014 - header->misc = perf_misc_flags(regs); 7987 + header->misc = perf_misc_flags(event, regs); 8015 7988 8016 7989 /* 8017 7990 * If you're adding more sample types here, you likely need to do ··· 8022 7995 * do here next. 8023 7996 */ 8024 7997 WARN_ON_ONCE(header->size & 7); 7998 + } 7999 + 8000 + static void __perf_event_aux_pause(struct perf_event *event, bool pause) 8001 + { 8002 + if (pause) { 8003 + if (!event->hw.aux_paused) { 8004 + event->hw.aux_paused = 1; 8005 + event->pmu->stop(event, PERF_EF_PAUSE); 8006 + } 8007 + } else { 8008 + if (event->hw.aux_paused) { 8009 + event->hw.aux_paused = 0; 8010 + event->pmu->start(event, PERF_EF_RESUME); 8011 + } 8012 + } 8013 + } 8014 + 8015 + static void perf_event_aux_pause(struct perf_event *event, bool pause) 8016 + { 8017 + struct perf_buffer *rb; 8018 + 8019 + if (WARN_ON_ONCE(!event)) 8020 + return; 8021 + 8022 + rb = ring_buffer_get(event); 8023 + if (!rb) 8024 + return; 8025 + 8026 + scoped_guard (irqsave) { 8027 + /* 8028 + * Guard against self-recursion here. Another event could trip 8029 + * this same from NMI context. 8030 + */ 8031 + if (READ_ONCE(rb->aux_in_pause_resume)) 8032 + break; 8033 + 8034 + WRITE_ONCE(rb->aux_in_pause_resume, 1); 8035 + barrier(); 8036 + __perf_event_aux_pause(event, pause); 8037 + barrier(); 8038 + WRITE_ONCE(rb->aux_in_pause_resume, 0); 8039 + } 8040 + ring_buffer_put(rb); 8025 8041 } 8026 8042 8027 8043 static __always_inline int ··· 9869 9799 9870 9800 ret = __perf_event_account_interrupt(event, throttle); 9871 9801 9802 + if (event->attr.aux_pause) 9803 + perf_event_aux_pause(event->aux_event, true); 9804 + 9872 9805 if (event->prog && event->prog->type == BPF_PROG_TYPE_PERF_EVENT && 9873 9806 !bpf_overflow_handler(event, data, regs)) 9874 - return ret; 9807 + goto out; 9875 9808 9876 9809 /* 9877 9810 * XXX event_limit might not quite work as expected on inherited ··· 9936 9863 event->pending_wakeup = 1; 9937 9864 irq_work_queue(&event->pending_irq); 9938 9865 } 9866 + out: 9867 + if (event->attr.aux_resume) 9868 + perf_event_aux_pause(event->aux_event, false); 9939 9869 9940 9870 return ret; 9941 9871 } ··· 12330 12254 } 12331 12255 12332 12256 if (event->attr.aux_output && 12333 - !(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT)) { 12257 + (!(pmu->capabilities & PERF_PMU_CAP_AUX_OUTPUT) || 12258 + event->attr.aux_pause || event->attr.aux_resume)) { 12334 12259 err = -EOPNOTSUPP; 12335 12260 goto err_pmu; 12261 + } 12262 + 12263 + if (event->attr.aux_pause && event->attr.aux_resume) { 12264 + err = -EINVAL; 12265 + goto err_pmu; 12266 + } 12267 + 12268 + if (event->attr.aux_start_paused) { 12269 + if (!(pmu->capabilities & PERF_PMU_CAP_AUX_PAUSE)) { 12270 + err = -EOPNOTSUPP; 12271 + goto err_pmu; 12272 + } 12273 + event->hw.aux_paused = 1; 12336 12274 } 12337 12275 12338 12276 if (cgroup_fd != -1) { ··· 13142 13052 * Grouping is not supported for kernel events, neither is 'AUX', 13143 13053 * make sure the caller's intentions are adjusted. 13144 13054 */ 13145 - if (attr->aux_output) 13055 + if (attr->aux_output || attr->aux_action) 13146 13056 return ERR_PTR(-EINVAL); 13147 13057 13148 13058 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
+1
kernel/events/internal.h
··· 52 52 void (*free_aux)(void *); 53 53 refcount_t aux_refcount; 54 54 int aux_in_sampling; 55 + int aux_in_pause_resume; 55 56 void **aux_pages; 56 57 void *aux_priv; 57 58
+440 -172
kernel/events/uprobes.c
··· 26 26 #include <linux/task_work.h> 27 27 #include <linux/shmem_fs.h> 28 28 #include <linux/khugepaged.h> 29 + #include <linux/rcupdate_trace.h> 30 + #include <linux/workqueue.h> 31 + #include <linux/srcu.h> 29 32 30 33 #include <linux/uprobes.h> 31 34 ··· 45 42 static DEFINE_RWLOCK(uprobes_treelock); /* serialize rbtree access */ 46 43 static seqcount_rwlock_t uprobes_seqcount = SEQCNT_RWLOCK_ZERO(uprobes_seqcount, &uprobes_treelock); 47 44 48 - DEFINE_STATIC_SRCU(uprobes_srcu); 49 - 50 45 #define UPROBES_HASH_SZ 13 51 46 /* serialize uprobe->pending_list */ 52 47 static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 53 48 #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 54 49 55 50 DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); 51 + 52 + /* Covers return_instance's uprobe lifetime. */ 53 + DEFINE_STATIC_SRCU(uretprobes_srcu); 56 54 57 55 /* Have a copy of original instruction */ 58 56 #define UPROBE_COPY_INSN 0 ··· 66 62 struct list_head pending_list; 67 63 struct list_head consumers; 68 64 struct inode *inode; /* Also hold a ref to inode */ 69 - struct rcu_head rcu; 65 + union { 66 + struct rcu_head rcu; 67 + struct work_struct work; 68 + }; 70 69 loff_t offset; 71 70 loff_t ref_ctr_offset; 72 - unsigned long flags; 71 + unsigned long flags; /* "unsigned long" so bitops work */ 73 72 74 73 /* 75 74 * The generic code assumes that it has two members of unknown type ··· 107 100 */ 108 101 struct xol_area { 109 102 wait_queue_head_t wq; /* if all slots are busy */ 110 - atomic_t slot_count; /* number of in-use slots */ 111 103 unsigned long *bitmap; /* 0 = free slot */ 112 104 113 105 struct page *page; ··· 626 620 return !RB_EMPTY_NODE(&uprobe->rb_node); 627 621 } 628 622 629 - static void uprobe_free_rcu(struct rcu_head *rcu) 623 + static void uprobe_free_rcu_tasks_trace(struct rcu_head *rcu) 630 624 { 631 625 struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); 632 626 633 627 kfree(uprobe); 634 628 } 635 629 636 - static void put_uprobe(struct uprobe *uprobe) 630 + static void uprobe_free_srcu(struct rcu_head *rcu) 637 631 { 638 - if (!refcount_dec_and_test(&uprobe->ref)) 639 - return; 632 + struct uprobe *uprobe = container_of(rcu, struct uprobe, rcu); 633 + 634 + call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu_tasks_trace); 635 + } 636 + 637 + static void uprobe_free_deferred(struct work_struct *work) 638 + { 639 + struct uprobe *uprobe = container_of(work, struct uprobe, work); 640 640 641 641 write_lock(&uprobes_treelock); 642 642 ··· 663 651 delayed_uprobe_remove(uprobe, NULL); 664 652 mutex_unlock(&delayed_uprobe_lock); 665 653 666 - call_srcu(&uprobes_srcu, &uprobe->rcu, uprobe_free_rcu); 654 + /* start srcu -> rcu_tasks_trace -> kfree chain */ 655 + call_srcu(&uretprobes_srcu, &uprobe->rcu, uprobe_free_srcu); 656 + } 657 + 658 + static void put_uprobe(struct uprobe *uprobe) 659 + { 660 + if (!refcount_dec_and_test(&uprobe->ref)) 661 + return; 662 + 663 + INIT_WORK(&uprobe->work, uprobe_free_deferred); 664 + schedule_work(&uprobe->work); 665 + } 666 + 667 + /* Initialize hprobe as SRCU-protected "leased" uprobe */ 668 + static void hprobe_init_leased(struct hprobe *hprobe, struct uprobe *uprobe, int srcu_idx) 669 + { 670 + WARN_ON(!uprobe); 671 + hprobe->state = HPROBE_LEASED; 672 + hprobe->uprobe = uprobe; 673 + hprobe->srcu_idx = srcu_idx; 674 + } 675 + 676 + /* Initialize hprobe as refcounted ("stable") uprobe (uprobe can be NULL). */ 677 + static void hprobe_init_stable(struct hprobe *hprobe, struct uprobe *uprobe) 678 + { 679 + hprobe->state = uprobe ? HPROBE_STABLE : HPROBE_GONE; 680 + hprobe->uprobe = uprobe; 681 + hprobe->srcu_idx = -1; 682 + } 683 + 684 + /* 685 + * hprobe_consume() fetches hprobe's underlying uprobe and detects whether 686 + * uprobe is SRCU protected or is refcounted. hprobe_consume() can be 687 + * used only once for a given hprobe. 688 + * 689 + * Caller has to call hprobe_finalize() and pass previous hprobe_state, so 690 + * that hprobe_finalize() can perform SRCU unlock or put uprobe, whichever 691 + * is appropriate. 692 + */ 693 + static inline struct uprobe *hprobe_consume(struct hprobe *hprobe, enum hprobe_state *hstate) 694 + { 695 + *hstate = xchg(&hprobe->state, HPROBE_CONSUMED); 696 + switch (*hstate) { 697 + case HPROBE_LEASED: 698 + case HPROBE_STABLE: 699 + return hprobe->uprobe; 700 + case HPROBE_GONE: /* uprobe is NULL, no SRCU */ 701 + case HPROBE_CONSUMED: /* uprobe was finalized already, do nothing */ 702 + return NULL; 703 + default: 704 + WARN(1, "hprobe invalid state %d", *hstate); 705 + return NULL; 706 + } 707 + } 708 + 709 + /* 710 + * Reset hprobe state and, if hprobe was LEASED, release SRCU lock. 711 + * hprobe_finalize() can only be used from current context after 712 + * hprobe_consume() call (which determines uprobe and hstate value). 713 + */ 714 + static void hprobe_finalize(struct hprobe *hprobe, enum hprobe_state hstate) 715 + { 716 + switch (hstate) { 717 + case HPROBE_LEASED: 718 + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); 719 + break; 720 + case HPROBE_STABLE: 721 + put_uprobe(hprobe->uprobe); 722 + break; 723 + case HPROBE_GONE: 724 + case HPROBE_CONSUMED: 725 + break; 726 + default: 727 + WARN(1, "hprobe invalid state %d", hstate); 728 + break; 729 + } 730 + } 731 + 732 + /* 733 + * Attempt to switch (atomically) uprobe from being SRCU protected (LEASED) 734 + * to refcounted (STABLE) state. Competes with hprobe_consume(); only one of 735 + * them can win the race to perform SRCU unlocking. Whoever wins must perform 736 + * SRCU unlock. 737 + * 738 + * Returns underlying valid uprobe or NULL, if there was no underlying uprobe 739 + * to begin with or we failed to bump its refcount and it's going away. 740 + * 741 + * Returned non-NULL uprobe can be still safely used within an ongoing SRCU 742 + * locked region. If `get` is true, it's guaranteed that non-NULL uprobe has 743 + * an extra refcount for caller to assume and use. Otherwise, it's not 744 + * guaranteed that returned uprobe has a positive refcount, so caller has to 745 + * attempt try_get_uprobe(), if it needs to preserve uprobe beyond current 746 + * SRCU lock region. See dup_utask(). 747 + */ 748 + static struct uprobe *hprobe_expire(struct hprobe *hprobe, bool get) 749 + { 750 + enum hprobe_state hstate; 751 + 752 + /* 753 + * return_instance's hprobe is protected by RCU. 754 + * Underlying uprobe is itself protected from reuse by SRCU. 755 + */ 756 + lockdep_assert(rcu_read_lock_held() && srcu_read_lock_held(&uretprobes_srcu)); 757 + 758 + hstate = READ_ONCE(hprobe->state); 759 + switch (hstate) { 760 + case HPROBE_STABLE: 761 + /* uprobe has positive refcount, bump refcount, if necessary */ 762 + return get ? get_uprobe(hprobe->uprobe) : hprobe->uprobe; 763 + case HPROBE_GONE: 764 + /* 765 + * SRCU was unlocked earlier and we didn't manage to take 766 + * uprobe refcnt, so it's effectively NULL 767 + */ 768 + return NULL; 769 + case HPROBE_CONSUMED: 770 + /* 771 + * uprobe was consumed, so it's effectively NULL as far as 772 + * uretprobe processing logic is concerned 773 + */ 774 + return NULL; 775 + case HPROBE_LEASED: { 776 + struct uprobe *uprobe = try_get_uprobe(hprobe->uprobe); 777 + /* 778 + * Try to switch hprobe state, guarding against 779 + * hprobe_consume() or another hprobe_expire() racing with us. 780 + * Note, if we failed to get uprobe refcount, we use special 781 + * HPROBE_GONE state to signal that hprobe->uprobe shouldn't 782 + * be used as it will be freed after SRCU is unlocked. 783 + */ 784 + if (try_cmpxchg(&hprobe->state, &hstate, uprobe ? HPROBE_STABLE : HPROBE_GONE)) { 785 + /* We won the race, we are the ones to unlock SRCU */ 786 + __srcu_read_unlock(&uretprobes_srcu, hprobe->srcu_idx); 787 + return get ? get_uprobe(uprobe) : uprobe; 788 + } 789 + 790 + /* 791 + * We lost the race, undo refcount bump (if it ever happened), 792 + * unless caller would like an extra refcount anyways. 793 + */ 794 + if (uprobe && !get) 795 + put_uprobe(uprobe); 796 + /* 797 + * Even if hprobe_consume() or another hprobe_expire() wins 798 + * the state update race and unlocks SRCU from under us, we 799 + * still have a guarantee that underyling uprobe won't be 800 + * freed due to ongoing caller's SRCU lock region, so we can 801 + * return it regardless. Also, if `get` was true, we also have 802 + * an extra ref for the caller to own. This is used in dup_utask(). 803 + */ 804 + return uprobe; 805 + } 806 + default: 807 + WARN(1, "unknown hprobe state %d", hstate); 808 + return NULL; 809 + } 667 810 } 668 811 669 812 static __always_inline ··· 873 706 struct rb_node *node; 874 707 unsigned int seq; 875 708 876 - lockdep_assert(srcu_read_lock_held(&uprobes_srcu)); 709 + lockdep_assert(rcu_read_lock_trace_held()); 877 710 878 711 do { 879 712 seq = read_seqcount_begin(&uprobes_seqcount); ··· 992 825 993 826 static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc) 994 827 { 828 + static atomic64_t id; 829 + 995 830 down_write(&uprobe->consumer_rwsem); 996 831 list_add_rcu(&uc->cons_node, &uprobe->consumers); 832 + uc->id = (__u64) atomic64_inc_return(&id); 997 833 up_write(&uprobe->consumer_rwsem); 998 834 } 999 835 ··· 1104 934 bool ret = false; 1105 935 1106 936 down_read(&uprobe->consumer_rwsem); 1107 - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, 1108 - srcu_read_lock_held(&uprobes_srcu)) { 937 + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { 1109 938 ret = consumer_filter(uc, mm); 1110 939 if (ret) 1111 940 break; ··· 1325 1156 * unlucky enough caller can free consumer's memory and cause 1326 1157 * handler_chain() or handle_uretprobe_chain() to do an use-after-free. 1327 1158 */ 1328 - synchronize_srcu(&uprobes_srcu); 1159 + synchronize_rcu_tasks_trace(); 1160 + synchronize_srcu(&uretprobes_srcu); 1329 1161 } 1330 1162 EXPORT_SYMBOL_GPL(uprobe_unregister_sync); 1331 1163 ··· 1410 1240 int uprobe_apply(struct uprobe *uprobe, struct uprobe_consumer *uc, bool add) 1411 1241 { 1412 1242 struct uprobe_consumer *con; 1413 - int ret = -ENOENT, srcu_idx; 1243 + int ret = -ENOENT; 1414 1244 1415 1245 down_write(&uprobe->register_rwsem); 1416 1246 1417 - srcu_idx = srcu_read_lock(&uprobes_srcu); 1418 - list_for_each_entry_srcu(con, &uprobe->consumers, cons_node, 1419 - srcu_read_lock_held(&uprobes_srcu)) { 1247 + rcu_read_lock_trace(); 1248 + list_for_each_entry_rcu(con, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { 1420 1249 if (con == uc) { 1421 1250 ret = register_for_each_vma(uprobe, add ? uc : NULL); 1422 1251 break; 1423 1252 } 1424 1253 } 1425 - srcu_read_unlock(&uprobes_srcu, srcu_idx); 1254 + rcu_read_unlock_trace(); 1426 1255 1427 1256 up_write(&uprobe->register_rwsem); 1428 1257 ··· 1644 1475 return 0; 1645 1476 } 1646 1477 1478 + static int xol_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma) 1479 + { 1480 + return -EPERM; 1481 + } 1482 + 1647 1483 static const struct vm_special_mapping xol_mapping = { 1648 1484 .name = "[uprobes]", 1649 1485 .fault = xol_fault, 1486 + .mremap = xol_mremap, 1650 1487 }; 1651 1488 1652 1489 /* Slot allocation for XOL */ ··· 1728 1553 init_waitqueue_head(&area->wq); 1729 1554 /* Reserve the 1st slot for get_trampoline_vaddr() */ 1730 1555 set_bit(0, area->bitmap); 1731 - atomic_set(&area->slot_count, 1); 1732 1556 insns = arch_uprobe_trampoline(&insns_size); 1733 1557 arch_uprobe_copy_ixol(area->page, 0, insns, insns_size); 1734 1558 ··· 1800 1626 } 1801 1627 } 1802 1628 1803 - /* 1804 - * - search for a free slot. 1805 - */ 1806 - static unsigned long xol_take_insn_slot(struct xol_area *area) 1629 + static unsigned long xol_get_slot_nr(struct xol_area *area) 1807 1630 { 1808 - unsigned long slot_addr; 1809 - int slot_nr; 1631 + unsigned long slot_nr; 1810 1632 1811 - do { 1812 - slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); 1813 - if (slot_nr < UINSNS_PER_PAGE) { 1814 - if (!test_and_set_bit(slot_nr, area->bitmap)) 1815 - break; 1633 + slot_nr = find_first_zero_bit(area->bitmap, UINSNS_PER_PAGE); 1634 + if (slot_nr < UINSNS_PER_PAGE) { 1635 + if (!test_and_set_bit(slot_nr, area->bitmap)) 1636 + return slot_nr; 1637 + } 1816 1638 1817 - slot_nr = UINSNS_PER_PAGE; 1818 - continue; 1819 - } 1820 - wait_event(area->wq, (atomic_read(&area->slot_count) < UINSNS_PER_PAGE)); 1821 - } while (slot_nr >= UINSNS_PER_PAGE); 1822 - 1823 - slot_addr = area->vaddr + (slot_nr * UPROBE_XOL_SLOT_BYTES); 1824 - atomic_inc(&area->slot_count); 1825 - 1826 - return slot_addr; 1639 + return UINSNS_PER_PAGE; 1827 1640 } 1828 1641 1829 1642 /* 1830 1643 * xol_get_insn_slot - allocate a slot for xol. 1831 - * Returns the allocated slot address or 0. 1832 1644 */ 1833 - static unsigned long xol_get_insn_slot(struct uprobe *uprobe) 1645 + static bool xol_get_insn_slot(struct uprobe *uprobe, struct uprobe_task *utask) 1834 1646 { 1835 - struct xol_area *area; 1836 - unsigned long xol_vaddr; 1647 + struct xol_area *area = get_xol_area(); 1648 + unsigned long slot_nr; 1837 1649 1838 - area = get_xol_area(); 1839 1650 if (!area) 1840 - return 0; 1651 + return false; 1841 1652 1842 - xol_vaddr = xol_take_insn_slot(area); 1843 - if (unlikely(!xol_vaddr)) 1844 - return 0; 1653 + wait_event(area->wq, (slot_nr = xol_get_slot_nr(area)) < UINSNS_PER_PAGE); 1845 1654 1846 - arch_uprobe_copy_ixol(area->page, xol_vaddr, 1655 + utask->xol_vaddr = area->vaddr + slot_nr * UPROBE_XOL_SLOT_BYTES; 1656 + arch_uprobe_copy_ixol(area->page, utask->xol_vaddr, 1847 1657 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); 1848 - 1849 - return xol_vaddr; 1658 + return true; 1850 1659 } 1851 1660 1852 1661 /* 1853 - * xol_free_insn_slot - If slot was earlier allocated by 1854 - * @xol_get_insn_slot(), make the slot available for 1855 - * subsequent requests. 1662 + * xol_free_insn_slot - free the slot allocated by xol_get_insn_slot() 1856 1663 */ 1857 - static void xol_free_insn_slot(struct task_struct *tsk) 1664 + static void xol_free_insn_slot(struct uprobe_task *utask) 1858 1665 { 1859 - struct xol_area *area; 1860 - unsigned long vma_end; 1861 - unsigned long slot_addr; 1666 + struct xol_area *area = current->mm->uprobes_state.xol_area; 1667 + unsigned long offset = utask->xol_vaddr - area->vaddr; 1668 + unsigned int slot_nr; 1862 1669 1863 - if (!tsk->mm || !tsk->mm->uprobes_state.xol_area || !tsk->utask) 1670 + utask->xol_vaddr = 0; 1671 + /* xol_vaddr must fit into [area->vaddr, area->vaddr + PAGE_SIZE) */ 1672 + if (WARN_ON_ONCE(offset >= PAGE_SIZE)) 1864 1673 return; 1865 1674 1866 - slot_addr = tsk->utask->xol_vaddr; 1867 - if (unlikely(!slot_addr)) 1868 - return; 1869 - 1870 - area = tsk->mm->uprobes_state.xol_area; 1871 - vma_end = area->vaddr + PAGE_SIZE; 1872 - if (area->vaddr <= slot_addr && slot_addr < vma_end) { 1873 - unsigned long offset; 1874 - int slot_nr; 1875 - 1876 - offset = slot_addr - area->vaddr; 1877 - slot_nr = offset / UPROBE_XOL_SLOT_BYTES; 1878 - if (slot_nr >= UINSNS_PER_PAGE) 1879 - return; 1880 - 1881 - clear_bit(slot_nr, area->bitmap); 1882 - atomic_dec(&area->slot_count); 1883 - smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ 1884 - if (waitqueue_active(&area->wq)) 1885 - wake_up(&area->wq); 1886 - 1887 - tsk->utask->xol_vaddr = 0; 1888 - } 1675 + slot_nr = offset / UPROBE_XOL_SLOT_BYTES; 1676 + clear_bit(slot_nr, area->bitmap); 1677 + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ 1678 + if (waitqueue_active(&area->wq)) 1679 + wake_up(&area->wq); 1889 1680 } 1890 1681 1891 1682 void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, ··· 1889 1750 return instruction_pointer(regs); 1890 1751 } 1891 1752 1892 - static struct return_instance *free_ret_instance(struct return_instance *ri) 1753 + static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe) 1893 1754 { 1894 1755 struct return_instance *next = ri->next; 1895 - put_uprobe(ri->uprobe); 1896 - kfree(ri); 1756 + 1757 + if (cleanup_hprobe) { 1758 + enum hprobe_state hstate; 1759 + 1760 + (void)hprobe_consume(&ri->hprobe, &hstate); 1761 + hprobe_finalize(&ri->hprobe, hstate); 1762 + } 1763 + 1764 + kfree_rcu(ri, rcu); 1897 1765 return next; 1898 1766 } 1899 1767 ··· 1916 1770 if (!utask) 1917 1771 return; 1918 1772 1919 - if (utask->active_uprobe) 1920 - put_uprobe(utask->active_uprobe); 1773 + WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr); 1774 + 1775 + timer_delete_sync(&utask->ri_timer); 1921 1776 1922 1777 ri = utask->return_instances; 1923 1778 while (ri) 1924 - ri = free_ret_instance(ri); 1779 + ri = free_ret_instance(ri, true /* cleanup_hprobe */); 1925 1780 1926 - xol_free_insn_slot(t); 1927 1781 kfree(utask); 1928 1782 t->utask = NULL; 1783 + } 1784 + 1785 + #define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */ 1786 + 1787 + #define for_each_ret_instance_rcu(pos, head) \ 1788 + for (pos = rcu_dereference_raw(head); pos; pos = rcu_dereference_raw(pos->next)) 1789 + 1790 + static void ri_timer(struct timer_list *timer) 1791 + { 1792 + struct uprobe_task *utask = container_of(timer, struct uprobe_task, ri_timer); 1793 + struct return_instance *ri; 1794 + 1795 + /* SRCU protects uprobe from reuse for the cmpxchg() inside hprobe_expire(). */ 1796 + guard(srcu)(&uretprobes_srcu); 1797 + /* RCU protects return_instance from freeing. */ 1798 + guard(rcu)(); 1799 + 1800 + for_each_ret_instance_rcu(ri, utask->return_instances) 1801 + hprobe_expire(&ri->hprobe, false); 1802 + } 1803 + 1804 + static struct uprobe_task *alloc_utask(void) 1805 + { 1806 + struct uprobe_task *utask; 1807 + 1808 + utask = kzalloc(sizeof(*utask), GFP_KERNEL); 1809 + if (!utask) 1810 + return NULL; 1811 + 1812 + timer_setup(&utask->ri_timer, ri_timer, 0); 1813 + 1814 + return utask; 1929 1815 } 1930 1816 1931 1817 /* ··· 1971 1793 static struct uprobe_task *get_utask(void) 1972 1794 { 1973 1795 if (!current->utask) 1974 - current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); 1796 + current->utask = alloc_utask(); 1975 1797 return current->utask; 1798 + } 1799 + 1800 + static size_t ri_size(int consumers_cnt) 1801 + { 1802 + struct return_instance *ri; 1803 + 1804 + return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt; 1805 + } 1806 + 1807 + #define DEF_CNT 4 1808 + 1809 + static struct return_instance *alloc_return_instance(void) 1810 + { 1811 + struct return_instance *ri; 1812 + 1813 + ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL); 1814 + if (!ri) 1815 + return ZERO_SIZE_PTR; 1816 + 1817 + ri->consumers_cnt = DEF_CNT; 1818 + return ri; 1819 + } 1820 + 1821 + static struct return_instance *dup_return_instance(struct return_instance *old) 1822 + { 1823 + size_t size = ri_size(old->consumers_cnt); 1824 + 1825 + return kmemdup(old, size, GFP_KERNEL); 1976 1826 } 1977 1827 1978 1828 static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) 1979 1829 { 1980 1830 struct uprobe_task *n_utask; 1981 1831 struct return_instance **p, *o, *n; 1832 + struct uprobe *uprobe; 1982 1833 1983 - n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL); 1834 + n_utask = alloc_utask(); 1984 1835 if (!n_utask) 1985 1836 return -ENOMEM; 1986 1837 t->utask = n_utask; 1987 1838 1839 + /* protect uprobes from freeing, we'll need try_get_uprobe() them */ 1840 + guard(srcu)(&uretprobes_srcu); 1841 + 1988 1842 p = &n_utask->return_instances; 1989 1843 for (o = o_utask->return_instances; o; o = o->next) { 1990 - n = kmalloc(sizeof(struct return_instance), GFP_KERNEL); 1844 + n = dup_return_instance(o); 1991 1845 if (!n) 1992 1846 return -ENOMEM; 1993 1847 1994 - *n = *o; 1995 - /* 1996 - * uprobe's refcnt has to be positive at this point, kept by 1997 - * utask->return_instances items; return_instances can't be 1998 - * removed right now, as task is blocked due to duping; so 1999 - * get_uprobe() is safe to use here. 2000 - */ 2001 - get_uprobe(n->uprobe); 2002 - n->next = NULL; 1848 + /* if uprobe is non-NULL, we'll have an extra refcount for uprobe */ 1849 + uprobe = hprobe_expire(&o->hprobe, true); 2003 1850 2004 - *p = n; 1851 + /* 1852 + * New utask will have stable properly refcounted uprobe or 1853 + * NULL. Even if we failed to get refcounted uprobe, we still 1854 + * need to preserve full set of return_instances for proper 1855 + * uretprobe handling and nesting in forked task. 1856 + */ 1857 + hprobe_init_stable(&n->hprobe, uprobe); 1858 + 1859 + n->next = NULL; 1860 + rcu_assign_pointer(*p, n); 2005 1861 p = &n->next; 1862 + 2006 1863 n_utask->depth++; 2007 1864 } 2008 1865 ··· 2113 1900 enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; 2114 1901 2115 1902 while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { 2116 - ri = free_ret_instance(ri); 1903 + ri = free_ret_instance(ri, true /* cleanup_hprobe */); 2117 1904 utask->depth--; 2118 1905 } 2119 - utask->return_instances = ri; 1906 + rcu_assign_pointer(utask->return_instances, ri); 2120 1907 } 2121 1908 2122 - static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) 1909 + static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs, 1910 + struct return_instance *ri) 2123 1911 { 2124 - struct return_instance *ri; 2125 - struct uprobe_task *utask; 1912 + struct uprobe_task *utask = current->utask; 2126 1913 unsigned long orig_ret_vaddr, trampoline_vaddr; 2127 1914 bool chained; 1915 + int srcu_idx; 2128 1916 2129 1917 if (!get_xol_area()) 2130 - return; 2131 - 2132 - utask = get_utask(); 2133 - if (!utask) 2134 - return; 1918 + goto free; 2135 1919 2136 1920 if (utask->depth >= MAX_URETPROBE_DEPTH) { 2137 1921 printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" 2138 1922 " nestedness limit pid/tgid=%d/%d\n", 2139 1923 current->pid, current->tgid); 2140 - return; 1924 + goto free; 2141 1925 } 2142 - 2143 - /* we need to bump refcount to store uprobe in utask */ 2144 - if (!try_get_uprobe(uprobe)) 2145 - return; 2146 - 2147 - ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); 2148 - if (!ri) 2149 - goto fail; 2150 1926 2151 1927 trampoline_vaddr = uprobe_get_trampoline_vaddr(); 2152 1928 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); 2153 1929 if (orig_ret_vaddr == -1) 2154 - goto fail; 1930 + goto free; 2155 1931 2156 1932 /* drop the entries invalidated by longjmp() */ 2157 1933 chained = (orig_ret_vaddr == trampoline_vaddr); ··· 2158 1956 * attack from user-space. 2159 1957 */ 2160 1958 uprobe_warn(current, "handle tail call"); 2161 - goto fail; 1959 + goto free; 2162 1960 } 2163 1961 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; 2164 1962 } 2165 - ri->uprobe = uprobe; 1963 + 1964 + /* __srcu_read_lock() because SRCU lock survives switch to user space */ 1965 + srcu_idx = __srcu_read_lock(&uretprobes_srcu); 1966 + 2166 1967 ri->func = instruction_pointer(regs); 2167 1968 ri->stack = user_stack_pointer(regs); 2168 1969 ri->orig_ret_vaddr = orig_ret_vaddr; 2169 1970 ri->chained = chained; 2170 1971 2171 1972 utask->depth++; 1973 + 1974 + hprobe_init_leased(&ri->hprobe, uprobe, srcu_idx); 2172 1975 ri->next = utask->return_instances; 2173 - utask->return_instances = ri; 1976 + rcu_assign_pointer(utask->return_instances, ri); 1977 + 1978 + mod_timer(&utask->ri_timer, jiffies + RI_TIMER_PERIOD); 2174 1979 2175 1980 return; 2176 - fail: 1981 + free: 2177 1982 kfree(ri); 2178 - put_uprobe(uprobe); 2179 1983 } 2180 1984 2181 1985 /* Prepare to single-step probed instruction out of line. */ 2182 1986 static int 2183 1987 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) 2184 1988 { 2185 - struct uprobe_task *utask; 2186 - unsigned long xol_vaddr; 1989 + struct uprobe_task *utask = current->utask; 2187 1990 int err; 2188 - 2189 - utask = get_utask(); 2190 - if (!utask) 2191 - return -ENOMEM; 2192 1991 2193 1992 if (!try_get_uprobe(uprobe)) 2194 1993 return -EINVAL; 2195 1994 2196 - xol_vaddr = xol_get_insn_slot(uprobe); 2197 - if (!xol_vaddr) { 1995 + if (!xol_get_insn_slot(uprobe, utask)) { 2198 1996 err = -ENOMEM; 2199 1997 goto err_out; 2200 1998 } 2201 1999 2202 - utask->xol_vaddr = xol_vaddr; 2203 2000 utask->vaddr = bp_vaddr; 2204 - 2205 2001 err = arch_uprobe_pre_xol(&uprobe->arch, regs); 2206 2002 if (unlikely(err)) { 2207 - xol_free_insn_slot(current); 2003 + xol_free_insn_slot(utask); 2208 2004 goto err_out; 2209 2005 } 2210 2006 ··· 2325 2125 return uprobe; 2326 2126 } 2327 2127 2128 + static struct return_instance* 2129 + push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie) 2130 + { 2131 + if (unlikely(ri == ZERO_SIZE_PTR)) 2132 + return ri; 2133 + 2134 + if (unlikely(idx >= ri->consumers_cnt)) { 2135 + struct return_instance *old_ri = ri; 2136 + 2137 + ri->consumers_cnt += DEF_CNT; 2138 + ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL); 2139 + if (!ri) { 2140 + kfree(old_ri); 2141 + return ZERO_SIZE_PTR; 2142 + } 2143 + } 2144 + 2145 + ri->consumers[idx].id = id; 2146 + ri->consumers[idx].cookie = cookie; 2147 + return ri; 2148 + } 2149 + 2150 + static struct return_consumer * 2151 + return_consumer_find(struct return_instance *ri, int *iter, int id) 2152 + { 2153 + struct return_consumer *ric; 2154 + int idx = *iter; 2155 + 2156 + for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) { 2157 + if (ric->id == id) { 2158 + *iter = idx + 1; 2159 + return ric; 2160 + } 2161 + } 2162 + return NULL; 2163 + } 2164 + 2165 + static bool ignore_ret_handler(int rc) 2166 + { 2167 + return rc == UPROBE_HANDLER_REMOVE || rc == UPROBE_HANDLER_IGNORE; 2168 + } 2169 + 2328 2170 static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) 2329 2171 { 2330 2172 struct uprobe_consumer *uc; 2331 - int remove = UPROBE_HANDLER_REMOVE; 2332 - bool need_prep = false; /* prepare return uprobe, when needed */ 2333 - bool has_consumers = false; 2173 + bool has_consumers = false, remove = true; 2174 + struct return_instance *ri = NULL; 2175 + int push_idx = 0; 2334 2176 2335 2177 current->utask->auprobe = &uprobe->arch; 2336 2178 2337 - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, 2338 - srcu_read_lock_held(&uprobes_srcu)) { 2179 + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { 2180 + bool session = uc->handler && uc->ret_handler; 2181 + __u64 cookie = 0; 2339 2182 int rc = 0; 2340 2183 2341 2184 if (uc->handler) { 2342 - rc = uc->handler(uc, regs); 2343 - WARN(rc & ~UPROBE_HANDLER_MASK, 2185 + rc = uc->handler(uc, regs, &cookie); 2186 + WARN(rc < 0 || rc > 2, 2344 2187 "bad rc=0x%x from %ps()\n", rc, uc->handler); 2345 2188 } 2346 2189 2347 - if (uc->ret_handler) 2348 - need_prep = true; 2349 - 2350 - remove &= rc; 2190 + remove &= rc == UPROBE_HANDLER_REMOVE; 2351 2191 has_consumers = true; 2192 + 2193 + if (!uc->ret_handler || ignore_ret_handler(rc)) 2194 + continue; 2195 + 2196 + if (!ri) 2197 + ri = alloc_return_instance(); 2198 + 2199 + if (session) 2200 + ri = push_consumer(ri, push_idx++, uc->id, cookie); 2352 2201 } 2353 2202 current->utask->auprobe = NULL; 2354 2203 2355 - if (need_prep && !remove) 2356 - prepare_uretprobe(uprobe, regs); /* put bp at return */ 2204 + if (!ZERO_OR_NULL_PTR(ri)) { 2205 + /* 2206 + * The push_idx value has the final number of return consumers, 2207 + * and ri->consumers_cnt has number of allocated consumers. 2208 + */ 2209 + ri->consumers_cnt = push_idx; 2210 + prepare_uretprobe(uprobe, regs, ri); 2211 + } 2357 2212 2358 2213 if (remove && has_consumers) { 2359 2214 down_read(&uprobe->register_rwsem); ··· 2424 2169 } 2425 2170 2426 2171 static void 2427 - handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) 2172 + handle_uretprobe_chain(struct return_instance *ri, struct uprobe *uprobe, struct pt_regs *regs) 2428 2173 { 2429 - struct uprobe *uprobe = ri->uprobe; 2174 + struct return_consumer *ric; 2430 2175 struct uprobe_consumer *uc; 2431 - int srcu_idx; 2176 + int ric_idx = 0; 2432 2177 2433 - srcu_idx = srcu_read_lock(&uprobes_srcu); 2434 - list_for_each_entry_srcu(uc, &uprobe->consumers, cons_node, 2435 - srcu_read_lock_held(&uprobes_srcu)) { 2436 - if (uc->ret_handler) 2437 - uc->ret_handler(uc, ri->func, regs); 2178 + /* all consumers unsubscribed meanwhile */ 2179 + if (unlikely(!uprobe)) 2180 + return; 2181 + 2182 + rcu_read_lock_trace(); 2183 + list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) { 2184 + bool session = uc->handler && uc->ret_handler; 2185 + 2186 + if (uc->ret_handler) { 2187 + ric = return_consumer_find(ri, &ric_idx, uc->id); 2188 + if (!session || ric) 2189 + uc->ret_handler(uc, ri->func, regs, ric ? &ric->cookie : NULL); 2190 + } 2438 2191 } 2439 - srcu_read_unlock(&uprobes_srcu, srcu_idx); 2192 + rcu_read_unlock_trace(); 2440 2193 } 2441 2194 2442 2195 static struct return_instance *find_next_ret_chain(struct return_instance *ri) ··· 2463 2200 { 2464 2201 struct uprobe_task *utask; 2465 2202 struct return_instance *ri, *next; 2203 + struct uprobe *uprobe; 2204 + enum hprobe_state hstate; 2466 2205 bool valid; 2467 2206 2468 2207 utask = current->utask; ··· 2495 2230 * trampoline addresses on the stack are replaced with correct 2496 2231 * original return addresses 2497 2232 */ 2498 - utask->return_instances = ri->next; 2233 + rcu_assign_pointer(utask->return_instances, ri->next); 2234 + 2235 + uprobe = hprobe_consume(&ri->hprobe, &hstate); 2499 2236 if (valid) 2500 - handle_uretprobe_chain(ri, regs); 2501 - ri = free_ret_instance(ri); 2237 + handle_uretprobe_chain(ri, uprobe, regs); 2238 + hprobe_finalize(&ri->hprobe, hstate); 2239 + 2240 + /* We already took care of hprobe, no need to waste more time on that. */ 2241 + ri = free_ret_instance(ri, false /* !cleanup_hprobe */); 2502 2242 utask->depth--; 2503 2243 } while (ri != next); 2504 2244 } while (!valid); 2505 2245 2506 - utask->return_instances = ri; 2507 2246 return; 2508 2247 2509 - sigill: 2248 + sigill: 2510 2249 uprobe_warn(current, "handle uretprobe, sending SIGILL."); 2511 2250 force_sig(SIGILL); 2512 - 2513 2251 } 2514 2252 2515 2253 bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) ··· 2534 2266 { 2535 2267 struct uprobe *uprobe; 2536 2268 unsigned long bp_vaddr; 2537 - int is_swbp, srcu_idx; 2269 + int is_swbp; 2538 2270 2539 2271 bp_vaddr = uprobe_get_swbp_addr(regs); 2540 2272 if (bp_vaddr == uprobe_get_trampoline_vaddr()) 2541 2273 return uprobe_handle_trampoline(regs); 2542 2274 2543 - srcu_idx = srcu_read_lock(&uprobes_srcu); 2275 + rcu_read_lock_trace(); 2544 2276 2545 2277 uprobe = find_active_uprobe_rcu(bp_vaddr, &is_swbp); 2546 2278 if (!uprobe) { ··· 2598 2330 2599 2331 out: 2600 2332 /* arch_uprobe_skip_sstep() succeeded, or restart if can't singlestep */ 2601 - srcu_read_unlock(&uprobes_srcu, srcu_idx); 2333 + rcu_read_unlock_trace(); 2602 2334 } 2603 2335 2604 2336 /* ··· 2621 2353 put_uprobe(uprobe); 2622 2354 utask->active_uprobe = NULL; 2623 2355 utask->state = UTASK_RUNNING; 2624 - xol_free_insn_slot(current); 2356 + xol_free_insn_slot(utask); 2625 2357 2626 2358 spin_lock_irq(&current->sighand->siglock); 2627 2359 recalc_sigpending(); /* see uprobe_deny_signal() */
+4 -2
kernel/trace/bpf_trace.c
··· 3240 3240 } 3241 3241 3242 3242 static int 3243 - uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs) 3243 + uprobe_multi_link_handler(struct uprobe_consumer *con, struct pt_regs *regs, 3244 + __u64 *data) 3244 3245 { 3245 3246 struct bpf_uprobe *uprobe; 3246 3247 ··· 3250 3249 } 3251 3250 3252 3251 static int 3253 - uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs) 3252 + uprobe_multi_link_ret_handler(struct uprobe_consumer *con, unsigned long func, struct pt_regs *regs, 3253 + __u64 *data) 3254 3254 { 3255 3255 struct bpf_uprobe *uprobe; 3256 3256
+8 -4
kernel/trace/trace_uprobe.c
··· 89 89 static int register_uprobe_event(struct trace_uprobe *tu); 90 90 static int unregister_uprobe_event(struct trace_uprobe *tu); 91 91 92 - static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 92 + static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, 93 + __u64 *data); 93 94 static int uretprobe_dispatcher(struct uprobe_consumer *con, 94 - unsigned long func, struct pt_regs *regs); 95 + unsigned long func, struct pt_regs *regs, 96 + __u64 *data); 95 97 96 98 #ifdef CONFIG_STACK_GROWSUP 97 99 static unsigned long adjust_stack_addr(unsigned long addr, unsigned int n) ··· 1524 1522 } 1525 1523 } 1526 1524 1527 - static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 1525 + static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs, 1526 + __u64 *data) 1528 1527 { 1529 1528 struct trace_uprobe *tu; 1530 1529 struct uprobe_dispatch_data udd; ··· 1556 1553 } 1557 1554 1558 1555 static int uretprobe_dispatcher(struct uprobe_consumer *con, 1559 - unsigned long func, struct pt_regs *regs) 1556 + unsigned long func, struct pt_regs *regs, 1557 + __u64 *data) 1560 1558 { 1561 1559 struct trace_uprobe *tu; 1562 1560 struct uprobe_dispatch_data udd;
+1 -1
tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
··· 461 461 462 462 static int 463 463 uprobe_ret_handler(struct uprobe_consumer *self, unsigned long func, 464 - struct pt_regs *regs) 464 + struct pt_regs *regs, __u64 *data) 465 465 466 466 { 467 467 regs->ax = 0x12345678deadbeef;