Merge tag 'kvm-x86-pmu-6.20' of https://github.com/kvm-x86/linux into HEAD

+49

Documentation/admin-guide/kernel-parameters.txt

··· 3079 3079 3080 3080 Default is Y (on). 3081 3081 3082 + kvm.enable_pmu=[KVM,X86] 3083 + If enabled, KVM will virtualize PMU functionality based 3084 + on the virtual CPU model defined by userspace. This 3085 + can be overridden on a per-VM basis via 3086 + KVM_CAP_PMU_CAPABILITY. 3087 + 3088 + If disabled, KVM will not virtualize PMU functionality, 3089 + e.g. MSRs, PMCs, PMIs, etc., even if userspace defines 3090 + a virtual CPU model that contains PMU assets. 3091 + 3092 + Note, KVM's vPMU support implicitly requires running 3093 + with an in-kernel local APIC, e.g. to deliver PMIs to 3094 + the guest. Running without an in-kernel local APIC is 3095 + not supported, though KVM will allow such a combination 3096 + (with severely degraded functionality). 3097 + 3098 + See also enable_mediated_pmu. 3099 + 3100 + Default is Y (on). 3101 + 3082 3102 kvm.enable_virt_at_load=[KVM,ARM64,LOONGARCH,MIPS,RISCV,X86] 3083 3103 If enabled, KVM will enable virtualization in hardware 3084 3104 when KVM is loaded, and disable virtualization when KVM ··· 3144 3124 zap a portion (see ratio above) of the pages every N msecs. 3145 3125 If the value is 0 (the default), KVM will pick a period based 3146 3126 on the ratio, such that a page is zapped after 1 hour on average. 3127 + 3128 + kvm-{amd,intel}.enable_mediated_pmu=[KVM,AMD,INTEL] 3129 + If enabled, KVM will provide a mediated virtual PMU, 3130 + instead of the default perf-based virtual PMU (if 3131 + kvm.enable_pmu is true and PMU is enumerated via the 3132 + virtual CPU model). 3133 + 3134 + With a perf-based vPMU, KVM operates as a user of perf, 3135 + i.e. emulates guest PMU counters using perf events. 3136 + KVM-created perf events are managed by perf as regular 3137 + (guest-only) events, e.g. are scheduled in/out, contend 3138 + for hardware resources, etc. Using a perf-based vPMU 3139 + allows guest and host usage of the PMU to co-exist, but 3140 + incurs non-trivial overhead and can result in silently 3141 + dropped guest events (due to resource contention). 3142 + 3143 + With a mediated vPMU, hardware PMU state is context 3144 + switched around the world switch to/from the guest. 3145 + KVM mediates which events the guest can utilize, but 3146 + gives the guest direct access to all other PMU assets 3147 + when possible (KVM may intercept some accesses if the 3148 + virtual CPU model provides a subset of hardware PMU 3149 + functionality). Using a mediated vPMU significantly 3150 + reduces PMU virtualization overhead and eliminates lost 3151 + guest events, but is mutually exclusive with using perf 3152 + to profile KVM guests and adds latency to most VM-Exits 3153 + (to context switch PMU state). 3154 + 3155 + Default is N (off). 3147 3156 3148 3157 kvm-amd.nested= [KVM,AMD] Control nested virtualization feature in 3149 3158 KVM/SVM. Default is 1 (enabled).

+1 -1

arch/arm64/kvm/arm.c

··· 2413 2413 if (err) 2414 2414 goto out; 2415 2415 2416 - kvm_register_perf_callbacks(NULL); 2416 + kvm_register_perf_callbacks(); 2417 2417 2418 2418 out: 2419 2419 if (err)

+1 -1

arch/loongarch/kvm/main.c

··· 402 402 } 403 403 404 404 kvm_init_gcsr_flag(); 405 - kvm_register_perf_callbacks(NULL); 405 + kvm_register_perf_callbacks(); 406 406 407 407 /* Register LoongArch IPI interrupt controller interface. */ 408 408 ret = kvm_loongarch_register_ipi_device();

+1 -1

arch/riscv/kvm/main.c

··· 174 174 175 175 kvm_riscv_setup_vendor_features(); 176 176 177 - kvm_register_perf_callbacks(NULL); 177 + kvm_register_perf_callbacks(); 178 178 179 179 rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); 180 180 if (rc) {

+1

arch/x86/entry/entry_fred.c

··· 114 114 115 115 SYSVEC(IRQ_WORK_VECTOR, irq_work), 116 116 117 + SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, perf_guest_mediated_pmi_handler), 117 118 SYSVEC(POSTED_INTR_VECTOR, kvm_posted_intr_ipi), 118 119 SYSVEC(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi), 119 120 SYSVEC(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),

+2

arch/x86/events/amd/core.c

··· 1439 1439 1440 1440 amd_pmu_global_cntr_mask = x86_pmu.cntr_mask64; 1441 1441 1442 + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; 1443 + 1442 1444 /* Update PMC handling functions */ 1443 1445 x86_pmu.enable_all = amd_pmu_v2_enable_all; 1444 1446 x86_pmu.disable_all = amd_pmu_v2_disable_all;

+36 -2

arch/x86/events/core.c

··· 30 30 #include <linux/device.h> 31 31 #include <linux/nospec.h> 32 32 #include <linux/static_call.h> 33 + #include <linux/kvm_types.h> 33 34 34 35 #include <asm/apic.h> 35 36 #include <asm/stacktrace.h> ··· 56 55 .enabled = 1, 57 56 .pmu = &pmu, 58 57 }; 58 + 59 + static DEFINE_PER_CPU(bool, guest_lvtpc_loaded); 59 60 60 61 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); 61 62 DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); ··· 1763 1760 apic_write(APIC_LVTPC, APIC_DM_NMI); 1764 1761 } 1765 1762 1763 + #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU 1764 + void perf_load_guest_lvtpc(u32 guest_lvtpc) 1765 + { 1766 + u32 masked = guest_lvtpc & APIC_LVT_MASKED; 1767 + 1768 + apic_write(APIC_LVTPC, 1769 + APIC_DM_FIXED | PERF_GUEST_MEDIATED_PMI_VECTOR | masked); 1770 + this_cpu_write(guest_lvtpc_loaded, true); 1771 + } 1772 + EXPORT_SYMBOL_FOR_KVM(perf_load_guest_lvtpc); 1773 + 1774 + void perf_put_guest_lvtpc(void) 1775 + { 1776 + this_cpu_write(guest_lvtpc_loaded, false); 1777 + apic_write(APIC_LVTPC, APIC_DM_NMI); 1778 + } 1779 + EXPORT_SYMBOL_FOR_KVM(perf_put_guest_lvtpc); 1780 + #endif /* CONFIG_PERF_GUEST_MEDIATED_PMU */ 1781 + 1766 1782 static int 1767 1783 perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) 1768 1784 { 1769 1785 u64 start_clock; 1770 1786 u64 finish_clock; 1771 1787 int ret; 1788 + 1789 + /* 1790 + * Ignore all NMIs when the CPU's LVTPC is configured to route PMIs to 1791 + * PERF_GUEST_MEDIATED_PMI_VECTOR, i.e. when an NMI time can't be due 1792 + * to a PMI. Attempting to handle a PMI while the guest's context is 1793 + * loaded will generate false positives and clobber guest state. Note, 1794 + * the LVTPC is switched to/from the dedicated mediated PMI IRQ vector 1795 + * while host events are quiesced. 1796 + */ 1797 + if (this_cpu_read(guest_lvtpc_loaded)) 1798 + return NMI_DONE; 1772 1799 1773 1800 /* 1774 1801 * All PMUs/events that share this PMI handler should make sure to ··· 3106 3073 cap->version = x86_pmu.version; 3107 3074 cap->num_counters_gp = x86_pmu_num_counters(NULL); 3108 3075 cap->num_counters_fixed = x86_pmu_num_counters_fixed(NULL); 3109 - cap->bit_width_gp = x86_pmu.cntval_bits; 3110 - cap->bit_width_fixed = x86_pmu.cntval_bits; 3076 + cap->bit_width_gp = cap->num_counters_gp ? x86_pmu.cntval_bits : 0; 3077 + cap->bit_width_fixed = cap->num_counters_fixed ? x86_pmu.cntval_bits : 0; 3111 3078 cap->events_mask = (unsigned int)x86_pmu.events_maskl; 3112 3079 cap->events_mask_len = x86_pmu.events_mask_len; 3113 3080 cap->pebs_ept = x86_pmu.pebs_ept; 3081 + cap->mediated = !!(pmu.capabilities & PERF_PMU_CAP_MEDIATED_VPMU); 3114 3082 } 3115 3083 EXPORT_SYMBOL_FOR_KVM(perf_get_x86_pmu_capability); 3116 3084

+6

arch/x86/events/intel/core.c

··· 5695 5695 else 5696 5696 pmu->intel_ctrl &= ~GLOBAL_CTRL_EN_PERF_METRICS; 5697 5697 5698 + pmu->pmu.capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; 5699 + 5698 5700 intel_pmu_check_event_constraints_all(&pmu->pmu); 5699 5701 5700 5702 intel_pmu_check_extra_regs(pmu->extra_regs); ··· 7316 7314 pr_cont(" AnyThread deprecated, "); 7317 7315 } 7318 7316 7317 + /* The perf side of core PMU is ready to support the mediated vPMU. */ 7318 + x86_get_pmu(smp_processor_id())->capabilities |= PERF_PMU_CAP_MEDIATED_VPMU; 7319 + 7319 7320 /* 7320 7321 * Many features on and after V6 require dynamic constraint, 7321 7322 * e.g., Arch PEBS, ACR. ··· 7410 7405 case INTEL_ATOM_SILVERMONT_D: 7411 7406 case INTEL_ATOM_SILVERMONT_MID: 7412 7407 case INTEL_ATOM_AIRMONT: 7408 + case INTEL_ATOM_AIRMONT_NP: 7413 7409 case INTEL_ATOM_SILVERMONT_MID2: 7414 7410 memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, 7415 7411 sizeof(hw_cache_event_ids));

+26 -7

arch/x86/events/intel/cstate.c

··· 41 41 * MSR_CORE_C1_RES: CORE C1 Residency Counter 42 42 * perf code: 0x00 43 43 * Available model: SLM,AMT,GLM,CNL,ICX,TNT,ADL,RPL 44 - * MTL,SRF,GRR,ARL,LNL,PTL 44 + * MTL,SRF,GRR,ARL,LNL,PTL,WCL,NVL 45 45 * Scope: Core (each processor core has a MSR) 46 46 * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter 47 47 * perf code: 0x01 ··· 53 53 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, 54 54 * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, 55 55 * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, 56 - * GRR,ARL,LNL,PTL 56 + * GRR,ARL,LNL,PTL,WCL,NVL 57 57 * Scope: Core 58 58 * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter 59 59 * perf code: 0x03 60 60 * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML, 61 61 * ICL,TGL,RKL,ADL,RPL,MTL,ARL,LNL, 62 - * PTL 62 + * PTL,WCL,NVL 63 63 * Scope: Core 64 64 * MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter. 65 65 * perf code: 0x00 66 66 * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL, 67 67 * KBL,CML,ICL,ICX,TGL,TNT,RKL,ADL, 68 - * RPL,SPR,MTL,ARL,LNL,SRF,PTL 68 + * RPL,SPR,MTL,ARL,LNL,SRF,PTL,WCL, 69 + * NVL 69 70 * Scope: Package (physical package) 70 71 * MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter. 71 72 * perf code: 0x01 ··· 79 78 * Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW, 80 79 * SKL,KNL,GLM,CNL,KBL,CML,ICL,ICX, 81 80 * TGL,TNT,RKL,ADL,RPL,SPR,MTL,SRF, 82 - * ARL,LNL,PTL 81 + * ARL,LNL,PTL,WCL,NVL 83 82 * Scope: Package (physical package) 84 83 * MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter. 85 84 * perf code: 0x03 ··· 98 97 * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. 99 98 * perf code: 0x06 100 99 * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL, 101 - * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL 100 + * TNT,RKL,ADL,RPL,MTL,ARL,LNL,PTL, 101 + * WCL,NVL 102 102 * Scope: Package (physical package) 103 103 * MSR_MODULE_C6_RES_MS: Module C6 Residency Counter. 104 104 * perf code: 0x00 105 - * Available model: SRF,GRR 105 + * Available model: SRF,GRR,NVL 106 106 * Scope: A cluster of cores shared L2 cache 107 107 * 108 108 */ ··· 529 527 BIT(PERF_CSTATE_PKG_C10_RES), 530 528 }; 531 529 530 + static const struct cstate_model nvl_cstates __initconst = { 531 + .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | 532 + BIT(PERF_CSTATE_CORE_C6_RES) | 533 + BIT(PERF_CSTATE_CORE_C7_RES), 534 + 535 + .module_events = BIT(PERF_CSTATE_MODULE_C6_RES), 536 + 537 + .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) | 538 + BIT(PERF_CSTATE_PKG_C6_RES) | 539 + BIT(PERF_CSTATE_PKG_C10_RES), 540 + }; 541 + 532 542 static const struct cstate_model slm_cstates __initconst = { 533 543 .core_events = BIT(PERF_CSTATE_CORE_C1_RES) | 534 544 BIT(PERF_CSTATE_CORE_C6_RES), ··· 613 599 X86_MATCH_VFM(INTEL_ATOM_SILVERMONT, &slm_cstates), 614 600 X86_MATCH_VFM(INTEL_ATOM_SILVERMONT_D, &slm_cstates), 615 601 X86_MATCH_VFM(INTEL_ATOM_AIRMONT, &slm_cstates), 602 + X86_MATCH_VFM(INTEL_ATOM_AIRMONT_NP, &slm_cstates), 616 603 617 604 X86_MATCH_VFM(INTEL_BROADWELL, &snb_cstates), 618 605 X86_MATCH_VFM(INTEL_BROADWELL_D, &snb_cstates), ··· 653 638 X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &icx_cstates), 654 639 X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &icx_cstates), 655 640 X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &icx_cstates), 641 + X86_MATCH_VFM(INTEL_DIAMONDRAPIDS_X, &srf_cstates), 656 642 657 643 X86_MATCH_VFM(INTEL_TIGERLAKE_L, &icl_cstates), 658 644 X86_MATCH_VFM(INTEL_TIGERLAKE, &icl_cstates), ··· 670 654 X86_MATCH_VFM(INTEL_ARROWLAKE_U, &adl_cstates), 671 655 X86_MATCH_VFM(INTEL_LUNARLAKE_M, &lnl_cstates), 672 656 X86_MATCH_VFM(INTEL_PANTHERLAKE_L, &lnl_cstates), 657 + X86_MATCH_VFM(INTEL_WILDCATLAKE_L, &lnl_cstates), 658 + X86_MATCH_VFM(INTEL_NOVALAKE, &nvl_cstates), 659 + X86_MATCH_VFM(INTEL_NOVALAKE_L, &nvl_cstates), 673 660 { }, 674 661 }; 675 662 MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);

+1

arch/x86/events/msr.c

··· 78 78 case INTEL_ATOM_SILVERMONT: 79 79 case INTEL_ATOM_SILVERMONT_D: 80 80 case INTEL_ATOM_AIRMONT: 81 + case INTEL_ATOM_AIRMONT_NP: 81 82 82 83 case INTEL_ATOM_GOLDMONT: 83 84 case INTEL_ATOM_GOLDMONT_D:

+3

arch/x86/include/asm/hardirq.h

··· 19 19 unsigned int kvm_posted_intr_wakeup_ipis; 20 20 unsigned int kvm_posted_intr_nested_ipis; 21 21 #endif 22 + #ifdef CONFIG_GUEST_PERF_EVENTS 23 + unsigned int perf_guest_mediated_pmis; 24 + #endif 22 25 unsigned int x86_platform_ipis; /* arch dependent */ 23 26 unsigned int apic_perf_irqs; 24 27 unsigned int apic_irq_work_irqs;

+6

arch/x86/include/asm/idtentry.h

··· 746 746 # define fred_sysvec_kvm_posted_intr_nested_ipi NULL 747 747 #endif 748 748 749 + # ifdef CONFIG_GUEST_PERF_EVENTS 750 + DECLARE_IDTENTRY_SYSVEC(PERF_GUEST_MEDIATED_PMI_VECTOR, sysvec_perf_guest_mediated_pmi_handler); 751 + #else 752 + # define fred_sysvec_perf_guest_mediated_pmi_handler NULL 753 + #endif 754 + 749 755 # ifdef CONFIG_X86_POSTED_MSI 750 756 DECLARE_IDTENTRY_SYSVEC(POSTED_MSI_NOTIFICATION_VECTOR, sysvec_posted_msi_notification); 751 757 #else

+3 -1

arch/x86/include/asm/irq_vectors.h

··· 77 77 */ 78 78 #define IRQ_WORK_VECTOR 0xf6 79 79 80 - /* 0xf5 - unused, was UV_BAU_MESSAGE */ 80 + /* IRQ vector for PMIs when running a guest with a mediated PMU. */ 81 + #define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 82 + 81 83 #define DEFERRED_ERROR_VECTOR 0xf4 82 84 83 85 /* Vector on which hypervisor callbacks will be delivered */

+4

arch/x86/include/asm/kvm-x86-pmu-ops.h

··· 23 23 KVM_X86_PMU_OP_OPTIONAL(deliver_pmi) 24 24 KVM_X86_PMU_OP_OPTIONAL(cleanup) 25 25 26 + KVM_X86_PMU_OP_OPTIONAL(write_global_ctrl) 27 + KVM_X86_PMU_OP(mediated_load) 28 + KVM_X86_PMU_OP(mediated_put) 29 + 26 30 #undef KVM_X86_PMU_OP 27 31 #undef KVM_X86_PMU_OP_OPTIONAL

+3

arch/x86/include/asm/kvm_host.h

··· 537 537 */ 538 538 u64 emulated_counter; 539 539 u64 eventsel; 540 + u64 eventsel_hw; 540 541 struct perf_event *perf_event; 541 542 struct kvm_vcpu *vcpu; 542 543 /* ··· 566 565 unsigned nr_arch_fixed_counters; 567 566 unsigned available_event_types; 568 567 u64 fixed_ctr_ctrl; 568 + u64 fixed_ctr_ctrl_hw; 569 569 u64 fixed_ctr_ctrl_rsvd; 570 570 u64 global_ctrl; 571 571 u64 global_status; ··· 1505 1503 1506 1504 bool bus_lock_detection_enabled; 1507 1505 bool enable_pmu; 1506 + bool created_mediated_pmu; 1508 1507 1509 1508 u32 notify_window; 1510 1509 u32 notify_vmexit_flags;

+1

arch/x86/include/asm/msr-index.h

··· 1219 1219 #define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e 1220 1220 #define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f 1221 1221 #define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390 1222 + #define MSR_CORE_PERF_GLOBAL_STATUS_SET 0x00000391 1222 1223 1223 1224 #define MSR_PERF_METRICS 0x00000329 1224 1225

+6

arch/x86/include/asm/perf_event.h

··· 301 301 unsigned int events_mask; 302 302 int events_mask_len; 303 303 unsigned int pebs_ept :1; 304 + unsigned int mediated :1; 304 305 }; 305 306 306 307 /* ··· 758 757 759 758 static inline void perf_events_lapic_init(void) { } 760 759 static inline void perf_check_microcode(void) { } 760 + #endif 761 + 762 + #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU 763 + extern void perf_load_guest_lvtpc(u32 guest_lvtpc); 764 + extern void perf_put_guest_lvtpc(void); 761 765 #endif 762 766 763 767 #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL)

+14 -13

arch/x86/include/asm/unwind_user.h

··· 2 2 #ifndef _ASM_X86_UNWIND_USER_H 3 3 #define _ASM_X86_UNWIND_USER_H 4 4 5 - #ifdef CONFIG_HAVE_UNWIND_USER_FP 5 + #ifdef CONFIG_UNWIND_USER 6 6 7 7 #include <asm/ptrace.h> 8 8 #include <asm/uprobes.h> 9 + 10 + static inline int unwind_user_word_size(struct pt_regs *regs) 11 + { 12 + /* We can't unwind VM86 stacks */ 13 + if (regs->flags & X86_VM_MASK) 14 + return 0; 15 + return user_64bit_mode(regs) ? 8 : 4; 16 + } 17 + 18 + #endif /* CONFIG_UNWIND_USER */ 19 + 20 + #ifdef CONFIG_HAVE_UNWIND_USER_FP 9 21 10 22 #define ARCH_INIT_USER_FP_FRAME(ws) \ 11 23 .cfa_off = 2*(ws), \ ··· 31 19 .fp_off = 0, \ 32 20 .use_fp = false, 33 21 34 - static inline int unwind_user_word_size(struct pt_regs *regs) 35 - { 36 - /* We can't unwind VM86 stacks */ 37 - if (regs->flags & X86_VM_MASK) 38 - return 0; 39 - #ifdef CONFIG_X86_64 40 - if (!user_64bit_mode(regs)) 41 - return sizeof(int); 42 - #endif 43 - return sizeof(long); 44 - } 45 - 46 22 static inline bool unwind_user_at_function_start(struct pt_regs *regs) 47 23 { 48 24 return is_uprobe_at_func_entry(regs); 49 25 } 26 + #define unwind_user_at_function_start unwind_user_at_function_start 50 27 51 28 #endif /* CONFIG_HAVE_UNWIND_USER_FP */ 52 29

+1

arch/x86/include/asm/vmx.h

··· 107 107 #define VM_EXIT_PT_CONCEAL_PIP 0x01000000 108 108 #define VM_EXIT_CLEAR_IA32_RTIT_CTL 0x02000000 109 109 #define VM_EXIT_LOAD_CET_STATE 0x10000000 110 + #define VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL 0x40000000 110 111 111 112 #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff 112 113

+3

arch/x86/kernel/idt.c

··· 158 158 INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi), 159 159 INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi), 160 160 # endif 161 + #ifdef CONFIG_GUEST_PERF_EVENTS 162 + INTG(PERF_GUEST_MEDIATED_PMI_VECTOR, asm_sysvec_perf_guest_mediated_pmi_handler), 163 + #endif 161 164 # ifdef CONFIG_IRQ_WORK 162 165 INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work), 163 166 # endif

+19

arch/x86/kernel/irq.c

··· 192 192 irq_stats(j)->kvm_posted_intr_wakeup_ipis); 193 193 seq_puts(p, " Posted-interrupt wakeup event\n"); 194 194 #endif 195 + #ifdef CONFIG_GUEST_PERF_EVENTS 196 + seq_printf(p, "%*s: ", prec, "VPMI"); 197 + for_each_online_cpu(j) 198 + seq_printf(p, "%10u ", 199 + irq_stats(j)->perf_guest_mediated_pmis); 200 + seq_puts(p, " Perf Guest Mediated PMI\n"); 201 + #endif 195 202 #ifdef CONFIG_X86_POSTED_MSI 196 203 seq_printf(p, "%*s: ", prec, "PMN"); 197 204 for_each_online_cpu(j) ··· 353 346 x86_platform_ipi_callback(); 354 347 trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); 355 348 set_irq_regs(old_regs); 349 + } 350 + #endif 351 + 352 + #ifdef CONFIG_GUEST_PERF_EVENTS 353 + /* 354 + * Handler for PERF_GUEST_MEDIATED_PMI_VECTOR. 355 + */ 356 + DEFINE_IDTENTRY_SYSVEC(sysvec_perf_guest_mediated_pmi_handler) 357 + { 358 + apic_eoi(); 359 + inc_irq_stat(perf_guest_mediated_pmis); 360 + perf_guest_handle_mediated_pmi(); 356 361 } 357 362 #endif 358 363

+1

arch/x86/kvm/Kconfig

··· 37 37 select SCHED_INFO 38 38 select PERF_EVENTS 39 39 select GUEST_PERF_EVENTS 40 + select PERF_GUEST_MEDIATED_PMU 40 41 select HAVE_KVM_MSI 41 42 select HAVE_KVM_CPU_RELAX_INTERCEPT 42 43 select HAVE_KVM_NO_POLL

+263 -6

arch/x86/kvm/pmu.c

··· 103 103 #undef __KVM_X86_PMU_OP 104 104 } 105 105 106 - void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) 106 + void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops) 107 107 { 108 108 bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; 109 109 int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; ··· 135 135 enable_pmu = false; 136 136 } 137 137 138 + if (!enable_pmu || !enable_mediated_pmu || !kvm_host_pmu.mediated || 139 + !pmu_ops->is_mediated_pmu_supported(&kvm_host_pmu)) 140 + enable_mediated_pmu = false; 141 + 142 + if (!enable_mediated_pmu) 143 + pmu_ops->write_global_ctrl = NULL; 144 + 138 145 if (!enable_pmu) { 139 146 memset(&kvm_pmu_cap, 0, sizeof(kvm_pmu_cap)); 140 147 return; ··· 158 151 perf_get_hw_event_config(PERF_COUNT_HW_INSTRUCTIONS); 159 152 kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED = 160 153 perf_get_hw_event_config(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 154 + } 155 + 156 + void kvm_handle_guest_mediated_pmi(void) 157 + { 158 + struct kvm_vcpu *vcpu = kvm_get_running_vcpu(); 159 + 160 + if (WARN_ON_ONCE(!vcpu || !kvm_vcpu_has_mediated_pmu(vcpu))) 161 + return; 162 + 163 + kvm_make_request(KVM_REQ_PMI, vcpu); 161 164 } 162 165 163 166 static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) ··· 379 362 380 363 void pmc_write_counter(struct kvm_pmc *pmc, u64 val) 381 364 { 365 + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) { 366 + pmc->counter = val & pmc_bitmask(pmc); 367 + return; 368 + } 369 + 382 370 /* 383 371 * Drop any unconsumed accumulated counts, the WRMSR is a write, not a 384 372 * read-modify-write. Adjust the counter value so that its value is ··· 520 498 return is_fixed_event_allowed(filter, pmc->idx); 521 499 } 522 500 501 + static void kvm_mediated_pmu_refresh_event_filter(struct kvm_pmc *pmc) 502 + { 503 + bool allowed = pmc_is_event_allowed(pmc); 504 + struct kvm_pmu *pmu = pmc_to_pmu(pmc); 505 + 506 + if (pmc_is_gp(pmc)) { 507 + pmc->eventsel_hw &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 508 + if (allowed) 509 + pmc->eventsel_hw |= pmc->eventsel & 510 + ARCH_PERFMON_EVENTSEL_ENABLE; 511 + } else { 512 + u64 mask = intel_fixed_bits_by_idx(pmc->idx - KVM_FIXED_PMC_BASE_IDX, 0xf); 513 + 514 + pmu->fixed_ctr_ctrl_hw &= ~mask; 515 + if (allowed) 516 + pmu->fixed_ctr_ctrl_hw |= pmu->fixed_ctr_ctrl & mask; 517 + } 518 + } 519 + 523 520 static int reprogram_counter(struct kvm_pmc *pmc) 524 521 { 525 522 struct kvm_pmu *pmu = pmc_to_pmu(pmc); ··· 546 505 u64 new_config = eventsel; 547 506 bool emulate_overflow; 548 507 u8 fixed_ctr_ctrl; 508 + 509 + if (kvm_vcpu_has_mediated_pmu(pmu_to_vcpu(pmu))) { 510 + kvm_mediated_pmu_refresh_event_filter(pmc); 511 + return 0; 512 + } 549 513 550 514 emulate_overflow = pmc_pause_counter(pmc); 551 515 ··· 746 700 return 0; 747 701 } 748 702 703 + static bool kvm_need_any_pmc_intercept(struct kvm_vcpu *vcpu) 704 + { 705 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 706 + 707 + if (!kvm_vcpu_has_mediated_pmu(vcpu)) 708 + return true; 709 + 710 + /* 711 + * Note! Check *host* PMU capabilities, not KVM's PMU capabilities, as 712 + * KVM's capabilities are constrained based on KVM support, i.e. KVM's 713 + * capabilities themselves may be a subset of hardware capabilities. 714 + */ 715 + return pmu->nr_arch_gp_counters != kvm_host_pmu.num_counters_gp || 716 + pmu->nr_arch_fixed_counters != kvm_host_pmu.num_counters_fixed; 717 + } 718 + 719 + bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu) 720 + { 721 + return kvm_need_any_pmc_intercept(vcpu) || 722 + !kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); 723 + } 724 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_perf_global_ctrl_intercept); 725 + 726 + bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu) 727 + { 728 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 729 + 730 + /* 731 + * VMware allows access to these Pseduo-PMCs even when read via RDPMC 732 + * in Ring3 when CR4.PCE=0. 733 + */ 734 + if (enable_vmware_backdoor) 735 + return true; 736 + 737 + return kvm_need_any_pmc_intercept(vcpu) || 738 + pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) || 739 + pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1); 740 + } 741 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_rdpmc_intercept); 742 + 749 743 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) 750 744 { 751 745 if (lapic_in_kernel(vcpu)) { ··· 881 795 pmu->global_ctrl = data; 882 796 reprogram_counters(pmu, diff); 883 797 } 798 + /* 799 + * Unconditionally forward writes to vendor code, i.e. to the 800 + * VMC{B,S}, as pmu->global_ctrl is per-VCPU, not per-VMC{B,S}. 801 + */ 802 + if (kvm_vcpu_has_mediated_pmu(vcpu)) 803 + kvm_pmu_call(write_global_ctrl)(data); 884 804 break; 885 805 case MSR_CORE_PERF_GLOBAL_OVF_CTRL: 886 806 /* ··· 927 835 pmc->counter = 0; 928 836 pmc->emulated_counter = 0; 929 837 930 - if (pmc_is_gp(pmc)) 838 + if (pmc_is_gp(pmc)) { 931 839 pmc->eventsel = 0; 840 + pmc->eventsel_hw = 0; 841 + } 932 842 } 933 843 934 - pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; 844 + pmu->fixed_ctr_ctrl = pmu->fixed_ctr_ctrl_hw = 0; 845 + pmu->global_ctrl = pmu->global_status = 0; 935 846 936 847 kvm_pmu_call(reset)(vcpu); 937 848 } ··· 983 888 * in the global controls). Emulate that behavior when refreshing the 984 889 * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. 985 890 */ 986 - if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) 891 + if (pmu->nr_arch_gp_counters && 892 + (kvm_pmu_has_perf_global_ctrl(pmu) || kvm_vcpu_has_mediated_pmu(vcpu))) 987 893 pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); 894 + 895 + if (kvm_vcpu_has_mediated_pmu(vcpu)) 896 + kvm_pmu_call(write_global_ctrl)(pmu->global_ctrl); 988 897 989 898 bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); 990 899 bitmap_set(pmu->all_valid_pmc_idx, KVM_FIXED_PMC_BASE_IDX, ··· 1031 932 kvm_pmu_reset(vcpu); 1032 933 } 1033 934 935 + static bool pmc_is_pmi_enabled(struct kvm_pmc *pmc) 936 + { 937 + u8 fixed_ctr_ctrl; 938 + 939 + if (pmc_is_gp(pmc)) 940 + return pmc->eventsel & ARCH_PERFMON_EVENTSEL_INT; 941 + 942 + fixed_ctr_ctrl = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, 943 + pmc->idx - KVM_FIXED_PMC_BASE_IDX); 944 + return fixed_ctr_ctrl & INTEL_FIXED_0_ENABLE_PMI; 945 + } 946 + 1034 947 static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) 1035 948 { 1036 - pmc->emulated_counter++; 1037 - kvm_pmu_request_counter_reprogram(pmc); 949 + struct kvm_vcpu *vcpu = pmc->vcpu; 950 + 951 + /* 952 + * For perf-based PMUs, accumulate software-emulated events separately 953 + * from pmc->counter, as pmc->counter is offset by the count of the 954 + * associated perf event. Request reprogramming, which will consult 955 + * both emulated and hardware-generated events to detect overflow. 956 + */ 957 + if (!kvm_vcpu_has_mediated_pmu(vcpu)) { 958 + pmc->emulated_counter++; 959 + kvm_pmu_request_counter_reprogram(pmc); 960 + return; 961 + } 962 + 963 + /* 964 + * For mediated PMUs, pmc->counter is updated when the vCPU's PMU is 965 + * put, and will be loaded into hardware when the PMU is loaded. Simply 966 + * increment the counter and signal overflow if it wraps to zero. 967 + */ 968 + pmc->counter = (pmc->counter + 1) & pmc_bitmask(pmc); 969 + if (!pmc->counter) { 970 + pmc_to_pmu(pmc)->global_status |= BIT_ULL(pmc->idx); 971 + if (pmc_is_pmi_enabled(pmc)) 972 + kvm_make_request(KVM_REQ_PMI, vcpu); 973 + } 1038 974 } 1039 975 1040 976 static inline bool cpl_is_matched(struct kvm_pmc *pmc) ··· 1281 1147 cleanup: 1282 1148 kfree(filter); 1283 1149 return r; 1150 + } 1151 + 1152 + static __always_inline u32 fixed_counter_msr(u32 idx) 1153 + { 1154 + return kvm_pmu_ops.FIXED_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; 1155 + } 1156 + 1157 + static __always_inline u32 gp_counter_msr(u32 idx) 1158 + { 1159 + return kvm_pmu_ops.GP_COUNTER_BASE + idx * kvm_pmu_ops.MSR_STRIDE; 1160 + } 1161 + 1162 + static __always_inline u32 gp_eventsel_msr(u32 idx) 1163 + { 1164 + return kvm_pmu_ops.GP_EVENTSEL_BASE + idx * kvm_pmu_ops.MSR_STRIDE; 1165 + } 1166 + 1167 + static void kvm_pmu_load_guest_pmcs(struct kvm_vcpu *vcpu) 1168 + { 1169 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 1170 + struct kvm_pmc *pmc; 1171 + u32 i; 1172 + 1173 + /* 1174 + * No need to zero out unexposed GP/fixed counters/selectors since RDPMC 1175 + * is intercepted if hardware has counters that aren't visible to the 1176 + * guest (KVM will inject #GP as appropriate). 1177 + */ 1178 + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 1179 + pmc = &pmu->gp_counters[i]; 1180 + 1181 + if (pmc->counter != rdpmc(i)) 1182 + wrmsrl(gp_counter_msr(i), pmc->counter); 1183 + wrmsrl(gp_eventsel_msr(i), pmc->eventsel_hw); 1184 + } 1185 + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { 1186 + pmc = &pmu->fixed_counters[i]; 1187 + 1188 + if (pmc->counter != rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i)) 1189 + wrmsrl(fixed_counter_msr(i), pmc->counter); 1190 + } 1191 + } 1192 + 1193 + void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu) 1194 + { 1195 + if (!kvm_vcpu_has_mediated_pmu(vcpu) || 1196 + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) 1197 + return; 1198 + 1199 + lockdep_assert_irqs_disabled(); 1200 + 1201 + perf_load_guest_context(); 1202 + 1203 + /* 1204 + * Explicitly clear PERF_GLOBAL_CTRL, as "loading" the guest's context 1205 + * disables all individual counters (if any were enabled), but doesn't 1206 + * globally disable the entire PMU. Loading event selectors and PMCs 1207 + * with guest values while PERF_GLOBAL_CTRL is non-zero will generate 1208 + * unexpected events and PMIs. 1209 + * 1210 + * VMX will enable/disable counters at VM-Enter/VM-Exit by atomically 1211 + * loading PERF_GLOBAL_CONTROL. SVM effectively performs the switch by 1212 + * configuring all events to be GUEST_ONLY. Clear PERF_GLOBAL_CONTROL 1213 + * even for SVM to minimize the damage if a perf event is left enabled, 1214 + * and to ensure a consistent starting state. 1215 + */ 1216 + wrmsrq(kvm_pmu_ops.PERF_GLOBAL_CTRL, 0); 1217 + 1218 + perf_load_guest_lvtpc(kvm_lapic_get_reg(vcpu->arch.apic, APIC_LVTPC)); 1219 + 1220 + kvm_pmu_load_guest_pmcs(vcpu); 1221 + 1222 + kvm_pmu_call(mediated_load)(vcpu); 1223 + } 1224 + 1225 + static void kvm_pmu_put_guest_pmcs(struct kvm_vcpu *vcpu) 1226 + { 1227 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 1228 + struct kvm_pmc *pmc; 1229 + u32 i; 1230 + 1231 + /* 1232 + * Clear selectors and counters to ensure hardware doesn't count using 1233 + * guest controls when the host (perf) restores its state. 1234 + */ 1235 + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 1236 + pmc = &pmu->gp_counters[i]; 1237 + 1238 + pmc->counter = rdpmc(i); 1239 + if (pmc->counter) 1240 + wrmsrq(gp_counter_msr(i), 0); 1241 + if (pmc->eventsel_hw) 1242 + wrmsrq(gp_eventsel_msr(i), 0); 1243 + } 1244 + 1245 + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { 1246 + pmc = &pmu->fixed_counters[i]; 1247 + 1248 + pmc->counter = rdpmc(INTEL_PMC_FIXED_RDPMC_BASE | i); 1249 + if (pmc->counter) 1250 + wrmsrq(fixed_counter_msr(i), 0); 1251 + } 1252 + } 1253 + 1254 + void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu) 1255 + { 1256 + if (!kvm_vcpu_has_mediated_pmu(vcpu) || 1257 + KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm)) 1258 + return; 1259 + 1260 + lockdep_assert_irqs_disabled(); 1261 + 1262 + /* 1263 + * Defer handling of PERF_GLOBAL_CTRL to vendor code. On Intel, it's 1264 + * atomically cleared on VM-Exit, i.e. doesn't need to be clear here. 1265 + */ 1266 + kvm_pmu_call(mediated_put)(vcpu); 1267 + 1268 + kvm_pmu_put_guest_pmcs(vcpu); 1269 + 1270 + perf_put_guest_lvtpc(); 1271 + 1272 + perf_put_guest_context(); 1284 1273 }

+36 -1

arch/x86/kvm/pmu.h

··· 37 37 void (*deliver_pmi)(struct kvm_vcpu *vcpu); 38 38 void (*cleanup)(struct kvm_vcpu *vcpu); 39 39 40 + bool (*is_mediated_pmu_supported)(struct x86_pmu_capability *host_pmu); 41 + void (*mediated_load)(struct kvm_vcpu *vcpu); 42 + void (*mediated_put)(struct kvm_vcpu *vcpu); 43 + void (*write_global_ctrl)(u64 global_ctrl); 44 + 40 45 const u64 EVENTSEL_EVENT; 41 46 const int MAX_NR_GP_COUNTERS; 42 47 const int MIN_NR_GP_COUNTERS; 48 + 49 + const u32 PERF_GLOBAL_CTRL; 50 + const u32 GP_EVENTSEL_BASE; 51 + const u32 GP_COUNTER_BASE; 52 + const u32 FIXED_COUNTER_BASE; 53 + const u32 MSR_STRIDE; 43 54 }; 44 55 45 56 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); 57 + 58 + void kvm_handle_guest_mediated_pmi(void); 46 59 47 60 static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) 48 61 { ··· 69 56 * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2. 70 57 */ 71 58 return pmu->version > 1; 59 + } 60 + 61 + static inline bool kvm_vcpu_has_mediated_pmu(struct kvm_vcpu *vcpu) 62 + { 63 + return enable_mediated_pmu && vcpu_to_pmu(vcpu)->version; 72 64 } 73 65 74 66 /* ··· 118 100 static inline u64 pmc_read_counter(struct kvm_pmc *pmc) 119 101 { 120 102 u64 counter, enabled, running; 103 + 104 + if (kvm_vcpu_has_mediated_pmu(pmc->vcpu)) 105 + return pmc->counter & pmc_bitmask(pmc); 121 106 122 107 counter = pmc->counter + pmc->emulated_counter; 123 108 ··· 195 174 196 175 extern struct x86_pmu_capability kvm_pmu_cap; 197 176 198 - void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops); 177 + void kvm_init_pmu_capability(struct kvm_pmu_ops *pmu_ops); 199 178 200 179 void kvm_pmu_recalc_pmc_emulation(struct kvm_pmu *pmu, struct kvm_pmc *pmc); 201 180 ··· 234 213 return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); 235 214 } 236 215 216 + static inline bool kvm_pmu_is_fastpath_emulation_allowed(struct kvm_vcpu *vcpu) 217 + { 218 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 219 + 220 + return !kvm_vcpu_has_mediated_pmu(vcpu) || 221 + !bitmap_intersects(pmu->pmc_counting_instructions, 222 + (unsigned long *)&pmu->global_ctrl, 223 + X86_PMC_IDX_MAX); 224 + } 225 + 237 226 void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); 238 227 void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); 239 228 int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); ··· 258 227 int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp); 259 228 void kvm_pmu_instruction_retired(struct kvm_vcpu *vcpu); 260 229 void kvm_pmu_branch_retired(struct kvm_vcpu *vcpu); 230 + void kvm_mediated_pmu_load(struct kvm_vcpu *vcpu); 231 + void kvm_mediated_pmu_put(struct kvm_vcpu *vcpu); 261 232 262 233 bool is_vmware_backdoor_pmc(u32 pmc_idx); 234 + bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu); 235 + bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu); 263 236 264 237 extern struct kvm_pmu_ops intel_pmu_ops; 265 238 extern struct kvm_pmu_ops amd_pmu_ops;

+17 -1

arch/x86/kvm/svm/nested.c

··· 193 193 * Hardcode the capacity of the array based on the maximum number of _offsets_. 194 194 * MSRs are batched together, so there are fewer offsets than MSRs. 195 195 */ 196 - static int nested_svm_msrpm_merge_offsets[7] __ro_after_init; 196 + static int nested_svm_msrpm_merge_offsets[10] __ro_after_init; 197 197 static int nested_svm_nr_msrpm_merge_offsets __ro_after_init; 198 198 typedef unsigned long nsvm_msrpm_merge_t; 199 199 ··· 221 221 MSR_IA32_LASTBRANCHTOIP, 222 222 MSR_IA32_LASTINTFROMIP, 223 223 MSR_IA32_LASTINTTOIP, 224 + 225 + MSR_K7_PERFCTR0, 226 + MSR_K7_PERFCTR1, 227 + MSR_K7_PERFCTR2, 228 + MSR_K7_PERFCTR3, 229 + MSR_F15H_PERF_CTR0, 230 + MSR_F15H_PERF_CTR1, 231 + MSR_F15H_PERF_CTR2, 232 + MSR_F15H_PERF_CTR3, 233 + MSR_F15H_PERF_CTR4, 234 + MSR_F15H_PERF_CTR5, 235 + 236 + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 237 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 238 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 239 + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, 224 240 }; 225 241 int i, j; 226 242

+44

arch/x86/kvm/svm/pmu.c

··· 166 166 data &= ~pmu->reserved_bits; 167 167 if (data != pmc->eventsel) { 168 168 pmc->eventsel = data; 169 + pmc->eventsel_hw = (data & ~AMD64_EVENTSEL_HOSTONLY) | 170 + AMD64_EVENTSEL_GUESTONLY; 169 171 kvm_pmu_request_counter_reprogram(pmc); 170 172 } 171 173 return 0; ··· 229 227 } 230 228 } 231 229 230 + static bool amd_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) 231 + { 232 + return host_pmu->version >= 2; 233 + } 234 + 235 + static void amd_mediated_pmu_load(struct kvm_vcpu *vcpu) 236 + { 237 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 238 + u64 global_status; 239 + 240 + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, global_status); 241 + /* Clear host global_status MSR if non-zero. */ 242 + if (global_status) 243 + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, global_status); 244 + 245 + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, pmu->global_status); 246 + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, pmu->global_ctrl); 247 + } 248 + 249 + static void amd_mediated_pmu_put(struct kvm_vcpu *vcpu) 250 + { 251 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 252 + 253 + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 0); 254 + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, pmu->global_status); 255 + 256 + /* Clear global status bits if non-zero */ 257 + if (pmu->global_status) 258 + wrmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, pmu->global_status); 259 + } 260 + 232 261 struct kvm_pmu_ops amd_pmu_ops __initdata = { 233 262 .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, 234 263 .msr_idx_to_pmc = amd_msr_idx_to_pmc, ··· 269 236 .set_msr = amd_pmu_set_msr, 270 237 .refresh = amd_pmu_refresh, 271 238 .init = amd_pmu_init, 239 + 240 + .is_mediated_pmu_supported = amd_pmu_is_mediated_pmu_supported, 241 + .mediated_load = amd_mediated_pmu_load, 242 + .mediated_put = amd_mediated_pmu_put, 243 + 272 244 .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, 273 245 .MAX_NR_GP_COUNTERS = KVM_MAX_NR_AMD_GP_COUNTERS, 274 246 .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, 247 + 248 + .PERF_GLOBAL_CTRL = MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 249 + .GP_EVENTSEL_BASE = MSR_F15H_PERF_CTL0, 250 + .GP_COUNTER_BASE = MSR_F15H_PERF_CTR0, 251 + .FIXED_COUNTER_BASE = 0, 252 + .MSR_STRIDE = 2, 275 253 };

+46

arch/x86/kvm/svm/svm.c

··· 170 170 bool vnmi = true; 171 171 module_param(vnmi, bool, 0444); 172 172 173 + module_param(enable_mediated_pmu, bool, 0444); 174 + 173 175 static bool svm_gp_erratum_intercept = true; 174 176 175 177 static u8 rsm_ins_bytes[] = "\x0f\xaa"; ··· 731 729 __free_pages(virt_to_page(msrpm), get_order(MSRPM_SIZE)); 732 730 } 733 731 732 + static void svm_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) 733 + { 734 + bool intercept = !kvm_vcpu_has_mediated_pmu(vcpu); 735 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 736 + int i; 737 + 738 + if (!enable_mediated_pmu) 739 + return; 740 + 741 + /* Legacy counters are always available for AMD CPUs with a PMU. */ 742 + for (i = 0; i < min(pmu->nr_arch_gp_counters, AMD64_NUM_COUNTERS); i++) 743 + svm_set_intercept_for_msr(vcpu, MSR_K7_PERFCTR0 + i, 744 + MSR_TYPE_RW, intercept); 745 + 746 + intercept |= !guest_cpu_cap_has(vcpu, X86_FEATURE_PERFCTR_CORE); 747 + for (i = 0; i < pmu->nr_arch_gp_counters; i++) 748 + svm_set_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, 749 + MSR_TYPE_RW, intercept); 750 + 751 + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) 752 + svm_enable_intercept_for_msr(vcpu, MSR_F15H_PERF_CTR + 2 * i, 753 + MSR_TYPE_RW); 754 + 755 + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); 756 + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL, 757 + MSR_TYPE_RW, intercept); 758 + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, 759 + MSR_TYPE_RW, intercept); 760 + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, 761 + MSR_TYPE_RW, intercept); 762 + svm_set_intercept_for_msr(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_SET, 763 + MSR_TYPE_RW, intercept); 764 + } 765 + 734 766 static void svm_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 735 767 { 736 768 struct vcpu_svm *svm = to_svm(vcpu); ··· 832 796 833 797 if (sev_es_guest(vcpu->kvm)) 834 798 sev_es_recalc_msr_intercepts(vcpu); 799 + 800 + svm_recalc_pmu_msr_intercepts(vcpu); 835 801 836 802 /* 837 803 * x2APIC intercepts are modified on-demand and cannot be filtered by ··· 1051 1013 svm_clr_intercept(svm, INTERCEPT_VMSAVE); 1052 1014 } 1053 1015 } 1016 + 1017 + if (kvm_need_rdpmc_intercept(vcpu)) 1018 + svm_set_intercept(svm, INTERCEPT_RDPMC); 1019 + else 1020 + svm_clr_intercept(svm, INTERCEPT_RDPMC); 1054 1021 } 1055 1022 1056 1023 static void svm_recalc_intercepts(struct kvm_vcpu *vcpu) ··· 4427 4384 kvm_read_and_reset_apf_flags(); 4428 4385 4429 4386 vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; 4387 + 4388 + if (!msr_write_intercepted(vcpu, MSR_AMD64_PERF_CNTR_GLOBAL_CTL)) 4389 + rdmsrq(MSR_AMD64_PERF_CNTR_GLOBAL_CTL, vcpu_to_pmu(vcpu)->global_ctrl); 4430 4390 4431 4391 trace_kvm_exit(vcpu, KVM_ISA_SVM); 4432 4392

+8 -1

arch/x86/kvm/vmx/capabilities.h

··· 109 109 { 110 110 return (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_CET_STATE); 111 111 } 112 + 113 + static inline bool cpu_has_save_perf_global_ctrl(void) 114 + { 115 + return vmcs_config.vmexit_ctrl & VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 116 + } 117 + 112 118 static inline bool cpu_has_vmx_mpx(void) 113 119 { 114 120 return vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS; ··· 401 395 402 396 static inline bool vmx_pebs_supported(void) 403 397 { 404 - return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept; 398 + return boot_cpu_has(X86_FEATURE_PEBS) && kvm_pmu_cap.pebs_ept && 399 + !enable_mediated_pmu; 405 400 } 406 401 407 402 static inline bool cpu_has_notify_vmexit(void)

+77 -67

arch/x86/kvm/vmx/nested.c

··· 621 621 msr_bitmap_l0, msr); 622 622 } 623 623 624 + #define nested_vmx_merge_msr_bitmaps(msr, type) \ 625 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, \ 626 + msr_bitmap_l0, msr, type) 627 + 628 + #define nested_vmx_merge_msr_bitmaps_read(msr) \ 629 + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_R) 630 + 631 + #define nested_vmx_merge_msr_bitmaps_write(msr) \ 632 + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_W) 633 + 634 + #define nested_vmx_merge_msr_bitmaps_rw(msr) \ 635 + nested_vmx_merge_msr_bitmaps(msr, MSR_TYPE_RW) 636 + 637 + static void nested_vmx_merge_pmu_msr_bitmaps(struct kvm_vcpu *vcpu, 638 + unsigned long *msr_bitmap_l1, 639 + unsigned long *msr_bitmap_l0) 640 + { 641 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 642 + struct vcpu_vmx *vmx = to_vmx(vcpu); 643 + int i; 644 + 645 + /* 646 + * Skip the merges if the vCPU doesn't have a mediated PMU MSR, i.e. if 647 + * none of the MSRs can possibly be passed through to L1. 648 + */ 649 + if (!kvm_vcpu_has_mediated_pmu(vcpu)) 650 + return; 651 + 652 + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 653 + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PERFCTR0 + i); 654 + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_PMC0 + i); 655 + } 656 + 657 + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 658 + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_FIXED_CTR0 + i); 659 + 660 + nested_vmx_merge_msr_bitmaps_rw(MSR_CORE_PERF_GLOBAL_CTRL); 661 + nested_vmx_merge_msr_bitmaps_read(MSR_CORE_PERF_GLOBAL_STATUS); 662 + nested_vmx_merge_msr_bitmaps_write(MSR_CORE_PERF_GLOBAL_OVF_CTRL); 663 + } 664 + 624 665 /* 625 666 * Merge L0's and L1's MSR bitmap, return false to indicate that 626 667 * we do not use the hardware. ··· 745 704 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 746 705 */ 747 706 #ifdef CONFIG_X86_64 748 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 749 - MSR_FS_BASE, MSR_TYPE_RW); 750 - 751 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 752 - MSR_GS_BASE, MSR_TYPE_RW); 753 - 754 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 755 - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 707 + nested_vmx_merge_msr_bitmaps_rw(MSR_FS_BASE); 708 + nested_vmx_merge_msr_bitmaps_rw(MSR_GS_BASE); 709 + nested_vmx_merge_msr_bitmaps_rw(MSR_KERNEL_GS_BASE); 756 710 #endif 757 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 758 - MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 759 - 760 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 761 - MSR_IA32_PRED_CMD, MSR_TYPE_W); 762 - 763 - nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 764 - MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 711 + nested_vmx_merge_msr_bitmaps_rw(MSR_IA32_SPEC_CTRL); 712 + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_PRED_CMD); 713 + nested_vmx_merge_msr_bitmaps_write(MSR_IA32_FLUSH_CMD); 765 714 766 715 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 767 716 MSR_IA32_APERF, MSR_TYPE_R); ··· 776 745 777 746 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 778 747 MSR_IA32_PL3_SSP, MSR_TYPE_RW); 748 + 749 + nested_vmx_merge_pmu_msr_bitmaps(vcpu, msr_bitmap_l1, msr_bitmap_l0); 779 750 780 751 kvm_vcpu_unmap(vcpu, &map); 781 752 ··· 1079 1046 * does not include the time taken for emulation of the L2->L1 1080 1047 * VM-exit in L0, use the more accurate value. 1081 1048 */ 1082 - if (msr_index == MSR_IA32_TSC) { 1083 - int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 1084 - MSR_IA32_TSC); 1049 + if (msr_index == MSR_IA32_TSC && vmx->nested.tsc_autostore_slot >= 0) { 1050 + int slot = vmx->nested.tsc_autostore_slot; 1051 + u64 host_tsc = vmx->msr_autostore.val[slot].value; 1085 1052 1086 - if (i >= 0) { 1087 - u64 val = vmx->msr_autostore.guest.val[i].value; 1088 - 1089 - *data = kvm_read_l1_tsc(vcpu, val); 1090 - return true; 1091 - } 1053 + *data = kvm_read_l1_tsc(vcpu, host_tsc); 1054 + return true; 1092 1055 } 1093 1056 1094 1057 if (kvm_emulate_msr_read(vcpu, msr_index, data)) { ··· 1161 1132 return true; 1162 1133 } 1163 1134 return false; 1164 - } 1165 - 1166 - static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1167 - u32 msr_index) 1168 - { 1169 - struct vcpu_vmx *vmx = to_vmx(vcpu); 1170 - struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1171 - bool in_vmcs12_store_list; 1172 - int msr_autostore_slot; 1173 - bool in_autostore_list; 1174 - int last; 1175 - 1176 - msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1177 - in_autostore_list = msr_autostore_slot >= 0; 1178 - in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1179 - 1180 - if (in_vmcs12_store_list && !in_autostore_list) { 1181 - if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1182 - /* 1183 - * Emulated VMEntry does not fail here. Instead a less 1184 - * accurate value will be returned by 1185 - * nested_vmx_get_vmexit_msr_value() by reading KVM's 1186 - * internal MSR state instead of reading the value from 1187 - * the vmcs02 VMExit MSR-store area. 1188 - */ 1189 - pr_warn_ratelimited( 1190 - "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1191 - msr_index); 1192 - return; 1193 - } 1194 - last = autostore->nr++; 1195 - autostore->val[last].index = msr_index; 1196 - } else if (!in_vmcs12_store_list && in_autostore_list) { 1197 - last = --autostore->nr; 1198 - autostore->val[msr_autostore_slot] = autostore->val[last]; 1199 - } 1200 1135 } 1201 1136 1202 1137 /* ··· 2330 2337 * addresses are constant (for vmcs02), the counts can change based 2331 2338 * on L2's behavior, e.g. switching to/from long mode. 2332 2339 */ 2333 - vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2340 + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 2334 2341 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2335 2342 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2336 2343 ··· 2662 2669 } 2663 2670 2664 2671 /* 2665 - * Make sure the msr_autostore list is up to date before we set the 2666 - * count in the vmcs02. 2672 + * If vmcs12 is configured to save TSC on exit via the auto-store list, 2673 + * append the MSR to vmcs02's auto-store list so that KVM effectively 2674 + * reads TSC at the time of VM-Exit from L2. The saved value will be 2675 + * propagated to vmcs12's list on nested VM-Exit. 2676 + * 2677 + * Don't increment the number of MSRs in the vCPU structure, as saving 2678 + * TSC is specific to this particular incarnation of vmcb02, i.e. must 2679 + * not bleed into vmcs01. 2667 2680 */ 2668 - prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2681 + if (nested_msr_store_list_has_msr(&vmx->vcpu, MSR_IA32_TSC) && 2682 + !WARN_ON_ONCE(vmx->msr_autostore.nr >= ARRAY_SIZE(vmx->msr_autostore.val))) { 2683 + vmx->nested.tsc_autostore_slot = vmx->msr_autostore.nr; 2684 + vmx->msr_autostore.val[vmx->msr_autostore.nr].index = MSR_IA32_TSC; 2669 2685 2670 - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2686 + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr + 1); 2687 + } else { 2688 + vmx->nested.tsc_autostore_slot = -1; 2689 + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 2690 + } 2671 2691 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2672 2692 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2673 2693 ··· 5124 5118 5125 5119 kvm_nested_vmexit_handle_ibrs(vcpu); 5126 5120 5127 - /* Update any VMCS fields that might have changed while L2 ran */ 5121 + /* 5122 + * Update any VMCS fields that might have changed while vmcs02 was the 5123 + * active VMCS. The tracking is per-vCPU, not per-VMCS. 5124 + */ 5125 + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.nr); 5128 5126 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 5129 5127 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 5130 5128 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);

+79 -13

arch/x86/kvm/vmx/pmu_intel.c

··· 61 61 int i; 62 62 63 63 pmu->fixed_ctr_ctrl = data; 64 + pmu->fixed_ctr_ctrl_hw = data; 64 65 for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { 65 66 u8 new_ctrl = fixed_ctrl_field(data, i); 66 67 u8 old_ctrl = fixed_ctrl_field(old_fixed_ctr_ctrl, i); ··· 127 126 128 127 *mask &= bitmask; 129 128 return &counters[array_index_nospec(idx, num_counters)]; 130 - } 131 - 132 - static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) 133 - { 134 - if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) 135 - return 0; 136 - 137 - return vcpu->arch.perf_capabilities; 138 - } 139 - 140 - static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) 141 - { 142 - return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; 143 129 } 144 130 145 131 static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) ··· 431 443 432 444 if (data != pmc->eventsel) { 433 445 pmc->eventsel = data; 446 + pmc->eventsel_hw = data; 434 447 kvm_pmu_request_counter_reprogram(pmc); 435 448 } 436 449 break; ··· 756 767 } 757 768 } 758 769 770 + static bool intel_pmu_is_mediated_pmu_supported(struct x86_pmu_capability *host_pmu) 771 + { 772 + u64 host_perf_cap = 0; 773 + 774 + if (boot_cpu_has(X86_FEATURE_PDCM)) 775 + rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 776 + 777 + /* 778 + * Require v4+ for MSR_CORE_PERF_GLOBAL_STATUS_SET, and full-width 779 + * writes so that KVM can precisely load guest counter values. 780 + */ 781 + if (host_pmu->version < 4 || !(host_perf_cap & PERF_CAP_FW_WRITES)) 782 + return false; 783 + 784 + /* 785 + * All CPUs that support a mediated PMU are expected to support loading 786 + * PERF_GLOBAL_CTRL via dedicated VMCS fields. 787 + */ 788 + if (WARN_ON_ONCE(!cpu_has_load_perf_global_ctrl())) 789 + return false; 790 + 791 + return true; 792 + } 793 + 794 + static void intel_pmu_write_global_ctrl(u64 global_ctrl) 795 + { 796 + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, global_ctrl); 797 + } 798 + 799 + 800 + static void intel_mediated_pmu_load(struct kvm_vcpu *vcpu) 801 + { 802 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 803 + u64 global_status, toggle; 804 + 805 + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, global_status); 806 + toggle = pmu->global_status ^ global_status; 807 + if (global_status & toggle) 808 + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, global_status & toggle); 809 + if (pmu->global_status & toggle) 810 + wrmsrq(MSR_CORE_PERF_GLOBAL_STATUS_SET, pmu->global_status & toggle); 811 + 812 + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, pmu->fixed_ctr_ctrl_hw); 813 + } 814 + 815 + static void intel_mediated_pmu_put(struct kvm_vcpu *vcpu) 816 + { 817 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 818 + 819 + /* MSR_CORE_PERF_GLOBAL_CTRL is already saved at VM-exit. */ 820 + rdmsrq(MSR_CORE_PERF_GLOBAL_STATUS, pmu->global_status); 821 + 822 + /* Clear hardware MSR_CORE_PERF_GLOBAL_STATUS MSR, if non-zero. */ 823 + if (pmu->global_status) 824 + wrmsrq(MSR_CORE_PERF_GLOBAL_OVF_CTRL, pmu->global_status); 825 + 826 + /* 827 + * Clear hardware FIXED_CTR_CTRL MSR to avoid information leakage and 828 + * also to avoid accidentally enabling fixed counters (based on guest 829 + * state) while running in the host, e.g. when setting global ctrl. 830 + */ 831 + if (pmu->fixed_ctr_ctrl_hw) 832 + wrmsrq(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); 833 + } 834 + 759 835 struct kvm_pmu_ops intel_pmu_ops __initdata = { 760 836 .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, 761 837 .msr_idx_to_pmc = intel_msr_idx_to_pmc, ··· 832 778 .reset = intel_pmu_reset, 833 779 .deliver_pmi = intel_pmu_deliver_pmi, 834 780 .cleanup = intel_pmu_cleanup, 781 + 782 + .is_mediated_pmu_supported = intel_pmu_is_mediated_pmu_supported, 783 + .mediated_load = intel_mediated_pmu_load, 784 + .mediated_put = intel_mediated_pmu_put, 785 + .write_global_ctrl = intel_pmu_write_global_ctrl, 786 + 835 787 .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, 836 788 .MAX_NR_GP_COUNTERS = KVM_MAX_NR_INTEL_GP_COUNTERS, 837 789 .MIN_NR_GP_COUNTERS = 1, 790 + 791 + .PERF_GLOBAL_CTRL = MSR_CORE_PERF_GLOBAL_CTRL, 792 + .GP_EVENTSEL_BASE = MSR_P6_EVNTSEL0, 793 + .GP_COUNTER_BASE = MSR_IA32_PMC0, 794 + .FIXED_COUNTER_BASE = MSR_CORE_PERF_FIXED_CTR0, 795 + .MSR_STRIDE = 1, 838 796 };

+15

arch/x86/kvm/vmx/pmu_intel.h

··· 4 4 5 5 #include <linux/kvm_host.h> 6 6 7 + #include "cpuid.h" 8 + 9 + static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu) 10 + { 11 + if (!guest_cpu_cap_has(vcpu, X86_FEATURE_PDCM)) 12 + return 0; 13 + 14 + return vcpu->arch.perf_capabilities; 15 + } 16 + 17 + static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) 18 + { 19 + return (vcpu_get_perf_capabilities(vcpu) & PERF_CAP_FW_WRITES) != 0; 20 + } 21 + 7 22 bool intel_pmu_lbr_is_enabled(struct kvm_vcpu *vcpu); 8 23 int intel_pmu_create_guest_lbr_event(struct kvm_vcpu *vcpu); 9 24

+163 -51

arch/x86/kvm/vmx/vmx.c

··· 150 150 extern bool __read_mostly allow_smaller_maxphyaddr; 151 151 module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); 152 152 153 + module_param(enable_mediated_pmu, bool, 0444); 154 + 153 155 #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) 154 156 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE 155 157 #define KVM_VM_CR0_ALWAYS_ON \ ··· 1029 1027 vm_exit_controls_clearbit(vmx, exit); 1030 1028 } 1031 1029 1032 - int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 1030 + static int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr) 1033 1031 { 1034 1032 unsigned int i; 1035 1033 ··· 1040 1038 return -ENOENT; 1041 1039 } 1042 1040 1043 - static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1041 + static void vmx_remove_auto_msr(struct vmx_msrs *m, u32 msr, 1042 + unsigned long vmcs_count_field) 1044 1043 { 1045 1044 int i; 1045 + 1046 + i = vmx_find_loadstore_msr_slot(m, msr); 1047 + if (i < 0) 1048 + return; 1049 + 1050 + --m->nr; 1051 + m->val[i] = m->val[m->nr]; 1052 + vmcs_write32(vmcs_count_field, m->nr); 1053 + } 1054 + 1055 + static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) 1056 + { 1046 1057 struct msr_autoload *m = &vmx->msr_autoload; 1047 1058 1048 1059 switch (msr) { ··· 1076 1061 } 1077 1062 break; 1078 1063 } 1079 - i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1080 - if (i < 0) 1081 - goto skip_guest; 1082 - --m->guest.nr; 1083 - m->guest.val[i] = m->guest.val[m->guest.nr]; 1084 - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1085 1064 1086 - skip_guest: 1087 - i = vmx_find_loadstore_msr_slot(&m->host, msr); 1088 - if (i < 0) 1089 - return; 1090 - 1091 - --m->host.nr; 1092 - m->host.val[i] = m->host.val[m->host.nr]; 1093 - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1065 + vmx_remove_auto_msr(&m->guest, msr, VM_ENTRY_MSR_LOAD_COUNT); 1066 + vmx_remove_auto_msr(&m->host, msr, VM_EXIT_MSR_LOAD_COUNT); 1094 1067 } 1095 1068 1096 1069 static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx, ··· 1093 1090 vm_exit_controls_setbit(vmx, exit); 1094 1091 } 1095 1092 1096 - static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1097 - u64 guest_val, u64 host_val, bool entry_only) 1093 + static void vmx_add_auto_msr(struct vmx_msrs *m, u32 msr, u64 value, 1094 + unsigned long vmcs_count_field, struct kvm *kvm) 1098 1095 { 1099 - int i, j = 0; 1096 + int i; 1097 + 1098 + i = vmx_find_loadstore_msr_slot(m, msr); 1099 + if (i < 0) { 1100 + if (KVM_BUG_ON(m->nr == MAX_NR_LOADSTORE_MSRS, kvm)) 1101 + return; 1102 + 1103 + i = m->nr++; 1104 + m->val[i].index = msr; 1105 + vmcs_write32(vmcs_count_field, m->nr); 1106 + } 1107 + m->val[i].value = value; 1108 + } 1109 + 1110 + static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, 1111 + u64 guest_val, u64 host_val) 1112 + { 1100 1113 struct msr_autoload *m = &vmx->msr_autoload; 1114 + struct kvm *kvm = vmx->vcpu.kvm; 1101 1115 1102 1116 switch (msr) { 1103 1117 case MSR_EFER: ··· 1148 1128 wrmsrq(MSR_IA32_PEBS_ENABLE, 0); 1149 1129 } 1150 1130 1151 - i = vmx_find_loadstore_msr_slot(&m->guest, msr); 1152 - if (!entry_only) 1153 - j = vmx_find_loadstore_msr_slot(&m->host, msr); 1154 - 1155 - if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) || 1156 - (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) { 1157 - printk_once(KERN_WARNING "Not enough msr switch entries. " 1158 - "Can't add msr %x\n", msr); 1159 - return; 1160 - } 1161 - if (i < 0) { 1162 - i = m->guest.nr++; 1163 - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr); 1164 - } 1165 - m->guest.val[i].index = msr; 1166 - m->guest.val[i].value = guest_val; 1167 - 1168 - if (entry_only) 1169 - return; 1170 - 1171 - if (j < 0) { 1172 - j = m->host.nr++; 1173 - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr); 1174 - } 1175 - m->host.val[j].index = msr; 1176 - m->host.val[j].value = host_val; 1131 + vmx_add_auto_msr(&m->guest, msr, guest_val, VM_ENTRY_MSR_LOAD_COUNT, kvm); 1132 + vmx_add_auto_msr(&m->guest, msr, host_val, VM_EXIT_MSR_LOAD_COUNT, kvm); 1177 1133 } 1178 1134 1179 1135 static bool update_transition_efer(struct vcpu_vmx *vmx) ··· 1183 1187 if (!(guest_efer & EFER_LMA)) 1184 1188 guest_efer &= ~EFER_LME; 1185 1189 if (guest_efer != kvm_host.efer) 1186 - add_atomic_switch_msr(vmx, MSR_EFER, 1187 - guest_efer, kvm_host.efer, false); 1190 + add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, kvm_host.efer); 1188 1191 else 1189 1192 clear_atomic_switch_msr(vmx, MSR_EFER); 1190 1193 return false; ··· 1202 1207 vmx->guest_uret_msrs[i].mask = ~ignore_bits; 1203 1208 1204 1209 return true; 1210 + } 1211 + 1212 + static void vmx_add_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1213 + { 1214 + vmx_add_auto_msr(&vmx->msr_autostore, msr, 0, VM_EXIT_MSR_STORE_COUNT, 1215 + vmx->vcpu.kvm); 1216 + } 1217 + 1218 + static void vmx_remove_autostore_msr(struct vcpu_vmx *vmx, u32 msr) 1219 + { 1220 + vmx_remove_auto_msr(&vmx->msr_autostore, msr, VM_EXIT_MSR_STORE_COUNT); 1205 1221 } 1206 1222 1207 1223 #ifdef CONFIG_X86_32 ··· 4284 4278 } 4285 4279 } 4286 4280 4281 + static void vmx_recalc_pmu_msr_intercepts(struct kvm_vcpu *vcpu) 4282 + { 4283 + u64 vm_exit_controls_bits = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | 4284 + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4285 + bool has_mediated_pmu = kvm_vcpu_has_mediated_pmu(vcpu); 4286 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 4287 + struct vcpu_vmx *vmx = to_vmx(vcpu); 4288 + bool intercept = !has_mediated_pmu; 4289 + int i; 4290 + 4291 + if (!enable_mediated_pmu) 4292 + return; 4293 + 4294 + if (!cpu_has_save_perf_global_ctrl()) { 4295 + vm_exit_controls_bits &= ~VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL; 4296 + 4297 + if (has_mediated_pmu) 4298 + vmx_add_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4299 + else 4300 + vmx_remove_autostore_msr(vmx, MSR_CORE_PERF_GLOBAL_CTRL); 4301 + } 4302 + 4303 + vm_entry_controls_changebit(vmx, VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, 4304 + has_mediated_pmu); 4305 + 4306 + vm_exit_controls_changebit(vmx, vm_exit_controls_bits, has_mediated_pmu); 4307 + 4308 + for (i = 0; i < pmu->nr_arch_gp_counters; i++) { 4309 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4310 + MSR_TYPE_RW, intercept); 4311 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, MSR_TYPE_RW, 4312 + intercept || !fw_writes_is_enabled(vcpu)); 4313 + } 4314 + for ( ; i < kvm_pmu_cap.num_counters_gp; i++) { 4315 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PERFCTR0 + i, 4316 + MSR_TYPE_RW, true); 4317 + vmx_set_intercept_for_msr(vcpu, MSR_IA32_PMC0 + i, 4318 + MSR_TYPE_RW, true); 4319 + } 4320 + 4321 + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) 4322 + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4323 + MSR_TYPE_RW, intercept); 4324 + for ( ; i < kvm_pmu_cap.num_counters_fixed; i++) 4325 + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_FIXED_CTR0 + i, 4326 + MSR_TYPE_RW, true); 4327 + 4328 + intercept = kvm_need_perf_global_ctrl_intercept(vcpu); 4329 + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_STATUS, 4330 + MSR_TYPE_RW, intercept); 4331 + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4332 + MSR_TYPE_RW, intercept); 4333 + vmx_set_intercept_for_msr(vcpu, MSR_CORE_PERF_GLOBAL_OVF_CTRL, 4334 + MSR_TYPE_RW, intercept); 4335 + } 4336 + 4287 4337 static void vmx_recalc_msr_intercepts(struct kvm_vcpu *vcpu) 4288 4338 { 4289 4339 bool intercept; ··· 4406 4344 vmx_set_intercept_for_msr(vcpu, MSR_IA32_S_CET, MSR_TYPE_RW, intercept); 4407 4345 } 4408 4346 4347 + vmx_recalc_pmu_msr_intercepts(vcpu); 4348 + 4409 4349 /* 4410 4350 * x2APIC and LBR MSR intercepts are modified on-demand and cannot be 4411 4351 * filtered by userspace. 4412 4352 */ 4413 4353 } 4414 4354 4355 + static void vmx_recalc_instruction_intercepts(struct kvm_vcpu *vcpu) 4356 + { 4357 + exec_controls_changebit(to_vmx(vcpu), CPU_BASED_RDPMC_EXITING, 4358 + kvm_need_rdpmc_intercept(vcpu)); 4359 + } 4360 + 4415 4361 void vmx_recalc_intercepts(struct kvm_vcpu *vcpu) 4416 4362 { 4363 + vmx_recalc_instruction_intercepts(vcpu); 4417 4364 vmx_recalc_msr_intercepts(vcpu); 4418 4365 } 4419 4366 ··· 4590 4519 vmcs_writel(HOST_SSP, 0); 4591 4520 vmcs_writel(HOST_INTR_SSP_TABLE, 0); 4592 4521 } 4522 + 4523 + /* 4524 + * When running a guest with a mediated PMU, guest state is resident in 4525 + * hardware after VM-Exit. Zero PERF_GLOBAL_CTRL on exit so that host 4526 + * activity doesn't bleed into the guest counters. When running with 4527 + * an emulated PMU, PERF_GLOBAL_CTRL is dynamically computed on every 4528 + * entry/exit to merge guest and host PMU usage. 4529 + */ 4530 + if (enable_mediated_pmu) 4531 + vmcs_write64(HOST_IA32_PERF_GLOBAL_CTRL, 0); 4593 4532 } 4594 4533 4595 4534 void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) ··· 4667 4586 VM_EXIT_CLEAR_IA32_RTIT_CTL); 4668 4587 /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ 4669 4588 return vmexit_ctrl & 4670 - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); 4589 + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER | 4590 + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL); 4671 4591 } 4672 4592 4673 4593 void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) ··· 5000 4918 vmcs_write64(VM_FUNCTION_CONTROL, 0); 5001 4919 5002 4920 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); 4921 + vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.val)); 5003 4922 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 5004 4923 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 5005 4924 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); ··· 6667 6584 if (vmcs_read32(VM_ENTRY_MSR_LOAD_COUNT) > 0) 6668 6585 vmx_dump_msrs("guest autoload", &vmx->msr_autoload.guest); 6669 6586 if (vmcs_read32(VM_EXIT_MSR_STORE_COUNT) > 0) 6670 - vmx_dump_msrs("guest autostore", &vmx->msr_autostore.guest); 6587 + vmx_dump_msrs("autostore", &vmx->msr_autostore); 6671 6588 6672 6589 if (vmentry_ctl & VM_ENTRY_LOAD_CET_STATE) 6673 6590 pr_err("S_CET = 0x%016lx, SSP = 0x%016lx, SSP TABLE = 0x%016lx\n", ··· 7419 7336 struct perf_guest_switch_msr *msrs; 7420 7337 struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu); 7421 7338 7339 + if (kvm_vcpu_has_mediated_pmu(&vmx->vcpu)) 7340 + return; 7341 + 7422 7342 pmu->host_cross_mapped_mask = 0; 7423 7343 if (pmu->pebs_enable & pmu->global_ctrl) 7424 7344 intel_pmu_cross_mapped_check(pmu); ··· 7436 7350 clear_atomic_switch_msr(vmx, msrs[i].msr); 7437 7351 else 7438 7352 add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, 7439 - msrs[i].host, false); 7353 + msrs[i].host); 7354 + } 7355 + 7356 + static void vmx_refresh_guest_perf_global_control(struct kvm_vcpu *vcpu) 7357 + { 7358 + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); 7359 + struct vcpu_vmx *vmx = to_vmx(vcpu); 7360 + 7361 + if (msr_write_intercepted(vmx, MSR_CORE_PERF_GLOBAL_CTRL)) 7362 + return; 7363 + 7364 + if (!cpu_has_save_perf_global_ctrl()) { 7365 + int slot = vmx_find_loadstore_msr_slot(&vmx->msr_autostore, 7366 + MSR_CORE_PERF_GLOBAL_CTRL); 7367 + 7368 + if (WARN_ON_ONCE(slot < 0)) 7369 + return; 7370 + 7371 + pmu->global_ctrl = vmx->msr_autostore.val[slot].value; 7372 + vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL, pmu->global_ctrl); 7373 + return; 7374 + } 7375 + 7376 + pmu->global_ctrl = vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL); 7440 7377 } 7441 7378 7442 7379 static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit) ··· 7746 7637 return EXIT_FASTPATH_NONE; 7747 7638 7748 7639 vmx->loaded_vmcs->launched = 1; 7640 + 7641 + vmx_refresh_guest_perf_global_control(vcpu); 7749 7642 7750 7643 vmx_recover_nmi_blocking(vmx); 7751 7644 vmx_complete_interrupts(vmx); ··· 8142 8031 if (boot_cpu_has(X86_FEATURE_PDCM)) 8143 8032 rdmsrq(MSR_IA32_PERF_CAPABILITIES, host_perf_cap); 8144 8033 8145 - if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) { 8034 + if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR) && 8035 + !enable_mediated_pmu) { 8146 8036 x86_perf_get_lbr(&vmx_lbr_caps); 8147 8037 8148 8038 /*

+4 -5

arch/x86/kvm/vmx/vmx.h

··· 182 182 u16 vpid02; 183 183 u16 last_vpid; 184 184 185 + int tsc_autostore_slot; 185 186 struct nested_vmx_msrs msrs; 186 187 187 188 /* SMM related state */ ··· 237 236 struct vmx_msrs host; 238 237 } msr_autoload; 239 238 240 - struct msr_autostore { 241 - struct vmx_msrs guest; 242 - } msr_autostore; 239 + struct vmx_msrs msr_autostore; 243 240 244 241 struct { 245 242 int vm86_active; ··· 375 376 unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx); 376 377 bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, 377 378 unsigned int flags); 378 - int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr); 379 379 void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu); 380 380 381 381 void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type, bool set); ··· 499 501 VM_EXIT_CLEAR_BNDCFGS | \ 500 502 VM_EXIT_PT_CONCEAL_PIP | \ 501 503 VM_EXIT_CLEAR_IA32_RTIT_CTL | \ 502 - VM_EXIT_LOAD_CET_STATE) 504 + VM_EXIT_LOAD_CET_STATE | \ 505 + VM_EXIT_SAVE_IA32_PERF_GLOBAL_CTRL) 503 506 504 507 #define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \ 505 508 (PIN_BASED_EXT_INTR_MASK | \

+51 -3

arch/x86/kvm/x86.c

··· 185 185 EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_pmu); 186 186 module_param(enable_pmu, bool, 0444); 187 187 188 + /* Enable/disabled mediated PMU virtualization. */ 189 + bool __read_mostly enable_mediated_pmu; 190 + EXPORT_SYMBOL_FOR_KVM_INTERNAL(enable_mediated_pmu); 191 + 188 192 bool __read_mostly eager_page_split = true; 189 193 module_param(eager_page_split, bool, 0644); 190 194 ··· 2217 2213 2218 2214 fastpath_t handle_fastpath_invd(struct kvm_vcpu *vcpu) 2219 2215 { 2216 + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) 2217 + return EXIT_FASTPATH_NONE; 2218 + 2220 2219 if (!kvm_emulate_invd(vcpu)) 2221 2220 return EXIT_FASTPATH_EXIT_USERSPACE; 2222 2221 ··· 2276 2269 2277 2270 static fastpath_t __handle_fastpath_wrmsr(struct kvm_vcpu *vcpu, u32 msr, u64 data) 2278 2271 { 2272 + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) 2273 + return EXIT_FASTPATH_NONE; 2274 + 2279 2275 switch (msr) { 2280 2276 case APIC_BASE_MSR + (APIC_ICR >> 4): 2281 2277 if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic) || ··· 3954 3944 3955 3945 vcpu->arch.perf_capabilities = data; 3956 3946 kvm_pmu_refresh(vcpu); 3947 + kvm_make_request(KVM_REQ_RECALC_INTERCEPTS, vcpu); 3957 3948 break; 3958 3949 case MSR_IA32_PRED_CMD: { 3959 3950 u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); ··· 6892 6881 break; 6893 6882 6894 6883 mutex_lock(&kvm->lock); 6895 - if (!kvm->created_vcpus) { 6884 + if (!kvm->created_vcpus && !kvm->arch.created_mediated_pmu) { 6896 6885 kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); 6897 6886 r = 0; 6898 6887 } ··· 10162 10151 set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); 10163 10152 #endif 10164 10153 10165 - kvm_register_perf_callbacks(ops->handle_intel_pt_intr); 10154 + __kvm_register_perf_callbacks(ops->handle_intel_pt_intr, 10155 + enable_mediated_pmu ? kvm_handle_guest_mediated_pmi : NULL); 10166 10156 10167 10157 if (IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled) 10168 10158 kvm_caps.supported_vm_types |= BIT(KVM_X86_SW_PROTECTED_VM); ··· 11371 11359 run_flags |= KVM_RUN_LOAD_DEBUGCTL; 11372 11360 vcpu->arch.host_debugctl = debug_ctl; 11373 11361 11362 + kvm_mediated_pmu_load(vcpu); 11363 + 11374 11364 guest_timing_enter_irqoff(); 11375 11365 11376 11366 /* ··· 11410 11396 } 11411 11397 11412 11398 kvm_load_host_pkru(vcpu); 11399 + 11400 + kvm_mediated_pmu_put(vcpu); 11413 11401 11414 11402 /* 11415 11403 * Do this here before restoring debug registers on the host. And ··· 11750 11734 11751 11735 fastpath_t handle_fastpath_hlt(struct kvm_vcpu *vcpu) 11752 11736 { 11737 + if (!kvm_pmu_is_fastpath_emulation_allowed(vcpu)) 11738 + return EXIT_FASTPATH_NONE; 11739 + 11753 11740 if (!kvm_emulate_halt(vcpu)) 11754 11741 return EXIT_FASTPATH_EXIT_USERSPACE; 11755 11742 ··· 12692 12673 return 0; 12693 12674 } 12694 12675 12676 + #define PERF_MEDIATED_PMU_MSG \ 12677 + "Failed to enable mediated vPMU, try disabling system wide perf events and nmi_watchdog.\n" 12678 + 12695 12679 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) 12696 12680 { 12681 + int r; 12682 + 12697 12683 if (kvm_check_tsc_unstable() && kvm->created_vcpus) 12698 12684 pr_warn_once("SMP vm created on host with unstable TSC; " 12699 12685 "guest TSC will not be reliable\n"); ··· 12709 12685 if (id >= kvm->arch.max_vcpu_ids) 12710 12686 return -EINVAL; 12711 12687 12712 - return kvm_x86_call(vcpu_precreate)(kvm); 12688 + /* 12689 + * Note, any actions done by .vcpu_create() must be idempotent with 12690 + * respect to creating multiple vCPUs, and therefore are not undone if 12691 + * creating a vCPU fails (including failure during pre-create). 12692 + */ 12693 + r = kvm_x86_call(vcpu_precreate)(kvm); 12694 + if (r) 12695 + return r; 12696 + 12697 + if (enable_mediated_pmu && kvm->arch.enable_pmu && 12698 + !kvm->arch.created_mediated_pmu) { 12699 + if (irqchip_in_kernel(kvm)) { 12700 + r = perf_create_mediated_pmu(); 12701 + if (r) { 12702 + pr_warn_ratelimited(PERF_MEDIATED_PMU_MSG); 12703 + return r; 12704 + } 12705 + kvm->arch.created_mediated_pmu = true; 12706 + } else { 12707 + kvm->arch.enable_pmu = false; 12708 + } 12709 + } 12710 + return 0; 12713 12711 } 12714 12712 12715 12713 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) ··· 13397 13351 __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); 13398 13352 mutex_unlock(&kvm->slots_lock); 13399 13353 } 13354 + if (kvm->arch.created_mediated_pmu) 13355 + perf_release_mediated_pmu(); 13400 13356 kvm_destroy_vcpus(kvm); 13401 13357 kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); 13402 13358 #ifdef CONFIG_KVM_IOAPIC

+1

arch/x86/kvm/x86.h

··· 481 481 extern struct kvm_host_values kvm_host; 482 482 483 483 extern bool enable_pmu; 484 + extern bool enable_mediated_pmu; 484 485 485 486 void kvm_setup_xss_caps(void); 486 487

+1

include/asm-generic/Kbuild

··· 32 32 mandatory-y += kdebug.h 33 33 mandatory-y += kmap_size.h 34 34 mandatory-y += kprobes.h 35 + mandatory-y += kvm_types.h 35 36 mandatory-y += linkage.h 36 37 mandatory-y += local.h 37 38 mandatory-y += local64.h

+9 -2

include/linux/kvm_host.h

··· 1756 1756 #ifdef CONFIG_GUEST_PERF_EVENTS 1757 1757 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu); 1758 1758 1759 - void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)); 1759 + void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), 1760 + void (*mediated_pmi_handler)(void)); 1761 + 1762 + static inline void kvm_register_perf_callbacks(void) 1763 + { 1764 + __kvm_register_perf_callbacks(NULL, NULL); 1765 + } 1766 + 1760 1767 void kvm_unregister_perf_callbacks(void); 1761 1768 #else 1762 - static inline void kvm_register_perf_callbacks(void *ign) {} 1769 + static inline void kvm_register_perf_callbacks(void) {} 1763 1770 static inline void kvm_unregister_perf_callbacks(void) {} 1764 1771 #endif /* CONFIG_GUEST_PERF_EVENTS */ 1765 1772

+29 -6

include/linux/perf_event.h

··· 305 305 #define PERF_PMU_CAP_EXTENDED_HW_TYPE 0x0100 306 306 #define PERF_PMU_CAP_AUX_PAUSE 0x0200 307 307 #define PERF_PMU_CAP_AUX_PREFER_LARGE 0x0400 308 + #define PERF_PMU_CAP_MEDIATED_VPMU 0x0800 308 309 309 310 /** 310 311 * pmu::scope ··· 999 998 u64 index; 1000 999 }; 1001 1000 1001 + struct perf_time_ctx { 1002 + u64 time; 1003 + u64 stamp; 1004 + u64 offset; 1005 + }; 1002 1006 1003 1007 /** 1004 1008 * struct perf_event_context - event context structure ··· 1042 1036 /* 1043 1037 * Context clock, runs when context enabled. 1044 1038 */ 1045 - u64 time; 1046 - u64 timestamp; 1047 - u64 timeoffset; 1039 + struct perf_time_ctx time; 1040 + 1041 + /* 1042 + * Context clock, runs when in the guest mode. 1043 + */ 1044 + struct perf_time_ctx timeguest; 1048 1045 1049 1046 /* 1050 1047 * These fields let us detect when two contexts have both ··· 1180 1171 * This is a per-cpu dynamically allocated data structure. 1181 1172 */ 1182 1173 struct perf_cgroup_info { 1183 - u64 time; 1184 - u64 timestamp; 1185 - u64 timeoffset; 1174 + struct perf_time_ctx time; 1175 + struct perf_time_ctx timeguest; 1186 1176 int active; 1187 1177 }; 1188 1178 ··· 1677 1669 unsigned int (*state)(void); 1678 1670 unsigned long (*get_ip)(void); 1679 1671 unsigned int (*handle_intel_pt_intr)(void); 1672 + 1673 + void (*handle_mediated_pmi)(void); 1680 1674 }; 1681 1675 1682 1676 #ifdef CONFIG_GUEST_PERF_EVENTS ··· 1688 1678 DECLARE_STATIC_CALL(__perf_guest_state, *perf_guest_cbs->state); 1689 1679 DECLARE_STATIC_CALL(__perf_guest_get_ip, *perf_guest_cbs->get_ip); 1690 1680 DECLARE_STATIC_CALL(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); 1681 + DECLARE_STATIC_CALL(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); 1691 1682 1692 1683 static inline unsigned int perf_guest_state(void) 1693 1684 { ··· 1703 1692 static inline unsigned int perf_guest_handle_intel_pt_intr(void) 1704 1693 { 1705 1694 return static_call(__perf_guest_handle_intel_pt_intr)(); 1695 + } 1696 + 1697 + static inline void perf_guest_handle_mediated_pmi(void) 1698 + { 1699 + static_call(__perf_guest_handle_mediated_pmi)(); 1706 1700 } 1707 1701 1708 1702 extern void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs); ··· 1929 1913 extern int perf_event_account_interrupt(struct perf_event *event); 1930 1914 extern int perf_event_period(struct perf_event *event, u64 value); 1931 1915 extern u64 perf_event_pause(struct perf_event *event, bool reset); 1916 + 1917 + #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU 1918 + int perf_create_mediated_pmu(void); 1919 + void perf_release_mediated_pmu(void); 1920 + void perf_load_guest_context(void); 1921 + void perf_put_guest_context(void); 1922 + #endif 1932 1923 1933 1924 #else /* !CONFIG_PERF_EVENTS: */ 1934 1925

+16 -2

include/linux/unwind_user.h

··· 5 5 #include <linux/unwind_user_types.h> 6 6 #include <asm/unwind_user.h> 7 7 8 - #ifndef ARCH_INIT_USER_FP_FRAME 9 - #define ARCH_INIT_USER_FP_FRAME 8 + #ifndef CONFIG_HAVE_UNWIND_USER_FP 9 + 10 + #define ARCH_INIT_USER_FP_FRAME(ws) 11 + 12 + #endif 13 + 14 + #ifndef ARCH_INIT_USER_FP_ENTRY_FRAME 15 + #define ARCH_INIT_USER_FP_ENTRY_FRAME(ws) 16 + #endif 17 + 18 + #ifndef unwind_user_at_function_start 19 + static inline bool unwind_user_at_function_start(struct pt_regs *regs) 20 + { 21 + return false; 22 + } 23 + #define unwind_user_at_function_start unwind_user_at_function_start 10 24 #endif 11 25 12 26 int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries);

+4

init/Kconfig

··· 2061 2061 bool 2062 2062 depends on HAVE_PERF_EVENTS 2063 2063 2064 + config PERF_GUEST_MEDIATED_PMU 2065 + bool 2066 + depends on GUEST_PERF_EVENTS 2067 + 2064 2068 config PERF_USE_VMALLOC 2065 2069 bool 2066 2070 help

+403 -120

kernel/events/core.c

··· 57 57 #include <linux/task_work.h> 58 58 #include <linux/percpu-rwsem.h> 59 59 #include <linux/unwind_deferred.h> 60 + #include <linux/kvm_types.h> 60 61 61 62 #include "internal.h" 62 63 ··· 167 166 EVENT_CPU = 0x10, 168 167 EVENT_CGROUP = 0x20, 169 168 169 + /* 170 + * EVENT_GUEST is set when scheduling in/out events between the host 171 + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST 172 + * is used: 173 + * 174 + * - In for_each_epc() to skip PMUs that don't support events in a 175 + * MEDIATED_VPMU guest, i.e. don't need to be context switched. 176 + * - To indicate the start/end point of the events in a guest. Guest 177 + * running time is deducted for host-only (exclude_guest) events. 178 + */ 179 + EVENT_GUEST = 0x40, 180 + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, 170 181 /* compound helpers */ 171 182 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 172 183 EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, ··· 470 457 static cpumask_var_t perf_online_pkg_mask; 471 458 static cpumask_var_t perf_online_sys_mask; 472 459 static struct kmem_cache *perf_event_cache; 460 + 461 + #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU 462 + static DEFINE_PER_CPU(bool, guest_ctx_loaded); 463 + 464 + static __always_inline bool is_guest_mediated_pmu_loaded(void) 465 + { 466 + return __this_cpu_read(guest_ctx_loaded); 467 + } 468 + #else 469 + static __always_inline bool is_guest_mediated_pmu_loaded(void) 470 + { 471 + return false; 472 + } 473 + #endif 473 474 474 475 /* 475 476 * perf event paranoia level: ··· 806 779 ___p; \ 807 780 }) 808 781 809 - #define for_each_epc(_epc, _ctx, _pmu, _cgroup) \ 782 + static bool perf_skip_pmu_ctx(struct perf_event_pmu_context *pmu_ctx, 783 + enum event_type_t event_type) 784 + { 785 + if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) 786 + return true; 787 + if ((event_type & EVENT_GUEST) && 788 + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) 789 + return true; 790 + return false; 791 + } 792 + 793 + #define for_each_epc(_epc, _ctx, _pmu, _event_type) \ 810 794 list_for_each_entry(_epc, &((_ctx)->pmu_ctx_list), pmu_ctx_entry) \ 811 - if (_cgroup && !_epc->nr_cgroups) \ 795 + if (perf_skip_pmu_ctx(_epc, _event_type)) \ 812 796 continue; \ 813 797 else if (_pmu && _epc->pmu != _pmu) \ 814 798 continue; \ 815 799 else 816 800 817 - static void perf_ctx_disable(struct perf_event_context *ctx, bool cgroup) 801 + static void perf_ctx_disable(struct perf_event_context *ctx, 802 + enum event_type_t event_type) 818 803 { 819 804 struct perf_event_pmu_context *pmu_ctx; 820 805 821 - for_each_epc(pmu_ctx, ctx, NULL, cgroup) 806 + for_each_epc(pmu_ctx, ctx, NULL, event_type) 822 807 perf_pmu_disable(pmu_ctx->pmu); 823 808 } 824 809 825 - static void perf_ctx_enable(struct perf_event_context *ctx, bool cgroup) 810 + static void perf_ctx_enable(struct perf_event_context *ctx, 811 + enum event_type_t event_type) 826 812 { 827 813 struct perf_event_pmu_context *pmu_ctx; 828 814 829 - for_each_epc(pmu_ctx, ctx, NULL, cgroup) 815 + for_each_epc(pmu_ctx, ctx, NULL, event_type) 830 816 perf_pmu_enable(pmu_ctx->pmu); 831 817 } 832 818 833 819 static void ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); 834 820 static void ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type); 821 + 822 + static inline void update_perf_time_ctx(struct perf_time_ctx *time, u64 now, bool adv) 823 + { 824 + if (adv) 825 + time->time += now - time->stamp; 826 + time->stamp = now; 827 + 828 + /* 829 + * The above: time' = time + (now - timestamp), can be re-arranged 830 + * into: time` = now + (time - timestamp), which gives a single value 831 + * offset to compute future time without locks on. 832 + * 833 + * See perf_event_time_now(), which can be used from NMI context where 834 + * it's (obviously) not possible to acquire ctx->lock in order to read 835 + * both the above values in a consistent manner. 836 + */ 837 + WRITE_ONCE(time->offset, time->time - time->stamp); 838 + } 839 + 840 + static_assert(offsetof(struct perf_event_context, timeguest) - 841 + offsetof(struct perf_event_context, time) == 842 + sizeof(struct perf_time_ctx)); 843 + 844 + #define T_TOTAL 0 845 + #define T_GUEST 1 846 + 847 + static inline u64 __perf_event_time_ctx(struct perf_event *event, 848 + struct perf_time_ctx *times) 849 + { 850 + u64 time = times[T_TOTAL].time; 851 + 852 + if (event->attr.exclude_guest) 853 + time -= times[T_GUEST].time; 854 + 855 + return time; 856 + } 857 + 858 + static inline u64 __perf_event_time_ctx_now(struct perf_event *event, 859 + struct perf_time_ctx *times, 860 + u64 now) 861 + { 862 + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { 863 + /* 864 + * (now + times[total].offset) - (now + times[guest].offset) := 865 + * times[total].offset - times[guest].offset 866 + */ 867 + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); 868 + } 869 + 870 + return now + READ_ONCE(times[T_TOTAL].offset); 871 + } 835 872 836 873 #ifdef CONFIG_CGROUP_PERF 837 874 ··· 933 842 return event->cgrp != NULL; 934 843 } 935 844 845 + static_assert(offsetof(struct perf_cgroup_info, timeguest) - 846 + offsetof(struct perf_cgroup_info, time) == 847 + sizeof(struct perf_time_ctx)); 848 + 936 849 static inline u64 perf_cgroup_event_time(struct perf_event *event) 937 850 { 938 851 struct perf_cgroup_info *t; 939 852 940 853 t = per_cpu_ptr(event->cgrp->info, event->cpu); 941 - return t->time; 854 + return __perf_event_time_ctx(event, &t->time); 942 855 } 943 856 944 857 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) ··· 951 856 952 857 t = per_cpu_ptr(event->cgrp->info, event->cpu); 953 858 if (!__load_acquire(&t->active)) 954 - return t->time; 955 - now += READ_ONCE(t->timeoffset); 956 - return now; 859 + return __perf_event_time_ctx(event, &t->time); 860 + 861 + return __perf_event_time_ctx_now(event, &t->time, now); 957 862 } 958 863 959 - static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) 864 + static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) 960 865 { 961 - if (adv) 962 - info->time += now - info->timestamp; 963 - info->timestamp = now; 964 - /* 965 - * see update_context_time() 966 - */ 967 - WRITE_ONCE(info->timeoffset, info->time - info->timestamp); 866 + update_perf_time_ctx(&info->timeguest, now, adv); 867 + } 868 + 869 + static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) 870 + { 871 + update_perf_time_ctx(&info->time, now, true); 872 + if (is_guest_mediated_pmu_loaded()) 873 + __update_cgrp_guest_time(info, now, true); 968 874 } 969 875 970 876 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) ··· 981 885 cgrp = container_of(css, struct perf_cgroup, css); 982 886 info = this_cpu_ptr(cgrp->info); 983 887 984 - __update_cgrp_time(info, now, true); 888 + update_cgrp_time(info, now); 985 889 if (final) 986 890 __store_release(&info->active, 0); 987 891 } ··· 1004 908 * Do not update time when cgroup is not active 1005 909 */ 1006 910 if (info->active) 1007 - __update_cgrp_time(info, perf_clock(), true); 911 + update_cgrp_time(info, perf_clock()); 1008 912 } 1009 913 1010 914 static inline void 1011 - perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 915 + perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) 1012 916 { 1013 917 struct perf_event_context *ctx = &cpuctx->ctx; 1014 918 struct perf_cgroup *cgrp = cpuctx->cgrp; ··· 1028 932 for (css = &cgrp->css; css; css = css->parent) { 1029 933 cgrp = container_of(css, struct perf_cgroup, css); 1030 934 info = this_cpu_ptr(cgrp->info); 1031 - __update_cgrp_time(info, ctx->timestamp, false); 1032 - __store_release(&info->active, 1); 935 + if (guest) { 936 + __update_cgrp_guest_time(info, ctx->time.stamp, false); 937 + } else { 938 + update_perf_time_ctx(&info->time, ctx->time.stamp, false); 939 + __store_release(&info->active, 1); 940 + } 1033 941 } 1034 942 } 1035 943 ··· 1064 964 return; 1065 965 1066 966 WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0); 1067 - 1068 - perf_ctx_disable(&cpuctx->ctx, true); 967 + perf_ctx_disable(&cpuctx->ctx, EVENT_CGROUP); 1069 968 1070 969 ctx_sched_out(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); 1071 970 /* ··· 1080 981 */ 1081 982 ctx_sched_in(&cpuctx->ctx, NULL, EVENT_ALL|EVENT_CGROUP); 1082 983 1083 - perf_ctx_enable(&cpuctx->ctx, true); 984 + perf_ctx_enable(&cpuctx->ctx, EVENT_CGROUP); 1084 985 } 1085 986 1086 987 static int perf_cgroup_ensure_storage(struct perf_event *event, ··· 1237 1138 } 1238 1139 1239 1140 static inline void 1240 - perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 1141 + perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) 1241 1142 { 1242 1143 } 1243 1144 ··· 1649 1550 */ 1650 1551 static void __update_context_time(struct perf_event_context *ctx, bool adv) 1651 1552 { 1652 - u64 now = perf_clock(); 1653 - 1654 1553 lockdep_assert_held(&ctx->lock); 1655 1554 1656 - if (adv) 1657 - ctx->time += now - ctx->timestamp; 1658 - ctx->timestamp = now; 1555 + update_perf_time_ctx(&ctx->time, perf_clock(), adv); 1556 + } 1659 1557 1660 - /* 1661 - * The above: time' = time + (now - timestamp), can be re-arranged 1662 - * into: time` = now + (time - timestamp), which gives a single value 1663 - * offset to compute future time without locks on. 1664 - * 1665 - * See perf_event_time_now(), which can be used from NMI context where 1666 - * it's (obviously) not possible to acquire ctx->lock in order to read 1667 - * both the above values in a consistent manner. 1668 - */ 1669 - WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); 1558 + static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) 1559 + { 1560 + lockdep_assert_held(&ctx->lock); 1561 + 1562 + /* must be called after __update_context_time(); */ 1563 + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); 1670 1564 } 1671 1565 1672 1566 static void update_context_time(struct perf_event_context *ctx) 1673 1567 { 1674 1568 __update_context_time(ctx, true); 1569 + if (is_guest_mediated_pmu_loaded()) 1570 + __update_context_guest_time(ctx, true); 1675 1571 } 1676 1572 1677 1573 static u64 perf_event_time(struct perf_event *event) ··· 1679 1585 if (is_cgroup_event(event)) 1680 1586 return perf_cgroup_event_time(event); 1681 1587 1682 - return ctx->time; 1588 + return __perf_event_time_ctx(event, &ctx->time); 1683 1589 } 1684 1590 1685 1591 static u64 perf_event_time_now(struct perf_event *event, u64 now) ··· 1693 1599 return perf_cgroup_event_time_now(event, now); 1694 1600 1695 1601 if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) 1696 - return ctx->time; 1602 + return __perf_event_time_ctx(event, &ctx->time); 1697 1603 1698 - now += READ_ONCE(ctx->timeoffset); 1699 - return now; 1604 + return __perf_event_time_ctx_now(event, &ctx->time, now); 1700 1605 } 1701 1606 1702 1607 static enum event_type_t get_event_type(struct perf_event *event) ··· 2515 2422 } 2516 2423 2517 2424 static inline void 2518 - __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) 2425 + __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, 2426 + bool final, enum event_type_t event_type) 2519 2427 { 2520 2428 if (ctx->is_active & EVENT_TIME) { 2521 2429 if (ctx->is_active & EVENT_FROZEN) 2522 2430 return; 2431 + 2523 2432 update_context_time(ctx); 2524 - update_cgrp_time_from_cpuctx(cpuctx, final); 2433 + /* vPMU should not stop time */ 2434 + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); 2525 2435 } 2526 2436 } 2527 2437 2528 2438 static inline void 2529 2439 ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) 2530 2440 { 2531 - __ctx_time_update(cpuctx, ctx, false); 2441 + __ctx_time_update(cpuctx, ctx, false, 0); 2532 2442 } 2533 2443 2534 2444 /* ··· 2957 2861 2958 2862 static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2959 2863 struct perf_event_context *ctx, 2960 - struct pmu *pmu) 2864 + struct pmu *pmu, 2865 + enum event_type_t event_type) 2961 2866 { 2962 - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED); 2867 + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_PINNED | event_type); 2963 2868 if (ctx) 2964 - ctx_sched_in(ctx, pmu, EVENT_PINNED); 2965 - ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); 2869 + ctx_sched_in(ctx, pmu, EVENT_PINNED | event_type); 2870 + ctx_sched_in(&cpuctx->ctx, pmu, EVENT_FLEXIBLE | event_type); 2966 2871 if (ctx) 2967 - ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE); 2872 + ctx_sched_in(ctx, pmu, EVENT_FLEXIBLE | event_type); 2968 2873 } 2969 2874 2970 2875 /* ··· 2999 2902 3000 2903 event_type &= EVENT_ALL; 3001 2904 3002 - for_each_epc(epc, &cpuctx->ctx, pmu, false) 2905 + for_each_epc(epc, &cpuctx->ctx, pmu, 0) 3003 2906 perf_pmu_disable(epc->pmu); 3004 2907 3005 2908 if (task_ctx) { 3006 - for_each_epc(epc, task_ctx, pmu, false) 2909 + for_each_epc(epc, task_ctx, pmu, 0) 3007 2910 perf_pmu_disable(epc->pmu); 3008 2911 3009 2912 task_ctx_sched_out(task_ctx, pmu, event_type); ··· 3021 2924 else if (event_type & EVENT_PINNED) 3022 2925 ctx_sched_out(&cpuctx->ctx, pmu, EVENT_FLEXIBLE); 3023 2926 3024 - perf_event_sched_in(cpuctx, task_ctx, pmu); 2927 + perf_event_sched_in(cpuctx, task_ctx, pmu, 0); 3025 2928 3026 - for_each_epc(epc, &cpuctx->ctx, pmu, false) 2929 + for_each_epc(epc, &cpuctx->ctx, pmu, 0) 3027 2930 perf_pmu_enable(epc->pmu); 3028 2931 3029 2932 if (task_ctx) { 3030 - for_each_epc(epc, task_ctx, pmu, false) 2933 + for_each_epc(epc, task_ctx, pmu, 0) 3031 2934 perf_pmu_enable(epc->pmu); 3032 2935 } 3033 2936 } ··· 3576 3479 ctx_sched_out(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) 3577 3480 { 3578 3481 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 3482 + enum event_type_t active_type = event_type & ~EVENT_FLAGS; 3579 3483 struct perf_event_pmu_context *pmu_ctx; 3580 3484 int is_active = ctx->is_active; 3581 - bool cgroup = event_type & EVENT_CGROUP; 3582 3485 3583 - event_type &= ~EVENT_CGROUP; 3584 3486 3585 3487 lockdep_assert_held(&ctx->lock); 3586 3488 ··· 3603 3507 * 3604 3508 * would only update time for the pinned events. 3605 3509 */ 3606 - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); 3510 + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); 3607 3511 3608 3512 /* 3609 3513 * CPU-release for the below ->is_active store, 3610 3514 * see __load_acquire() in perf_event_time_now() 3611 3515 */ 3612 3516 barrier(); 3613 - ctx->is_active &= ~event_type; 3517 + ctx->is_active &= ~active_type; 3614 3518 3615 3519 if (!(ctx->is_active & EVENT_ALL)) { 3616 3520 /* ··· 3629 3533 cpuctx->task_ctx = NULL; 3630 3534 } 3631 3535 3632 - is_active ^= ctx->is_active; /* changed bits */ 3536 + if (event_type & EVENT_GUEST) { 3537 + /* 3538 + * Schedule out all exclude_guest events of PMU 3539 + * with PERF_PMU_CAP_MEDIATED_VPMU. 3540 + */ 3541 + is_active = EVENT_ALL; 3542 + __update_context_guest_time(ctx, false); 3543 + perf_cgroup_set_timestamp(cpuctx, true); 3544 + barrier(); 3545 + } else { 3546 + is_active ^= ctx->is_active; /* changed bits */ 3547 + } 3633 3548 3634 - for_each_epc(pmu_ctx, ctx, pmu, cgroup) 3549 + for_each_epc(pmu_ctx, ctx, pmu, event_type) 3635 3550 __pmu_ctx_sched_out(pmu_ctx, is_active); 3636 3551 } 3637 3552 ··· 3798 3691 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 3799 3692 if (context_equiv(ctx, next_ctx)) { 3800 3693 3801 - perf_ctx_disable(ctx, false); 3694 + perf_ctx_disable(ctx, 0); 3802 3695 3803 3696 /* PMIs are disabled; ctx->nr_no_switch_fast is stable. */ 3804 3697 if (local_read(&ctx->nr_no_switch_fast) || ··· 3822 3715 3823 3716 perf_ctx_sched_task_cb(ctx, task, false); 3824 3717 3825 - perf_ctx_enable(ctx, false); 3718 + perf_ctx_enable(ctx, 0); 3826 3719 3827 3720 /* 3828 3721 * RCU_INIT_POINTER here is safe because we've not ··· 3846 3739 3847 3740 if (do_switch) { 3848 3741 raw_spin_lock(&ctx->lock); 3849 - perf_ctx_disable(ctx, false); 3742 + perf_ctx_disable(ctx, 0); 3850 3743 3851 3744 inside_switch: 3852 3745 perf_ctx_sched_task_cb(ctx, task, false); 3853 3746 task_ctx_sched_out(ctx, NULL, EVENT_ALL); 3854 3747 3855 - perf_ctx_enable(ctx, false); 3748 + perf_ctx_enable(ctx, 0); 3856 3749 raw_spin_unlock(&ctx->lock); 3857 3750 } 3858 3751 } ··· 4099 3992 event_update_userpage(event); 4100 3993 } 4101 3994 3995 + struct merge_sched_data { 3996 + int can_add_hw; 3997 + enum event_type_t event_type; 3998 + }; 3999 + 4102 4000 static int merge_sched_in(struct perf_event *event, void *data) 4103 4001 { 4104 4002 struct perf_event_context *ctx = event->ctx; 4105 - int *can_add_hw = data; 4003 + struct merge_sched_data *msd = data; 4106 4004 4107 4005 if (event->state <= PERF_EVENT_STATE_OFF) 4108 4006 return 0; ··· 4115 4003 if (!event_filter_match(event)) 4116 4004 return 0; 4117 4005 4118 - if (group_can_go_on(event, *can_add_hw)) { 4006 + /* 4007 + * Don't schedule in any host events from PMU with 4008 + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. 4009 + */ 4010 + if (is_guest_mediated_pmu_loaded() && 4011 + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && 4012 + !(msd->event_type & EVENT_GUEST)) 4013 + return 0; 4014 + 4015 + if (group_can_go_on(event, msd->can_add_hw)) { 4119 4016 if (!group_sched_in(event, ctx)) 4120 4017 list_add_tail(&event->active_list, get_event_list(event)); 4121 4018 } 4122 4019 4123 4020 if (event->state == PERF_EVENT_STATE_INACTIVE) { 4124 - *can_add_hw = 0; 4021 + msd->can_add_hw = 0; 4125 4022 if (event->attr.pinned) { 4126 4023 perf_cgroup_event_disable(event, ctx); 4127 4024 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); ··· 4153 4032 4154 4033 static void pmu_groups_sched_in(struct perf_event_context *ctx, 4155 4034 struct perf_event_groups *groups, 4156 - struct pmu *pmu) 4035 + struct pmu *pmu, 4036 + enum event_type_t event_type) 4157 4037 { 4158 - int can_add_hw = 1; 4038 + struct merge_sched_data msd = { 4039 + .can_add_hw = 1, 4040 + .event_type = event_type, 4041 + }; 4159 4042 visit_groups_merge(ctx, groups, smp_processor_id(), pmu, 4160 - merge_sched_in, &can_add_hw); 4043 + merge_sched_in, &msd); 4161 4044 } 4162 4045 4163 4046 static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, ··· 4170 4045 struct perf_event_context *ctx = pmu_ctx->ctx; 4171 4046 4172 4047 if (event_type & EVENT_PINNED) 4173 - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); 4048 + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); 4174 4049 if (event_type & EVENT_FLEXIBLE) 4175 - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); 4050 + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); 4176 4051 } 4177 4052 4178 4053 static void 4179 4054 ctx_sched_in(struct perf_event_context *ctx, struct pmu *pmu, enum event_type_t event_type) 4180 4055 { 4181 4056 struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 4057 + enum event_type_t active_type = event_type & ~EVENT_FLAGS; 4182 4058 struct perf_event_pmu_context *pmu_ctx; 4183 4059 int is_active = ctx->is_active; 4184 - bool cgroup = event_type & EVENT_CGROUP; 4185 - 4186 - event_type &= ~EVENT_CGROUP; 4187 4060 4188 4061 lockdep_assert_held(&ctx->lock); 4189 4062 ··· 4189 4066 return; 4190 4067 4191 4068 if (!(is_active & EVENT_TIME)) { 4069 + /* EVENT_TIME should be active while the guest runs */ 4070 + WARN_ON_ONCE(event_type & EVENT_GUEST); 4192 4071 /* start ctx time */ 4193 4072 __update_context_time(ctx, false); 4194 - perf_cgroup_set_timestamp(cpuctx); 4073 + perf_cgroup_set_timestamp(cpuctx, false); 4195 4074 /* 4196 4075 * CPU-release for the below ->is_active store, 4197 4076 * see __load_acquire() in perf_event_time_now() ··· 4201 4076 barrier(); 4202 4077 } 4203 4078 4204 - ctx->is_active |= (event_type | EVENT_TIME); 4079 + ctx->is_active |= active_type | EVENT_TIME; 4205 4080 if (ctx->task) { 4206 4081 if (!(is_active & EVENT_ALL)) 4207 4082 cpuctx->task_ctx = ctx; ··· 4209 4084 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 4210 4085 } 4211 4086 4212 - is_active ^= ctx->is_active; /* changed bits */ 4087 + if (event_type & EVENT_GUEST) { 4088 + /* 4089 + * Schedule in the required exclude_guest events of PMU 4090 + * with PERF_PMU_CAP_MEDIATED_VPMU. 4091 + */ 4092 + is_active = event_type & EVENT_ALL; 4093 + 4094 + /* 4095 + * Update ctx time to set the new start time for 4096 + * the exclude_guest events. 4097 + */ 4098 + update_context_time(ctx); 4099 + update_cgrp_time_from_cpuctx(cpuctx, false); 4100 + barrier(); 4101 + } else { 4102 + is_active ^= ctx->is_active; /* changed bits */ 4103 + } 4213 4104 4214 4105 /* 4215 4106 * First go through the list and put on any pinned groups 4216 4107 * in order to give them the best chance of going on. 4217 4108 */ 4218 4109 if (is_active & EVENT_PINNED) { 4219 - for_each_epc(pmu_ctx, ctx, pmu, cgroup) 4220 - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); 4110 + for_each_epc(pmu_ctx, ctx, pmu, event_type) 4111 + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); 4221 4112 } 4222 4113 4223 4114 /* Then walk through the lower prio flexible groups */ 4224 4115 if (is_active & EVENT_FLEXIBLE) { 4225 - for_each_epc(pmu_ctx, ctx, pmu, cgroup) 4226 - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); 4116 + for_each_epc(pmu_ctx, ctx, pmu, event_type) 4117 + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); 4227 4118 } 4228 4119 } 4229 4120 ··· 4255 4114 4256 4115 if (cpuctx->task_ctx == ctx) { 4257 4116 perf_ctx_lock(cpuctx, ctx); 4258 - perf_ctx_disable(ctx, false); 4117 + perf_ctx_disable(ctx, 0); 4259 4118 4260 4119 perf_ctx_sched_task_cb(ctx, task, true); 4261 4120 4262 - perf_ctx_enable(ctx, false); 4121 + perf_ctx_enable(ctx, 0); 4263 4122 perf_ctx_unlock(cpuctx, ctx); 4264 4123 goto rcu_unlock; 4265 4124 } ··· 4272 4131 if (!ctx->nr_events) 4273 4132 goto unlock; 4274 4133 4275 - perf_ctx_disable(ctx, false); 4134 + perf_ctx_disable(ctx, 0); 4276 4135 /* 4277 4136 * We want to keep the following priority order: 4278 4137 * cpu pinned (that don't need to move), task pinned, ··· 4282 4141 * events, no need to flip the cpuctx's events around. 4283 4142 */ 4284 4143 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) { 4285 - perf_ctx_disable(&cpuctx->ctx, false); 4144 + perf_ctx_disable(&cpuctx->ctx, 0); 4286 4145 ctx_sched_out(&cpuctx->ctx, NULL, EVENT_FLEXIBLE); 4287 4146 } 4288 4147 4289 - perf_event_sched_in(cpuctx, ctx, NULL); 4148 + perf_event_sched_in(cpuctx, ctx, NULL, 0); 4290 4149 4291 4150 perf_ctx_sched_task_cb(cpuctx->task_ctx, task, true); 4292 4151 4293 4152 if (!RB_EMPTY_ROOT(&ctx->pinned_groups.tree)) 4294 - perf_ctx_enable(&cpuctx->ctx, false); 4153 + perf_ctx_enable(&cpuctx->ctx, 0); 4295 4154 4296 - perf_ctx_enable(ctx, false); 4155 + perf_ctx_enable(ctx, 0); 4297 4156 4298 4157 unlock: 4299 4158 perf_ctx_unlock(cpuctx, ctx); ··· 5735 5594 { 5736 5595 struct pmu *pmu = event->pmu; 5737 5596 5597 + security_perf_event_free(event); 5598 + 5738 5599 if (event->attach_state & PERF_ATTACH_CALLCHAIN) 5739 5600 put_callchain_buffers(); 5740 5601 ··· 5790 5647 call_rcu(&event->rcu_head, free_event_rcu); 5791 5648 } 5792 5649 5650 + static void mediated_pmu_unaccount_event(struct perf_event *event); 5651 + 5793 5652 DEFINE_FREE(__free_event, struct perf_event *, if (_T) __free_event(_T)) 5794 5653 5795 5654 /* vs perf_event_alloc() success */ ··· 5801 5656 irq_work_sync(&event->pending_disable_irq); 5802 5657 5803 5658 unaccount_event(event); 5804 - 5805 - security_perf_event_free(event); 5659 + mediated_pmu_unaccount_event(event); 5806 5660 5807 5661 if (event->rb) { 5808 5662 /* ··· 6324 6180 } 6325 6181 EXPORT_SYMBOL_GPL(perf_event_pause); 6326 6182 6183 + #ifdef CONFIG_PERF_GUEST_MEDIATED_PMU 6184 + static atomic_t nr_include_guest_events __read_mostly; 6185 + 6186 + static atomic_t nr_mediated_pmu_vms __read_mostly; 6187 + static DEFINE_MUTEX(perf_mediated_pmu_mutex); 6188 + 6189 + /* !exclude_guest event of PMU with PERF_PMU_CAP_MEDIATED_VPMU */ 6190 + static inline bool is_include_guest_event(struct perf_event *event) 6191 + { 6192 + if ((event->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU) && 6193 + !event->attr.exclude_guest) 6194 + return true; 6195 + 6196 + return false; 6197 + } 6198 + 6199 + static int mediated_pmu_account_event(struct perf_event *event) 6200 + { 6201 + if (!is_include_guest_event(event)) 6202 + return 0; 6203 + 6204 + if (atomic_inc_not_zero(&nr_include_guest_events)) 6205 + return 0; 6206 + 6207 + guard(mutex)(&perf_mediated_pmu_mutex); 6208 + if (atomic_read(&nr_mediated_pmu_vms)) 6209 + return -EOPNOTSUPP; 6210 + 6211 + atomic_inc(&nr_include_guest_events); 6212 + return 0; 6213 + } 6214 + 6215 + static void mediated_pmu_unaccount_event(struct perf_event *event) 6216 + { 6217 + if (!is_include_guest_event(event)) 6218 + return; 6219 + 6220 + if (WARN_ON_ONCE(!atomic_read(&nr_include_guest_events))) 6221 + return; 6222 + 6223 + atomic_dec(&nr_include_guest_events); 6224 + } 6225 + 6226 + /* 6227 + * Currently invoked at VM creation to 6228 + * - Check whether there are existing !exclude_guest events of PMU with 6229 + * PERF_PMU_CAP_MEDIATED_VPMU 6230 + * - Set nr_mediated_pmu_vms to prevent !exclude_guest event creation on 6231 + * PMUs with PERF_PMU_CAP_MEDIATED_VPMU 6232 + * 6233 + * No impact for the PMU without PERF_PMU_CAP_MEDIATED_VPMU. The perf 6234 + * still owns all the PMU resources. 6235 + */ 6236 + int perf_create_mediated_pmu(void) 6237 + { 6238 + if (atomic_inc_not_zero(&nr_mediated_pmu_vms)) 6239 + return 0; 6240 + 6241 + guard(mutex)(&perf_mediated_pmu_mutex); 6242 + if (atomic_read(&nr_include_guest_events)) 6243 + return -EBUSY; 6244 + 6245 + atomic_inc(&nr_mediated_pmu_vms); 6246 + return 0; 6247 + } 6248 + EXPORT_SYMBOL_FOR_KVM(perf_create_mediated_pmu); 6249 + 6250 + void perf_release_mediated_pmu(void) 6251 + { 6252 + if (WARN_ON_ONCE(!atomic_read(&nr_mediated_pmu_vms))) 6253 + return; 6254 + 6255 + atomic_dec(&nr_mediated_pmu_vms); 6256 + } 6257 + EXPORT_SYMBOL_FOR_KVM(perf_release_mediated_pmu); 6258 + 6259 + /* When loading a guest's mediated PMU, schedule out all exclude_guest events. */ 6260 + void perf_load_guest_context(void) 6261 + { 6262 + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 6263 + 6264 + lockdep_assert_irqs_disabled(); 6265 + 6266 + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); 6267 + 6268 + if (WARN_ON_ONCE(__this_cpu_read(guest_ctx_loaded))) 6269 + return; 6270 + 6271 + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); 6272 + ctx_sched_out(&cpuctx->ctx, NULL, EVENT_GUEST); 6273 + if (cpuctx->task_ctx) { 6274 + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); 6275 + task_ctx_sched_out(cpuctx->task_ctx, NULL, EVENT_GUEST); 6276 + } 6277 + 6278 + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); 6279 + if (cpuctx->task_ctx) 6280 + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); 6281 + 6282 + __this_cpu_write(guest_ctx_loaded, true); 6283 + } 6284 + EXPORT_SYMBOL_GPL(perf_load_guest_context); 6285 + 6286 + void perf_put_guest_context(void) 6287 + { 6288 + struct perf_cpu_context *cpuctx = this_cpu_ptr(&perf_cpu_context); 6289 + 6290 + lockdep_assert_irqs_disabled(); 6291 + 6292 + guard(perf_ctx_lock)(cpuctx, cpuctx->task_ctx); 6293 + 6294 + if (WARN_ON_ONCE(!__this_cpu_read(guest_ctx_loaded))) 6295 + return; 6296 + 6297 + perf_ctx_disable(&cpuctx->ctx, EVENT_GUEST); 6298 + if (cpuctx->task_ctx) 6299 + perf_ctx_disable(cpuctx->task_ctx, EVENT_GUEST); 6300 + 6301 + perf_event_sched_in(cpuctx, cpuctx->task_ctx, NULL, EVENT_GUEST); 6302 + 6303 + if (cpuctx->task_ctx) 6304 + perf_ctx_enable(cpuctx->task_ctx, EVENT_GUEST); 6305 + perf_ctx_enable(&cpuctx->ctx, EVENT_GUEST); 6306 + 6307 + __this_cpu_write(guest_ctx_loaded, false); 6308 + } 6309 + EXPORT_SYMBOL_GPL(perf_put_guest_context); 6310 + #else 6311 + static int mediated_pmu_account_event(struct perf_event *event) { return 0; } 6312 + static void mediated_pmu_unaccount_event(struct perf_event *event) {} 6313 + #endif 6314 + 6327 6315 /* 6328 6316 * Holding the top-level event's child_mutex means that any 6329 6317 * descendant process that has inherited this event will block ··· 6824 6548 goto unlock; 6825 6549 6826 6550 /* 6827 - * compute total_time_enabled, total_time_running 6828 - * based on snapshot values taken when the event 6829 - * was last scheduled in. 6830 - * 6831 - * we cannot simply called update_context_time() 6832 - * because of locking issue as we can be called in 6833 - * NMI context 6834 - */ 6835 - calc_timer_values(event, &now, &enabled, &running); 6836 - 6837 - userpg = rb->user_page; 6838 - /* 6839 6551 * Disable preemption to guarantee consistent time stamps are stored to 6840 6552 * the user page. 6841 6553 */ 6842 6554 preempt_disable(); 6555 + 6556 + /* 6557 + * Compute total_time_enabled, total_time_running based on snapshot 6558 + * values taken when the event was last scheduled in. 6559 + * 6560 + * We cannot simply call update_context_time() because doing so would 6561 + * lead to deadlock when called from NMI context. 6562 + */ 6563 + calc_timer_values(event, &now, &enabled, &running); 6564 + 6565 + userpg = rb->user_page; 6566 + 6843 6567 ++userpg->lock; 6844 6568 barrier(); 6845 6569 userpg->index = perf_event_index(event); ··· 7659 7383 DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state); 7660 7384 DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip); 7661 7385 DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr); 7386 + DEFINE_STATIC_CALL_RET0(__perf_guest_handle_mediated_pmi, *perf_guest_cbs->handle_mediated_pmi); 7662 7387 7663 7388 void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) 7664 7389 { ··· 7674 7397 if (cbs->handle_intel_pt_intr) 7675 7398 static_call_update(__perf_guest_handle_intel_pt_intr, 7676 7399 cbs->handle_intel_pt_intr); 7400 + 7401 + if (cbs->handle_mediated_pmi) 7402 + static_call_update(__perf_guest_handle_mediated_pmi, 7403 + cbs->handle_mediated_pmi); 7677 7404 } 7678 7405 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); 7679 7406 ··· 7689 7408 rcu_assign_pointer(perf_guest_cbs, NULL); 7690 7409 static_call_update(__perf_guest_state, (void *)&__static_call_return0); 7691 7410 static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0); 7692 - static_call_update(__perf_guest_handle_intel_pt_intr, 7693 - (void *)&__static_call_return0); 7411 + static_call_update(__perf_guest_handle_intel_pt_intr, (void *)&__static_call_return0); 7412 + static_call_update(__perf_guest_handle_mediated_pmi, (void *)&__static_call_return0); 7694 7413 synchronize_rcu(); 7695 7414 } 7696 7415 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); ··· 8150 7869 u64 read_format = event->attr.read_format; 8151 7870 8152 7871 /* 8153 - * compute total_time_enabled, total_time_running 8154 - * based on snapshot values taken when the event 8155 - * was last scheduled in. 7872 + * Compute total_time_enabled, total_time_running based on snapshot 7873 + * values taken when the event was last scheduled in. 8156 7874 * 8157 - * we cannot simply called update_context_time() 8158 - * because of locking issue as we are called in 8159 - * NMI context 7875 + * We cannot simply call update_context_time() because doing so would 7876 + * lead to deadlock when called from NMI context. 8160 7877 */ 8161 7878 if (read_format & PERF_FORMAT_TOTAL_TIMES) 8162 7879 calc_timer_values(event, &now, &enabled, &running); ··· 12322 12043 static void task_clock_event_start(struct perf_event *event, int flags) 12323 12044 { 12324 12045 event->hw.state = 0; 12325 - local64_set(&event->hw.prev_count, event->ctx->time); 12046 + local64_set(&event->hw.prev_count, event->ctx->time.time); 12326 12047 perf_swevent_start_hrtimer(event); 12327 12048 } 12328 12049 ··· 12331 12052 event->hw.state = PERF_HES_STOPPED; 12332 12053 perf_swevent_cancel_hrtimer(event); 12333 12054 if (flags & PERF_EF_UPDATE) 12334 - task_clock_event_update(event, event->ctx->time); 12055 + task_clock_event_update(event, event->ctx->time.time); 12335 12056 } 12336 12057 12337 12058 static int task_clock_event_add(struct perf_event *event, int flags) ··· 12351 12072 static void task_clock_event_read(struct perf_event *event) 12352 12073 { 12353 12074 u64 now = perf_clock(); 12354 - u64 delta = now - event->ctx->timestamp; 12355 - u64 time = event->ctx->time + delta; 12075 + u64 delta = now - event->ctx->time.stamp; 12076 + u64 time = event->ctx->time.time + delta; 12356 12077 12357 12078 task_clock_event_update(event, time); 12358 12079 } ··· 13431 13152 } 13432 13153 13433 13154 err = security_perf_event_alloc(event); 13155 + if (err) 13156 + return ERR_PTR(err); 13157 + 13158 + err = mediated_pmu_account_event(event); 13434 13159 if (err) 13435 13160 return ERR_PTR(err); 13436 13161

+4 -8

kernel/unwind/user.c

··· 31 31 { 32 32 unsigned long cfa, fp, ra; 33 33 34 + /* Get the Canonical Frame Address (CFA) */ 34 35 if (frame->use_fp) { 35 36 if (state->fp < state->sp) 36 37 return -EINVAL; ··· 39 38 } else { 40 39 cfa = state->sp; 41 40 } 42 - 43 - /* Get the Canonical Frame Address (CFA) */ 44 41 cfa += frame->cfa_off; 45 42 46 - /* stack going in wrong direction? */ 43 + /* Make sure that stack is not going in wrong direction */ 47 44 if (cfa <= state->sp) 48 45 return -EINVAL; 49 46 ··· 49 50 if (cfa & (state->ws - 1)) 50 51 return -EINVAL; 51 52 52 - /* Find the Return Address (RA) */ 53 + /* Get the Return Address (RA) */ 53 54 if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) 54 55 return -EINVAL; 55 56 57 + /* Get the Frame Pointer (FP) */ 56 58 if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) 57 59 return -EINVAL; 58 60 ··· 67 67 68 68 static int unwind_user_next_fp(struct unwind_user_state *state) 69 69 { 70 - #ifdef CONFIG_HAVE_UNWIND_USER_FP 71 70 struct pt_regs *regs = task_pt_regs(current); 72 71 73 72 if (state->topmost && unwind_user_at_function_start(regs)) { ··· 80 81 ARCH_INIT_USER_FP_FRAME(state->ws) 81 82 }; 82 83 return unwind_user_next_common(state, &fp_frame); 83 - #else 84 - return -EINVAL; 85 - #endif 86 84 } 87 85 88 86 static int unwind_user_next(struct unwind_user_state *state)

+2 -1

tools/perf/trace/beauty/arch/x86/include/asm/irq_vectors.h

··· 77 77 */ 78 78 #define IRQ_WORK_VECTOR 0xf6 79 79 80 - /* 0xf5 - unused, was UV_BAU_MESSAGE */ 80 + #define PERF_GUEST_MEDIATED_PMI_VECTOR 0xf5 81 + 81 82 #define DEFERRED_ERROR_VECTOR 0xf4 82 83 83 84 /* Vector on which hypervisor callbacks will be delivered */

+5 -1

virt/kvm/kvm_main.c

··· 6482 6482 .state = kvm_guest_state, 6483 6483 .get_ip = kvm_guest_get_ip, 6484 6484 .handle_intel_pt_intr = NULL, 6485 + .handle_mediated_pmi = NULL, 6485 6486 }; 6486 6487 6487 - void kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void)) 6488 + void __kvm_register_perf_callbacks(unsigned int (*pt_intr_handler)(void), 6489 + void (*mediated_pmi_handler)(void)) 6488 6490 { 6489 6491 kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler; 6492 + kvm_guest_cbs.handle_mediated_pmi = mediated_pmi_handler; 6493 + 6490 6494 perf_register_guest_info_callbacks(&kvm_guest_cbs); 6491 6495 } 6492 6496 void kvm_unregister_perf_callbacks(void)

Configure Feed

Configure Feed