Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

perf: Add a EVENT_GUEST flag

Current perf doesn't explicitly schedule out all exclude_guest events
while the guest is running. There is no problem with the current
emulated vPMU. Because perf owns all the PMU counters. It can mask the
counter which is assigned to an exclude_guest event when a guest is
running (Intel way), or set the corresponding HOSTONLY bit in evsentsel
(AMD way). The counter doesn't count when a guest is running.

However, either way doesn't work with the introduced mediated vPMU.
A guest owns all the PMU counters when it's running. The host should not
mask any counters. The counter may be used by the guest. The evsentsel
may be overwritten.

Perf should explicitly schedule out all exclude_guest events to release
the PMU resources when entering a guest, and resume the counting when
exiting the guest.

It's possible that an exclude_guest event is created when a guest is
running. The new event should not be scheduled in as well.

The ctx time is shared among different PMUs. The time cannot be stopped
when a guest is running. It is required to calculate the time for events
from other PMUs, e.g., uncore events. Add timeguest to track the guest
run time. For an exclude_guest event, the elapsed time equals
the ctx time - guest time.
Cgroup has dedicated times. Use the same method to deduct the guest time
from the cgroup time as well.

[sean: massage comments]
Co-developed-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
Signed-off-by: Mingwei Zhang <mizhang@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Xudong Hao <xudong.hao@intel.com>
Link: https://patch.msgid.link/20251206001720.468579-7-seanjc@google.com

authored by

Kan Liang and committed by
Peter Zijlstra
4593b4b6 f5c7de8f

+187 -53
+6
include/linux/perf_event.h
··· 1045 1045 struct perf_time_ctx time; 1046 1046 1047 1047 /* 1048 + * Context clock, runs when in the guest mode. 1049 + */ 1050 + struct perf_time_ctx timeguest; 1051 + 1052 + /* 1048 1053 * These fields let us detect when two contexts have both 1049 1054 * been cloned (inherited) from a common ancestor. 1050 1055 */ ··· 1181 1176 */ 1182 1177 struct perf_cgroup_info { 1183 1178 struct perf_time_ctx time; 1179 + struct perf_time_ctx timeguest; 1184 1180 int active; 1185 1181 }; 1186 1182
+181 -53
kernel/events/core.c
··· 165 165 /* see ctx_resched() for details */ 166 166 EVENT_CPU = 0x10, 167 167 EVENT_CGROUP = 0x20, 168 - EVENT_FLAGS = EVENT_CGROUP, 168 + 169 + /* 170 + * EVENT_GUEST is set when scheduling in/out events between the host 171 + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST 172 + * is used: 173 + * 174 + * - In for_each_epc() to skip PMUs that don't support events in a 175 + * MEDIATED_VPMU guest, i.e. don't need to be context switched. 176 + * - To indicate the start/end point of the events in a guest. Guest 177 + * running time is deducted for host-only (exclude_guest) events. 178 + */ 179 + EVENT_GUEST = 0x40, 180 + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, 169 181 /* compound helpers */ 170 182 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 171 183 EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, ··· 469 457 static cpumask_var_t perf_online_pkg_mask; 470 458 static cpumask_var_t perf_online_sys_mask; 471 459 static struct kmem_cache *perf_event_cache; 460 + 461 + static __always_inline bool is_guest_mediated_pmu_loaded(void) 462 + { 463 + return false; 464 + } 472 465 473 466 /* 474 467 * perf event paranoia level: ··· 801 784 { 802 785 if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) 803 786 return true; 787 + if ((event_type & EVENT_GUEST) && 788 + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) 789 + return true; 804 790 return false; 805 791 } 806 792 ··· 854 834 WRITE_ONCE(time->offset, time->time - time->stamp); 855 835 } 856 836 837 + static_assert(offsetof(struct perf_event_context, timeguest) - 838 + offsetof(struct perf_event_context, time) == 839 + sizeof(struct perf_time_ctx)); 840 + 841 + #define T_TOTAL 0 842 + #define T_GUEST 1 843 + 844 + static inline u64 __perf_event_time_ctx(struct perf_event *event, 845 + struct perf_time_ctx *times) 846 + { 847 + u64 time = times[T_TOTAL].time; 848 + 849 + if (event->attr.exclude_guest) 850 + time -= times[T_GUEST].time; 851 + 852 + return time; 853 + } 854 + 855 + static inline u64 __perf_event_time_ctx_now(struct perf_event *event, 856 + struct perf_time_ctx *times, 857 + u64 now) 858 + { 859 + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { 860 + /* 861 + * (now + times[total].offset) - (now + times[guest].offset) := 862 + * times[total].offset - times[guest].offset 863 + */ 864 + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); 865 + } 866 + 867 + return now + READ_ONCE(times[T_TOTAL].offset); 868 + } 869 + 857 870 #ifdef CONFIG_CGROUP_PERF 858 871 859 872 static inline bool ··· 923 870 return event->cgrp != NULL; 924 871 } 925 872 873 + static_assert(offsetof(struct perf_cgroup_info, timeguest) - 874 + offsetof(struct perf_cgroup_info, time) == 875 + sizeof(struct perf_time_ctx)); 876 + 926 877 static inline u64 perf_cgroup_event_time(struct perf_event *event) 927 878 { 928 879 struct perf_cgroup_info *t; 929 880 930 881 t = per_cpu_ptr(event->cgrp->info, event->cpu); 931 - return t->time.time; 882 + return __perf_event_time_ctx(event, &t->time); 932 883 } 933 884 934 885 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) ··· 941 884 942 885 t = per_cpu_ptr(event->cgrp->info, event->cpu); 943 886 if (!__load_acquire(&t->active)) 944 - return t->time.time; 945 - now += READ_ONCE(t->time.offset); 946 - return now; 887 + return __perf_event_time_ctx(event, &t->time); 888 + 889 + return __perf_event_time_ctx_now(event, &t->time, now); 890 + } 891 + 892 + static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) 893 + { 894 + update_perf_time_ctx(&info->timeguest, now, adv); 895 + } 896 + 897 + static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) 898 + { 899 + update_perf_time_ctx(&info->time, now, true); 900 + if (is_guest_mediated_pmu_loaded()) 901 + __update_cgrp_guest_time(info, now, true); 947 902 } 948 903 949 904 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) ··· 971 902 cgrp = container_of(css, struct perf_cgroup, css); 972 903 info = this_cpu_ptr(cgrp->info); 973 904 974 - update_perf_time_ctx(&info->time, now, true); 905 + update_cgrp_time(info, now); 975 906 if (final) 976 907 __store_release(&info->active, 0); 977 908 } ··· 994 925 * Do not update time when cgroup is not active 995 926 */ 996 927 if (info->active) 997 - update_perf_time_ctx(&info->time, perf_clock(), true); 928 + update_cgrp_time(info, perf_clock()); 998 929 } 999 930 1000 931 static inline void 1001 - perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 932 + perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) 1002 933 { 1003 934 struct perf_event_context *ctx = &cpuctx->ctx; 1004 935 struct perf_cgroup *cgrp = cpuctx->cgrp; ··· 1018 949 for (css = &cgrp->css; css; css = css->parent) { 1019 950 cgrp = container_of(css, struct perf_cgroup, css); 1020 951 info = this_cpu_ptr(cgrp->info); 1021 - update_perf_time_ctx(&info->time, ctx->time.stamp, false); 1022 - __store_release(&info->active, 1); 952 + if (guest) { 953 + __update_cgrp_guest_time(info, ctx->time.stamp, false); 954 + } else { 955 + update_perf_time_ctx(&info->time, ctx->time.stamp, false); 956 + __store_release(&info->active, 1); 957 + } 1023 958 } 1024 959 } 1025 960 ··· 1227 1154 } 1228 1155 1229 1156 static inline void 1230 - perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 1157 + perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) 1231 1158 { 1232 1159 } 1233 1160 ··· 1639 1566 */ 1640 1567 static void __update_context_time(struct perf_event_context *ctx, bool adv) 1641 1568 { 1642 - u64 now = perf_clock(); 1643 - 1644 1569 lockdep_assert_held(&ctx->lock); 1645 1570 1646 - update_perf_time_ctx(&ctx->time, now, adv); 1571 + update_perf_time_ctx(&ctx->time, perf_clock(), adv); 1572 + } 1573 + 1574 + static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) 1575 + { 1576 + lockdep_assert_held(&ctx->lock); 1577 + 1578 + /* must be called after __update_context_time(); */ 1579 + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); 1647 1580 } 1648 1581 1649 1582 static void update_context_time(struct perf_event_context *ctx) 1650 1583 { 1651 1584 __update_context_time(ctx, true); 1585 + if (is_guest_mediated_pmu_loaded()) 1586 + __update_context_guest_time(ctx, true); 1652 1587 } 1653 1588 1654 1589 static u64 perf_event_time(struct perf_event *event) ··· 1669 1588 if (is_cgroup_event(event)) 1670 1589 return perf_cgroup_event_time(event); 1671 1590 1672 - return ctx->time.time; 1591 + return __perf_event_time_ctx(event, &ctx->time); 1673 1592 } 1674 1593 1675 1594 static u64 perf_event_time_now(struct perf_event *event, u64 now) ··· 1683 1602 return perf_cgroup_event_time_now(event, now); 1684 1603 1685 1604 if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) 1686 - return ctx->time.time; 1605 + return __perf_event_time_ctx(event, &ctx->time); 1687 1606 1688 - now += READ_ONCE(ctx->time.offset); 1689 - return now; 1607 + return __perf_event_time_ctx_now(event, &ctx->time, now); 1690 1608 } 1691 1609 1692 1610 static enum event_type_t get_event_type(struct perf_event *event) ··· 2505 2425 } 2506 2426 2507 2427 static inline void 2508 - __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) 2428 + __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, 2429 + bool final, enum event_type_t event_type) 2509 2430 { 2510 2431 if (ctx->is_active & EVENT_TIME) { 2511 2432 if (ctx->is_active & EVENT_FROZEN) 2512 2433 return; 2434 + 2513 2435 update_context_time(ctx); 2514 - update_cgrp_time_from_cpuctx(cpuctx, final); 2436 + /* vPMU should not stop time */ 2437 + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); 2515 2438 } 2516 2439 } 2517 2440 2518 2441 static inline void 2519 2442 ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) 2520 2443 { 2521 - __ctx_time_update(cpuctx, ctx, false); 2444 + __ctx_time_update(cpuctx, ctx, false, 0); 2522 2445 } 2523 2446 2524 2447 /* ··· 3593 3510 * 3594 3511 * would only update time for the pinned events. 3595 3512 */ 3596 - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); 3513 + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); 3597 3514 3598 3515 /* 3599 3516 * CPU-release for the below ->is_active store, ··· 3619 3536 cpuctx->task_ctx = NULL; 3620 3537 } 3621 3538 3622 - is_active ^= ctx->is_active; /* changed bits */ 3539 + if (event_type & EVENT_GUEST) { 3540 + /* 3541 + * Schedule out all exclude_guest events of PMU 3542 + * with PERF_PMU_CAP_MEDIATED_VPMU. 3543 + */ 3544 + is_active = EVENT_ALL; 3545 + __update_context_guest_time(ctx, false); 3546 + perf_cgroup_set_timestamp(cpuctx, true); 3547 + barrier(); 3548 + } else { 3549 + is_active ^= ctx->is_active; /* changed bits */ 3550 + } 3623 3551 3624 3552 for_each_epc(pmu_ctx, ctx, pmu, event_type) 3625 3553 __pmu_ctx_sched_out(pmu_ctx, is_active); ··· 4089 3995 event_update_userpage(event); 4090 3996 } 4091 3997 3998 + struct merge_sched_data { 3999 + int can_add_hw; 4000 + enum event_type_t event_type; 4001 + }; 4002 + 4092 4003 static int merge_sched_in(struct perf_event *event, void *data) 4093 4004 { 4094 4005 struct perf_event_context *ctx = event->ctx; 4095 - int *can_add_hw = data; 4006 + struct merge_sched_data *msd = data; 4096 4007 4097 4008 if (event->state <= PERF_EVENT_STATE_OFF) 4098 4009 return 0; ··· 4105 4006 if (!event_filter_match(event)) 4106 4007 return 0; 4107 4008 4108 - if (group_can_go_on(event, *can_add_hw)) { 4009 + /* 4010 + * Don't schedule in any host events from PMU with 4011 + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. 4012 + */ 4013 + if (is_guest_mediated_pmu_loaded() && 4014 + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && 4015 + !(msd->event_type & EVENT_GUEST)) 4016 + return 0; 4017 + 4018 + if (group_can_go_on(event, msd->can_add_hw)) { 4109 4019 if (!group_sched_in(event, ctx)) 4110 4020 list_add_tail(&event->active_list, get_event_list(event)); 4111 4021 } 4112 4022 4113 4023 if (event->state == PERF_EVENT_STATE_INACTIVE) { 4114 - *can_add_hw = 0; 4024 + msd->can_add_hw = 0; 4115 4025 if (event->attr.pinned) { 4116 4026 perf_cgroup_event_disable(event, ctx); 4117 4027 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); ··· 4143 4035 4144 4036 static void pmu_groups_sched_in(struct perf_event_context *ctx, 4145 4037 struct perf_event_groups *groups, 4146 - struct pmu *pmu) 4038 + struct pmu *pmu, 4039 + enum event_type_t event_type) 4147 4040 { 4148 - int can_add_hw = 1; 4041 + struct merge_sched_data msd = { 4042 + .can_add_hw = 1, 4043 + .event_type = event_type, 4044 + }; 4149 4045 visit_groups_merge(ctx, groups, smp_processor_id(), pmu, 4150 - merge_sched_in, &can_add_hw); 4046 + merge_sched_in, &msd); 4151 4047 } 4152 4048 4153 4049 static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, ··· 4160 4048 struct perf_event_context *ctx = pmu_ctx->ctx; 4161 4049 4162 4050 if (event_type & EVENT_PINNED) 4163 - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); 4051 + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); 4164 4052 if (event_type & EVENT_FLEXIBLE) 4165 - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); 4053 + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); 4166 4054 } 4167 4055 4168 4056 static void ··· 4179 4067 return; 4180 4068 4181 4069 if (!(is_active & EVENT_TIME)) { 4070 + /* EVENT_TIME should be active while the guest runs */ 4071 + WARN_ON_ONCE(event_type & EVENT_GUEST); 4182 4072 /* start ctx time */ 4183 4073 __update_context_time(ctx, false); 4184 - perf_cgroup_set_timestamp(cpuctx); 4074 + perf_cgroup_set_timestamp(cpuctx, false); 4185 4075 /* 4186 4076 * CPU-release for the below ->is_active store, 4187 4077 * see __load_acquire() in perf_event_time_now() ··· 4199 4085 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 4200 4086 } 4201 4087 4202 - is_active ^= ctx->is_active; /* changed bits */ 4088 + if (event_type & EVENT_GUEST) { 4089 + /* 4090 + * Schedule in the required exclude_guest events of PMU 4091 + * with PERF_PMU_CAP_MEDIATED_VPMU. 4092 + */ 4093 + is_active = event_type & EVENT_ALL; 4094 + 4095 + /* 4096 + * Update ctx time to set the new start time for 4097 + * the exclude_guest events. 4098 + */ 4099 + update_context_time(ctx); 4100 + update_cgrp_time_from_cpuctx(cpuctx, false); 4101 + barrier(); 4102 + } else { 4103 + is_active ^= ctx->is_active; /* changed bits */ 4104 + } 4203 4105 4204 4106 /* 4205 4107 * First go through the list and put on any pinned groups ··· 4223 4093 */ 4224 4094 if (is_active & EVENT_PINNED) { 4225 4095 for_each_epc(pmu_ctx, ctx, pmu, event_type) 4226 - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); 4096 + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); 4227 4097 } 4228 4098 4229 4099 /* Then walk through the lower prio flexible groups */ 4230 4100 if (is_active & EVENT_FLEXIBLE) { 4231 4101 for_each_epc(pmu_ctx, ctx, pmu, event_type) 4232 - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); 4102 + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); 4233 4103 } 4234 4104 } 4235 4105 ··· 6757 6627 goto unlock; 6758 6628 6759 6629 /* 6760 - * compute total_time_enabled, total_time_running 6761 - * based on snapshot values taken when the event 6762 - * was last scheduled in. 6763 - * 6764 - * we cannot simply called update_context_time() 6765 - * because of locking issue as we can be called in 6766 - * NMI context 6767 - */ 6768 - calc_timer_values(event, &now, &enabled, &running); 6769 - 6770 - userpg = rb->user_page; 6771 - /* 6772 6630 * Disable preemption to guarantee consistent time stamps are stored to 6773 6631 * the user page. 6774 6632 */ 6775 6633 preempt_disable(); 6634 + 6635 + /* 6636 + * Compute total_time_enabled, total_time_running based on snapshot 6637 + * values taken when the event was last scheduled in. 6638 + * 6639 + * We cannot simply call update_context_time() because doing so would 6640 + * lead to deadlock when called from NMI context. 6641 + */ 6642 + calc_timer_values(event, &now, &enabled, &running); 6643 + 6644 + userpg = rb->user_page; 6645 + 6776 6646 ++userpg->lock; 6777 6647 barrier(); 6778 6648 userpg->index = perf_event_index(event); ··· 8069 7939 u64 read_format = event->attr.read_format; 8070 7940 8071 7941 /* 8072 - * compute total_time_enabled, total_time_running 8073 - * based on snapshot values taken when the event 8074 - * was last scheduled in. 7942 + * Compute total_time_enabled, total_time_running based on snapshot 7943 + * values taken when the event was last scheduled in. 8075 7944 * 8076 - * we cannot simply called update_context_time() 8077 - * because of locking issue as we are called in 8078 - * NMI context 7945 + * We cannot simply call update_context_time() because doing so would 7946 + * lead to deadlock when called from NMI context. 8079 7947 */ 8080 7948 if (read_format & PERF_FORMAT_TOTAL_TIMES) 8081 7949 calc_timer_values(event, &now, &enabled, &running);