perf: Add a EVENT_GUEST flag · tjh.dev/kernel@4593b4b

+187 -53

2 changed files

expand all

include

linux

perf_event.h

kernel

events

core.c

include/linux/perf_event.h

··· 1045 1045 struct perf_time_ctx time; 1046 1046 1047 1047 /* 1048 + * Context clock, runs when in the guest mode. 1049 + */ 1050 + struct perf_time_ctx timeguest; 1051 + 1052 + /* 1048 1053 * These fields let us detect when two contexts have both 1049 1054 * been cloned (inherited) from a common ancestor. 1050 1055 */ ··· 1181 1176 */ 1182 1177 struct perf_cgroup_info { 1183 1178 struct perf_time_ctx time; 1179 + struct perf_time_ctx timeguest; 1184 1180 int active; 1185 1181 }; 1186 1182

+181 -53

kernel/events/core.c

··· 165 165 /* see ctx_resched() for details */ 166 166 EVENT_CPU = 0x10, 167 167 EVENT_CGROUP = 0x20, 168 - EVENT_FLAGS = EVENT_CGROUP, 168 + 169 + /* 170 + * EVENT_GUEST is set when scheduling in/out events between the host 171 + * and a guest with a mediated vPMU. Among other things, EVENT_GUEST 172 + * is used: 173 + * 174 + * - In for_each_epc() to skip PMUs that don't support events in a 175 + * MEDIATED_VPMU guest, i.e. don't need to be context switched. 176 + * - To indicate the start/end point of the events in a guest. Guest 177 + * running time is deducted for host-only (exclude_guest) events. 178 + */ 179 + EVENT_GUEST = 0x40, 180 + EVENT_FLAGS = EVENT_CGROUP | EVENT_GUEST, 169 181 /* compound helpers */ 170 182 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 171 183 EVENT_TIME_FROZEN = EVENT_TIME | EVENT_FROZEN, ··· 469 457 static cpumask_var_t perf_online_pkg_mask; 470 458 static cpumask_var_t perf_online_sys_mask; 471 459 static struct kmem_cache *perf_event_cache; 460 + 461 + static __always_inline bool is_guest_mediated_pmu_loaded(void) 462 + { 463 + return false; 464 + } 472 465 473 466 /* 474 467 * perf event paranoia level: ··· 801 784 { 802 785 if ((event_type & EVENT_CGROUP) && !pmu_ctx->nr_cgroups) 803 786 return true; 787 + if ((event_type & EVENT_GUEST) && 788 + !(pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU)) 789 + return true; 804 790 return false; 805 791 } 806 792 ··· 854 834 WRITE_ONCE(time->offset, time->time - time->stamp); 855 835 } 856 836 837 + static_assert(offsetof(struct perf_event_context, timeguest) - 838 + offsetof(struct perf_event_context, time) == 839 + sizeof(struct perf_time_ctx)); 840 + 841 + #define T_TOTAL 0 842 + #define T_GUEST 1 843 + 844 + static inline u64 __perf_event_time_ctx(struct perf_event *event, 845 + struct perf_time_ctx *times) 846 + { 847 + u64 time = times[T_TOTAL].time; 848 + 849 + if (event->attr.exclude_guest) 850 + time -= times[T_GUEST].time; 851 + 852 + return time; 853 + } 854 + 855 + static inline u64 __perf_event_time_ctx_now(struct perf_event *event, 856 + struct perf_time_ctx *times, 857 + u64 now) 858 + { 859 + if (is_guest_mediated_pmu_loaded() && event->attr.exclude_guest) { 860 + /* 861 + * (now + times[total].offset) - (now + times[guest].offset) := 862 + * times[total].offset - times[guest].offset 863 + */ 864 + return READ_ONCE(times[T_TOTAL].offset) - READ_ONCE(times[T_GUEST].offset); 865 + } 866 + 867 + return now + READ_ONCE(times[T_TOTAL].offset); 868 + } 869 + 857 870 #ifdef CONFIG_CGROUP_PERF 858 871 859 872 static inline bool ··· 923 870 return event->cgrp != NULL; 924 871 } 925 872 873 + static_assert(offsetof(struct perf_cgroup_info, timeguest) - 874 + offsetof(struct perf_cgroup_info, time) == 875 + sizeof(struct perf_time_ctx)); 876 + 926 877 static inline u64 perf_cgroup_event_time(struct perf_event *event) 927 878 { 928 879 struct perf_cgroup_info *t; 929 880 930 881 t = per_cpu_ptr(event->cgrp->info, event->cpu); 931 - return t->time.time; 882 + return __perf_event_time_ctx(event, &t->time); 932 883 } 933 884 934 885 static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) ··· 941 884 942 885 t = per_cpu_ptr(event->cgrp->info, event->cpu); 943 886 if (!__load_acquire(&t->active)) 944 - return t->time.time; 945 - now += READ_ONCE(t->time.offset); 946 - return now; 887 + return __perf_event_time_ctx(event, &t->time); 888 + 889 + return __perf_event_time_ctx_now(event, &t->time, now); 890 + } 891 + 892 + static inline void __update_cgrp_guest_time(struct perf_cgroup_info *info, u64 now, bool adv) 893 + { 894 + update_perf_time_ctx(&info->timeguest, now, adv); 895 + } 896 + 897 + static inline void update_cgrp_time(struct perf_cgroup_info *info, u64 now) 898 + { 899 + update_perf_time_ctx(&info->time, now, true); 900 + if (is_guest_mediated_pmu_loaded()) 901 + __update_cgrp_guest_time(info, now, true); 947 902 } 948 903 949 904 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) ··· 971 902 cgrp = container_of(css, struct perf_cgroup, css); 972 903 info = this_cpu_ptr(cgrp->info); 973 904 974 - update_perf_time_ctx(&info->time, now, true); 905 + update_cgrp_time(info, now); 975 906 if (final) 976 907 __store_release(&info->active, 0); 977 908 } ··· 994 925 * Do not update time when cgroup is not active 995 926 */ 996 927 if (info->active) 997 - update_perf_time_ctx(&info->time, perf_clock(), true); 928 + update_cgrp_time(info, perf_clock()); 998 929 } 999 930 1000 931 static inline void 1001 - perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 932 + perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) 1002 933 { 1003 934 struct perf_event_context *ctx = &cpuctx->ctx; 1004 935 struct perf_cgroup *cgrp = cpuctx->cgrp; ··· 1018 949 for (css = &cgrp->css; css; css = css->parent) { 1019 950 cgrp = container_of(css, struct perf_cgroup, css); 1020 951 info = this_cpu_ptr(cgrp->info); 1021 - update_perf_time_ctx(&info->time, ctx->time.stamp, false); 1022 - __store_release(&info->active, 1); 952 + if (guest) { 953 + __update_cgrp_guest_time(info, ctx->time.stamp, false); 954 + } else { 955 + update_perf_time_ctx(&info->time, ctx->time.stamp, false); 956 + __store_release(&info->active, 1); 957 + } 1023 958 } 1024 959 } 1025 960 ··· 1227 1154 } 1228 1155 1229 1156 static inline void 1230 - perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx) 1157 + perf_cgroup_set_timestamp(struct perf_cpu_context *cpuctx, bool guest) 1231 1158 { 1232 1159 } 1233 1160 ··· 1639 1566 */ 1640 1567 static void __update_context_time(struct perf_event_context *ctx, bool adv) 1641 1568 { 1642 - u64 now = perf_clock(); 1643 - 1644 1569 lockdep_assert_held(&ctx->lock); 1645 1570 1646 - update_perf_time_ctx(&ctx->time, now, adv); 1571 + update_perf_time_ctx(&ctx->time, perf_clock(), adv); 1572 + } 1573 + 1574 + static void __update_context_guest_time(struct perf_event_context *ctx, bool adv) 1575 + { 1576 + lockdep_assert_held(&ctx->lock); 1577 + 1578 + /* must be called after __update_context_time(); */ 1579 + update_perf_time_ctx(&ctx->timeguest, ctx->time.stamp, adv); 1647 1580 } 1648 1581 1649 1582 static void update_context_time(struct perf_event_context *ctx) 1650 1583 { 1651 1584 __update_context_time(ctx, true); 1585 + if (is_guest_mediated_pmu_loaded()) 1586 + __update_context_guest_time(ctx, true); 1652 1587 } 1653 1588 1654 1589 static u64 perf_event_time(struct perf_event *event) ··· 1669 1588 if (is_cgroup_event(event)) 1670 1589 return perf_cgroup_event_time(event); 1671 1590 1672 - return ctx->time.time; 1591 + return __perf_event_time_ctx(event, &ctx->time); 1673 1592 } 1674 1593 1675 1594 static u64 perf_event_time_now(struct perf_event *event, u64 now) ··· 1683 1602 return perf_cgroup_event_time_now(event, now); 1684 1603 1685 1604 if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) 1686 - return ctx->time.time; 1605 + return __perf_event_time_ctx(event, &ctx->time); 1687 1606 1688 - now += READ_ONCE(ctx->time.offset); 1689 - return now; 1607 + return __perf_event_time_ctx_now(event, &ctx->time, now); 1690 1608 } 1691 1609 1692 1610 static enum event_type_t get_event_type(struct perf_event *event) ··· 2505 2425 } 2506 2426 2507 2427 static inline void 2508 - __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, bool final) 2428 + __ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, 2429 + bool final, enum event_type_t event_type) 2509 2430 { 2510 2431 if (ctx->is_active & EVENT_TIME) { 2511 2432 if (ctx->is_active & EVENT_FROZEN) 2512 2433 return; 2434 + 2513 2435 update_context_time(ctx); 2514 - update_cgrp_time_from_cpuctx(cpuctx, final); 2436 + /* vPMU should not stop time */ 2437 + update_cgrp_time_from_cpuctx(cpuctx, !(event_type & EVENT_GUEST) && final); 2515 2438 } 2516 2439 } 2517 2440 2518 2441 static inline void 2519 2442 ctx_time_update(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) 2520 2443 { 2521 - __ctx_time_update(cpuctx, ctx, false); 2444 + __ctx_time_update(cpuctx, ctx, false, 0); 2522 2445 } 2523 2446 2524 2447 /* ··· 3593 3510 * 3594 3511 * would only update time for the pinned events. 3595 3512 */ 3596 - __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx); 3513 + __ctx_time_update(cpuctx, ctx, ctx == &cpuctx->ctx, event_type); 3597 3514 3598 3515 /* 3599 3516 * CPU-release for the below ->is_active store, ··· 3619 3536 cpuctx->task_ctx = NULL; 3620 3537 } 3621 3538 3622 - is_active ^= ctx->is_active; /* changed bits */ 3539 + if (event_type & EVENT_GUEST) { 3540 + /* 3541 + * Schedule out all exclude_guest events of PMU 3542 + * with PERF_PMU_CAP_MEDIATED_VPMU. 3543 + */ 3544 + is_active = EVENT_ALL; 3545 + __update_context_guest_time(ctx, false); 3546 + perf_cgroup_set_timestamp(cpuctx, true); 3547 + barrier(); 3548 + } else { 3549 + is_active ^= ctx->is_active; /* changed bits */ 3550 + } 3623 3551 3624 3552 for_each_epc(pmu_ctx, ctx, pmu, event_type) 3625 3553 __pmu_ctx_sched_out(pmu_ctx, is_active); ··· 4089 3995 event_update_userpage(event); 4090 3996 } 4091 3997 3998 + struct merge_sched_data { 3999 + int can_add_hw; 4000 + enum event_type_t event_type; 4001 + }; 4002 + 4092 4003 static int merge_sched_in(struct perf_event *event, void *data) 4093 4004 { 4094 4005 struct perf_event_context *ctx = event->ctx; 4095 - int *can_add_hw = data; 4006 + struct merge_sched_data *msd = data; 4096 4007 4097 4008 if (event->state <= PERF_EVENT_STATE_OFF) 4098 4009 return 0; ··· 4105 4006 if (!event_filter_match(event)) 4106 4007 return 0; 4107 4008 4108 - if (group_can_go_on(event, *can_add_hw)) { 4009 + /* 4010 + * Don't schedule in any host events from PMU with 4011 + * PERF_PMU_CAP_MEDIATED_VPMU, while a guest is running. 4012 + */ 4013 + if (is_guest_mediated_pmu_loaded() && 4014 + event->pmu_ctx->pmu->capabilities & PERF_PMU_CAP_MEDIATED_VPMU && 4015 + !(msd->event_type & EVENT_GUEST)) 4016 + return 0; 4017 + 4018 + if (group_can_go_on(event, msd->can_add_hw)) { 4109 4019 if (!group_sched_in(event, ctx)) 4110 4020 list_add_tail(&event->active_list, get_event_list(event)); 4111 4021 } 4112 4022 4113 4023 if (event->state == PERF_EVENT_STATE_INACTIVE) { 4114 - *can_add_hw = 0; 4024 + msd->can_add_hw = 0; 4115 4025 if (event->attr.pinned) { 4116 4026 perf_cgroup_event_disable(event, ctx); 4117 4027 perf_event_set_state(event, PERF_EVENT_STATE_ERROR); ··· 4143 4035 4144 4036 static void pmu_groups_sched_in(struct perf_event_context *ctx, 4145 4037 struct perf_event_groups *groups, 4146 - struct pmu *pmu) 4038 + struct pmu *pmu, 4039 + enum event_type_t event_type) 4147 4040 { 4148 - int can_add_hw = 1; 4041 + struct merge_sched_data msd = { 4042 + .can_add_hw = 1, 4043 + .event_type = event_type, 4044 + }; 4149 4045 visit_groups_merge(ctx, groups, smp_processor_id(), pmu, 4150 - merge_sched_in, &can_add_hw); 4046 + merge_sched_in, &msd); 4151 4047 } 4152 4048 4153 4049 static void __pmu_ctx_sched_in(struct perf_event_pmu_context *pmu_ctx, ··· 4160 4048 struct perf_event_context *ctx = pmu_ctx->ctx; 4161 4049 4162 4050 if (event_type & EVENT_PINNED) 4163 - pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu); 4051 + pmu_groups_sched_in(ctx, &ctx->pinned_groups, pmu_ctx->pmu, event_type); 4164 4052 if (event_type & EVENT_FLEXIBLE) 4165 - pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu); 4053 + pmu_groups_sched_in(ctx, &ctx->flexible_groups, pmu_ctx->pmu, event_type); 4166 4054 } 4167 4055 4168 4056 static void ··· 4179 4067 return; 4180 4068 4181 4069 if (!(is_active & EVENT_TIME)) { 4070 + /* EVENT_TIME should be active while the guest runs */ 4071 + WARN_ON_ONCE(event_type & EVENT_GUEST); 4182 4072 /* start ctx time */ 4183 4073 __update_context_time(ctx, false); 4184 - perf_cgroup_set_timestamp(cpuctx); 4074 + perf_cgroup_set_timestamp(cpuctx, false); 4185 4075 /* 4186 4076 * CPU-release for the below ->is_active store, 4187 4077 * see __load_acquire() in perf_event_time_now() ··· 4199 4085 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 4200 4086 } 4201 4087 4202 - is_active ^= ctx->is_active; /* changed bits */ 4088 + if (event_type & EVENT_GUEST) { 4089 + /* 4090 + * Schedule in the required exclude_guest events of PMU 4091 + * with PERF_PMU_CAP_MEDIATED_VPMU. 4092 + */ 4093 + is_active = event_type & EVENT_ALL; 4094 + 4095 + /* 4096 + * Update ctx time to set the new start time for 4097 + * the exclude_guest events. 4098 + */ 4099 + update_context_time(ctx); 4100 + update_cgrp_time_from_cpuctx(cpuctx, false); 4101 + barrier(); 4102 + } else { 4103 + is_active ^= ctx->is_active; /* changed bits */ 4104 + } 4203 4105 4204 4106 /* 4205 4107 * First go through the list and put on any pinned groups ··· 4223 4093 */ 4224 4094 if (is_active & EVENT_PINNED) { 4225 4095 for_each_epc(pmu_ctx, ctx, pmu, event_type) 4226 - __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED); 4096 + __pmu_ctx_sched_in(pmu_ctx, EVENT_PINNED | (event_type & EVENT_GUEST)); 4227 4097 } 4228 4098 4229 4099 /* Then walk through the lower prio flexible groups */ 4230 4100 if (is_active & EVENT_FLEXIBLE) { 4231 4101 for_each_epc(pmu_ctx, ctx, pmu, event_type) 4232 - __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE); 4102 + __pmu_ctx_sched_in(pmu_ctx, EVENT_FLEXIBLE | (event_type & EVENT_GUEST)); 4233 4103 } 4234 4104 } 4235 4105 ··· 6757 6627 goto unlock; 6758 6628 6759 6629 /* 6760 - * compute total_time_enabled, total_time_running 6761 - * based on snapshot values taken when the event 6762 - * was last scheduled in. 6763 - * 6764 - * we cannot simply called update_context_time() 6765 - * because of locking issue as we can be called in 6766 - * NMI context 6767 - */ 6768 - calc_timer_values(event, &now, &enabled, &running); 6769 - 6770 - userpg = rb->user_page; 6771 - /* 6772 6630 * Disable preemption to guarantee consistent time stamps are stored to 6773 6631 * the user page. 6774 6632 */ 6775 6633 preempt_disable(); 6634 + 6635 + /* 6636 + * Compute total_time_enabled, total_time_running based on snapshot 6637 + * values taken when the event was last scheduled in. 6638 + * 6639 + * We cannot simply call update_context_time() because doing so would 6640 + * lead to deadlock when called from NMI context. 6641 + */ 6642 + calc_timer_values(event, &now, &enabled, &running); 6643 + 6644 + userpg = rb->user_page; 6645 + 6776 6646 ++userpg->lock; 6777 6647 barrier(); 6778 6648 userpg->index = perf_event_index(event); ··· 8069 7939 u64 read_format = event->attr.read_format; 8070 7940 8071 7941 /* 8072 - * compute total_time_enabled, total_time_running 8073 - * based on snapshot values taken when the event 8074 - * was last scheduled in. 7942 + * Compute total_time_enabled, total_time_running based on snapshot 7943 + * values taken when the event was last scheduled in. 8075 7944 * 8076 - * we cannot simply called update_context_time() 8077 - * because of locking issue as we are called in 8078 - * NMI context 7945 + * We cannot simply call update_context_time() because doing so would 7946 + * lead to deadlock when called from NMI context. 8079 7947 */ 8080 7948 if (read_format & PERF_FORMAT_TOTAL_TIMES) 8081 7949 calc_timer_values(event, &now, &enabled, &running);

Configure Feed

Configure Feed