Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/xe: Convert GT stats to per-cpu counters

Current GT statistics use atomic64_t counters. Atomic operations incur
a global coherency penalty.

Transition to dynamic per-cpu counters using alloc_percpu(). This allows
stats to be incremented via this_cpu_add(), which compiles to a single
non-locking instruction. This approach keeps the hot-path updates local
to the CPU, avoiding expensive cross-core cache invalidation traffic.

Use for_each_possible_cpu() during aggregation and clear operations to
ensure data consistency across CPU hotplug events.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Stuart Summers <stuart.summers@intel.com>
Link: https://patch.msgid.link/20260217200552.596718-1-matthew.brost@intel.com

+82 -16
+5
drivers/gpu/drm/xe/xe_gt.c
··· 33 33 #include "xe_gt_printk.h" 34 34 #include "xe_gt_sriov_pf.h" 35 35 #include "xe_gt_sriov_vf.h" 36 + #include "xe_gt_stats.h" 36 37 #include "xe_gt_sysfs.h" 37 38 #include "xe_gt_topology.h" 38 39 #include "xe_guc_exec_queue_types.h" ··· 453 452 xe_gt_mmio_init(gt); 454 453 455 454 err = xe_uc_init_noalloc(&gt->uc); 455 + if (err) 456 + return err; 457 + 458 + err = xe_gt_stats_init(gt); 456 459 if (err) 457 460 return err; 458 461
+51 -12
drivers/gpu/drm/xe/xe_gt_stats.c
··· 3 3 * Copyright © 2024 Intel Corporation 4 4 */ 5 5 6 - #include <linux/atomic.h> 7 - 6 + #include <drm/drm_managed.h> 8 7 #include <drm/drm_print.h> 9 8 9 + #include "xe_device.h" 10 10 #include "xe_gt_stats.h" 11 - #include "xe_gt_types.h" 11 + 12 + static void xe_gt_stats_fini(struct drm_device *drm, void *arg) 13 + { 14 + struct xe_gt *gt = arg; 15 + 16 + free_percpu(gt->stats); 17 + } 18 + 19 + /** 20 + * xe_gt_stats_init() - Initialize GT statistics 21 + * @gt: GT structure 22 + * 23 + * Allocate per-CPU GT statistics. Using per-CPU stats allows increments 24 + * to occur without cross-CPU atomics. 25 + * 26 + * Return: 0 on success, -ENOMEM on failure. 27 + */ 28 + int xe_gt_stats_init(struct xe_gt *gt) 29 + { 30 + gt->stats = alloc_percpu(struct xe_gt_stats); 31 + if (!gt->stats) 32 + return -ENOMEM; 33 + 34 + return drmm_add_action_or_reset(&gt_to_xe(gt)->drm, xe_gt_stats_fini, 35 + gt); 36 + } 12 37 13 38 /** 14 39 * xe_gt_stats_incr - Increments the specified stats counter ··· 48 23 if (id >= __XE_GT_STATS_NUM_IDS) 49 24 return; 50 25 51 - atomic64_add(incr, &gt->stats.counters[id]); 26 + this_cpu_add(gt->stats->counters[id], incr); 52 27 } 53 28 54 29 #define DEF_STAT_STR(ID, name) [XE_GT_STATS_ID_##ID] = name ··· 119 94 { 120 95 enum xe_gt_stats_id id; 121 96 122 - for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id) 123 - drm_printf(p, "%s: %lld\n", stat_description[id], 124 - atomic64_read(&gt->stats.counters[id])); 97 + for (id = 0; id < __XE_GT_STATS_NUM_IDS; ++id) { 98 + u64 total = 0; 99 + int cpu; 100 + 101 + for_each_possible_cpu(cpu) { 102 + struct xe_gt_stats *s = per_cpu_ptr(gt->stats, cpu); 103 + 104 + total += s->counters[id]; 105 + } 106 + 107 + drm_printf(p, "%s: %lld\n", stat_description[id], total); 108 + } 125 109 126 110 return 0; 127 111 } 128 112 129 113 /** 130 - * xe_gt_stats_clear - Clear the GT stats 114 + * xe_gt_stats_clear() - Clear the GT stats 131 115 * @gt: GT structure 132 116 * 133 - * This clear (zeros) all the available GT stats. 117 + * Clear (zero) all available GT stats. Note that if the stats are being 118 + * updated while this function is running, the results may be unpredictable. 119 + * Intended to be called on an idle GPU. 134 120 */ 135 121 void xe_gt_stats_clear(struct xe_gt *gt) 136 122 { 137 - int id; 123 + int cpu; 138 124 139 - for (id = 0; id < ARRAY_SIZE(gt->stats.counters); ++id) 140 - atomic64_set(&gt->stats.counters[id], 0); 125 + for_each_possible_cpu(cpu) { 126 + struct xe_gt_stats *s = per_cpu_ptr(gt->stats, cpu); 127 + 128 + memset(s, 0, sizeof(*s)); 129 + } 141 130 }
+6
drivers/gpu/drm/xe/xe_gt_stats.h
··· 14 14 struct drm_printer; 15 15 16 16 #ifdef CONFIG_DEBUG_FS 17 + int xe_gt_stats_init(struct xe_gt *gt); 17 18 int xe_gt_stats_print_info(struct xe_gt *gt, struct drm_printer *p); 18 19 void xe_gt_stats_clear(struct xe_gt *gt); 19 20 void xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, int incr); 20 21 #else 22 + static inline int xe_gt_stats_init(struct xe_gt *gt) 23 + { 24 + return 0; 25 + } 26 + 21 27 static inline void 22 28 xe_gt_stats_incr(struct xe_gt *gt, const enum xe_gt_stats_id id, 23 29 int incr)
+19
drivers/gpu/drm/xe/xe_gt_stats_types.h
··· 6 6 #ifndef _XE_GT_STATS_TYPES_H_ 7 7 #define _XE_GT_STATS_TYPES_H_ 8 8 9 + #include <linux/types.h> 10 + 9 11 enum xe_gt_stats_id { 10 12 XE_GT_STATS_ID_SVM_PAGEFAULT_COUNT, 11 13 XE_GT_STATS_ID_TLB_INVAL, ··· 59 57 /* must be the last entry */ 60 58 __XE_GT_STATS_NUM_IDS, 61 59 }; 60 + 61 + /** 62 + * struct xe_gt_stats - Per-CPU GT statistics counters 63 + * @counters: Array of 64-bit counters indexed by &enum xe_gt_stats_id 64 + * 65 + * This structure is used for high-frequency, per-CPU statistics collection 66 + * in the Xe driver. By using a per-CPU allocation and ensuring the structure 67 + * is cache-line aligned, we avoid the performance-heavy atomics and cache 68 + * coherency traffic. 69 + * 70 + * Updates to these counters should be performed using the this_cpu_add() 71 + * macro to ensure they are atomic with respect to local interrupts and 72 + * preemption-safe without the overhead of explicit locking. 73 + */ 74 + struct xe_gt_stats { 75 + u64 counters[__XE_GT_STATS_NUM_IDS]; 76 + } ____cacheline_aligned; 62 77 63 78 #endif
+1 -4
drivers/gpu/drm/xe/xe_gt_types.h
··· 158 158 159 159 #if IS_ENABLED(CONFIG_DEBUG_FS) 160 160 /** @stats: GT stats */ 161 - struct { 162 - /** @stats.counters: counters for various GT stats */ 163 - atomic64_t counters[__XE_GT_STATS_NUM_IDS]; 164 - } stats; 161 + struct xe_gt_stats __percpu *stats; 165 162 #endif 166 163 167 164 /**