Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/i915: Support replaying GPU hangs with captured context image

When debugging GPU hangs Mesa developers are finding it useful to replay
the captured error state against the simulator. But due various simulator
limitations which prevent replicating all hangs, one step further is being
able to replay against a real GPU.

This is almost doable today with the missing part being able to upload the
captured context image into the driver state prior to executing the
uploaded hanging batch and all the buffers.

To enable this last part we add a new context parameter called
I915_CONTEXT_PARAM_CONTEXT_IMAGE. It follows the existing SSEU
configuration pattern of being able to select which context to apply
against, paired with the actual image and its size.

Since this is adding a new concept of debug only uapi, we hide it behind
a new kconfig option and also require activation with a module parameter.
Together with a warning banner printed at driver load, all those combined
should be sufficient to guard against inadvertently enabling the feature.

In terms of implementation we allow the legacy context set param to be
used since that removes the need to record the per context data in the
proto context, while still allowing flexibility of specifying context
images for any context.

Mesa MR using the uapi can be seen at:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27594

v2:
* Fix whitespace alignment as per checkpatch.
* Added warning on userspace misuse.
* Rebase for extracting ce->default_state shadowing.

v3:
* Rebase for I915_CONTEXT_PARAM_LOW_LATENCY.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Cc: Carlos Santa <carlos.santa@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Tested-by: Carlos Santa <carlos.santa@intel.com>
Signed-off-by: Tvrtko Ursulin <tursulin@igalia.com>
Signed-off-by: Tvrtko Ursulin <tursulin@ursulin.net>
Link: https://patchwork.freedesktop.org/patch/msgid/20240514145939.87427-2-tursulin@igalia.com

authored by

Tvrtko Ursulin and committed by
Tvrtko Ursulin
0f1bb41b e1eb97c2

+193 -3
+17
drivers/gpu/drm/i915/Kconfig.debug
··· 16 16 17 17 If in doubt, say "N". 18 18 19 + config DRM_I915_REPLAY_GPU_HANGS_API 20 + bool "Enable GPU hang replay userspace API" 21 + depends on DRM_I915 22 + depends on EXPERT 23 + default n 24 + help 25 + Choose this option if you want to enable special and unstable 26 + userspace API used for replaying GPU hangs on a running system. 27 + 28 + This API is intended to be used by userspace graphics stack developers 29 + and provides no stability guarantees. 30 + 31 + The API needs to be activated at boot time using the 32 + enable_debug_only_api module parameter. 33 + 34 + If in doubt, say "N". 35 + 19 36 config DRM_I915_DEBUG 20 37 bool "Enable additional driver debugging" 21 38 depends on DRM_I915
+113
drivers/gpu/drm/i915/gem/i915_gem_context.c
··· 78 78 #include "gt/intel_engine_user.h" 79 79 #include "gt/intel_gpu_commands.h" 80 80 #include "gt/intel_ring.h" 81 + #include "gt/shmem_utils.h" 81 82 82 83 #include "pxp/intel_pxp.h" 83 84 ··· 958 957 case I915_CONTEXT_PARAM_NO_ZEROMAP: 959 958 case I915_CONTEXT_PARAM_BAN_PERIOD: 960 959 case I915_CONTEXT_PARAM_RINGSIZE: 960 + case I915_CONTEXT_PARAM_CONTEXT_IMAGE: 961 961 default: 962 962 ret = -EINVAL; 963 963 break; ··· 2106 2104 return 0; 2107 2105 } 2108 2106 2107 + static int set_context_image(struct i915_gem_context *ctx, 2108 + struct drm_i915_gem_context_param *args) 2109 + { 2110 + struct i915_gem_context_param_context_image user; 2111 + struct intel_context *ce; 2112 + struct file *shmem_state; 2113 + unsigned long lookup; 2114 + void *state; 2115 + int ret = 0; 2116 + 2117 + if (!IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API)) 2118 + return -EINVAL; 2119 + 2120 + if (!ctx->i915->params.enable_debug_only_api) 2121 + return -EINVAL; 2122 + 2123 + if (args->size < sizeof(user)) 2124 + return -EINVAL; 2125 + 2126 + if (copy_from_user(&user, u64_to_user_ptr(args->value), sizeof(user))) 2127 + return -EFAULT; 2128 + 2129 + if (user.mbz) 2130 + return -EINVAL; 2131 + 2132 + if (user.flags & ~(I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX)) 2133 + return -EINVAL; 2134 + 2135 + lookup = 0; 2136 + if (user.flags & I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX) 2137 + lookup |= LOOKUP_USER_INDEX; 2138 + 2139 + ce = lookup_user_engine(ctx, lookup, &user.engine); 2140 + if (IS_ERR(ce)) 2141 + return PTR_ERR(ce); 2142 + 2143 + if (user.size < ce->engine->context_size) { 2144 + ret = -EINVAL; 2145 + goto out_ce; 2146 + } 2147 + 2148 + if (drm_WARN_ON_ONCE(&ctx->i915->drm, 2149 + test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) { 2150 + /* 2151 + * This is racy but for a debug only API, if userspace is keen 2152 + * to create and configure contexts, while simultaneously using 2153 + * them from a second thread, let them suffer by potentially not 2154 + * executing with the context image they just raced to apply. 2155 + */ 2156 + ret = -EBUSY; 2157 + goto out_ce; 2158 + } 2159 + 2160 + state = kmalloc(ce->engine->context_size, GFP_KERNEL); 2161 + if (!state) { 2162 + ret = -ENOMEM; 2163 + goto out_ce; 2164 + } 2165 + 2166 + if (copy_from_user(state, u64_to_user_ptr(user.image), 2167 + ce->engine->context_size)) { 2168 + ret = -EFAULT; 2169 + goto out_state; 2170 + } 2171 + 2172 + shmem_state = shmem_create_from_data(ce->engine->name, 2173 + state, ce->engine->context_size); 2174 + if (IS_ERR(shmem_state)) { 2175 + ret = PTR_ERR(shmem_state); 2176 + goto out_state; 2177 + } 2178 + 2179 + if (intel_context_set_own_state(ce)) { 2180 + ret = -EBUSY; 2181 + fput(shmem_state); 2182 + goto out_state; 2183 + } 2184 + 2185 + ce->default_state = shmem_state; 2186 + 2187 + args->size = sizeof(user); 2188 + 2189 + out_state: 2190 + kfree(state); 2191 + out_ce: 2192 + intel_context_put(ce); 2193 + return ret; 2194 + } 2195 + 2109 2196 static int ctx_setparam(struct drm_i915_file_private *fpriv, 2110 2197 struct i915_gem_context *ctx, 2111 2198 struct drm_i915_gem_context_param *args) ··· 2245 2154 2246 2155 case I915_CONTEXT_PARAM_PERSISTENCE: 2247 2156 ret = set_persistence(ctx, args); 2157 + break; 2158 + 2159 + case I915_CONTEXT_PARAM_CONTEXT_IMAGE: 2160 + ret = set_context_image(ctx, args); 2248 2161 break; 2249 2162 2250 2163 case I915_CONTEXT_PARAM_PROTECTED_CONTENT: ··· 2595 2500 case I915_CONTEXT_PARAM_BAN_PERIOD: 2596 2501 case I915_CONTEXT_PARAM_ENGINES: 2597 2502 case I915_CONTEXT_PARAM_RINGSIZE: 2503 + case I915_CONTEXT_PARAM_CONTEXT_IMAGE: 2598 2504 default: 2599 2505 ret = -EINVAL; 2600 2506 break; ··· 2707 2611 slab_luts = KMEM_CACHE(i915_lut_handle, 0); 2708 2612 if (!slab_luts) 2709 2613 return -ENOMEM; 2614 + 2615 + if (IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API)) { 2616 + pr_notice("**************************************************************\n"); 2617 + pr_notice("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); 2618 + pr_notice("** **\n"); 2619 + if (i915_modparams.enable_debug_only_api) 2620 + pr_notice("** i915.enable_debug_only_api is intended to be set **\n"); 2621 + else 2622 + pr_notice("** CONFIG_DRM_I915_REPLAY_GPU_HANGS_API builds are intended **\n"); 2623 + pr_notice("** for specific userspace graphics stack developers only! **\n"); 2624 + pr_notice("** **\n"); 2625 + pr_notice("** If you are seeing this message please report this to the **\n"); 2626 + pr_notice("** provider of your kernel build. **\n"); 2627 + pr_notice("** **\n"); 2628 + pr_notice("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); 2629 + pr_notice("**************************************************************\n"); 2630 + } 2710 2631 2711 2632 return 0; 2712 2633 }
+2
drivers/gpu/drm/i915/gt/intel_context.c
··· 27 27 struct intel_context *ce = container_of(rcu, typeof(*ce), rcu); 28 28 29 29 trace_intel_context_free(ce); 30 + if (intel_context_has_own_state(ce)) 31 + fput(ce->default_state); 30 32 kmem_cache_free(slab_ce, ce); 31 33 } 32 34
+22
drivers/gpu/drm/i915/gt/intel_context.h
··· 375 375 clear_bit(CONTEXT_NOPREEMPT, &ce->flags); 376 376 } 377 377 378 + #if IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API) 379 + static inline bool intel_context_has_own_state(const struct intel_context *ce) 380 + { 381 + return test_bit(CONTEXT_OWN_STATE, &ce->flags); 382 + } 383 + 384 + static inline bool intel_context_set_own_state(struct intel_context *ce) 385 + { 386 + return test_and_set_bit(CONTEXT_OWN_STATE, &ce->flags); 387 + } 388 + #else 389 + static inline bool intel_context_has_own_state(const struct intel_context *ce) 390 + { 391 + return false; 392 + } 393 + 394 + static inline bool intel_context_set_own_state(struct intel_context *ce) 395 + { 396 + return true; 397 + } 398 + #endif 399 + 378 400 u64 intel_context_get_total_runtime_ns(struct intel_context *ce); 379 401 u64 intel_context_get_avg_runtime_ns(struct intel_context *ce); 380 402
+1
drivers/gpu/drm/i915/gt/intel_context_types.h
··· 133 133 #define CONTEXT_IS_PARKING 12 134 134 #define CONTEXT_EXITING 13 135 135 #define CONTEXT_LOW_LATENCY 14 136 + #define CONTEXT_OWN_STATE 15 136 137 137 138 struct { 138 139 u64 timeout_us;
+2 -1
drivers/gpu/drm/i915/gt/intel_lrc.c
··· 1130 1130 1131 1131 GEM_BUG_ON(ce->state); 1132 1132 1133 - ce->default_state = engine->default_state; 1133 + if (!intel_context_has_own_state(ce)) 1134 + ce->default_state = engine->default_state; 1134 1135 1135 1136 vma = __lrc_alloc_state(ce, engine); 1136 1137 if (IS_ERR(vma))
+2 -1
drivers/gpu/drm/i915/gt/intel_ring_submission.c
··· 569 569 { 570 570 struct intel_engine_cs *engine = ce->engine; 571 571 572 - ce->default_state = engine->default_state; 572 + if (!intel_context_has_own_state(ce)) 573 + ce->default_state = engine->default_state; 573 574 574 575 /* One ringbuffer to rule them all */ 575 576 GEM_BUG_ON(!engine->legacy.ring);
+5
drivers/gpu/drm/i915/i915_params.c
··· 131 131 i915_param_named_unsafe(lmem_bar_size, uint, 0400, 132 132 "Set the lmem bar size(in MiB)."); 133 133 134 + #if IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API) 135 + i915_param_named(enable_debug_only_api, bool, 0400, 136 + "Enable support for unstable debug only userspace API. (default:false)"); 137 + #endif 138 + 134 139 static void _param_print_bool(struct drm_printer *p, const char *name, 135 140 bool val) 136 141 {
+2 -1
drivers/gpu/drm/i915/i915_params.h
··· 63 63 /* leave bools at the end to not create holes */ \ 64 64 param(bool, enable_hangcheck, true, 0600) \ 65 65 param(bool, error_capture, true, IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR) ? 0600 : 0) \ 66 - param(bool, enable_gvt, false, IS_ENABLED(CONFIG_DRM_I915_GVT) ? 0400 : 0) 66 + param(bool, enable_gvt, false, IS_ENABLED(CONFIG_DRM_I915_GVT) ? 0400 : 0) \ 67 + param(bool, enable_debug_only_api, false, IS_ENABLED(CONFIG_DRM_I915_REPLAY_GPU_HANGS_API) ? 0400 : 0) 67 68 68 69 #define MEMBER(T, member, ...) T member; 69 70 struct i915_params {
+27
include/uapi/drm/i915_drm.h
··· 2163 2163 * supports this per context flag. 2164 2164 */ 2165 2165 #define I915_CONTEXT_PARAM_LOW_LATENCY 0xe 2166 + 2167 + /* 2168 + * I915_CONTEXT_PARAM_CONTEXT_IMAGE: 2169 + * 2170 + * Allows userspace to provide own context images. 2171 + * 2172 + * Note that this is a debug API not available on production kernel builds. 2173 + */ 2174 + #define I915_CONTEXT_PARAM_CONTEXT_IMAGE 0xf 2166 2175 /* Must be kept compact -- no holes and well documented */ 2167 2176 2168 2177 /** @value: Context parameter value to be set or queried */ ··· 2572 2563 __u64 extensions; \ 2573 2564 struct i915_engine_class_instance engines[N__]; \ 2574 2565 } __attribute__((packed)) name__ 2566 + 2567 + struct i915_gem_context_param_context_image { 2568 + /** @engine: Engine class & instance to be configured. */ 2569 + struct i915_engine_class_instance engine; 2570 + 2571 + /** @flags: One of the supported flags or zero. */ 2572 + __u32 flags; 2573 + #define I915_CONTEXT_IMAGE_FLAG_ENGINE_INDEX (1u << 0) 2574 + 2575 + /** @size: Size of the image blob pointed to by @image. */ 2576 + __u32 size; 2577 + 2578 + /** @mbz: Must be zero. */ 2579 + __u32 mbz; 2580 + 2581 + /** @image: Userspace memory containing the context image. */ 2582 + __u64 image; 2583 + } __attribute__((packed)); 2575 2584 2576 2585 /** 2577 2586 * struct drm_i915_gem_context_create_ext_setparam - Context parameter