Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amd/amdgpu: Add independent hang detect work for user queue fence

In error scenarios (e.g., malformed commands), user queue fences may never
be signaled, causing processes to wait indefinitely. To address this while
preserving the requirement of infinite fence waits, implement an independent
timeout detection mechanism:

1. Initialize a hang detect work when creating a user queue (one-time setup)
2. Start the work with queue-type-specific timeout (gfx/compute/sdma) when
the last fence is created via amdgpu_userq_signal_ioctl (per-fence timing)
3. Trigger queue reset logic if the timer expires before the fence is signaled

v2: make timeout per queue type (adev->gfx_timeout vs adev->compute_timeout vs adev->sdma_timeout) to be consistent with kernel queues. (Alex)
v3: The timeout detection must be independent from the fence, e.g. you don't wait for a timeout on the fence
but rather have the timeout start as soon as the fence is initialized. (Christian)
v4: replace the timer with the `hang_detect_work` delayed work.

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Jesse Zhang <jesse.zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Jesse.Zhang and committed by
Alex Deucher
fc3336be 5aaa5058

+73 -1
+69 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
··· 148 148 return r; 149 149 } 150 150 151 + static void amdgpu_userq_hang_detect_work(struct work_struct *work) 152 + { 153 + struct amdgpu_usermode_queue *queue = container_of(work, 154 + struct amdgpu_usermode_queue, 155 + hang_detect_work.work); 156 + struct dma_fence *fence; 157 + struct amdgpu_userq_mgr *uq_mgr; 158 + 159 + if (!queue || !queue->userq_mgr) 160 + return; 161 + 162 + uq_mgr = queue->userq_mgr; 163 + fence = READ_ONCE(queue->hang_detect_fence); 164 + /* Fence already signaled – no action needed */ 165 + if (!fence || dma_fence_is_signaled(fence)) 166 + return; 167 + 168 + mutex_lock(&uq_mgr->userq_mutex); 169 + amdgpu_userq_detect_and_reset_queues(uq_mgr); 170 + mutex_unlock(&uq_mgr->userq_mutex); 171 + } 172 + 173 + /* 174 + * Start hang detection for a user queue fence. A delayed work will be scheduled 175 + * to check if the fence is still pending after the timeout period. 176 + */ 177 + void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue) 178 + { 179 + struct amdgpu_device *adev; 180 + unsigned long timeout_ms; 181 + 182 + if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev) 183 + return; 184 + 185 + adev = queue->userq_mgr->adev; 186 + /* Determine timeout based on queue type */ 187 + switch (queue->queue_type) { 188 + case AMDGPU_RING_TYPE_GFX: 189 + timeout_ms = adev->gfx_timeout; 190 + break; 191 + case AMDGPU_RING_TYPE_COMPUTE: 192 + timeout_ms = adev->compute_timeout; 193 + break; 194 + case AMDGPU_RING_TYPE_SDMA: 195 + timeout_ms = adev->sdma_timeout; 196 + break; 197 + default: 198 + timeout_ms = adev->gfx_timeout; 199 + break; 200 + } 201 + 202 + /* Store the fence to monitor and schedule hang detection */ 203 + WRITE_ONCE(queue->hang_detect_fence, queue->last_fence); 204 + schedule_delayed_work(&queue->hang_detect_work, 205 + msecs_to_jiffies(timeout_ms)); 206 + } 207 + 208 + static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue *queue) 209 + { 210 + INIT_DELAYED_WORK(&queue->hang_detect_work, amdgpu_userq_hang_detect_work); 211 + queue->hang_detect_fence = NULL; 212 + } 213 + 151 214 static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue, 152 215 struct amdgpu_bo_va_mapping *va_map, u64 addr) 153 216 { ··· 635 572 636 573 cancel_delayed_work_sync(&uq_mgr->resume_work); 637 574 mutex_lock(&uq_mgr->userq_mutex); 638 - 639 575 queue = amdgpu_userq_find(uq_mgr, queue_id); 640 576 if (!queue) { 641 577 drm_dbg_driver(adev_to_drm(uq_mgr->adev), "Invalid queue id to destroy\n"); ··· 642 580 return -EINVAL; 643 581 } 644 582 amdgpu_userq_wait_for_last_fence(queue); 583 + /* Cancel any pending hang detection work and cleanup */ 584 + if (queue->hang_detect_fence) { 585 + cancel_delayed_work_sync(&queue->hang_detect_work); 586 + queue->hang_detect_fence = NULL; 587 + } 645 588 r = amdgpu_bo_reserve(queue->db_obj.obj, true); 646 589 if (!r) { 647 590 amdgpu_bo_unpin(queue->db_obj.obj); ··· 886 819 queue->debugfs_queue = debugfs_create_dir(queue_name, filp->debugfs_client); 887 820 debugfs_create_file("mqd_info", 0444, queue->debugfs_queue, queue, &amdgpu_mqd_info_fops); 888 821 #endif 822 + amdgpu_userq_init_hang_detect_work(queue); 889 823 kfree(queue_name); 890 824 891 825 args->out.queue_id = qid;
+3
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
··· 72 72 u32 xcp_id; 73 73 int priority; 74 74 struct dentry *debugfs_queue; 75 + struct delayed_work hang_detect_work; 76 + struct dma_fence *hang_detect_fence; 75 77 76 78 struct list_head userq_va_list; 77 79 }; ··· 149 147 void amdgpu_userq_reset_work(struct work_struct *work); 150 148 void amdgpu_userq_pre_reset(struct amdgpu_device *adev); 151 149 int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost); 150 + void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue); 152 151 153 152 int amdgpu_userq_input_va_validate(struct amdgpu_device *adev, 154 153 struct amdgpu_usermode_queue *queue,
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
··· 574 574 575 575 dma_fence_put(queue->last_fence); 576 576 queue->last_fence = dma_fence_get(fence); 577 + amdgpu_userq_start_hang_detect_work(queue); 577 578 mutex_unlock(&userq_mgr->userq_mutex); 578 579 579 580 drm_exec_init(&exec, DRM_EXEC_INTERRUPTIBLE_WAIT,