Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: completely rework eviction fence handling v2

Well that was broken on multiple levels.

First of all a lot of checks were placed at incorrect locations, especially if
the resume worker should run or not.

Then a bunch of code was just mid-layering because of incorrect assignment who
should do what.

And finally comments explaining what happens instead of why.

Just re-write it from scratch, that should at least fix some of the hangs we
are seeing.

Use RCU for the eviction fence pointer in the manager, the spinlock usage was
mostly incorrect as well. Then finally remove all the nonsense checks and
actually add them in the correct locations.

v2: some typo fixes and cleanups suggested by Sunil

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Sunil Khatri <sunil.khatri@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Christian König and committed by
Alex Deucher
2cd7284b 87327658

+113 -209
+2 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
··· 2952 2952 int idx; 2953 2953 2954 2954 if (fpriv && drm_dev_enter(dev, &idx)) { 2955 - fpriv->evf_mgr.fd_closing = true; 2956 - amdgpu_eviction_fence_destroy(&fpriv->evf_mgr); 2955 + amdgpu_evf_mgr_shutdown(&fpriv->evf_mgr); 2957 2956 amdgpu_userq_mgr_fini(&fpriv->userq_mgr); 2957 + amdgpu_evf_mgr_fini(&fpriv->evf_mgr); 2958 2958 drm_dev_exit(idx); 2959 2959 } 2960 2960
+66 -152
drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.c
··· 25 25 #include <drm/drm_exec.h> 26 26 #include "amdgpu.h" 27 27 28 - #define work_to_evf_mgr(w, name) container_of(w, struct amdgpu_eviction_fence_mgr, name) 29 - #define evf_mgr_to_fpriv(e) container_of(e, struct amdgpu_fpriv, evf_mgr) 30 - 31 28 static const char * 32 29 amdgpu_eviction_fence_get_driver_name(struct dma_fence *fence) 33 30 { ··· 40 43 return ef->timeline_name; 41 44 } 42 45 43 - int 44 - amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, 45 - struct drm_exec *exec) 46 - { 47 - struct amdgpu_eviction_fence *old_ef, *new_ef; 48 - struct drm_gem_object *obj; 49 - unsigned long index; 50 - int ret; 51 - 52 - if (evf_mgr->ev_fence && 53 - !dma_fence_is_signaled(&evf_mgr->ev_fence->base)) 54 - return 0; 55 - /* 56 - * Steps to replace eviction fence: 57 - * * lock all objects in exec (caller) 58 - * * create a new eviction fence 59 - * * update new eviction fence in evf_mgr 60 - * * attach the new eviction fence to BOs 61 - * * release the old fence 62 - * * unlock the objects (caller) 63 - */ 64 - new_ef = amdgpu_eviction_fence_create(evf_mgr); 65 - if (!new_ef) { 66 - DRM_ERROR("Failed to create new eviction fence\n"); 67 - return -ENOMEM; 68 - } 69 - 70 - /* Update the eviction fence now */ 71 - spin_lock(&evf_mgr->ev_fence_lock); 72 - old_ef = evf_mgr->ev_fence; 73 - evf_mgr->ev_fence = new_ef; 74 - spin_unlock(&evf_mgr->ev_fence_lock); 75 - 76 - /* Attach the new fence */ 77 - drm_exec_for_each_locked_object(exec, index, obj) { 78 - struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 79 - 80 - if (!bo) 81 - continue; 82 - ret = amdgpu_eviction_fence_attach(evf_mgr, bo); 83 - if (ret) { 84 - DRM_ERROR("Failed to attch new eviction fence\n"); 85 - goto free_err; 86 - } 87 - } 88 - 89 - /* Free old fence */ 90 - if (old_ef) 91 - dma_fence_put(&old_ef->base); 92 - return 0; 93 - 94 - free_err: 95 - kfree(new_ef); 96 - return ret; 97 - } 98 - 99 - static void 100 - amdgpu_eviction_fence_suspend_worker(struct work_struct *work) 101 - { 102 - struct amdgpu_eviction_fence_mgr *evf_mgr = work_to_evf_mgr(work, suspend_work.work); 103 - struct amdgpu_fpriv *fpriv = evf_mgr_to_fpriv(evf_mgr); 104 - struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; 105 - struct amdgpu_eviction_fence *ev_fence; 106 - 107 - mutex_lock(&uq_mgr->userq_mutex); 108 - spin_lock(&evf_mgr->ev_fence_lock); 109 - ev_fence = evf_mgr->ev_fence; 110 - if (ev_fence) 111 - dma_fence_get(&ev_fence->base); 112 - else 113 - goto unlock; 114 - spin_unlock(&evf_mgr->ev_fence_lock); 115 - 116 - amdgpu_userq_evict(uq_mgr, ev_fence); 117 - 118 - mutex_unlock(&uq_mgr->userq_mutex); 119 - dma_fence_put(&ev_fence->base); 120 - return; 121 - 122 - unlock: 123 - spin_unlock(&evf_mgr->ev_fence_lock); 124 - mutex_unlock(&uq_mgr->userq_mutex); 125 - } 126 - 127 46 static bool amdgpu_eviction_fence_enable_signaling(struct dma_fence *f) 128 47 { 129 - struct amdgpu_eviction_fence_mgr *evf_mgr; 130 - struct amdgpu_eviction_fence *ev_fence; 48 + struct amdgpu_eviction_fence *ev_fence = to_ev_fence(f); 131 49 132 - if (!f) 133 - return true; 134 - 135 - ev_fence = to_ev_fence(f); 136 - evf_mgr = ev_fence->evf_mgr; 137 - 138 - schedule_delayed_work(&evf_mgr->suspend_work, 0); 50 + schedule_work(&ev_fence->evf_mgr->suspend_work); 139 51 return true; 140 52 } 141 53 ··· 54 148 .enable_signaling = amdgpu_eviction_fence_enable_signaling, 55 149 }; 56 150 57 - void amdgpu_eviction_fence_signal(struct amdgpu_eviction_fence_mgr *evf_mgr, 58 - struct amdgpu_eviction_fence *ev_fence) 151 + static void 152 + amdgpu_eviction_fence_suspend_worker(struct work_struct *work) 59 153 { 60 - spin_lock(&evf_mgr->ev_fence_lock); 61 - dma_fence_signal(&ev_fence->base); 62 - spin_unlock(&evf_mgr->ev_fence_lock); 154 + struct amdgpu_eviction_fence_mgr *evf_mgr = 155 + container_of(work, struct amdgpu_eviction_fence_mgr, 156 + suspend_work); 157 + struct amdgpu_fpriv *fpriv = 158 + container_of(evf_mgr, struct amdgpu_fpriv, evf_mgr); 159 + struct amdgpu_userq_mgr *uq_mgr = &fpriv->userq_mgr; 160 + struct dma_fence *ev_fence; 161 + 162 + mutex_lock(&uq_mgr->userq_mutex); 163 + ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); 164 + amdgpu_userq_evict(uq_mgr, !evf_mgr->shutdown); 165 + 166 + /* 167 + * Signaling the eviction fence must be done while holding the 168 + * userq_mutex. Otherwise we won't resume the queues before issuing the 169 + * next fence. 170 + */ 171 + dma_fence_signal(ev_fence); 172 + dma_fence_put(ev_fence); 173 + mutex_unlock(&uq_mgr->userq_mutex); 63 174 } 64 175 65 - struct amdgpu_eviction_fence * 66 - amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr) 176 + void amdgpu_evf_mgr_attach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, 177 + struct amdgpu_bo *bo) 178 + { 179 + struct dma_fence *ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); 180 + struct dma_resv *resv = bo->tbo.base.resv; 181 + 182 + dma_resv_add_fence(resv, ev_fence, DMA_RESV_USAGE_BOOKKEEP); 183 + dma_fence_put(ev_fence); 184 + } 185 + 186 + int amdgpu_evf_mgr_rearm(struct amdgpu_eviction_fence_mgr *evf_mgr, 187 + struct drm_exec *exec) 67 188 { 68 189 struct amdgpu_eviction_fence *ev_fence; 190 + struct drm_gem_object *obj; 191 + unsigned long index; 69 192 193 + /* Create and initialize a new eviction fence */ 70 194 ev_fence = kzalloc_obj(*ev_fence); 71 195 if (!ev_fence) 72 - return NULL; 196 + return -ENOMEM; 73 197 74 198 ev_fence->evf_mgr = evf_mgr; 75 199 get_task_comm(ev_fence->timeline_name, current); ··· 107 171 dma_fence_init64(&ev_fence->base, &amdgpu_eviction_fence_ops, 108 172 &ev_fence->lock, evf_mgr->ev_fence_ctx, 109 173 atomic_inc_return(&evf_mgr->ev_fence_seq)); 110 - return ev_fence; 111 - } 112 174 113 - void amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr) 114 - { 115 - struct amdgpu_eviction_fence *ev_fence; 175 + /* Remember it for newly added BOs */ 176 + dma_fence_put(evf_mgr->ev_fence); 177 + evf_mgr->ev_fence = &ev_fence->base; 116 178 117 - /* Wait for any pending work to execute */ 118 - flush_delayed_work(&evf_mgr->suspend_work); 179 + /* And add it to all existing BOs */ 180 + drm_exec_for_each_locked_object(exec, index, obj) { 181 + struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); 119 182 120 - spin_lock(&evf_mgr->ev_fence_lock); 121 - ev_fence = evf_mgr->ev_fence; 122 - spin_unlock(&evf_mgr->ev_fence_lock); 123 - 124 - if (!ev_fence) 125 - return; 126 - 127 - dma_fence_wait(&ev_fence->base, false); 128 - 129 - /* Last unref of ev_fence */ 130 - dma_fence_put(&ev_fence->base); 131 - } 132 - 133 - int amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr, 134 - struct amdgpu_bo *bo) 135 - { 136 - struct amdgpu_eviction_fence *ev_fence; 137 - struct dma_resv *resv = bo->tbo.base.resv; 138 - int ret; 139 - 140 - if (!resv) 141 - return 0; 142 - 143 - ret = dma_resv_reserve_fences(resv, 1); 144 - if (ret) { 145 - DRM_DEBUG_DRIVER("Failed to resv fence space\n"); 146 - return ret; 183 + amdgpu_evf_mgr_attach_fence(evf_mgr, bo); 147 184 } 148 - 149 - spin_lock(&evf_mgr->ev_fence_lock); 150 - ev_fence = evf_mgr->ev_fence; 151 - if (ev_fence) 152 - dma_resv_add_fence(resv, &ev_fence->base, DMA_RESV_USAGE_BOOKKEEP); 153 - spin_unlock(&evf_mgr->ev_fence_lock); 154 - 155 185 return 0; 156 186 } 157 187 158 - void amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr, 159 - struct amdgpu_bo *bo) 188 + void amdgpu_evf_mgr_detach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, 189 + struct amdgpu_bo *bo) 160 190 { 161 191 struct dma_fence *stub = dma_fence_get_stub(); 162 192 ··· 131 229 dma_fence_put(stub); 132 230 } 133 231 134 - int amdgpu_eviction_fence_init(struct amdgpu_eviction_fence_mgr *evf_mgr) 232 + void amdgpu_evf_mgr_init(struct amdgpu_eviction_fence_mgr *evf_mgr) 135 233 { 136 - /* This needs to be done one time per open */ 137 234 atomic_set(&evf_mgr->ev_fence_seq, 0); 138 235 evf_mgr->ev_fence_ctx = dma_fence_context_alloc(1); 139 - spin_lock_init(&evf_mgr->ev_fence_lock); 236 + evf_mgr->ev_fence = dma_fence_get_stub(); 140 237 141 - INIT_DELAYED_WORK(&evf_mgr->suspend_work, amdgpu_eviction_fence_suspend_worker); 142 - return 0; 238 + INIT_WORK(&evf_mgr->suspend_work, amdgpu_eviction_fence_suspend_worker); 239 + } 240 + 241 + void amdgpu_evf_mgr_shutdown(struct amdgpu_eviction_fence_mgr *evf_mgr) 242 + { 243 + evf_mgr->shutdown = true; 244 + flush_work(&evf_mgr->suspend_work); 245 + } 246 + 247 + void amdgpu_evf_mgr_fini(struct amdgpu_eviction_fence_mgr *evf_mgr) 248 + { 249 + dma_fence_wait(rcu_dereference_protected(evf_mgr->ev_fence, true), 250 + false); 251 + flush_work(&evf_mgr->suspend_work); 252 + dma_fence_put(evf_mgr->ev_fence); 143 253 }
+28 -26
drivers/gpu/drm/amd/amdgpu/amdgpu_eviction_fence.h
··· 25 25 #ifndef AMDGPU_EV_FENCE_H_ 26 26 #define AMDGPU_EV_FENCE_H_ 27 27 28 + #include <linux/dma-fence.h> 29 + 28 30 struct amdgpu_eviction_fence { 29 31 struct dma_fence base; 30 32 spinlock_t lock; ··· 37 35 struct amdgpu_eviction_fence_mgr { 38 36 u64 ev_fence_ctx; 39 37 atomic_t ev_fence_seq; 40 - spinlock_t ev_fence_lock; 41 - struct amdgpu_eviction_fence *ev_fence; 42 - struct delayed_work suspend_work; 43 - uint8_t fd_closing; 38 + 39 + /* 40 + * Only updated while holding the VM resv lock. 41 + * Only signaled while holding the userq mutex. 42 + */ 43 + struct dma_fence __rcu *ev_fence; 44 + struct work_struct suspend_work; 45 + bool shutdown; 44 46 }; 45 47 46 - /* Eviction fence helper functions */ 47 - struct amdgpu_eviction_fence * 48 - amdgpu_eviction_fence_create(struct amdgpu_eviction_fence_mgr *evf_mgr); 48 + static inline struct dma_fence * 49 + amdgpu_evf_mgr_get_fence(struct amdgpu_eviction_fence_mgr *evf_mgr) 50 + { 51 + struct dma_fence *ev_fence; 49 52 50 - void 51 - amdgpu_eviction_fence_destroy(struct amdgpu_eviction_fence_mgr *evf_mgr); 53 + rcu_read_lock(); 54 + ev_fence = dma_fence_get_rcu_safe(&evf_mgr->ev_fence); 55 + rcu_read_unlock(); 56 + return ev_fence; 57 + } 52 58 53 - int 54 - amdgpu_eviction_fence_attach(struct amdgpu_eviction_fence_mgr *evf_mgr, 55 - struct amdgpu_bo *bo); 59 + void amdgpu_evf_mgr_attach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, 60 + struct amdgpu_bo *bo); 61 + int amdgpu_evf_mgr_rearm(struct amdgpu_eviction_fence_mgr *evf_mgr, 62 + struct drm_exec *exec); 63 + void amdgpu_evf_mgr_detach_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, 64 + struct amdgpu_bo *bo); 65 + void amdgpu_evf_mgr_init(struct amdgpu_eviction_fence_mgr *evf_mgr); 66 + void amdgpu_evf_mgr_shutdown(struct amdgpu_eviction_fence_mgr *evf_mgr); 67 + void amdgpu_evf_mgr_fini(struct amdgpu_eviction_fence_mgr *evf_mgr); 56 68 57 - void 58 - amdgpu_eviction_fence_detach(struct amdgpu_eviction_fence_mgr *evf_mgr, 59 - struct amdgpu_bo *bo); 60 - 61 - int 62 - amdgpu_eviction_fence_init(struct amdgpu_eviction_fence_mgr *evf_mgr); 63 - 64 - void 65 - amdgpu_eviction_fence_signal(struct amdgpu_eviction_fence_mgr *evf_mgr, 66 - struct amdgpu_eviction_fence *ev_fence); 67 - 68 - int 69 - amdgpu_eviction_fence_replace_fence(struct amdgpu_eviction_fence_mgr *evf_mgr, 70 - struct drm_exec *exec); 71 69 #endif
+2 -8
drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
··· 263 263 else 264 264 ++bo_va->ref_count; 265 265 266 - /* attach gfx eviction fence */ 267 - r = amdgpu_eviction_fence_attach(&fpriv->evf_mgr, abo); 268 - if (r) { 269 - DRM_DEBUG_DRIVER("Failed to attach eviction fence to BO\n"); 270 - amdgpu_bo_unreserve(abo); 271 - return r; 272 - } 266 + amdgpu_evf_mgr_attach_fence(&fpriv->evf_mgr, abo); 273 267 drm_exec_fini(&exec); 274 268 275 269 /* Validate and add eviction fence to DMABuf imports with dynamic ··· 331 337 } 332 338 333 339 if (!amdgpu_vm_is_bo_always_valid(vm, bo)) 334 - amdgpu_eviction_fence_detach(&fpriv->evf_mgr, bo); 340 + amdgpu_evf_mgr_detach_fence(&fpriv->evf_mgr, bo); 335 341 336 342 bo_va = amdgpu_vm_bo_find(vm, bo); 337 343 if (!bo_va || --bo_va->ref_count)
+1 -4
drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
··· 1522 1522 "Failed to init usermode queue manager (%d), use legacy workload submission only\n", 1523 1523 r); 1524 1524 1525 - r = amdgpu_eviction_fence_init(&fpriv->evf_mgr); 1526 - if (r) 1527 - goto error_vm; 1528 - 1525 + amdgpu_evf_mgr_init(&fpriv->evf_mgr); 1529 1526 amdgpu_ctx_mgr_init(&fpriv->ctx_mgr, adev); 1530 1527 1531 1528 file_priv->driver_priv = fpriv;
+13 -16
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
··· 472 472 amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *uq_mgr, 473 473 struct amdgpu_eviction_fence_mgr *evf_mgr) 474 474 { 475 - struct amdgpu_eviction_fence *ev_fence; 475 + struct dma_fence *ev_fence; 476 476 477 477 retry: 478 478 /* Flush any pending resume work to create ev_fence */ 479 479 flush_delayed_work(&uq_mgr->resume_work); 480 480 481 481 mutex_lock(&uq_mgr->userq_mutex); 482 - spin_lock(&evf_mgr->ev_fence_lock); 483 - ev_fence = evf_mgr->ev_fence; 484 - spin_unlock(&evf_mgr->ev_fence_lock); 485 - if (!ev_fence || dma_fence_is_signaled(&ev_fence->base)) { 482 + ev_fence = amdgpu_evf_mgr_get_fence(evf_mgr); 483 + if (dma_fence_is_signaled(ev_fence)) { 484 + dma_fence_put(ev_fence); 486 485 mutex_unlock(&uq_mgr->userq_mutex); 487 486 /* 488 487 * Looks like there was no pending resume work, ··· 490 491 schedule_delayed_work(&uq_mgr->resume_work, 0); 491 492 goto retry; 492 493 } 494 + dma_fence_put(ev_fence); 493 495 } 494 496 495 497 int amdgpu_userq_create_object(struct amdgpu_userq_mgr *uq_mgr, ··· 1197 1197 dma_fence_wait(bo_va->last_pt_update, false); 1198 1198 dma_fence_wait(vm->last_update, false); 1199 1199 1200 - ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec); 1200 + ret = amdgpu_evf_mgr_rearm(&fpriv->evf_mgr, &exec); 1201 1201 if (ret) 1202 1202 drm_file_err(uq_mgr->file, "Failed to replace eviction fence\n"); 1203 1203 ··· 1217 1217 { 1218 1218 struct amdgpu_userq_mgr *uq_mgr = work_to_uq_mgr(work, resume_work.work); 1219 1219 struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); 1220 + struct dma_fence *ev_fence; 1220 1221 int ret; 1221 1222 1222 - flush_delayed_work(&fpriv->evf_mgr.suspend_work); 1223 - 1224 1223 mutex_lock(&uq_mgr->userq_mutex); 1224 + ev_fence = amdgpu_evf_mgr_get_fence(&fpriv->evf_mgr); 1225 + if (!dma_fence_is_signaled(ev_fence)) 1226 + goto unlock; 1225 1227 1226 1228 ret = amdgpu_userq_vm_validate(uq_mgr); 1227 1229 if (ret) { ··· 1239 1237 1240 1238 unlock: 1241 1239 mutex_unlock(&uq_mgr->userq_mutex); 1240 + dma_fence_put(ev_fence); 1242 1241 } 1243 1242 1244 1243 static int ··· 1315 1312 } 1316 1313 1317 1314 void 1318 - amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, 1319 - struct amdgpu_eviction_fence *ev_fence) 1315 + amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, bool schedule_resume) 1320 1316 { 1321 - struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr); 1322 - struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr; 1323 1317 struct amdgpu_device *adev = uq_mgr->adev; 1324 1318 int ret; 1325 1319 ··· 1329 1329 if (ret) 1330 1330 dev_err(adev->dev, "Failed to evict userqueue\n"); 1331 1331 1332 - /* Signal current eviction fence */ 1333 - amdgpu_eviction_fence_signal(evf_mgr, ev_fence); 1334 - 1335 - if (!evf_mgr->fd_closing) 1332 + if (schedule_resume) 1336 1333 schedule_delayed_work(&uq_mgr->resume_work, 0); 1337 1334 } 1338 1335
+1 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
··· 133 133 struct amdgpu_userq_obj *userq_obj); 134 134 135 135 void amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr, 136 - struct amdgpu_eviction_fence *ev_fence); 136 + bool schedule_resume); 137 137 138 138 void amdgpu_userq_ensure_ev_fence(struct amdgpu_userq_mgr *userq_mgr, 139 139 struct amdgpu_eviction_fence_mgr *evf_mgr);