Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdkfd: implement per queue sdma reset for gfx 9.4+

To reset hung SDMA queues on GFX 9.4+ for the GFX9 family, a soft reset
must be issued through SMU. Since soft resets will reset an entire SDMA
engine, use a common KGD call to do the reset as the KGD will handle
avoiding a reset of in flight GFX and paging queues on that engine.

In addition, create a common call for all reset types to simplify
the handling of module parameter settings that block gpu resets.

Signed-off-by: Jonathan Kim <jonathan.kim@amd.com>
Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Jonathan Kim and committed by
Alex Deucher
bac38ca8 057fef20

+171 -25
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_aldebaran.c
··· 193 193 .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, 194 194 .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, 195 195 .hqd_reset = kgd_gfx_v9_hqd_reset, 196 + .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell 196 197 };
+2 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
··· 419 419 .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, 420 420 .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, 421 421 .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, 422 - .hqd_reset = kgd_gfx_v9_hqd_reset 422 + .hqd_reset = kgd_gfx_v9_hqd_reset, 423 + .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell 423 424 };
+13 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gc_9_4_3.c
··· 509 509 return 0; 510 510 } 511 511 512 + static uint32_t kgd_gfx_v9_4_3_hqd_sdma_get_doorbell(struct amdgpu_device *adev, 513 + int engine, int queue) 514 + { 515 + uint32_t reg_offset = get_sdma_rlc_reg_offset(adev, engine, queue); 516 + uint32_t status = RREG32(regSDMA_RLC0_CONTEXT_STATUS + reg_offset); 517 + uint32_t doorbell_off = RREG32(regSDMA_RLC0_DOORBELL_OFFSET + reg_offset); 518 + bool is_active = !!REG_GET_FIELD(status, SDMA_RLC0_CONTEXT_STATUS, SELECTED); 519 + 520 + return is_active ? doorbell_off >> 2 : 0; 521 + } 522 + 512 523 const struct kfd2kgd_calls gc_9_4_3_kfd2kgd = { 513 524 .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, 514 525 .set_pasid_vmid_mapping = kgd_gfx_v9_4_3_set_pasid_vmid_mapping, ··· 554 543 .set_address_watch = kgd_gfx_v9_4_3_set_address_watch, 555 544 .clear_address_watch = kgd_gfx_v9_4_3_clear_address_watch, 556 545 .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, 557 - .hqd_reset = kgd_gfx_v9_hqd_reset 546 + .hqd_reset = kgd_gfx_v9_hqd_reset, 547 + .hqd_sdma_get_doorbell = kgd_gfx_v9_4_3_hqd_sdma_get_doorbell 558 548 };
+8 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
··· 1084 1084 return 0; 1085 1085 } 1086 1086 1087 + uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev, 1088 + int engine, int queue) 1089 + { 1090 + return 0; 1091 + } 1092 + 1087 1093 const struct kfd2kgd_calls gfx_v10_kfd2kgd = { 1088 1094 .program_sh_mem_settings = kgd_program_sh_mem_settings, 1089 1095 .set_pasid_vmid_mapping = kgd_set_pasid_vmid_mapping, ··· 1118 1112 .build_grace_period_packet_info = kgd_gfx_v10_build_grace_period_packet_info, 1119 1113 .program_trap_handler_settings = program_trap_handler_settings, 1120 1114 .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr, 1121 - .hqd_reset = kgd_gfx_v10_hqd_reset 1115 + .hqd_reset = kgd_gfx_v10_hqd_reset, 1116 + .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell 1122 1117 };
+2
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.h
··· 65 65 uint32_t queue_id, 66 66 uint32_t inst, 67 67 unsigned int utimeout); 68 + uint32_t kgd_gfx_v10_hqd_sdma_get_doorbell(struct amdgpu_device *adev, 69 + int engine, int queue);
+2 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10_3.c
··· 682 682 .set_address_watch = kgd_gfx_v10_set_address_watch, 683 683 .clear_address_watch = kgd_gfx_v10_clear_address_watch, 684 684 .hqd_get_pq_addr = kgd_gfx_v10_hqd_get_pq_addr, 685 - .hqd_reset = kgd_gfx_v10_hqd_reset 685 + .hqd_reset = kgd_gfx_v10_hqd_reset, 686 + .hqd_sdma_get_doorbell = kgd_gfx_v10_hqd_sdma_get_doorbell 686 687 };
+8 -1
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v11.c
··· 800 800 return 0; 801 801 } 802 802 803 + static uint32_t kgd_gfx_v11_hqd_sdma_get_doorbell(struct amdgpu_device *adev, 804 + int engine, int queue) 805 + { 806 + return 0; 807 + } 808 + 803 809 const struct kfd2kgd_calls gfx_v11_kfd2kgd = { 804 810 .program_sh_mem_settings = program_sh_mem_settings_v11, 805 811 .set_pasid_vmid_mapping = set_pasid_vmid_mapping_v11, ··· 830 824 .set_address_watch = kgd_gfx_v11_set_address_watch, 831 825 .clear_address_watch = kgd_gfx_v11_clear_address_watch, 832 826 .hqd_get_pq_addr = kgd_gfx_v11_hqd_get_pq_addr, 833 - .hqd_reset = kgd_gfx_v11_hqd_reset 827 + .hqd_reset = kgd_gfx_v11_hqd_reset, 828 + .hqd_sdma_get_doorbell = kgd_gfx_v11_hqd_sdma_get_doorbell 834 829 };
+7
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v12.c
··· 361 361 return 0; 362 362 } 363 363 364 + static uint32_t kgd_gfx_v12_hqd_sdma_get_doorbell(struct amdgpu_device *adev, 365 + int engine, int queue) 366 + { 367 + return 0; 368 + } 369 + 364 370 const struct kfd2kgd_calls gfx_v12_kfd2kgd = { 365 371 .init_interrupts = init_interrupts_v12, 366 372 .hqd_dump = hqd_dump_v12, ··· 380 374 .set_wave_launch_mode = kgd_gfx_v12_set_wave_launch_mode, 381 375 .set_address_watch = kgd_gfx_v12_set_address_watch, 382 376 .clear_address_watch = kgd_gfx_v12_clear_address_watch, 377 + .hqd_sdma_get_doorbell = kgd_gfx_v12_hqd_sdma_get_doorbell 383 378 };
+9 -7
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
··· 1131 1131 uint32_t low, high; 1132 1132 uint64_t queue_addr = 0; 1133 1133 1134 - if (!amdgpu_gpu_recovery) 1135 - return 0; 1136 - 1137 1134 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 1138 1135 amdgpu_gfx_rlc_enter_safe_mode(adev, inst); 1139 1136 ··· 1179 1182 uint32_t low, high, pipe_reset_data = 0; 1180 1183 uint64_t queue_addr = 0; 1181 1184 1182 - if (!amdgpu_gpu_recovery) 1183 - return 0; 1184 - 1185 1185 kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst); 1186 1186 amdgpu_gfx_rlc_enter_safe_mode(adev, inst); 1187 1187 ··· 1223 1229 return queue_addr; 1224 1230 } 1225 1231 1232 + uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev, 1233 + int engine, int queue) 1234 + 1235 + { 1236 + return 0; 1237 + } 1238 + 1226 1239 const struct kfd2kgd_calls gfx_v9_kfd2kgd = { 1227 1240 .program_sh_mem_settings = kgd_gfx_v9_program_sh_mem_settings, 1228 1241 .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, ··· 1259 1258 .get_cu_occupancy = kgd_gfx_v9_get_cu_occupancy, 1260 1259 .program_trap_handler_settings = kgd_gfx_v9_program_trap_handler_settings, 1261 1260 .hqd_get_pq_addr = kgd_gfx_v9_hqd_get_pq_addr, 1262 - .hqd_reset = kgd_gfx_v9_hqd_reset 1261 + .hqd_reset = kgd_gfx_v9_hqd_reset, 1262 + .hqd_sdma_get_doorbell = kgd_gfx_v9_hqd_sdma_get_doorbell 1263 1263 };
+2
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
··· 111 111 uint32_t queue_id, 112 112 uint32_t inst, 113 113 unsigned int utimeout); 114 + uint32_t kgd_gfx_v9_hqd_sdma_get_doorbell(struct amdgpu_device *adev, 115 + int engine, int queue);
+115 -13
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
··· 36 36 #include "kfd_kernel_queue.h" 37 37 #include "amdgpu_amdkfd.h" 38 38 #include "amdgpu_reset.h" 39 + #include "amdgpu_sdma.h" 39 40 #include "mes_v11_api_def.h" 40 41 #include "kfd_debug.h" 41 42 ··· 67 66 static int allocate_hqd(struct device_queue_manager *dqm, struct queue *q); 68 67 static int allocate_sdma_queue(struct device_queue_manager *dqm, 69 68 struct queue *q, const uint32_t *restore_sdma_id); 69 + 70 + static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma); 70 71 71 72 static inline 72 73 enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type) ··· 2208 2205 return NULL; 2209 2206 } 2210 2207 2211 - /* only for compute queue */ 2212 - static int reset_queues_on_hws_hang(struct device_queue_manager *dqm) 2208 + static int reset_hung_queues(struct device_queue_manager *dqm) 2213 2209 { 2214 2210 int r = 0, reset_count = 0, i; 2215 2211 ··· 2261 2259 return r; 2262 2260 } 2263 2261 2262 + static bool sdma_has_hang(struct device_queue_manager *dqm) 2263 + { 2264 + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); 2265 + int engine_end = engine_start + get_num_all_sdma_engines(dqm); 2266 + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; 2267 + int i, j; 2268 + 2269 + for (i = engine_start; i < engine_end; i++) { 2270 + for (j = 0; j < num_queues_per_eng; j++) { 2271 + if (!dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j)) 2272 + continue; 2273 + 2274 + return true; 2275 + } 2276 + } 2277 + 2278 + return false; 2279 + } 2280 + 2281 + static bool set_sdma_queue_as_reset(struct device_queue_manager *dqm, 2282 + uint32_t doorbell_off) 2283 + { 2284 + struct device_process_node *cur; 2285 + struct qcm_process_device *qpd; 2286 + struct queue *q; 2287 + 2288 + list_for_each_entry(cur, &dqm->queues, list) { 2289 + qpd = cur->qpd; 2290 + list_for_each_entry(q, &qpd->queues_list, list) { 2291 + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA || 2292 + q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) && 2293 + q->properties.doorbell_off == doorbell_off) { 2294 + set_queue_as_reset(dqm, q, qpd); 2295 + return true; 2296 + } 2297 + } 2298 + } 2299 + 2300 + return false; 2301 + } 2302 + 2303 + static int reset_hung_queues_sdma(struct device_queue_manager *dqm) 2304 + { 2305 + int engine_start = dqm->dev->node_id * get_num_all_sdma_engines(dqm); 2306 + int engine_end = engine_start + get_num_all_sdma_engines(dqm); 2307 + int num_queues_per_eng = dqm->dev->kfd->device_info.num_sdma_queues_per_engine; 2308 + int r = 0, i, j; 2309 + 2310 + if (dqm->is_hws_hang) 2311 + return -EIO; 2312 + 2313 + /* Scan for hung HW queues and reset engine. */ 2314 + dqm->detect_hang_count = 0; 2315 + for (i = engine_start; i < engine_end; i++) { 2316 + for (j = 0; j < num_queues_per_eng; j++) { 2317 + uint32_t doorbell_off = 2318 + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j); 2319 + 2320 + if (!doorbell_off) 2321 + continue; 2322 + 2323 + /* Reset engine and check. */ 2324 + if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) || 2325 + dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) || 2326 + !set_sdma_queue_as_reset(dqm, doorbell_off)) { 2327 + r = -ENOTRECOVERABLE; 2328 + goto reset_fail; 2329 + } 2330 + 2331 + /* Should only expect one queue active per engine */ 2332 + dqm->detect_hang_count++; 2333 + break; 2334 + } 2335 + } 2336 + 2337 + /* Signal process reset */ 2338 + if (dqm->detect_hang_count) 2339 + kfd_signal_reset_event(dqm->dev); 2340 + else 2341 + r = -ENOTRECOVERABLE; 2342 + 2343 + reset_fail: 2344 + dqm->detect_hang_count = 0; 2345 + 2346 + return r; 2347 + } 2348 + 2349 + static int reset_queues_on_hws_hang(struct device_queue_manager *dqm, bool is_sdma) 2350 + { 2351 + while (halt_if_hws_hang) 2352 + schedule(); 2353 + 2354 + if (!amdgpu_gpu_recovery) 2355 + return -ENOTRECOVERABLE; 2356 + 2357 + return is_sdma ? reset_hung_queues_sdma(dqm) : reset_hung_queues(dqm); 2358 + } 2359 + 2264 2360 /* dqm->lock mutex has to be locked before calling this function */ 2265 2361 static int unmap_queues_cpsch(struct device_queue_manager *dqm, 2266 2362 enum kfd_unmap_queues_filter filter, ··· 2409 2309 * check those fields 2410 2310 */ 2411 2311 mqd_mgr = dqm->mqd_mgrs[KFD_MQD_TYPE_HIQ]; 2412 - if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd)) { 2413 - while (halt_if_hws_hang) 2414 - schedule(); 2415 - if (reset_queues_on_hws_hang(dqm)) { 2416 - dqm->is_hws_hang = true; 2417 - kfd_hws_hang(dqm); 2418 - retval = -ETIME; 2419 - goto out; 2420 - } 2421 - } 2312 + if (mqd_mgr->check_preemption_failed(mqd_mgr, dqm->packet_mgr.priv_queue->queue->mqd) && 2313 + reset_queues_on_hws_hang(dqm, false)) 2314 + goto reset_fail; 2315 + 2316 + /* Check for SDMA hang and attempt SDMA reset */ 2317 + if (sdma_has_hang(dqm) && reset_queues_on_hws_hang(dqm, true)) 2318 + goto reset_fail; 2422 2319 2423 2320 /* We need to reset the grace period value for this device */ 2424 2321 if (grace_period != USE_DEFAULT_GRACE_PERIOD) { ··· 2426 2329 2427 2330 pm_release_ib(&dqm->packet_mgr); 2428 2331 dqm->active_runlist = false; 2429 - 2430 2332 out: 2431 2333 up_read(&dqm->dev->adev->reset_domain->sem); 2432 2334 return retval; 2335 + 2336 + reset_fail: 2337 + dqm->is_hws_hang = true; 2338 + kfd_hws_hang(dqm); 2339 + up_read(&dqm->dev->adev->reset_domain->sem); 2340 + return -ETIME; 2433 2341 } 2434 2342 2435 2343 /* only for compute queue */
+2
drivers/gpu/drm/amd/include/kgd_kfd_interface.h
··· 330 330 uint64_t (*hqd_reset)(struct amdgpu_device *adev, 331 331 uint32_t pipe_id, uint32_t queue_id, 332 332 uint32_t inst, unsigned int utimeout); 333 + uint32_t (*hqd_sdma_get_doorbell)(struct amdgpu_device *adev, 334 + int engine, int queue); 333 335 }; 334 336 335 337 #endif /* KGD_KFD_INTERFACE_H_INCLUDED */