Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: Add reset control handling to reset workflow

This prefers reset control based handling if it's implemented
for a particular ASIC. If not, it takes the legacy path. It uses
the legacy method of preparing environment (job, scheduler tasks)
and restoring environment.

v2: remove unused variable (Alex)

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Feifei Xu <Feifei.Xu@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Lijo Lazar and committed by
Alex Deucher
04442bf7 e071dce3

+97 -39
+5 -6
drivers/gpu/drm/amd/amdgpu/amdgpu.h
··· 270 270 struct amdgpu_atif; 271 271 struct kfd_vm_fault_info; 272 272 struct amdgpu_hive_info; 273 + struct amdgpu_reset_context; 273 274 struct amdgpu_reset_control; 274 275 275 276 enum amdgpu_cp_irq { ··· 1076 1075 1077 1076 bool in_pci_err_recovery; 1078 1077 struct pci_saved_state *pci_state; 1078 + 1079 1079 struct amdgpu_reset_control *reset_cntl; 1080 1080 }; 1081 1081 ··· 1129 1127 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev); 1130 1128 1131 1129 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 1132 - struct amdgpu_job *job, 1133 - bool *need_full_reset_arg); 1130 + struct amdgpu_reset_context *reset_context); 1134 1131 1135 - int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 1136 - struct list_head *device_list_handle, 1137 - bool *need_full_reset_arg, 1138 - bool skip_hw_reset); 1132 + int amdgpu_do_asic_reset(struct list_head *device_list_handle, 1133 + struct amdgpu_reset_context *reset_context); 1139 1134 1140 1135 int emu_soc_asic_init(struct amdgpu_device *adev); 1141 1136
+78 -30
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 65 65 #include "amdgpu_ras.h" 66 66 #include "amdgpu_pmu.h" 67 67 #include "amdgpu_fru_eeprom.h" 68 + #include "amdgpu_reset.h" 68 69 69 70 #include <linux/suspend.h> 70 71 #include <drm/task_barrier.h> ··· 3422 3421 goto fence_driver_init; 3423 3422 } 3424 3423 3424 + amdgpu_reset_init(adev); 3425 + 3425 3426 /* detect if we are with an SRIOV vbios */ 3426 3427 amdgpu_device_detect_sriov_bios(adev); 3427 3428 ··· 3674 3671 release_firmware(adev->firmware.gpu_info_fw); 3675 3672 adev->firmware.gpu_info_fw = NULL; 3676 3673 adev->accel_working = false; 3674 + 3675 + amdgpu_reset_fini(adev); 3676 + 3677 3677 /* free i2c buses */ 3678 3678 if (!amdgpu_device_has_dc_support(adev)) 3679 3679 amdgpu_i2c_fini(adev); ··· 4245 4239 } 4246 4240 4247 4241 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, 4248 - struct amdgpu_job *job, 4249 - bool *need_full_reset_arg) 4242 + struct amdgpu_reset_context *reset_context) 4250 4243 { 4251 4244 int i, r = 0; 4252 - bool need_full_reset = *need_full_reset_arg; 4245 + struct amdgpu_job *job = NULL; 4246 + bool need_full_reset = 4247 + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4248 + 4249 + if (reset_context->reset_req_dev == adev) 4250 + job = reset_context->job; 4253 4251 4254 4252 /* no need to dump if device is not in good state during probe period */ 4255 4253 if (!adev->gmc.xgmi.pending_reset) ··· 4278 4268 if(job) 4279 4269 drm_sched_increase_karma(&job->base); 4280 4270 4271 + r = amdgpu_reset_prepare_hwcontext(adev, reset_context); 4272 + if (r != -ENOSYS) 4273 + return r; 4274 + 4281 4275 /* Don't suspend on bare metal if we are not going to HW reset the ASIC */ 4282 4276 if (!amdgpu_sriov_vf(adev)) { 4283 4277 ··· 4300 4286 4301 4287 if (need_full_reset) 4302 4288 r = amdgpu_device_ip_suspend(adev); 4303 - 4304 - *need_full_reset_arg = need_full_reset; 4289 + if (need_full_reset) 4290 + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4291 + else 4292 + clear_bit(AMDGPU_NEED_FULL_RESET, 4293 + &reset_context->flags); 4305 4294 } 4306 4295 4307 4296 return r; 4308 4297 } 4309 4298 4310 - int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, 4311 - struct list_head *device_list_handle, 4312 - bool *need_full_reset_arg, 4313 - bool skip_hw_reset) 4299 + int amdgpu_do_asic_reset(struct list_head *device_list_handle, 4300 + struct amdgpu_reset_context *reset_context) 4314 4301 { 4315 4302 struct amdgpu_device *tmp_adev = NULL; 4316 - bool need_full_reset = *need_full_reset_arg, vram_lost = false; 4303 + bool need_full_reset, skip_hw_reset, vram_lost = false; 4317 4304 int r = 0; 4305 + 4306 + /* Try reset handler method first */ 4307 + tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 4308 + reset_list); 4309 + r = amdgpu_reset_perform_reset(tmp_adev, reset_context); 4310 + 4311 + if (r != -ENOSYS) 4312 + return r; 4313 + 4314 + /* Reset handler not implemented, use the default method */ 4315 + need_full_reset = 4316 + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4317 + skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); 4318 4318 4319 4319 /* 4320 4320 * ASIC reset has to be done on all XGMI hive nodes ASAP ··· 4413 4385 */ 4414 4386 amdgpu_register_gpu_instance(tmp_adev); 4415 4387 4416 - if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4388 + if (!reset_context->hive && 4389 + tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4417 4390 amdgpu_xgmi_add_device(tmp_adev); 4418 4391 4419 4392 r = amdgpu_device_ip_late_init(tmp_adev); ··· 4442 4413 } 4443 4414 4444 4415 /* Update PSP FW topology after reset */ 4445 - if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4446 - r = amdgpu_xgmi_update_topology(hive, tmp_adev); 4416 + if (reset_context->hive && 4417 + tmp_adev->gmc.xgmi.num_physical_nodes > 1) 4418 + r = amdgpu_xgmi_update_topology( 4419 + reset_context->hive, tmp_adev); 4447 4420 } 4448 4421 } 4449 4422 ··· 4469 4438 } 4470 4439 4471 4440 end: 4472 - *need_full_reset_arg = need_full_reset; 4441 + if (need_full_reset) 4442 + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4443 + else 4444 + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); 4473 4445 return r; 4474 4446 } 4475 4447 ··· 4609 4575 return 0; 4610 4576 } 4611 4577 4612 - void amdgpu_device_recheck_guilty_jobs(struct amdgpu_device *adev, 4613 - struct amdgpu_hive_info *hive, 4614 - struct list_head *device_list_handle, 4615 - bool *need_full_reset) 4578 + void amdgpu_device_recheck_guilty_jobs( 4579 + struct amdgpu_device *adev, struct list_head *device_list_handle, 4580 + struct amdgpu_reset_context *reset_context) 4616 4581 { 4617 4582 int i, r = 0; 4618 4583 ··· 4647 4614 if (r) 4648 4615 adev->asic_reset_res = r; 4649 4616 } else { 4650 - r = amdgpu_do_asic_reset(hive, device_list_handle, 4651 - need_full_reset, false); 4617 + clear_bit(AMDGPU_SKIP_HW_RESET, 4618 + &reset_context->flags); 4619 + r = amdgpu_do_asic_reset(device_list_handle, 4620 + reset_context); 4652 4621 if (r && r == -EAGAIN) 4653 4622 goto retry; 4654 4623 } ··· 4692 4657 struct amdgpu_job *job) 4693 4658 { 4694 4659 struct list_head device_list, *device_list_handle = NULL; 4695 - bool need_full_reset = false; 4696 4660 bool job_signaled = false; 4697 4661 struct amdgpu_hive_info *hive = NULL; 4698 4662 struct amdgpu_device *tmp_adev = NULL; ··· 4699 4665 bool need_emergency_restart = false; 4700 4666 bool audio_suspended = false; 4701 4667 int tmp_vram_lost_counter; 4668 + struct amdgpu_reset_context reset_context; 4669 + 4670 + memset(&reset_context, 0, sizeof(reset_context)); 4702 4671 4703 4672 /* 4704 4673 * Special case: RAS triggered and full reset isn't supported ··· 4741 4704 } 4742 4705 mutex_lock(&hive->hive_lock); 4743 4706 } 4707 + 4708 + reset_context.method = AMD_RESET_METHOD_NONE; 4709 + reset_context.reset_req_dev = adev; 4710 + reset_context.job = job; 4711 + reset_context.hive = hive; 4712 + clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 4744 4713 4745 4714 /* 4746 4715 * lock the device before we try to operate the linked list ··· 4848 4805 4849 4806 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 4850 4807 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 4851 - r = amdgpu_device_pre_asic_reset(tmp_adev, 4852 - (tmp_adev == adev) ? job : NULL, 4853 - &need_full_reset); 4808 + r = amdgpu_device_pre_asic_reset(tmp_adev, &reset_context); 4854 4809 /*TODO Should we stop ?*/ 4855 4810 if (r) { 4856 4811 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", ··· 4865 4824 if (r) 4866 4825 adev->asic_reset_res = r; 4867 4826 } else { 4868 - r = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false); 4827 + r = amdgpu_do_asic_reset(device_list_handle, &reset_context); 4869 4828 if (r && r == -EAGAIN) 4870 4829 goto retry; 4871 4830 } ··· 4884 4843 */ 4885 4844 if (amdgpu_gpu_recovery == 2 && 4886 4845 !(tmp_vram_lost_counter < atomic_read(&adev->vram_lost_counter))) 4887 - amdgpu_device_recheck_guilty_jobs(tmp_adev, hive, 4888 - device_list_handle, &need_full_reset); 4846 + amdgpu_device_recheck_guilty_jobs( 4847 + tmp_adev, device_list_handle, &reset_context); 4889 4848 4890 4849 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 4891 4850 struct amdgpu_ring *ring = tmp_adev->rings[i]; ··· 5230 5189 struct drm_device *dev = pci_get_drvdata(pdev); 5231 5190 struct amdgpu_device *adev = drm_to_adev(dev); 5232 5191 int r, i; 5233 - bool need_full_reset = true; 5192 + struct amdgpu_reset_context reset_context; 5234 5193 u32 memsize; 5235 5194 struct list_head device_list; 5236 5195 5237 5196 DRM_INFO("PCI error: slot reset callback!!\n"); 5197 + 5198 + memset(&reset_context, 0, sizeof(reset_context)); 5238 5199 5239 5200 INIT_LIST_HEAD(&device_list); 5240 5201 list_add_tail(&adev->reset_list, &device_list); ··· 5260 5217 goto out; 5261 5218 } 5262 5219 5220 + reset_context.method = AMD_RESET_METHOD_NONE; 5221 + reset_context.reset_req_dev = adev; 5222 + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 5223 + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 5224 + 5263 5225 adev->in_pci_err_recovery = true; 5264 - r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 5226 + r = amdgpu_device_pre_asic_reset(adev, &reset_context); 5265 5227 adev->in_pci_err_recovery = false; 5266 5228 if (r) 5267 5229 goto out; 5268 5230 5269 - r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 5231 + r = amdgpu_do_asic_reset(&device_list, &reset_context); 5270 5232 5271 5233 out: 5272 5234 if (!r) {
+14 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
··· 47 47 48 48 #include "amdgpu_ras.h" 49 49 #include "amdgpu_xgmi.h" 50 + #include "amdgpu_reset.h" 50 51 51 52 /* 52 53 * KMS wrapper. ··· 1350 1349 struct list_head device_list; 1351 1350 struct amdgpu_device *adev; 1352 1351 int i, r; 1353 - bool need_full_reset = true; 1352 + struct amdgpu_reset_context reset_context; 1353 + 1354 + memset(&reset_context, 0, sizeof(reset_context)); 1354 1355 1355 1356 mutex_lock(&mgpu_info.mutex); 1356 1357 if (mgpu_info.pending_reset == true) { ··· 1362 1359 mgpu_info.pending_reset = true; 1363 1360 mutex_unlock(&mgpu_info.mutex); 1364 1361 1362 + /* Use a common context, just need to make sure full reset is done */ 1363 + reset_context.method = AMD_RESET_METHOD_NONE; 1364 + set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 1365 + 1365 1366 for (i = 0; i < mgpu_info.num_dgpu; i++) { 1366 1367 adev = mgpu_info.gpu_ins[i].adev; 1367 - r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset); 1368 + reset_context.reset_req_dev = adev; 1369 + r = amdgpu_device_pre_asic_reset(adev, &reset_context); 1368 1370 if (r) { 1369 1371 dev_err(adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 1370 1372 r, adev_to_drm(adev)->unique); ··· 1396 1388 list_for_each_entry(adev, &device_list, reset_list) 1397 1389 amdgpu_unregister_gpu_instance(adev); 1398 1390 1399 - r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true); 1391 + /* Use a common context, just need to make sure full reset is done */ 1392 + set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags); 1393 + r = amdgpu_do_asic_reset(&device_list, &reset_context); 1394 + 1400 1395 if (r) { 1401 1396 DRM_ERROR("reinit gpus failure"); 1402 1397 return;