Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: Add fatal error detected flag

For a RAS error that needs a full reset to recover, set the fatal error
status. Clear the status once the device is reset.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Asad Kamal <asad.kamal@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Lijo Lazar and committed by
Alex Deucher
1b6ef74b 34b811a2

+39
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 5321 5321 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5322 5322 if (need_full_reset) { 5323 5323 /* post card */ 5324 + amdgpu_ras_set_fed(tmp_adev, false); 5324 5325 r = amdgpu_device_asic_init(tmp_adev); 5325 5326 if (r) { 5326 5327 dev_warn(tmp_adev->dev, "asic atom init failed!");
+32
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 2439 2439 ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET; 2440 2440 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); 2441 2441 2442 + /* For any RAS error that needs a full reset to 2443 + * recover, set the fatal error status 2444 + */ 2445 + if (hive) { 2446 + list_for_each_entry(remote_adev, 2447 + &hive->device_list, 2448 + gmc.xgmi.head) 2449 + amdgpu_ras_set_fed(remote_adev, 2450 + true); 2451 + } else { 2452 + amdgpu_ras_set_fed(adev, true); 2453 + } 2442 2454 psp_fatal_error_recovery_quirk(&adev->psp); 2443 2455 } 2444 2456 } ··· 3450 3438 kfree(con); 3451 3439 3452 3440 return 0; 3441 + } 3442 + 3443 + bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev) 3444 + { 3445 + struct amdgpu_ras *ras; 3446 + 3447 + ras = amdgpu_ras_get_context(adev); 3448 + if (!ras) 3449 + return false; 3450 + 3451 + return atomic_read(&ras->fed); 3452 + } 3453 + 3454 + void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status) 3455 + { 3456 + struct amdgpu_ras *ras; 3457 + 3458 + ras = amdgpu_ras_get_context(adev); 3459 + if (ras) 3460 + atomic_set(&ras->fed, !!status); 3453 3461 } 3454 3462 3455 3463 void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
+6
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 477 477 wait_queue_head_t page_retirement_wq; 478 478 struct mutex page_retirement_lock; 479 479 atomic_t page_retirement_req_cnt; 480 + /* Fatal error detected flag */ 481 + atomic_t fed; 480 482 }; 481 483 482 484 struct ras_fs_data { ··· 875 873 876 874 void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, 877 875 struct ras_err_addr *mca_err_addr); 876 + 877 + void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); 878 + bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); 879 + 878 880 #endif