Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: add RAS page retirement functions for MCA

Define page retirement functions for MCA platform.

v2: remove page retirement handling from MCA poison handler,
let MCA notifier do page retirement.

v3: remove specific poison handler for MCA to simplify code.

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Tao Zhou and committed by
Alex Deucher
cbe4d43e 25135748

+55
+53
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
··· 22 22 */ 23 23 24 24 #include "amdgpu.h" 25 + #include "umc_v6_7.h" 26 + 27 + static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev, 28 + struct ras_err_data *err_data, uint64_t err_addr, 29 + uint32_t ch_inst, uint32_t umc_inst) 30 + { 31 + switch (adev->ip_versions[UMC_HWIP][0]) { 32 + case IP_VERSION(6, 7, 0): 33 + umc_v6_7_convert_error_address(adev, 34 + err_data, err_addr, ch_inst, umc_inst); 35 + break; 36 + default: 37 + dev_warn(adev->dev, 38 + "UMC address to Physical address translation is not supported\n"); 39 + return AMDGPU_RAS_FAIL; 40 + } 41 + 42 + return AMDGPU_RAS_SUCCESS; 43 + } 44 + 45 + int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, 46 + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst) 47 + { 48 + struct ras_err_data err_data = {0, 0, 0, NULL}; 49 + int ret = AMDGPU_RAS_FAIL; 50 + 51 + err_data.err_addr = 52 + kcalloc(adev->umc.max_ras_err_cnt_per_query, 53 + sizeof(struct eeprom_table_record), GFP_KERNEL); 54 + if (!err_data.err_addr) { 55 + dev_warn(adev->dev, 56 + "Failed to alloc memory for umc error record in MCA notifier!\n"); 57 + return AMDGPU_RAS_FAIL; 58 + } 59 + 60 + /* 61 + * Translate UMC channel address to Physical address 62 + */ 63 + ret = amdgpu_umc_convert_error_address(adev, &err_data, err_addr, 64 + ch_inst, umc_inst); 65 + if (ret) 66 + goto out; 67 + 68 + if (amdgpu_bad_page_threshold != 0) { 69 + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, 70 + err_data.err_addr_cnt); 71 + amdgpu_ras_save_bad_pages(adev); 72 + } 73 + 74 + out: 75 + kfree(err_data.err_addr); 76 + return ret; 77 + } 25 78 26 79 static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev, 27 80 void *ras_error_status,
+2
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
··· 98 98 int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev, 99 99 void *ras_error_status, 100 100 struct amdgpu_iv_entry *entry); 101 + int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev, 102 + uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst); 101 103 #endif