Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: add flag to indicate the type of RAS eeprom record

One UMC MCA address could map to multiply physical address (PA):

AMDGPU_RAS_EEPROM_REC_PA: one record store one PA
AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address, PA
is not cared about

Signed-off-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Tao Zhou and committed by
Alex Deucher
772df3df 95024c71

+40 -7
+26 -7
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 2821 2821 return -ENOMEM; 2822 2822 2823 2823 ret = amdgpu_ras_eeprom_read(control, bps, control->ras_num_recs); 2824 - if (ret) 2824 + if (ret) { 2825 2825 dev_err(adev->dev, "Failed to load EEPROM table records!"); 2826 - else 2826 + } else { 2827 + if (control->ras_num_recs > 1 && 2828 + adev->umc.ras && adev->umc.ras->convert_ras_err_addr) { 2829 + if ((bps[0].address == bps[1].address) && 2830 + (bps[0].mem_channel == bps[1].mem_channel)) 2831 + control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; 2832 + else 2833 + control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; 2834 + } 2835 + 2827 2836 ret = amdgpu_ras_add_bad_pages(adev, bps, control->ras_num_recs); 2837 + } 2828 2838 2829 2839 kfree(bps); 2830 2840 return ret; ··· 3215 3205 int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev) 3216 3206 { 3217 3207 struct amdgpu_ras *con = amdgpu_ras_get_context(adev); 3208 + struct amdgpu_ras_eeprom_control *control; 3218 3209 int ret; 3219 3210 3220 3211 if (!con || amdgpu_sriov_vf(adev)) 3221 3212 return 0; 3222 3213 3223 - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); 3224 - 3214 + control = &con->eeprom_control; 3215 + ret = amdgpu_ras_eeprom_init(control); 3225 3216 if (ret) 3226 3217 return ret; 3227 3218 ··· 3230 3219 if (amdgpu_ras_is_rma(adev)) 3231 3220 return -EHWPOISON; 3232 3221 3233 - if (con->eeprom_control.ras_num_recs) { 3222 + if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr) 3223 + control->rec_type = AMDGPU_RAS_EEPROM_REC_PA; 3224 + 3225 + /* default status is MCA storage */ 3226 + if (control->ras_num_recs <= 1 && 3227 + adev->umc.ras && adev->umc.ras->convert_ras_err_addr) 3228 + control->rec_type = AMDGPU_RAS_EEPROM_REC_MCA; 3229 + 3230 + if (control->ras_num_recs) { 3234 3231 ret = amdgpu_ras_load_bad_pages(adev); 3235 3232 if (ret) 3236 3233 return ret; 3237 3234 3238 3235 amdgpu_dpm_send_hbm_bad_pages_num( 3239 - adev, con->eeprom_control.ras_num_recs); 3236 + adev, control->ras_num_recs); 3240 3237 3241 3238 if (con->update_channel_flag == true) { 3242 3239 amdgpu_dpm_send_hbm_bad_channel_flag( 3243 - adev, con->eeprom_control.bad_channel_bitmap); 3240 + adev, control->bad_channel_bitmap); 3244 3241 con->update_channel_flag = false; 3245 3242 } 3246 3243 }
+14
drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
··· 43 43 AMDGPU_RAS_EEPROM_ERR_COUNT, 44 44 }; 45 45 46 + /* 47 + * one UMC MCA address could map to multiply physical address (PA), 48 + * such as 1:16, we use eeprom_table_record.address to store MCA 49 + * address and use eeprom_table_record.retired_page to save PA. 50 + * 51 + * AMDGPU_RAS_EEPROM_REC_PA: one record store one PA 52 + * AMDGPU_RAS_EEPROM_REC_MCA: one record store one MCA address 53 + */ 54 + enum amdgpu_ras_eeprom_rec_type { 55 + AMDGPU_RAS_EEPROM_REC_PA, 56 + AMDGPU_RAS_EEPROM_REC_MCA, 57 + }; 58 + 46 59 struct amdgpu_ras_eeprom_table_header { 47 60 uint32_t header; 48 61 uint32_t version; ··· 115 102 /* Record channel info which occurred bad pages 116 103 */ 117 104 u32 bad_channel_bitmap; 105 + enum amdgpu_ras_eeprom_rec_type rec_type; 118 106 }; 119 107 120 108 /*