Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amd/ras: adapt page retirement process for pmfw eeprom

read bad page data from pmfw eeprom when retirement
is triggered, use timestamp read from eeprom

Signed-off-by: Gangliang Xie <ganglxie@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Gangliang Xie and committed by
Alex Deucher
72289903 42c46be2

+65 -9
+20 -9
drivers/gpu/drm/amd/ras/rascore/ras_aca.c
··· 234 234 bank_ecc->de_count) { 235 235 struct ras_bank_ecc ras_ecc = {0}; 236 236 237 - ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); 238 - ras_ecc.addr = bank_ecc->bank_info.addr; 239 - ras_ecc.ipid = bank_ecc->bank_info.ipid; 240 - ras_ecc.status = bank_ecc->bank_info.status; 241 - ras_ecc.seq_no = bank->seq_no; 237 + if (ras_fw_eeprom_supported(ras_core)) { 238 + ret = ras_fw_eeprom_update_record(ras_core, &ras_ecc); 239 + if (!ret) { 240 + ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); 241 + ras_ecc.status = bank_ecc->bank_info.status; 242 + ras_ecc.seq_no = bank->seq_no; 243 + } 244 + } else { 245 + ras_ecc.nps = ras_core_get_curr_nps_mode(ras_core); 246 + ras_ecc.addr = bank_ecc->bank_info.addr; 247 + ras_ecc.ipid = bank_ecc->bank_info.ipid; 248 + ras_ecc.status = bank_ecc->bank_info.status; 249 + ras_ecc.seq_no = bank->seq_no; 250 + } 242 251 243 - if (ras_core_gpu_in_reset(ras_core)) 244 - ras_umc_log_bad_bank_pending(ras_core, &ras_ecc); 245 - else 246 - ras_umc_log_bad_bank(ras_core, &ras_ecc); 252 + if (!ret) { 253 + if (ras_core_gpu_in_reset(ras_core)) 254 + ras_umc_log_bad_bank_pending(ras_core, &ras_ecc); 255 + else 256 + ras_umc_log_bad_bank(ras_core, &ras_ecc); 257 + } 247 258 } 248 259 249 260 aca_report_ecc_info(ras_core,
+40
drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
··· 24 24 25 25 #include "ras.h" 26 26 27 + #define RAS_SMU_MESSAGE_TIMEOUT_MS 1000 /* 1s */ 28 + 27 29 void ras_fw_init_feature_flags(struct ras_core_context *ras_core) 28 30 { 29 31 struct ras_mp1 *mp1 = &ras_core->ras_mp1; ··· 330 328 return 0; 331 329 332 330 return ras_core->ras_fw_eeprom.ras_num_recs; 331 + } 332 + 333 + int ras_fw_eeprom_update_record(struct ras_core_context *ras_core, 334 + struct ras_bank_ecc *ras_ecc) 335 + { 336 + struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom; 337 + int ret, retry = 20; 338 + u32 recs_num_new = control->ras_num_recs; 339 + 340 + do { 341 + /* 1000ms timeout is long enough, smu_get_badpage_count won't 342 + * return -EBUSY before timeout. 343 + */ 344 + ret = ras_fw_get_badpage_count(ras_core, 345 + &recs_num_new, RAS_SMU_MESSAGE_TIMEOUT_MS); 346 + if (!ret && 347 + (recs_num_new == control->ras_num_recs)) { 348 + /* record number update in PMFW needs some time, 349 + * smu_get_badpage_count may return immediately without 350 + * count update, sleep for a while and retry again. 351 + */ 352 + msleep(50); 353 + retry--; 354 + } else { 355 + break; 356 + } 357 + } while (retry); 358 + 359 + if (ret) 360 + return ret; 361 + 362 + if (recs_num_new > control->ras_num_recs) 363 + ret = ras_fw_eeprom_read_idx(ras_core, 0, 364 + ras_ecc, control->ras_num_recs, 1); 365 + else 366 + ret = -EINVAL; 367 + 368 + return ret; 333 369 }
+2
drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
··· 75 75 struct ras_bank_ecc *ras_ecc, 76 76 u32 rec_idx, const u32 num); 77 77 uint32_t ras_fw_eeprom_get_record_count(struct ras_core_context *ras_core); 78 + int ras_fw_eeprom_update_record(struct ras_core_context *ras_core, 79 + struct ras_bank_ecc *ras_ecc); 78 80 79 81 #endif
+3
drivers/gpu/drm/amd/ras/rascore/ras_umc_v12_0.c
··· 373 373 ACA_ADDR_2_ERR_ADDR(bank->addr), ACA_IPID_2_UMC_INST(bank->ipid), 374 374 &nps_addr, bank->nps, record); 375 375 376 + if (ras_fw_eeprom_supported(ras_core) && bank->ts) 377 + record->ts = bank->ts; 378 + 376 379 lookup_bad_pages_in_a_row(ras_core, record, 377 380 bank->nps, NULL, 0, bank->seq_no, true); 378 381