Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amd/ras: add check func for pmfw eeprom

add check func for pmfw eeprom

Signed-off-by: Gangliang Xie <ganglxie@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Gangliang Xie and committed by
Alex Deucher
e82f9aac b2d13a41

+67 -9
+2 -1
drivers/gpu/drm/amd/ras/ras_mgr/amdgpu_ras_sys.c
··· 137 137 break; 138 138 case RAS_EVENT_ID__DEVICE_RMA: 139 139 ras_log_ring_add_log_event(ras_core, RAS_LOG_EVENT_RMA, NULL, NULL); 140 - ret = amdgpu_dpm_send_rma_reason(ras_core->dev); 140 + if (!ras_fw_eeprom_supported(ras_core)) 141 + ret = amdgpu_dpm_send_rma_reason(ras_core->dev); 141 142 break; 142 143 case RAS_EVENT_ID__RESET_GPU: 143 144 ret = amdgpu_ras_mgr_reset_gpu(ras_core->dev, *(uint32_t *)data);
+7
drivers/gpu/drm/amd/ras/rascore/ras.h
··· 50 50 #define GPU_RESET_CAUSE_FATAL (RAS_CORE_RESET_GPU | 0x0002) 51 51 #define GPU_RESET_CAUSE_RMA (RAS_CORE_RESET_GPU | 0x0004) 52 52 53 + enum ras_gpu_health_status { 54 + RAS_GPU_HEALTH_NONE = 0, 55 + RAS_GPU_HEALTH_USABLE = 1, 56 + RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2, 57 + RAS_GPU_IN_BAD_STATUS = 3, 58 + }; 59 + 53 60 enum ras_core_fw_feature_flags { 54 61 RAS_CORE_FW_FEATURE_BIT__RAS_EEPROM = BIT_ULL(0), 55 62 };
+4 -1
drivers/gpu/drm/amd/ras/rascore/ras_core.c
··· 403 403 goto init_err6; 404 404 } 405 405 406 - ret = ras_eeprom_check_storage_status(ras_core); 406 + if (ras_fw_eeprom_supported(ras_core)) 407 + ret = ras_fw_eeprom_check_storage_status(ras_core); 408 + else 409 + ret = ras_eeprom_check_storage_status(ras_core); 407 410 if (ret) 408 411 goto init_err6; 409 412
-7
drivers/gpu/drm/amd/ras/rascore/ras_eeprom.h
··· 57 57 (RECORD)->retired_row_pfn = tmp; \ 58 58 } while (0) 59 59 60 - enum ras_gpu_health_status { 61 - RAS_GPU_HEALTH_NONE = 0, 62 - RAS_GPU_HEALTH_USABLE = 1, 63 - RAS_GPU_RETIRED__ECC_REACH_THRESHOLD = 2, 64 - RAS_GPU_IN_BAD_STATUS = 3, 65 - }; 66 - 67 60 enum ras_eeprom_err_type { 68 61 RAS_EEPROM_ERR_NA, 69 62 RAS_EEPROM_ERR_RECOVERABLE,
+51
drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.c
··· 453 453 454 454 return 0; 455 455 } 456 + 457 + int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core) 458 + { 459 + struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom; 460 + int bad_page_count; 461 + 462 + bad_page_count = ras_umc_get_badpage_count(ras_core); 463 + 464 + if ((control->record_threshold_count < bad_page_count) && 465 + (control->record_threshold_config != 0)) { 466 + RAS_DEV_ERR(ras_core->dev, "RAS records:%d exceed threshold:%d", 467 + bad_page_count, control->record_threshold_count); 468 + if ((control->record_threshold_config == WARN_NONSTOP_OVER_THRESHOLD) || 469 + (control->record_threshold_config == NONSTOP_OVER_THRESHOLD)) { 470 + RAS_DEV_WARN(ras_core->dev, 471 + "Please consult AMD Service Action Guide (SAG) for appropriate service procedures\n"); 472 + } else { 473 + ras_core->is_rma = true; 474 + RAS_DEV_ERR(ras_core->dev, 475 + "User defined threshold is set, runtime service will be halt when threshold is reached\n"); 476 + } 477 + return 0; 478 + } 479 + 480 + RAS_DEV_INFO(ras_core->dev, 481 + "Found existing EEPROM table with %d records\n", 482 + bad_page_count); 483 + /* Warn if we are at 90% of the threshold or above 484 + */ 485 + if (10 * bad_page_count >= 9 * control->record_threshold_count) 486 + RAS_DEV_WARN(ras_core->dev, 487 + "RAS records:%u exceeds 90%% of threshold:%d\n", 488 + bad_page_count, 489 + control->record_threshold_count); 490 + 491 + return 0; 492 + } 493 + 494 + enum ras_gpu_health_status 495 + ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core) 496 + { 497 + struct ras_fw_eeprom_control *control = &ras_core->ras_fw_eeprom; 498 + 499 + if (!control->record_threshold_config) 500 + return RAS_GPU_HEALTH_NONE; 501 + 502 + if (ras_core->is_rma) 503 + return RAS_GPU_RETIRED__ECC_REACH_THRESHOLD; 504 + 505 + return RAS_GPU_HEALTH_USABLE; 506 + }
+3
drivers/gpu/drm/amd/ras/rascore/ras_eeprom_fw.h
··· 79 79 struct ras_bank_ecc *ras_ecc); 80 80 int ras_fw_eeprom_hw_init(struct ras_core_context *ras_core); 81 81 int ras_fw_eeprom_hw_fini(struct ras_core_context *ras_core); 82 + int ras_fw_eeprom_check_storage_status(struct ras_core_context *ras_core); 83 + enum ras_gpu_health_status 84 + ras_fw_eeprom_check_gpu_status(struct ras_core_context *ras_core); 82 85 83 86 #endif