Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: Correct the loss of aca bank reg info

By polling, poll ACA bank count to ensure that valid
ACA bank reg info can be obtained

v2: add corresponding delay before send msg to SMU to query mca bank info
(Stanley)

v3: the loop cannot exit. (Thomas)

v4: remove amdgpu_aca_clear_bank_count. (Kevin)

v5: continuously inject ce. If a creation interruption
occurs at this time, bank reg info will be lost. (Thomas)
v5: each cycle is delayed by 100ms. (Tao)

Signed-off-by: Ce Sun <cesun102@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Ce Sun and committed by
Alex Deucher
d8442bca 0989b764

+29 -32
+21 -29
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 122 122 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */ 123 123 #define RAS_BAD_PAGE_COVER (100 * 1024 * 1024ULL) 124 124 125 - #define MAX_UMC_POISON_POLLING_TIME_ASYNC 300 //ms 125 + #define MAX_UMC_POISON_POLLING_TIME_ASYNC 10 126 126 127 127 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms 128 128 ··· 3317 3317 3318 3318 INIT_RADIX_TREE(&ecc_log->de_page_tree, GFP_KERNEL); 3319 3319 ecc_log->de_queried_count = 0; 3320 - ecc_log->prev_de_queried_count = 0; 3320 + ecc_log->consumption_q_count = 0; 3321 3321 } 3322 3322 3323 3323 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) ··· 3337 3337 3338 3338 mutex_destroy(&ecc_log->lock); 3339 3339 ecc_log->de_queried_count = 0; 3340 - ecc_log->prev_de_queried_count = 0; 3340 + ecc_log->consumption_q_count = 0; 3341 3341 } 3342 3342 3343 3343 static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con, ··· 3387 3387 int ret = 0; 3388 3388 struct ras_ecc_log_info *ecc_log; 3389 3389 struct ras_query_if info; 3390 - uint32_t timeout = 0; 3390 + u32 timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; 3391 3391 struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 3392 - uint64_t de_queried_count; 3393 - uint32_t new_detect_count, total_detect_count; 3394 - uint32_t need_query_count = poison_creation_count; 3392 + u64 de_queried_count; 3393 + u64 consumption_q_count; 3395 3394 enum ras_event_type type = RAS_EVENT_TYPE_POISON_CREATION; 3396 3395 3397 3396 memset(&info, 0, sizeof(info)); 3398 3397 info.head.block = AMDGPU_RAS_BLOCK__UMC; 3399 3398 3400 3399 ecc_log = &ras->umc_ecc_log; 3401 - total_detect_count = 0; 3400 + ecc_log->de_queried_count = 0; 3401 + ecc_log->consumption_q_count = 0; 3402 + 3402 3403 do { 3403 3404 ret = amdgpu_ras_query_error_status_with_event(adev, &info, type); 3404 3405 if (ret) 3405 3406 return ret; 3406 3407 3407 3408 de_queried_count = ecc_log->de_queried_count; 3408 - if (de_queried_count > ecc_log->prev_de_queried_count) { 3409 - new_detect_count = de_queried_count - ecc_log->prev_de_queried_count; 3410 - ecc_log->prev_de_queried_count = de_queried_count; 3411 - timeout = 0; 3412 - } else { 3413 - new_detect_count = 0; 3414 - } 3409 + consumption_q_count = ecc_log->consumption_q_count; 3415 3410 3416 - if (new_detect_count) { 3417 - total_detect_count += new_detect_count; 3418 - } else { 3419 - if (!timeout && need_query_count) 3420 - timeout = MAX_UMC_POISON_POLLING_TIME_ASYNC; 3411 + if (de_queried_count && consumption_q_count) 3412 + break; 3421 3413 3422 - if (timeout) { 3423 - if (!--timeout) 3424 - break; 3425 - msleep(1); 3426 - } 3427 - } 3428 - } while (total_detect_count < need_query_count); 3414 + msleep(100); 3415 + } while (--timeout); 3429 3416 3430 - if (total_detect_count) 3417 + if (de_queried_count) 3431 3418 schedule_delayed_work(&ras->page_retirement_dwork, 0); 3432 3419 3433 3420 if (amdgpu_ras_is_rma(adev) && atomic_cmpxchg(&ras->rma_in_recovery, 0, 1) == 0) ··· 3512 3525 atomic_sub(poison_creation_count, &con->poison_creation_count); 3513 3526 atomic_sub(poison_creation_count, &con->page_retirement_req_cnt); 3514 3527 } 3515 - } while (atomic_read(&con->poison_creation_count)); 3528 + } while (atomic_read(&con->poison_creation_count) && 3529 + !atomic_read(&con->poison_consumption_count)); 3516 3530 3517 3531 if (ret != -EIO) { 3518 3532 msg_count = kfifo_len(&con->poison_fifo); ··· 3530 3542 /* gpu mode-1 reset is ongoing or just completed ras mode-1 reset */ 3531 3543 /* Clear poison creation request */ 3532 3544 atomic_set(&con->poison_creation_count, 0); 3545 + atomic_set(&con->poison_consumption_count, 0); 3533 3546 3534 3547 /* Clear poison fifo */ 3535 3548 amdgpu_ras_clear_poison_fifo(adev); ··· 3554 3565 amdgpu_ras_clear_poison_fifo(adev); 3555 3566 atomic_sub(msg_count, &con->page_retirement_req_cnt); 3556 3567 } 3568 + 3569 + atomic_set(&con->poison_consumption_count, 0); 3557 3570 3558 3571 /* Wake up work to save bad pages to eeprom */ 3559 3572 schedule_delayed_work(&con->page_retirement_dwork, 0); ··· 3662 3671 init_waitqueue_head(&con->page_retirement_wq); 3663 3672 atomic_set(&con->page_retirement_req_cnt, 0); 3664 3673 atomic_set(&con->poison_creation_count, 0); 3674 + atomic_set(&con->poison_consumption_count, 0); 3665 3675 con->page_retirement_thread = 3666 3676 kthread_run(amdgpu_ras_page_retirement_thread, adev, "umc_page_retirement"); 3667 3677 if (IS_ERR(con->page_retirement_thread)) {
+3 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 492 492 struct ras_ecc_log_info { 493 493 struct mutex lock; 494 494 struct radix_tree_root de_page_tree; 495 - uint64_t de_queried_count; 496 - uint64_t prev_de_queried_count; 495 + uint64_t de_queried_count; 496 + uint64_t consumption_q_count; 497 497 }; 498 498 499 499 struct ras_critical_region { ··· 565 565 struct mutex page_retirement_lock; 566 566 atomic_t page_retirement_req_cnt; 567 567 atomic_t poison_creation_count; 568 + atomic_t poison_consumption_count; 568 569 struct mutex page_rsv_lock; 569 570 DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); 570 571 struct ras_ecc_log_info umc_ecc_log;
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
··· 252 252 block, pasid, pasid_fn, data, reset); 253 253 if (!ret) { 254 254 atomic_inc(&con->page_retirement_req_cnt); 255 + atomic_inc(&con->poison_consumption_count); 255 256 wake_up(&con->page_retirement_wq); 256 257 } 257 258 }
+4 -1
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
··· 536 536 hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID); 537 537 mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType); 538 538 539 - if ((hwid != MCA_UMC_HWID_V12_0) || (mcatype != MCA_UMC_MCATYPE_V12_0)) 539 + /* The IP block decode of consumption is SMU */ 540 + if (hwid != MCA_UMC_HWID_V12_0 || mcatype != MCA_UMC_MCATYPE_V12_0) { 541 + con->umc_ecc_log.consumption_q_count++; 540 542 return 0; 543 + } 541 544 542 545 if (!status) 543 546 return 0;