Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: added xgmi ras error reset sequence

added mechanism to clear xgmi ras status inbetween error queries

Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: John Clements <john.clements@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

John Clements and committed by
Alex Deucher
66399248 3aa0115d

+31
+30
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
··· 604 604 adev->gmc.xgmi.num_physical_nodes == 0) 605 605 return 0; 606 606 607 + amdgpu_xgmi_reset_ras_error_count(adev); 608 + 607 609 if (!adev->gmc.xgmi.ras_if) { 608 610 adev->gmc.xgmi.ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL); 609 611 if (!adev->gmc.xgmi.ras_if) ··· 668 666 dev_warn(adev->dev, "failed to enable DF-Cstate\n"); 669 667 670 668 return addr + dram_base_addr; 669 + } 670 + 671 + static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg) 672 + { 673 + WREG32_PCIE(pcs_status_reg, 0xFFFFFFFF); 674 + WREG32_PCIE(pcs_status_reg, 0); 675 + } 676 + 677 + void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev) 678 + { 679 + uint32_t i; 680 + 681 + switch (adev->asic_type) { 682 + case CHIP_ARCTURUS: 683 + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_arct); i++) 684 + pcs_clear_status(adev, 685 + xgmi_pcs_err_status_reg_arct[i]); 686 + break; 687 + case CHIP_VEGA20: 688 + for (i = 0; i < ARRAY_SIZE(xgmi_pcs_err_status_reg_vg20); i++) 689 + pcs_clear_status(adev, 690 + xgmi_pcs_err_status_reg_vg20[i]); 691 + break; 692 + default: 693 + break; 694 + } 671 695 } 672 696 673 697 static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev, ··· 785 757 } 786 758 break; 787 759 } 760 + 761 + amdgpu_xgmi_reset_ras_error_count(adev); 788 762 789 763 err_data->ue_count += ue_cnt; 790 764 err_data->ce_count += ce_cnt;
+1
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
··· 56 56 uint64_t addr); 57 57 int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev, 58 58 void *ras_error_status); 59 + void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev); 59 60 60 61 static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev, 61 62 struct amdgpu_device *bo_adev)