Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: bypass querying ras error count registers

Once ras recovery is issued by ras sync flood interrupt or
ras controller interrupt, add this guard to bypass or execute
ras error count register harvest of all IPs.

Signed-off-by: Guchun Chen <guchun.chen@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: Dennis Li <Dennis.Li@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Guchun Chen and committed by
Alex Deucher
f75e94d8 0cf0ee98

+38 -29
+12 -10
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
··· 1547 1547 struct list_head device_list, *device_list_handle = NULL; 1548 1548 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); 1549 1549 1550 - /* Build list of devices to query RAS related errors */ 1551 - if (hive && adev->gmc.xgmi.num_physical_nodes > 1) 1552 - device_list_handle = &hive->device_list; 1553 - else { 1554 - INIT_LIST_HEAD(&device_list); 1555 - list_add_tail(&adev->gmc.xgmi.head, &device_list); 1556 - device_list_handle = &device_list; 1557 - } 1550 + if (!ras->disable_ras_err_cnt_harvest) { 1551 + /* Build list of devices to query RAS related errors */ 1552 + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { 1553 + device_list_handle = &hive->device_list; 1554 + } else { 1555 + INIT_LIST_HEAD(&device_list); 1556 + list_add_tail(&adev->gmc.xgmi.head, &device_list); 1557 + device_list_handle = &device_list; 1558 + } 1558 1559 1559 - list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { 1560 - amdgpu_ras_log_on_err_counter(remote_adev); 1560 + list_for_each_entry(remote_adev, 1561 + device_list_handle, gmc.xgmi.head) 1562 + amdgpu_ras_log_on_err_counter(remote_adev); 1561 1563 } 1562 1564 1563 1565 if (amdgpu_device_should_recover_gpu(ras->adev))
+3
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
··· 343 343 344 344 /* bad page count threshold */ 345 345 uint32_t bad_page_cnt_threshold; 346 + 347 + /* disable ras error count harvest in recovery */ 348 + bool disable_ras_err_cnt_harvest; 346 349 }; 347 350 348 351 struct ras_fs_data {
+23 -19
drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
··· 302 302 uint32_t bif_doorbell_intr_cntl; 303 303 struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if); 304 304 struct ras_err_data err_data = {0, 0, 0, NULL}; 305 + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); 305 306 306 307 bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL); 307 308 if (REG_GET_FIELD(bif_doorbell_intr_cntl, ··· 313 312 RAS_CNTLR_INTERRUPT_CLEAR, 1); 314 313 WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl); 315 314 316 - /* 317 - * clear error status after ras_controller_intr according to 318 - * hw team and count ue number for query 319 - */ 320 - nbio_v7_4_query_ras_error_count(adev, &err_data); 315 + if (!ras->disable_ras_err_cnt_harvest) { 316 + /* 317 + * clear error status after ras_controller_intr 318 + * according to hw team and count ue number 319 + * for query 320 + */ 321 + nbio_v7_4_query_ras_error_count(adev, &err_data); 321 322 322 - /* logging on error counter and printing for awareness */ 323 - obj->err_data.ue_count += err_data.ue_count; 324 - obj->err_data.ce_count += err_data.ce_count; 323 + /* logging on error cnt and printing for awareness */ 324 + obj->err_data.ue_count += err_data.ue_count; 325 + obj->err_data.ce_count += err_data.ce_count; 325 326 326 - if (err_data.ce_count) 327 - dev_info(adev->dev, "%ld correctable hardware " 328 - "errors detected in %s block, " 329 - "no user action is needed.\n", 330 - obj->err_data.ce_count, 331 - adev->nbio.ras_if->name); 327 + if (err_data.ce_count) 328 + dev_info(adev->dev, "%ld correctable hardware " 329 + "errors detected in %s block, " 330 + "no user action is needed.\n", 331 + obj->err_data.ce_count, 332 + adev->nbio.ras_if->name); 332 333 333 - if (err_data.ue_count) 334 - dev_info(adev->dev, "%ld uncorrectable hardware " 335 - "errors detected in %s block\n", 336 - obj->err_data.ue_count, 337 - adev->nbio.ras_if->name); 334 + if (err_data.ue_count) 335 + dev_info(adev->dev, "%ld uncorrectable hardware " 336 + "errors detected in %s block\n", 337 + obj->err_data.ue_count, 338 + adev->nbio.ras_if->name); 339 + } 338 340 339 341 dev_info(adev->dev, "RAS controller interrupt triggered " 340 342 "by NBIF error\n");