Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: Add support for CPERs on virtualization

Add support for CPERs on VFs.

VFs do not receive PMFW messages directly; as such, they need to
query them from the host. To avoid hitting host event guard,
CPER queries need to be rate limited. CPER queries share the same
RAS telemetry buffer as error count query, so a mutex protecting
the shared buffer was added as well.

For readability, the amdgpu_detect_virtualization was refactored
into multiple individual functions.

Signed-off-by: Tony Yi <Tony.Yi@amd.com>
Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Tony Yi and committed by
Alex Deucher
a91d91b6 ca17c8e1

+195 -13
+3 -4
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 3099 3099 3100 3100 amdgpu_fru_get_product_info(adev); 3101 3101 3102 - r = amdgpu_cper_init(adev); 3102 + if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_ras_cper_en(adev)) 3103 + r = amdgpu_cper_init(adev); 3103 3104 3104 3105 init_failed: 3105 3106 ··· 4334 4333 * for throttling interrupt) = 60 seconds. 4335 4334 */ 4336 4335 ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1); 4337 - ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1); 4338 4336 4339 4337 ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE); 4340 - ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE); 4341 4338 4342 4339 /* Registers mapping */ 4343 4340 /* TODO: block userspace mapping of io register */ ··· 4367 4368 return -ENOMEM; 4368 4369 4369 4370 /* detect hw virtualization here */ 4370 - amdgpu_detect_virtualization(adev); 4371 + amdgpu_virt_init(adev); 4371 4372 4372 4373 amdgpu_device_get_pcie_info(adev); 4373 4374
+28 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
··· 578 578 return result; 579 579 } 580 580 581 + static ssize_t amdgpu_debugfs_virt_ring_read(struct file *f, char __user *buf, 582 + size_t size, loff_t *pos) 583 + { 584 + struct amdgpu_ring *ring = file_inode(f)->i_private; 585 + 586 + if (*pos & 3 || size & 3) 587 + return -EINVAL; 588 + 589 + if (ring->funcs->type == AMDGPU_RING_TYPE_CPER) 590 + amdgpu_virt_req_ras_cper_dump(ring->adev, false); 591 + 592 + return amdgpu_debugfs_ring_read(f, buf, size, pos); 593 + } 594 + 581 595 static const struct file_operations amdgpu_debugfs_ring_fops = { 582 596 .owner = THIS_MODULE, 583 597 .read = amdgpu_debugfs_ring_read, 598 + .llseek = default_llseek 599 + }; 600 + 601 + static const struct file_operations amdgpu_debugfs_virt_ring_fops = { 602 + .owner = THIS_MODULE, 603 + .read = amdgpu_debugfs_virt_ring_read, 584 604 .llseek = default_llseek 585 605 }; 586 606 ··· 691 671 char name[32]; 692 672 693 673 sprintf(name, "amdgpu_ring_%s", ring->name); 694 - debugfs_create_file_size(name, S_IFREG | 0444, root, ring, 695 - &amdgpu_debugfs_ring_fops, 696 - ring->ring_size + 12); 674 + if (amdgpu_sriov_vf(adev)) 675 + debugfs_create_file_size(name, S_IFREG | 0444, root, ring, 676 + &amdgpu_debugfs_virt_ring_fops, 677 + ring->ring_size + 12); 678 + else 679 + debugfs_create_file_size(name, S_IFREG | 0444, root, ring, 680 + &amdgpu_debugfs_ring_fops, 681 + ring->ring_size + 12); 697 682 698 683 if (ring->mqd_obj) { 699 684 sprintf(name, "amdgpu_mqd_%s", ring->name);
+135 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
··· 739 739 } 740 740 } 741 741 742 - void amdgpu_detect_virtualization(struct amdgpu_device *adev) 742 + static u32 amdgpu_virt_init_detect_asic(struct amdgpu_device *adev) 743 743 { 744 744 uint32_t reg; 745 745 ··· 775 775 adev->virt.caps |= AMDGPU_PASSTHROUGH_MODE; 776 776 } 777 777 778 + return reg; 779 + } 780 + 781 + static bool amdgpu_virt_init_req_data(struct amdgpu_device *adev, u32 reg) 782 + { 783 + bool is_sriov = false; 784 + 778 785 /* we have the ability to check now */ 779 786 if (amdgpu_sriov_vf(adev)) { 787 + is_sriov = true; 788 + 780 789 switch (adev->asic_type) { 781 790 case CHIP_TONGA: 782 791 case CHIP_FIJI: ··· 814 805 amdgpu_virt_request_init_data(adev); 815 806 break; 816 807 default: /* other chip doesn't support SRIOV */ 808 + is_sriov = false; 817 809 DRM_ERROR("Unknown asic type: %d!\n", adev->asic_type); 818 810 break; 819 811 } 820 812 } 813 + 814 + return is_sriov; 815 + } 816 + 817 + static void amdgpu_virt_init_ras(struct amdgpu_device *adev) 818 + { 819 + ratelimit_state_init(&adev->virt.ras.ras_error_cnt_rs, 5 * HZ, 1); 820 + ratelimit_state_init(&adev->virt.ras.ras_cper_dump_rs, 5 * HZ, 1); 821 + 822 + ratelimit_set_flags(&adev->virt.ras.ras_error_cnt_rs, 823 + RATELIMIT_MSG_ON_RELEASE); 824 + ratelimit_set_flags(&adev->virt.ras.ras_cper_dump_rs, 825 + RATELIMIT_MSG_ON_RELEASE); 826 + 827 + mutex_init(&adev->virt.ras.ras_telemetry_mutex); 828 + 829 + adev->virt.ras.cper_rptr = 0; 830 + } 831 + 832 + void amdgpu_virt_init(struct amdgpu_device *adev) 833 + { 834 + bool is_sriov = false; 835 + uint32_t reg = amdgpu_virt_init_detect_asic(adev); 836 + 837 + is_sriov = amdgpu_virt_init_req_data(adev, reg); 838 + 839 + if (is_sriov) 840 + amdgpu_virt_init_ras(adev); 821 841 } 822 842 823 843 static bool amdgpu_virt_access_debugfs_is_mmio(struct amdgpu_device *adev) ··· 1326 1288 * will ignore incoming guest messages. Ratelimit the guest messages to 1327 1289 * prevent guest self DOS. 1328 1290 */ 1329 - if (__ratelimit(&adev->virt.ras_telemetry_rs) || force_update) { 1291 + if (__ratelimit(&virt->ras.ras_error_cnt_rs) || force_update) { 1292 + mutex_lock(&virt->ras.ras_telemetry_mutex); 1330 1293 if (!virt->ops->req_ras_err_count(adev)) 1331 1294 amdgpu_virt_cache_host_error_counts(adev, 1332 - adev->virt.fw_reserve.ras_telemetry); 1295 + virt->fw_reserve.ras_telemetry); 1296 + mutex_unlock(&virt->ras.ras_telemetry_mutex); 1333 1297 } 1334 1298 1335 1299 return 0; ··· 1360 1320 err_data->de_count = adev->virt.count_cache.block[sriov_block].de_count; 1361 1321 1362 1322 return 0; 1323 + } 1324 + 1325 + static int 1326 + amdgpu_virt_write_cpers_to_ring(struct amdgpu_device *adev, 1327 + struct amdsriov_ras_telemetry *host_telemetry, 1328 + u32 *more) 1329 + { 1330 + struct amd_sriov_ras_cper_dump *cper_dump = NULL; 1331 + struct cper_hdr *entry = NULL; 1332 + struct amdgpu_ring *ring = &adev->cper.ring_buf; 1333 + uint32_t checksum, used_size, i; 1334 + int ret = 0; 1335 + 1336 + checksum = host_telemetry->header.checksum; 1337 + used_size = host_telemetry->header.used_size; 1338 + 1339 + if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10)) 1340 + return 0; 1341 + 1342 + cper_dump = kmemdup(&host_telemetry->body.cper_dump, used_size, GFP_KERNEL); 1343 + if (!cper_dump) 1344 + return -ENOMEM; 1345 + 1346 + if (checksum != amd_sriov_msg_checksum(cper_dump, used_size, 0, 0)) 1347 + goto out; 1348 + 1349 + *more = cper_dump->more; 1350 + 1351 + if (cper_dump->wptr < adev->virt.ras.cper_rptr) { 1352 + dev_warn( 1353 + adev->dev, 1354 + "guest specified rptr that was too high! guest rptr: 0x%llx, host rptr: 0x%llx\n", 1355 + adev->virt.ras.cper_rptr, cper_dump->wptr); 1356 + 1357 + adev->virt.ras.cper_rptr = cper_dump->wptr; 1358 + goto out; 1359 + } 1360 + 1361 + entry = (struct cper_hdr *)&cper_dump->buf[0]; 1362 + 1363 + for (i = 0; i < cper_dump->count; i++) { 1364 + amdgpu_cper_ring_write(ring, entry, entry->record_length); 1365 + entry = (struct cper_hdr *)((char *)entry + 1366 + entry->record_length); 1367 + } 1368 + 1369 + if (cper_dump->overflow_count) 1370 + dev_warn(adev->dev, 1371 + "host reported CPER overflow of 0x%llx entries!\n", 1372 + cper_dump->overflow_count); 1373 + 1374 + adev->virt.ras.cper_rptr = cper_dump->wptr; 1375 + out: 1376 + kfree(cper_dump); 1377 + 1378 + return ret; 1379 + } 1380 + 1381 + static int amdgpu_virt_req_ras_cper_dump_internal(struct amdgpu_device *adev) 1382 + { 1383 + struct amdgpu_virt *virt = &adev->virt; 1384 + int ret = 0; 1385 + uint32_t more = 0; 1386 + 1387 + if (!amdgpu_sriov_ras_cper_en(adev)) 1388 + return -EOPNOTSUPP; 1389 + 1390 + do { 1391 + if (!virt->ops->req_ras_cper_dump(adev, virt->ras.cper_rptr)) 1392 + ret = amdgpu_virt_write_cpers_to_ring( 1393 + adev, virt->fw_reserve.ras_telemetry, &more); 1394 + else 1395 + ret = 0; 1396 + } while (more); 1397 + 1398 + return ret; 1399 + } 1400 + 1401 + int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update) 1402 + { 1403 + struct amdgpu_virt *virt = &adev->virt; 1404 + int ret = 0; 1405 + 1406 + if ((__ratelimit(&virt->ras.ras_cper_dump_rs) || force_update) && 1407 + down_read_trylock(&adev->reset_domain->sem)) { 1408 + mutex_lock(&virt->ras.ras_telemetry_mutex); 1409 + ret = amdgpu_virt_req_ras_cper_dump_internal(adev); 1410 + mutex_unlock(&virt->ras.ras_telemetry_mutex); 1411 + up_read(&adev->reset_domain->sem); 1412 + } 1413 + 1414 + return ret; 1363 1415 } 1364 1416 1365 1417 int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev)
+15 -3
drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
··· 96 96 enum amdgpu_ras_block block); 97 97 bool (*rcvd_ras_intr)(struct amdgpu_device *adev); 98 98 int (*req_ras_err_count)(struct amdgpu_device *adev); 99 + int (*req_ras_cper_dump)(struct amdgpu_device *adev, u64 vf_rptr); 99 100 }; 100 101 101 102 /* ··· 141 140 AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8), 142 141 AMDGIM_FEATURE_RAS_CAPS = (1 << 9), 143 142 AMDGIM_FEATURE_RAS_TELEMETRY = (1 << 10), 143 + AMDGIM_FEATURE_RAS_CPER = (1 << 11), 144 144 }; 145 145 146 146 enum AMDGIM_REG_ACCESS_FLAG { ··· 244 242 int last_reserved; 245 243 }; 246 244 245 + struct amdgpu_virt_ras { 246 + struct ratelimit_state ras_error_cnt_rs; 247 + struct ratelimit_state ras_cper_dump_rs; 248 + struct mutex ras_telemetry_mutex; 249 + uint64_t cper_rptr; 250 + }; 251 + 247 252 /* GPU virtualization */ 248 253 struct amdgpu_virt { 249 254 uint32_t caps; ··· 293 284 294 285 union amd_sriov_ras_caps ras_en_caps; 295 286 union amd_sriov_ras_caps ras_telemetry_en_caps; 296 - 297 - struct ratelimit_state ras_telemetry_rs; 287 + struct amdgpu_virt_ras ras; 298 288 struct amd_sriov_ras_telemetry_error_count count_cache; 299 289 }; 300 290 ··· 348 340 #define amdgpu_sriov_ras_telemetry_block_en(adev, sriov_blk) \ 349 341 (amdgpu_sriov_ras_telemetry_en((adev)) && (adev)->virt.ras_telemetry_en_caps.all & BIT(sriov_blk)) 350 342 343 + #define amdgpu_sriov_ras_cper_en(adev) \ 344 + ((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_CPER) 345 + 351 346 static inline bool is_virtual_machine(void) 352 347 { 353 348 #if defined(CONFIG_X86) ··· 389 378 void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev); 390 379 void amdgpu_virt_exchange_data(struct amdgpu_device *adev); 391 380 void amdgpu_virt_fini_data_exchange(struct amdgpu_device *adev); 392 - void amdgpu_detect_virtualization(struct amdgpu_device *adev); 381 + void amdgpu_virt_init(struct amdgpu_device *adev); 393 382 394 383 bool amdgpu_virt_can_access_debugfs(struct amdgpu_device *adev); 395 384 int amdgpu_virt_enable_access_debugfs(struct amdgpu_device *adev); ··· 417 406 bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev); 418 407 int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block, 419 408 struct ras_err_data *err_data); 409 + int amdgpu_virt_req_ras_cper_dump(struct amdgpu_device *adev, bool force_update); 420 410 int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev); 421 411 bool amdgpu_virt_ras_telemetry_block_en(struct amdgpu_device *adev, 422 412 enum amdgpu_ras_block block);
+14
drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
··· 184 184 case IDH_REQ_RAS_ERROR_COUNT: 185 185 event = IDH_RAS_ERROR_COUNT_READY; 186 186 break; 187 + case IDH_REQ_RAS_CPER_DUMP: 188 + event = IDH_RAS_CPER_DUMP_READY; 189 + break; 187 190 default: 188 191 break; 189 192 } ··· 470 467 return xgpu_nv_send_access_requests(adev, IDH_REQ_RAS_ERROR_COUNT); 471 468 } 472 469 470 + static int xgpu_nv_req_ras_cper_dump(struct amdgpu_device *adev, u64 vf_rptr) 471 + { 472 + uint32_t vf_rptr_hi, vf_rptr_lo; 473 + 474 + vf_rptr_hi = (uint32_t)(vf_rptr >> 32); 475 + vf_rptr_lo = (uint32_t)(vf_rptr & 0xFFFFFFFF); 476 + return xgpu_nv_send_access_requests_with_param( 477 + adev, IDH_REQ_RAS_CPER_DUMP, vf_rptr_hi, vf_rptr_lo, 0); 478 + } 479 + 473 480 const struct amdgpu_virt_ops xgpu_nv_virt_ops = { 474 481 .req_full_gpu = xgpu_nv_request_full_gpu_access, 475 482 .rel_full_gpu = xgpu_nv_release_full_gpu_access, ··· 491 478 .ras_poison_handler = xgpu_nv_ras_poison_handler, 492 479 .rcvd_ras_intr = xgpu_nv_rcvd_ras_intr, 493 480 .req_ras_err_count = xgpu_nv_req_ras_err_count, 481 + .req_ras_cper_dump = xgpu_nv_req_ras_cper_dump, 494 482 };