Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: Fix map/unmap queue logic

In current logic, it calls ring_alloc followed by a ring_test. ring_test
in turn will call another ring_alloc. This is illegal usage as a
ring_alloc is expected to be closed properly with a ring_commit. Change
to commit the map/unmap queue packet first followed by a ring_test. Add a
comment about the usage of ring_test.

Also, reorder the current pre-condition checks of job hang or kiq ring
scheduler not ready. Without them being met, it is not useful to attempt
ring or memory allocations.

Fixes tag refers to the original patch which introduced this issue which
then got carried over into newer code.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Le Ma <le.ma@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c")

authored by

Lijo Lazar and committed by
Alex Deucher
fa317985 2bb7dced

+63 -20
+11 -2
drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
··· 834 834 if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) 835 835 return -EINVAL; 836 836 837 + if (!kiq_ring->sched.ready || adev->job_hang) 838 + return 0; 839 + 837 840 ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL); 838 841 if (!ring_funcs) 839 842 return -ENOMEM; ··· 861 858 862 859 kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0); 863 860 864 - if (kiq_ring->sched.ready && !adev->job_hang) 865 - r = amdgpu_ring_test_helper(kiq_ring); 861 + /* Submit unmap queue packet */ 862 + amdgpu_ring_commit(kiq_ring); 863 + /* 864 + * Ring test will do a basic scratch register change check. Just run 865 + * this to ensure that unmap queues that is submitted before got 866 + * processed successfully before returning. 867 + */ 868 + r = amdgpu_ring_test_helper(kiq_ring); 866 869 867 870 spin_unlock(&kiq->ring_lock); 868 871
+45 -18
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
··· 515 515 if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) 516 516 return -EINVAL; 517 517 518 + if (!kiq_ring->sched.ready || adev->job_hang) 519 + return 0; 520 + /** 521 + * This is workaround: only skip kiq_ring test 522 + * during ras recovery in suspend stage for gfx9.4.3 523 + */ 524 + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 525 + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 526 + amdgpu_ras_in_recovery(adev)) 527 + return 0; 528 + 518 529 spin_lock(&kiq->ring_lock); 519 530 if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * 520 531 adev->gfx.num_compute_rings)) { ··· 539 528 &adev->gfx.compute_ring[j], 540 529 RESET_QUEUES, 0, 0); 541 530 } 542 - 543 - /** 544 - * This is workaround: only skip kiq_ring test 545 - * during ras recovery in suspend stage for gfx9.4.3 531 + /* Submit unmap queue packet */ 532 + amdgpu_ring_commit(kiq_ring); 533 + /* 534 + * Ring test will do a basic scratch register change check. Just run 535 + * this to ensure that unmap queues that is submitted before got 536 + * processed successfully before returning. 546 537 */ 547 - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || 548 - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && 549 - amdgpu_ras_in_recovery(adev)) { 550 - spin_unlock(&kiq->ring_lock); 551 - return 0; 552 - } 538 + r = amdgpu_ring_test_helper(kiq_ring); 553 539 554 - if (kiq_ring->sched.ready && !adev->job_hang) 555 - r = amdgpu_ring_test_helper(kiq_ring); 556 540 spin_unlock(&kiq->ring_lock); 557 541 558 542 return r; ··· 575 569 if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) 576 570 return -EINVAL; 577 571 578 - spin_lock(&kiq->ring_lock); 572 + if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang) 573 + return 0; 574 + 579 575 if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { 576 + spin_lock(&kiq->ring_lock); 580 577 if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * 581 578 adev->gfx.num_gfx_rings)) { 582 579 spin_unlock(&kiq->ring_lock); ··· 592 583 &adev->gfx.gfx_ring[j], 593 584 PREEMPT_QUEUES, 0, 0); 594 585 } 595 - } 586 + /* Submit unmap queue packet */ 587 + amdgpu_ring_commit(kiq_ring); 596 588 597 - if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang) 589 + /* 590 + * Ring test will do a basic scratch register change check. 591 + * Just run this to ensure that unmap queues that is submitted 592 + * before got processed successfully before returning. 593 + */ 598 594 r = amdgpu_ring_test_helper(kiq_ring); 599 - spin_unlock(&kiq->ring_lock); 595 + spin_unlock(&kiq->ring_lock); 596 + } 600 597 601 598 return r; 602 599 } ··· 707 692 kiq->pmf->kiq_map_queues(kiq_ring, 708 693 &adev->gfx.compute_ring[j]); 709 694 } 710 - 695 + /* Submit map queue packet */ 696 + amdgpu_ring_commit(kiq_ring); 697 + /* 698 + * Ring test will do a basic scratch register change check. Just run 699 + * this to ensure that map queues that is submitted before got 700 + * processed successfully before returning. 701 + */ 711 702 r = amdgpu_ring_test_helper(kiq_ring); 712 703 spin_unlock(&kiq->ring_lock); 713 704 if (r) ··· 764 743 &adev->gfx.gfx_ring[j]); 765 744 } 766 745 } 767 - 746 + /* Submit map queue packet */ 747 + amdgpu_ring_commit(kiq_ring); 748 + /* 749 + * Ring test will do a basic scratch register change check. Just run 750 + * this to ensure that map queues that is submitted before got 751 + * processed successfully before returning. 752 + */ 768 753 r = amdgpu_ring_test_helper(kiq_ring); 769 754 spin_unlock(&kiq->ring_lock); 770 755 if (r)
+7
drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
··· 4823 4823 amdgpu_ring_write(kiq_ring, 0); 4824 4824 amdgpu_ring_write(kiq_ring, 0); 4825 4825 } 4826 + /* Submit unmap queue packet */ 4827 + amdgpu_ring_commit(kiq_ring); 4828 + /* 4829 + * Ring test will do a basic scratch register change check. Just run 4830 + * this to ensure that unmap queues that is submitted before got 4831 + * processed successfully before returning. 4832 + */ 4826 4833 r = amdgpu_ring_test_helper(kiq_ring); 4827 4834 if (r) 4828 4835 DRM_ERROR("KCQ disable failed\n");