Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

drm/amdgpu: Implement Enforce Isolation Handler for KGD/KFD serialization

This commit introduces the Enforce Isolation Handler designed to enforce
shader isolation on AMD GPUs, which helps to prevent data leakage
between different processes.

The handler counts the number of emitted fences for each GFX and compute
ring. If there are any fences, it schedules the `enforce_isolation_work`
to be run after a delay of `GFX_SLICE_PERIOD`. If there are no fences,
it signals the Kernel Fusion Driver (KFD) to resume the runqueue.

The function is synchronized using the `enforce_isolation_mutex`.

This commit also introduces a reference count mechanism
(kfd_sch_req_count) to keep track of the number of requests to enable
the KFD scheduler. When a request to enable the KFD scheduler is made,
the reference count is decremented. When the reference count reaches
zero, a delayed work is scheduled to enforce isolation after a delay of
GFX_SLICE_PERIOD.

When a request to disable the KFD scheduler is made, the function first
checks if the reference count is zero. If it is, it cancels the delayed
work for enforcing isolation and checks if the KFD scheduler is active.
If the KFD scheduler is active, it sends a request to stop the KFD
scheduler and sets the KFD scheduler state to inactive. Then, it
increments the reference count.

The function is synchronized using the kfd_sch_mutex to ensure that the
KFD scheduler state and reference count are updated atomically.

Cc: Christian König <christian.koenig@amd.com>
Cc: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Signed-off-by: Srinivasan Shanmugam <srinivasan.shanmugam@amd.com>
Suggested-by: Christian König <christian.koenig@amd.com>
Suggested-by: Alex Deucher <alexander.deucher@amd.com>

authored by

Srinivasan Shanmugam and committed by
Alex Deucher
afefd6f2 234eebe1

+200
+2
drivers/gpu/drm/amd/amdgpu/amdgpu.h
··· 118 118 119 119 #define MAX_GPU_INSTANCE 64 120 120 121 + #define GFX_SLICE_PERIOD msecs_to_jiffies(250) 122 + 121 123 struct amdgpu_gpu_instance { 122 124 struct amdgpu_device *adev; 123 125 int mgpu_fan_enabled;
+16
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
··· 4067 4067 mutex_init(&adev->gfx.reset_sem_mutex); 4068 4068 /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ 4069 4069 mutex_init(&adev->enforce_isolation_mutex); 4070 + mutex_init(&adev->gfx.kfd_sch_mutex); 4070 4071 4071 4072 amdgpu_device_init_apu_flags(adev); 4072 4073 ··· 4099 4098 amdgpu_device_delayed_init_work_handler); 4100 4099 INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, 4101 4100 amdgpu_device_delay_enable_gfx_off); 4101 + /* 4102 + * Initialize the enforce_isolation work structures for each XCP 4103 + * partition. This work handler is responsible for enforcing shader 4104 + * isolation on AMD GPUs. It counts the number of emitted fences for 4105 + * each GFX and compute ring. If there are any fences, it schedules 4106 + * the `enforce_isolation_work` to be run after a delay. If there are 4107 + * no fences, it signals the Kernel Fusion Driver (KFD) to resume the 4108 + * runqueue. 4109 + */ 4110 + for (i = 0; i < MAX_XCP; i++) { 4111 + INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, 4112 + amdgpu_gfx_enforce_isolation_handler); 4113 + adev->gfx.enforce_isolation[i].adev = adev; 4114 + adev->gfx.enforce_isolation[i].xcp_id = i; 4115 + } 4102 4116 4103 4117 INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); 4104 4118
+167
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
··· 1686 1686 memcpy_toio(adev->gfx.cleaner_shader_cpu_ptr, cleaner_shader_ptr, 1687 1687 cleaner_shader_size); 1688 1688 } 1689 + 1690 + /** 1691 + * amdgpu_gfx_kfd_sch_ctrl - Control the KFD scheduler from the KGD (Graphics Driver) 1692 + * @adev: amdgpu_device pointer 1693 + * @idx: Index of the scheduler to control 1694 + * @enable: Whether to enable or disable the KFD scheduler 1695 + * 1696 + * This function is used to control the KFD (Kernel Fusion Driver) scheduler 1697 + * from the KGD. It is part of the cleaner shader feature. This function plays 1698 + * a key role in enforcing process isolation on the GPU. 1699 + * 1700 + * The function uses a reference count mechanism (kfd_sch_req_count) to keep 1701 + * track of the number of requests to enable the KFD scheduler. When a request 1702 + * to enable the KFD scheduler is made, the reference count is decremented. 1703 + * When the reference count reaches zero, a delayed work is scheduled to 1704 + * enforce isolation after a delay of GFX_SLICE_PERIOD. 1705 + * 1706 + * When a request to disable the KFD scheduler is made, the function first 1707 + * checks if the reference count is zero. If it is, it cancels the delayed work 1708 + * for enforcing isolation and checks if the KFD scheduler is active. If the 1709 + * KFD scheduler is active, it sends a request to stop the KFD scheduler and 1710 + * sets the KFD scheduler state to inactive. Then, it increments the reference 1711 + * count. 1712 + * 1713 + * The function is synchronized using the kfd_sch_mutex to ensure that the KFD 1714 + * scheduler state and reference count are updated atomically. 1715 + * 1716 + * Note: If the reference count is already zero when a request to enable the 1717 + * KFD scheduler is made, it means there's an imbalance bug somewhere. The 1718 + * function triggers a warning in this case. 1719 + */ 1720 + static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx, 1721 + bool enable) 1722 + { 1723 + mutex_lock(&adev->gfx.kfd_sch_mutex); 1724 + 1725 + if (enable) { 1726 + /* If the count is already 0, it means there's an imbalance bug somewhere. 1727 + * Note that the bug may be in a different caller than the one which triggers the 1728 + * WARN_ON_ONCE. 1729 + */ 1730 + if (WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx] == 0)) { 1731 + dev_err(adev->dev, "Attempted to enable KFD scheduler when reference count is already zero\n"); 1732 + goto unlock; 1733 + } 1734 + 1735 + adev->gfx.kfd_sch_req_count[idx]--; 1736 + 1737 + if (adev->gfx.kfd_sch_req_count[idx] == 0 && 1738 + adev->gfx.kfd_sch_inactive[idx]) { 1739 + schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work, 1740 + GFX_SLICE_PERIOD); 1741 + } 1742 + } else { 1743 + if (adev->gfx.kfd_sch_req_count[idx] == 0) { 1744 + cancel_delayed_work_sync(&adev->gfx.enforce_isolation[idx].work); 1745 + if (!adev->gfx.kfd_sch_inactive[idx]) { 1746 + amdgpu_amdkfd_stop_sched(adev, idx); 1747 + adev->gfx.kfd_sch_inactive[idx] = true; 1748 + } 1749 + } 1750 + 1751 + adev->gfx.kfd_sch_req_count[idx]++; 1752 + } 1753 + 1754 + unlock: 1755 + mutex_unlock(&adev->gfx.kfd_sch_mutex); 1756 + } 1757 + 1758 + /** 1759 + * amdgpu_gfx_enforce_isolation_handler - work handler for enforcing shader isolation 1760 + * 1761 + * @work: work_struct. 1762 + * 1763 + * This function is the work handler for enforcing shader isolation on AMD GPUs. 1764 + * It counts the number of emitted fences for each GFX and compute ring. If there 1765 + * are any fences, it schedules the `enforce_isolation_work` to be run after a 1766 + * delay of `GFX_SLICE_PERIOD`. If there are no fences, it signals the Kernel Fusion 1767 + * Driver (KFD) to resume the runqueue. The function is synchronized using the 1768 + * `enforce_isolation_mutex`. 1769 + */ 1770 + void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) 1771 + { 1772 + struct amdgpu_isolation_work *isolation_work = 1773 + container_of(work, struct amdgpu_isolation_work, work.work); 1774 + struct amdgpu_device *adev = isolation_work->adev; 1775 + u32 i, idx, fences = 0; 1776 + 1777 + if (isolation_work->xcp_id == AMDGPU_XCP_NO_PARTITION) 1778 + idx = 0; 1779 + else 1780 + idx = isolation_work->xcp_id; 1781 + 1782 + if (idx >= MAX_XCP) 1783 + return; 1784 + 1785 + mutex_lock(&adev->enforce_isolation_mutex); 1786 + for (i = 0; i < AMDGPU_MAX_GFX_RINGS; ++i) { 1787 + if (isolation_work->xcp_id == adev->gfx.gfx_ring[i].xcp_id) 1788 + fences += amdgpu_fence_count_emitted(&adev->gfx.gfx_ring[i]); 1789 + } 1790 + for (i = 0; i < (AMDGPU_MAX_COMPUTE_RINGS * AMDGPU_MAX_GC_INSTANCES); ++i) { 1791 + if (isolation_work->xcp_id == adev->gfx.compute_ring[i].xcp_id) 1792 + fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]); 1793 + } 1794 + if (fences) { 1795 + schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work, 1796 + GFX_SLICE_PERIOD); 1797 + } else { 1798 + /* Tell KFD to resume the runqueue */ 1799 + if (adev->kfd.init_complete) { 1800 + WARN_ON_ONCE(!adev->gfx.kfd_sch_inactive[idx]); 1801 + WARN_ON_ONCE(adev->gfx.kfd_sch_req_count[idx]); 1802 + amdgpu_amdkfd_start_sched(adev, idx); 1803 + adev->gfx.kfd_sch_inactive[idx] = false; 1804 + } 1805 + } 1806 + mutex_unlock(&adev->enforce_isolation_mutex); 1807 + } 1808 + 1809 + void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) 1810 + { 1811 + struct amdgpu_device *adev = ring->adev; 1812 + u32 idx; 1813 + 1814 + if (!adev->gfx.enable_cleaner_shader) 1815 + return; 1816 + 1817 + if (ring->xcp_id == AMDGPU_XCP_NO_PARTITION) 1818 + idx = 0; 1819 + else 1820 + idx = ring->xcp_id; 1821 + 1822 + if (idx >= MAX_XCP) 1823 + return; 1824 + 1825 + mutex_lock(&adev->enforce_isolation_mutex); 1826 + if (adev->enforce_isolation[idx]) { 1827 + if (adev->kfd.init_complete) 1828 + amdgpu_gfx_kfd_sch_ctrl(adev, idx, false); 1829 + } 1830 + mutex_unlock(&adev->enforce_isolation_mutex); 1831 + } 1832 + 1833 + void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring) 1834 + { 1835 + struct amdgpu_device *adev = ring->adev; 1836 + u32 idx; 1837 + 1838 + if (!adev->gfx.enable_cleaner_shader) 1839 + return; 1840 + 1841 + if (ring->xcp_id == AMDGPU_XCP_NO_PARTITION) 1842 + idx = 0; 1843 + else 1844 + idx = ring->xcp_id; 1845 + 1846 + if (idx >= MAX_XCP) 1847 + return; 1848 + 1849 + mutex_lock(&adev->enforce_isolation_mutex); 1850 + if (adev->enforce_isolation[idx]) { 1851 + if (adev->kfd.init_complete) 1852 + amdgpu_gfx_kfd_sch_ctrl(adev, idx, true); 1853 + } 1854 + mutex_unlock(&adev->enforce_isolation_mutex); 1855 + }
+15
drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
··· 34 34 #include "soc15.h" 35 35 #include "amdgpu_ras.h" 36 36 #include "amdgpu_ring_mux.h" 37 + #include "amdgpu_xcp.h" 37 38 38 39 /* GFX current status */ 39 40 #define AMDGPU_GFX_NORMAL_MODE 0x00000000L ··· 344 343 DECLARE_BITMAP(queue_bitmap, AMDGPU_MAX_GFX_QUEUES); 345 344 }; 346 345 346 + struct amdgpu_isolation_work { 347 + struct amdgpu_device *adev; 348 + u32 xcp_id; 349 + struct delayed_work work; 350 + }; 351 + 347 352 struct amdgpu_gfx { 348 353 struct mutex gpu_clock_mutex; 349 354 struct amdgpu_gfx_config config; ··· 461 454 void *cleaner_shader_cpu_ptr; 462 455 const void *cleaner_shader_ptr; 463 456 bool enable_cleaner_shader; 457 + struct amdgpu_isolation_work enforce_isolation[MAX_XCP]; 458 + /* Mutex for synchronizing KFD scheduler operations */ 459 + struct mutex kfd_sch_mutex; 460 + u64 kfd_sch_req_count[MAX_XCP]; 461 + bool kfd_sch_inactive[MAX_XCP]; 464 462 }; 465 463 466 464 struct amdgpu_gfx_ras_reg_entry { ··· 575 563 const void *cleaner_shader_ptr); 576 564 int amdgpu_gfx_sysfs_isolation_shader_init(struct amdgpu_device *adev); 577 565 void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev); 566 + void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work); 567 + void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring); 568 + void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring); 578 569 579 570 static inline const char *amdgpu_gfx_compute_mode_desc(int mode) 580 571 {