Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

vfio/nvgrace-gpu: register device memory for poison handling

The nvgrace-gpu module [1] maps the device memory to the user VA (Qemu)
without adding the memory to the kernel. The device memory pages are PFNMAP
and not backed by struct page. The module can thus utilize the MM's PFNMAP
memory_failure mechanism that handles ECC/poison on regions with no struct
pages.

The kernel MM code exposes register/unregister APIs allowing modules to
register the device memory for memory_failure handling. Make nvgrace-gpu
register the GPU memory with the MM on open.

The module registers its memory region, the address_space with the
kernel MM for ECC handling and implements a callback function to convert
the PFN to the file page offset. The callback functions checks if the
PFN belongs to the device memory region and is also contained in the
VMA range, an error is returned otherwise.

Link: https://lore.kernel.org/all/20240220115055.23546-1-ankita@nvidia.com/ [1]

Suggested-by: Alex Williamson <alex@shazbot.org>
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Reviewed-by: Jiaqi Yan <jiaqiyan@google.com>
Link: https://lore.kernel.org/r/20260115202849.2921-3-ankita@nvidia.com
Signed-off-by: Alex Williamson <alex@shazbot.org>

authored by

Ankit Agrawal and committed by
Alex Williamson
e5f19b61 205e6d17

+109 -4
+109 -4
drivers/vfio/pci/nvgrace-gpu/main.c
··· 9 9 #include <linux/jiffies.h> 10 10 #include <linux/pci-p2pdma.h> 11 11 #include <linux/pm_runtime.h> 12 + #include <linux/memory-failure.h> 12 13 13 14 /* 14 15 * The device memory usable to the workloads running in the VM is cached ··· 50 49 void *memaddr; 51 50 void __iomem *ioaddr; 52 51 }; /* Base virtual address of the region */ 52 + struct pfn_address_space pfn_address_space; 53 53 }; 54 54 55 55 struct nvgrace_gpu_pci_core_device { ··· 90 88 return NULL; 91 89 } 92 90 91 + static int pfn_memregion_offset(struct nvgrace_gpu_pci_core_device *nvdev, 92 + unsigned int index, 93 + unsigned long pfn, 94 + pgoff_t *pfn_offset_in_region) 95 + { 96 + struct mem_region *region; 97 + unsigned long start_pfn, num_pages; 98 + 99 + region = nvgrace_gpu_memregion(index, nvdev); 100 + if (!region) 101 + return -EINVAL; 102 + 103 + start_pfn = PHYS_PFN(region->memphys); 104 + num_pages = region->memlength >> PAGE_SHIFT; 105 + 106 + if (pfn < start_pfn || pfn >= start_pfn + num_pages) 107 + return -EFAULT; 108 + 109 + *pfn_offset_in_region = pfn - start_pfn; 110 + 111 + return 0; 112 + } 113 + 114 + static inline 115 + struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma); 116 + 117 + static int nvgrace_gpu_pfn_to_vma_pgoff(struct vm_area_struct *vma, 118 + unsigned long pfn, 119 + pgoff_t *pgoff) 120 + { 121 + struct nvgrace_gpu_pci_core_device *nvdev; 122 + unsigned int index = 123 + vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT); 124 + pgoff_t vma_offset_in_region = vma->vm_pgoff & 125 + ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); 126 + pgoff_t pfn_offset_in_region; 127 + int ret; 128 + 129 + nvdev = vma_to_nvdev(vma); 130 + if (!nvdev) 131 + return -ENOENT; 132 + 133 + ret = pfn_memregion_offset(nvdev, index, pfn, &pfn_offset_in_region); 134 + if (ret) 135 + return ret; 136 + 137 + /* Ensure PFN is not before VMA's start within the region */ 138 + if (pfn_offset_in_region < vma_offset_in_region) 139 + return -EFAULT; 140 + 141 + /* Calculate offset from VMA start */ 142 + *pgoff = vma->vm_pgoff + 143 + (pfn_offset_in_region - vma_offset_in_region); 144 + 145 + return 0; 146 + } 147 + 148 + static int 149 + nvgrace_gpu_vfio_pci_register_pfn_range(struct vfio_device *core_vdev, 150 + struct mem_region *region) 151 + { 152 + unsigned long pfn, nr_pages; 153 + 154 + pfn = PHYS_PFN(region->memphys); 155 + nr_pages = region->memlength >> PAGE_SHIFT; 156 + 157 + region->pfn_address_space.node.start = pfn; 158 + region->pfn_address_space.node.last = pfn + nr_pages - 1; 159 + region->pfn_address_space.mapping = core_vdev->inode->i_mapping; 160 + region->pfn_address_space.pfn_to_vma_pgoff = nvgrace_gpu_pfn_to_vma_pgoff; 161 + 162 + return register_pfn_address_space(&region->pfn_address_space); 163 + } 164 + 93 165 static int nvgrace_gpu_open_device(struct vfio_device *core_vdev) 94 166 { 95 167 struct vfio_pci_core_device *vdev = ··· 190 114 * memory mapping. 191 115 */ 192 116 ret = vfio_pci_core_setup_barmap(vdev, 0); 193 - if (ret) { 194 - vfio_pci_core_disable(vdev); 195 - return ret; 117 + if (ret) 118 + goto error_exit; 119 + 120 + if (nvdev->resmem.memlength) { 121 + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->resmem); 122 + if (ret && ret != -EOPNOTSUPP) 123 + goto error_exit; 196 124 } 197 125 198 - vfio_pci_core_finish_enable(vdev); 126 + ret = nvgrace_gpu_vfio_pci_register_pfn_range(core_vdev, &nvdev->usemem); 127 + if (ret && ret != -EOPNOTSUPP) 128 + goto register_mem_failed; 199 129 130 + vfio_pci_core_finish_enable(vdev); 200 131 return 0; 132 + 133 + register_mem_failed: 134 + if (nvdev->resmem.memlength) 135 + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); 136 + error_exit: 137 + vfio_pci_core_disable(vdev); 138 + return ret; 201 139 } 202 140 203 141 static void nvgrace_gpu_close_device(struct vfio_device *core_vdev) ··· 219 129 struct nvgrace_gpu_pci_core_device *nvdev = 220 130 container_of(core_vdev, struct nvgrace_gpu_pci_core_device, 221 131 core_device.vdev); 132 + 133 + if (nvdev->resmem.memlength) 134 + unregister_pfn_address_space(&nvdev->resmem.pfn_address_space); 135 + 136 + unregister_pfn_address_space(&nvdev->usemem.pfn_address_space); 222 137 223 138 /* Unmap the mapping to the device memory cached region */ 224 139 if (nvdev->usemem.memaddr) { ··· 341 246 .huge_fault = nvgrace_gpu_vfio_pci_huge_fault, 342 247 #endif 343 248 }; 249 + 250 + static inline 251 + struct nvgrace_gpu_pci_core_device *vma_to_nvdev(struct vm_area_struct *vma) 252 + { 253 + /* Check if this VMA belongs to us */ 254 + if (vma->vm_ops != &nvgrace_gpu_vfio_pci_mmap_ops) 255 + return NULL; 256 + 257 + return vma->vm_private_data; 258 + } 344 259 345 260 static int nvgrace_gpu_mmap(struct vfio_device *core_vdev, 346 261 struct vm_area_struct *vma)