Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

dma-mapping: Separate DMA sync issuing and completion waiting

Currently, arch_sync_dma_for_cpu and arch_sync_dma_for_device
always wait for the completion of each DMA buffer. That is,
issuing the DMA sync and waiting for completion is done in a
single API call.

For scatter-gather lists with multiple entries, this means
issuing and waiting is repeated for each entry, which can hurt
performance. Architectures like ARM64 may be able to issue all
DMA sync operations for all entries first and then wait for
completion together.

To address this, arch_sync_dma_for_* now batches DMA operations
and performs a flush afterward. On ARM64, the flush is implemented
with a dsb instruction in arch_sync_dma_flush(). On other
architectures, arch_sync_dma_flush() is currently a nop.

Cc: Leon Romanovsky <leon@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Robin Murphy <robin.murphy@arm.com>
Cc: Ada Couprie Diaz <ada.coupriediaz@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Stefano Stabellini <sstabellini@kernel.org>
Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
Cc: Tangquan Zheng <zhengtangquan@oppo.com>
Reviewed-by: Juergen Gross <jgross@suse.com> # drivers/xen/swiotlb-xen.c
Tested-by: Xueyuan Chen <xueyuan.chen21@gmail.com>
Signed-off-by: Barry Song <baohua@kernel.org>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Marek Szyprowski <m.szyprowski@samsung.com>
Link: https://lore.kernel.org/r/20260228221316.59934-1-21cnbao@gmail.com

authored by

Barry Song and committed by
Marek Szyprowski
d7eafe65 cf875c4b

+78 -22
+1
arch/arm64/Kconfig
··· 55 55 select ARCH_HAS_STRICT_MODULE_RWX 56 56 select ARCH_HAS_SYNC_DMA_FOR_DEVICE 57 57 select ARCH_HAS_SYNC_DMA_FOR_CPU 58 + select ARCH_HAS_BATCHED_DMA_SYNC 58 59 select ARCH_HAS_SYSCALL_WRAPPER 59 60 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 60 61 select ARCH_HAS_ZONE_DMA_SET if EXPERT
+5
arch/arm64/include/asm/cache.h
··· 87 87 88 88 #define dma_get_cache_alignment cache_line_size 89 89 90 + static inline void arch_sync_dma_flush(void) 91 + { 92 + dsb(sy); 93 + } 94 + 90 95 /* Compress a u64 MPIDR value into 32 bits. */ 91 96 static inline u64 arch_compact_of_hwid(u64 id) 92 97 {
+2 -2
arch/arm64/mm/dma-mapping.c
··· 17 17 { 18 18 unsigned long start = (unsigned long)phys_to_virt(paddr); 19 19 20 - dcache_clean_poc(start, start + size); 20 + dcache_clean_poc_nosync(start, start + size); 21 21 } 22 22 23 23 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, ··· 28 28 if (dir == DMA_TO_DEVICE) 29 29 return; 30 30 31 - dcache_inval_poc(start, start + size); 31 + dcache_inval_poc_nosync(start, start + size); 32 32 } 33 33 34 34 void arch_dma_prep_coherent(struct page *page, size_t size)
+27 -8
drivers/iommu/dma-iommu.c
··· 1095 1095 return; 1096 1096 1097 1097 phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); 1098 - if (!dev_is_dma_coherent(dev)) 1098 + if (!dev_is_dma_coherent(dev)) { 1099 1099 arch_sync_dma_for_cpu(phys, size, dir); 1100 + arch_sync_dma_flush(); 1101 + } 1100 1102 1101 1103 swiotlb_sync_single_for_cpu(dev, phys, size, dir); 1102 1104 } ··· 1114 1112 phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); 1115 1113 swiotlb_sync_single_for_device(dev, phys, size, dir); 1116 1114 1117 - if (!dev_is_dma_coherent(dev)) 1115 + if (!dev_is_dma_coherent(dev)) { 1118 1116 arch_sync_dma_for_device(phys, size, dir); 1117 + arch_sync_dma_flush(); 1118 + } 1119 1119 } 1120 1120 1121 1121 void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, ··· 1126 1122 struct scatterlist *sg; 1127 1123 int i; 1128 1124 1129 - if (sg_dma_is_swiotlb(sgl)) 1125 + if (sg_dma_is_swiotlb(sgl)) { 1130 1126 for_each_sg(sgl, sg, nelems, i) 1131 1127 iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg), 1132 1128 sg->length, dir); 1133 - else if (!dev_is_dma_coherent(dev)) 1129 + } else if (!dev_is_dma_coherent(dev)) { 1134 1130 for_each_sg(sgl, sg, nelems, i) 1135 1131 arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); 1132 + arch_sync_dma_flush(); 1133 + } 1136 1134 } 1137 1135 1138 1136 void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, ··· 1143 1137 struct scatterlist *sg; 1144 1138 int i; 1145 1139 1146 - if (sg_dma_is_swiotlb(sgl)) 1140 + if (sg_dma_is_swiotlb(sgl)) { 1147 1141 for_each_sg(sgl, sg, nelems, i) 1148 1142 iommu_dma_sync_single_for_device(dev, 1149 1143 sg_dma_address(sg), 1150 1144 sg->length, dir); 1151 - else if (!dev_is_dma_coherent(dev)) 1145 + } else if (!dev_is_dma_coherent(dev)) { 1152 1146 for_each_sg(sgl, sg, nelems, i) 1153 1147 arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); 1148 + arch_sync_dma_flush(); 1149 + } 1154 1150 } 1155 1151 1156 1152 static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, ··· 1227 1219 return DMA_MAPPING_ERROR; 1228 1220 } 1229 1221 1230 - if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) 1222 + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { 1231 1223 arch_sync_dma_for_device(phys, size, dir); 1224 + arch_sync_dma_flush(); 1225 + } 1232 1226 1233 1227 iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); 1234 1228 if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) ··· 1252 1242 if (WARN_ON(!phys)) 1253 1243 return; 1254 1244 1255 - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) 1245 + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) { 1256 1246 arch_sync_dma_for_cpu(phys, size, dir); 1247 + arch_sync_dma_flush(); 1248 + } 1257 1249 1258 1250 __iommu_dma_unmap(dev, dma_handle, size); 1259 1251 ··· 1992 1980 dma_addr_t addr = state->addr + offset; 1993 1981 size_t iova_start_pad = iova_offset(iovad, addr); 1994 1982 1983 + if (!dev_is_dma_coherent(dev)) 1984 + arch_sync_dma_flush(); 1995 1985 return iommu_sync_map(domain, addr - iova_start_pad, 1996 1986 iova_align(iovad, size + iova_start_pad)); 1997 1987 } ··· 2007 1993 struct iommu_dma_cookie *cookie = domain->iova_cookie; 2008 1994 struct iova_domain *iovad = &cookie->iovad; 2009 1995 size_t iova_start_pad = iova_offset(iovad, addr); 1996 + bool need_sync_dma = !dev_is_dma_coherent(dev) && 1997 + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)); 2010 1998 dma_addr_t end = addr + size; 2011 1999 2012 2000 do { ··· 2032 2016 addr += len; 2033 2017 iova_start_pad = 0; 2034 2018 } while (addr < end); 2019 + 2020 + if (need_sync_dma) 2021 + arch_sync_dma_flush(); 2035 2022 } 2036 2023 2037 2024 static void __iommu_dma_iova_unlink(struct device *dev,
+16 -8
drivers/xen/swiotlb-xen.c
··· 262 262 263 263 done: 264 264 if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { 265 - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) 265 + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) { 266 266 arch_sync_dma_for_device(phys, size, dir); 267 - else 267 + arch_sync_dma_flush(); 268 + } else { 268 269 xen_dma_sync_for_device(dev, dev_addr, size, dir); 270 + } 269 271 } 270 272 return dev_addr; 271 273 } ··· 289 287 BUG_ON(dir == DMA_NONE); 290 288 291 289 if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { 292 - if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) 290 + if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) { 293 291 arch_sync_dma_for_cpu(paddr, size, dir); 294 - else 292 + arch_sync_dma_flush(); 293 + } else { 295 294 xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir); 295 + } 296 296 } 297 297 298 298 /* NOTE: We use dev_addr here, not paddr! */ ··· 312 308 struct io_tlb_pool *pool; 313 309 314 310 if (!dev_is_dma_coherent(dev)) { 315 - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) 311 + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { 316 312 arch_sync_dma_for_cpu(paddr, size, dir); 317 - else 313 + arch_sync_dma_flush(); 314 + } else { 318 315 xen_dma_sync_for_cpu(dev, dma_addr, size, dir); 316 + } 319 317 } 320 318 321 319 pool = xen_swiotlb_find_pool(dev, dma_addr); ··· 337 331 __swiotlb_sync_single_for_device(dev, paddr, size, dir, pool); 338 332 339 333 if (!dev_is_dma_coherent(dev)) { 340 - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) 334 + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { 341 335 arch_sync_dma_for_device(paddr, size, dir); 342 - else 336 + arch_sync_dma_flush(); 337 + } else { 343 338 xen_dma_sync_for_device(dev, dma_addr, size, dir); 339 + } 344 340 } 345 341 } 346 342
+6
include/linux/dma-map-ops.h
··· 361 361 } 362 362 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ 363 363 364 + #ifndef CONFIG_ARCH_HAS_BATCHED_DMA_SYNC 365 + static inline void arch_sync_dma_flush(void) 366 + { 367 + } 368 + #endif 369 + 364 370 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL 365 371 void arch_sync_dma_for_cpu_all(void); 366 372 #else
+3
kernel/dma/Kconfig
··· 72 72 config ARCH_HAS_FORCE_DMA_UNENCRYPTED 73 73 bool 74 74 75 + config ARCH_HAS_BATCHED_DMA_SYNC 76 + bool 77 + 75 78 # 76 79 # Select this option if the architecture assumes DMA devices are coherent 77 80 # by default.
+5 -1
kernel/dma/direct.c
··· 406 406 arch_sync_dma_for_device(paddr, sg->length, 407 407 dir); 408 408 } 409 + if (!dev_is_dma_coherent(dev)) 410 + arch_sync_dma_flush(); 409 411 } 410 412 #endif 411 413 ··· 429 427 swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); 430 428 } 431 429 432 - if (!dev_is_dma_coherent(dev)) 430 + if (!dev_is_dma_coherent(dev)) { 431 + arch_sync_dma_flush(); 433 432 arch_sync_dma_for_cpu_all(); 433 + } 434 434 } 435 435 436 436 /*
+7 -2
kernel/dma/direct.h
··· 60 60 61 61 swiotlb_sync_single_for_device(dev, paddr, size, dir); 62 62 63 - if (!dev_is_dma_coherent(dev)) 63 + if (!dev_is_dma_coherent(dev)) { 64 64 arch_sync_dma_for_device(paddr, size, dir); 65 + arch_sync_dma_flush(); 66 + } 65 67 } 66 68 67 69 static inline void dma_direct_sync_single_for_cpu(struct device *dev, ··· 73 71 74 72 if (!dev_is_dma_coherent(dev)) { 75 73 arch_sync_dma_for_cpu(paddr, size, dir); 74 + arch_sync_dma_flush(); 76 75 arch_sync_dma_for_cpu_all(); 77 76 } 78 77 ··· 109 106 } 110 107 111 108 if (!dev_is_dma_coherent(dev) && 112 - !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) 109 + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { 113 110 arch_sync_dma_for_device(phys, size, dir); 111 + arch_sync_dma_flush(); 112 + } 114 113 return dma_addr; 115 114 116 115 err_overflow:
+6 -1
kernel/dma/swiotlb.c
··· 867 867 if (orig_addr == INVALID_PHYS_ADDR) 868 868 return; 869 869 870 + if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev)) 871 + arch_sync_dma_flush(); 872 + 870 873 /* 871 874 * It's valid for tlb_offset to be negative. This can happen when the 872 875 * "offset" returned by swiotlb_align_offset() is non-zero, and the ··· 1598 1595 return DMA_MAPPING_ERROR; 1599 1596 } 1600 1597 1601 - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) 1598 + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { 1602 1599 arch_sync_dma_for_device(swiotlb_addr, size, dir); 1600 + arch_sync_dma_flush(); 1601 + } 1603 1602 return dma_addr; 1604 1603 } 1605 1604