dma-mapping: Separate DMA sync issuing and completion waiting

+1

arch/arm64/Kconfig

··· 55 55 select ARCH_HAS_STRICT_MODULE_RWX 56 56 select ARCH_HAS_SYNC_DMA_FOR_DEVICE 57 57 select ARCH_HAS_SYNC_DMA_FOR_CPU 58 + select ARCH_HAS_BATCHED_DMA_SYNC 58 59 select ARCH_HAS_SYSCALL_WRAPPER 59 60 select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST 60 61 select ARCH_HAS_ZONE_DMA_SET if EXPERT

+5

arch/arm64/include/asm/cache.h

··· 87 87 88 88 #define dma_get_cache_alignment cache_line_size 89 89 90 + static inline void arch_sync_dma_flush(void) 91 + { 92 + dsb(sy); 93 + } 94 + 90 95 /* Compress a u64 MPIDR value into 32 bits. */ 91 96 static inline u64 arch_compact_of_hwid(u64 id) 92 97 {

+2 -2

arch/arm64/mm/dma-mapping.c

··· 17 17 { 18 18 unsigned long start = (unsigned long)phys_to_virt(paddr); 19 19 20 - dcache_clean_poc(start, start + size); 20 + dcache_clean_poc_nosync(start, start + size); 21 21 } 22 22 23 23 void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size, ··· 28 28 if (dir == DMA_TO_DEVICE) 29 29 return; 30 30 31 - dcache_inval_poc(start, start + size); 31 + dcache_inval_poc_nosync(start, start + size); 32 32 } 33 33 34 34 void arch_dma_prep_coherent(struct page *page, size_t size)

+27 -8

drivers/iommu/dma-iommu.c

··· 1095 1095 return; 1096 1096 1097 1097 phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); 1098 - if (!dev_is_dma_coherent(dev)) 1098 + if (!dev_is_dma_coherent(dev)) { 1099 1099 arch_sync_dma_for_cpu(phys, size, dir); 1100 + arch_sync_dma_flush(); 1101 + } 1100 1102 1101 1103 swiotlb_sync_single_for_cpu(dev, phys, size, dir); 1102 1104 } ··· 1114 1112 phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle); 1115 1113 swiotlb_sync_single_for_device(dev, phys, size, dir); 1116 1114 1117 - if (!dev_is_dma_coherent(dev)) 1115 + if (!dev_is_dma_coherent(dev)) { 1118 1116 arch_sync_dma_for_device(phys, size, dir); 1117 + arch_sync_dma_flush(); 1118 + } 1119 1119 } 1120 1120 1121 1121 void iommu_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, ··· 1126 1122 struct scatterlist *sg; 1127 1123 int i; 1128 1124 1129 - if (sg_dma_is_swiotlb(sgl)) 1125 + if (sg_dma_is_swiotlb(sgl)) { 1130 1126 for_each_sg(sgl, sg, nelems, i) 1131 1127 iommu_dma_sync_single_for_cpu(dev, sg_dma_address(sg), 1132 1128 sg->length, dir); 1133 - else if (!dev_is_dma_coherent(dev)) 1129 + } else if (!dev_is_dma_coherent(dev)) { 1134 1130 for_each_sg(sgl, sg, nelems, i) 1135 1131 arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir); 1132 + arch_sync_dma_flush(); 1133 + } 1136 1134 } 1137 1135 1138 1136 void iommu_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, ··· 1143 1137 struct scatterlist *sg; 1144 1138 int i; 1145 1139 1146 - if (sg_dma_is_swiotlb(sgl)) 1140 + if (sg_dma_is_swiotlb(sgl)) { 1147 1141 for_each_sg(sgl, sg, nelems, i) 1148 1142 iommu_dma_sync_single_for_device(dev, 1149 1143 sg_dma_address(sg), 1150 1144 sg->length, dir); 1151 - else if (!dev_is_dma_coherent(dev)) 1145 + } else if (!dev_is_dma_coherent(dev)) { 1152 1146 for_each_sg(sgl, sg, nelems, i) 1153 1147 arch_sync_dma_for_device(sg_phys(sg), sg->length, dir); 1148 + arch_sync_dma_flush(); 1149 + } 1154 1150 } 1155 1151 1156 1152 static phys_addr_t iommu_dma_map_swiotlb(struct device *dev, phys_addr_t phys, ··· 1227 1219 return DMA_MAPPING_ERROR; 1228 1220 } 1229 1221 1230 - if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) 1222 + if (!coherent && !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { 1231 1223 arch_sync_dma_for_device(phys, size, dir); 1224 + arch_sync_dma_flush(); 1225 + } 1232 1226 1233 1227 iova = __iommu_dma_map(dev, phys, size, prot, dma_mask); 1234 1228 if (iova == DMA_MAPPING_ERROR && !(attrs & DMA_ATTR_MMIO)) ··· 1252 1242 if (WARN_ON(!phys)) 1253 1243 return; 1254 1244 1255 - if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) 1245 + if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && !dev_is_dma_coherent(dev)) { 1256 1246 arch_sync_dma_for_cpu(phys, size, dir); 1247 + arch_sync_dma_flush(); 1248 + } 1257 1249 1258 1250 __iommu_dma_unmap(dev, dma_handle, size); 1259 1251 ··· 1992 1980 dma_addr_t addr = state->addr + offset; 1993 1981 size_t iova_start_pad = iova_offset(iovad, addr); 1994 1982 1983 + if (!dev_is_dma_coherent(dev)) 1984 + arch_sync_dma_flush(); 1995 1985 return iommu_sync_map(domain, addr - iova_start_pad, 1996 1986 iova_align(iovad, size + iova_start_pad)); 1997 1987 } ··· 2007 1993 struct iommu_dma_cookie *cookie = domain->iova_cookie; 2008 1994 struct iova_domain *iovad = &cookie->iovad; 2009 1995 size_t iova_start_pad = iova_offset(iovad, addr); 1996 + bool need_sync_dma = !dev_is_dma_coherent(dev) && 1997 + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)); 2010 1998 dma_addr_t end = addr + size; 2011 1999 2012 2000 do { ··· 2032 2016 addr += len; 2033 2017 iova_start_pad = 0; 2034 2018 } while (addr < end); 2019 + 2020 + if (need_sync_dma) 2021 + arch_sync_dma_flush(); 2035 2022 } 2036 2023 2037 2024 static void __iommu_dma_iova_unlink(struct device *dev,

+16 -8

drivers/xen/swiotlb-xen.c

··· 262 262 263 263 done: 264 264 if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { 265 - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) 265 + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dev_addr)))) { 266 266 arch_sync_dma_for_device(phys, size, dir); 267 - else 267 + arch_sync_dma_flush(); 268 + } else { 268 269 xen_dma_sync_for_device(dev, dev_addr, size, dir); 270 + } 269 271 } 270 272 return dev_addr; 271 273 } ··· 289 287 BUG_ON(dir == DMA_NONE); 290 288 291 289 if (!dev_is_dma_coherent(hwdev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { 292 - if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) 290 + if (pfn_valid(PFN_DOWN(dma_to_phys(hwdev, dev_addr)))) { 293 291 arch_sync_dma_for_cpu(paddr, size, dir); 294 - else 292 + arch_sync_dma_flush(); 293 + } else { 295 294 xen_dma_sync_for_cpu(hwdev, dev_addr, size, dir); 295 + } 296 296 } 297 297 298 298 /* NOTE: We use dev_addr here, not paddr! */ ··· 312 308 struct io_tlb_pool *pool; 313 309 314 310 if (!dev_is_dma_coherent(dev)) { 315 - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) 311 + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { 316 312 arch_sync_dma_for_cpu(paddr, size, dir); 317 - else 313 + arch_sync_dma_flush(); 314 + } else { 318 315 xen_dma_sync_for_cpu(dev, dma_addr, size, dir); 316 + } 319 317 } 320 318 321 319 pool = xen_swiotlb_find_pool(dev, dma_addr); ··· 337 331 __swiotlb_sync_single_for_device(dev, paddr, size, dir, pool); 338 332 339 333 if (!dev_is_dma_coherent(dev)) { 340 - if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) 334 + if (pfn_valid(PFN_DOWN(dma_to_phys(dev, dma_addr)))) { 341 335 arch_sync_dma_for_device(paddr, size, dir); 342 - else 336 + arch_sync_dma_flush(); 337 + } else { 343 338 xen_dma_sync_for_device(dev, dma_addr, size, dir); 339 + } 344 340 } 345 341 } 346 342

+6

include/linux/dma-map-ops.h

··· 361 361 } 362 362 #endif /* ARCH_HAS_SYNC_DMA_FOR_CPU */ 363 363 364 + #ifndef CONFIG_ARCH_HAS_BATCHED_DMA_SYNC 365 + static inline void arch_sync_dma_flush(void) 366 + { 367 + } 368 + #endif 369 + 364 370 #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL 365 371 void arch_sync_dma_for_cpu_all(void); 366 372 #else

+3

kernel/dma/Kconfig

··· 72 72 config ARCH_HAS_FORCE_DMA_UNENCRYPTED 73 73 bool 74 74 75 + config ARCH_HAS_BATCHED_DMA_SYNC 76 + bool 77 + 75 78 # 76 79 # Select this option if the architecture assumes DMA devices are coherent 77 80 # by default.

+5 -1

kernel/dma/direct.c

··· 406 406 arch_sync_dma_for_device(paddr, sg->length, 407 407 dir); 408 408 } 409 + if (!dev_is_dma_coherent(dev)) 410 + arch_sync_dma_flush(); 409 411 } 410 412 #endif 411 413 ··· 429 427 swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir); 430 428 } 431 429 432 - if (!dev_is_dma_coherent(dev)) 430 + if (!dev_is_dma_coherent(dev)) { 431 + arch_sync_dma_flush(); 433 432 arch_sync_dma_for_cpu_all(); 433 + } 434 434 } 435 435 436 436 /*

+7 -2

kernel/dma/direct.h

··· 60 60 61 61 swiotlb_sync_single_for_device(dev, paddr, size, dir); 62 62 63 - if (!dev_is_dma_coherent(dev)) 63 + if (!dev_is_dma_coherent(dev)) { 64 64 arch_sync_dma_for_device(paddr, size, dir); 65 + arch_sync_dma_flush(); 66 + } 65 67 } 66 68 67 69 static inline void dma_direct_sync_single_for_cpu(struct device *dev, ··· 73 71 74 72 if (!dev_is_dma_coherent(dev)) { 75 73 arch_sync_dma_for_cpu(paddr, size, dir); 74 + arch_sync_dma_flush(); 76 75 arch_sync_dma_for_cpu_all(); 77 76 } 78 77 ··· 109 106 } 110 107 111 108 if (!dev_is_dma_coherent(dev) && 112 - !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) 109 + !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO))) { 113 110 arch_sync_dma_for_device(phys, size, dir); 111 + arch_sync_dma_flush(); 112 + } 114 113 return dma_addr; 115 114 116 115 err_overflow:

+6 -1

kernel/dma/swiotlb.c

··· 867 867 if (orig_addr == INVALID_PHYS_ADDR) 868 868 return; 869 869 870 + if (dir == DMA_FROM_DEVICE && !dev_is_dma_coherent(dev)) 871 + arch_sync_dma_flush(); 872 + 870 873 /* 871 874 * It's valid for tlb_offset to be negative. This can happen when the 872 875 * "offset" returned by swiotlb_align_offset() is non-zero, and the ··· 1598 1595 return DMA_MAPPING_ERROR; 1599 1596 } 1600 1597 1601 - if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) 1598 + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { 1602 1599 arch_sync_dma_for_device(swiotlb_addr, size, dir); 1600 + arch_sync_dma_flush(); 1601 + } 1603 1602 return dma_addr; 1604 1603 } 1605 1604

Configure Feed

Configure Feed