Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

block: add scatterlist-less DMA mapping helpers

Add a new blk_rq_dma_map / blk_rq_dma_unmap pair that does away with
the wasteful scatterlist structure. Instead it uses the mapping iterator
to either add segments to the IOVA for IOMMU operations, or just maps
them one by one for the direct mapping. For the IOMMU case instead of
a scatterlist with an entry for each segment, only a single [dma_addr,len]
pair needs to be stored for processing a request, and for the direct
mapping the per-segment allocation shrinks from
[page,offset,len,dma_addr,dma_len] to just [dma_addr,len].

One big difference to the scatterlist API, which could be considered
downside, is that the IOVA collapsing only works when the driver sets
a virt_boundary that matches the IOMMU granule. For NVMe this is done
already so it works perfectly.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Leon Romanovsky <leonro@nvidia.com>
Link: https://lore.kernel.org/r/20250625113531.522027-3-hch@lst.de
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Christoph Hellwig and committed by
Jens Axboe
858299dc 38446014

+224
+161
block/blk-mq-dma.c
··· 2 2 /* 3 3 * Copyright (C) 2025 Christoph Hellwig 4 4 */ 5 + #include <linux/blk-mq-dma.h> 5 6 #include "blk.h" 6 7 7 8 struct phys_vec { ··· 61 60 vec->len = bv.bv_len; 62 61 return true; 63 62 } 63 + 64 + /* 65 + * The IOVA-based DMA API wants to be able to coalesce at the minimal IOMMU page 66 + * size granularity (which is guaranteed to be <= PAGE_SIZE and usually 4k), so 67 + * we need to ensure our segments are aligned to this as well. 68 + * 69 + * Note that there is no point in using the slightly more complicated IOVA based 70 + * path for single segment mappings. 71 + */ 72 + static inline bool blk_can_dma_map_iova(struct request *req, 73 + struct device *dma_dev) 74 + { 75 + return !((queue_virt_boundary(req->q) + 1) & 76 + dma_get_merge_boundary(dma_dev)); 77 + } 78 + 79 + static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec) 80 + { 81 + iter->addr = pci_p2pdma_bus_addr_map(&iter->p2pdma, vec->paddr); 82 + iter->len = vec->len; 83 + return true; 84 + } 85 + 86 + static bool blk_dma_map_direct(struct request *req, struct device *dma_dev, 87 + struct blk_dma_iter *iter, struct phys_vec *vec) 88 + { 89 + iter->addr = dma_map_page(dma_dev, phys_to_page(vec->paddr), 90 + offset_in_page(vec->paddr), vec->len, rq_dma_dir(req)); 91 + if (dma_mapping_error(dma_dev, iter->addr)) { 92 + iter->status = BLK_STS_RESOURCE; 93 + return false; 94 + } 95 + iter->len = vec->len; 96 + return true; 97 + } 98 + 99 + static bool blk_rq_dma_map_iova(struct request *req, struct device *dma_dev, 100 + struct dma_iova_state *state, struct blk_dma_iter *iter, 101 + struct phys_vec *vec) 102 + { 103 + enum dma_data_direction dir = rq_dma_dir(req); 104 + unsigned int mapped = 0; 105 + int error; 106 + 107 + iter->addr = state->addr; 108 + iter->len = dma_iova_size(state); 109 + 110 + do { 111 + error = dma_iova_link(dma_dev, state, vec->paddr, mapped, 112 + vec->len, dir, 0); 113 + if (error) 114 + break; 115 + mapped += vec->len; 116 + } while (blk_map_iter_next(req, &iter->iter, vec)); 117 + 118 + error = dma_iova_sync(dma_dev, state, 0, mapped); 119 + if (error) { 120 + iter->status = errno_to_blk_status(error); 121 + return false; 122 + } 123 + 124 + return true; 125 + } 126 + 127 + /** 128 + * blk_rq_dma_map_iter_start - map the first DMA segment for a request 129 + * @req: request to map 130 + * @dma_dev: device to map to 131 + * @state: DMA IOVA state 132 + * @iter: block layer DMA iterator 133 + * 134 + * Start DMA mapping @req to @dma_dev. @state and @iter are provided by the 135 + * caller and don't need to be initialized. @state needs to be stored for use 136 + * at unmap time, @iter is only needed at map time. 137 + * 138 + * Returns %false if there is no segment to map, including due to an error, or 139 + * %true ft it did map a segment. 140 + * 141 + * If a segment was mapped, the DMA address for it is returned in @iter.addr and 142 + * the length in @iter.len. If no segment was mapped the status code is 143 + * returned in @iter.status. 144 + * 145 + * The caller can call blk_rq_dma_map_coalesce() to check if further segments 146 + * need to be mapped after this, or go straight to blk_rq_dma_map_iter_next() 147 + * to try to map the following segments. 148 + */ 149 + bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, 150 + struct dma_iova_state *state, struct blk_dma_iter *iter) 151 + { 152 + unsigned int total_len = blk_rq_payload_bytes(req); 153 + struct phys_vec vec; 154 + 155 + iter->iter.bio = req->bio; 156 + iter->iter.iter = req->bio->bi_iter; 157 + memset(&iter->p2pdma, 0, sizeof(iter->p2pdma)); 158 + iter->status = BLK_STS_OK; 159 + 160 + /* 161 + * Grab the first segment ASAP because we'll need it to check for P2P 162 + * transfers. 163 + */ 164 + if (!blk_map_iter_next(req, &iter->iter, &vec)) 165 + return false; 166 + 167 + if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA)) { 168 + switch (pci_p2pdma_state(&iter->p2pdma, dma_dev, 169 + phys_to_page(vec.paddr))) { 170 + case PCI_P2PDMA_MAP_BUS_ADDR: 171 + return blk_dma_map_bus(iter, &vec); 172 + case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE: 173 + /* 174 + * P2P transfers through the host bridge are treated the 175 + * same as non-P2P transfers below and during unmap. 176 + */ 177 + req->cmd_flags &= ~REQ_P2PDMA; 178 + break; 179 + default: 180 + iter->status = BLK_STS_INVAL; 181 + return false; 182 + } 183 + } 184 + 185 + if (blk_can_dma_map_iova(req, dma_dev) && 186 + dma_iova_try_alloc(dma_dev, state, vec.paddr, total_len)) 187 + return blk_rq_dma_map_iova(req, dma_dev, state, iter, &vec); 188 + return blk_dma_map_direct(req, dma_dev, iter, &vec); 189 + } 190 + EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_start); 191 + 192 + /** 193 + * blk_rq_dma_map_iter_next - map the next DMA segment for a request 194 + * @req: request to map 195 + * @dma_dev: device to map to 196 + * @state: DMA IOVA state 197 + * @iter: block layer DMA iterator 198 + * 199 + * Iterate to the next mapping after a previous call to 200 + * blk_rq_dma_map_iter_start(). See there for a detailed description of the 201 + * arguments. 202 + * 203 + * Returns %false if there is no segment to map, including due to an error, or 204 + * %true ft it did map a segment. 205 + * 206 + * If a segment was mapped, the DMA address for it is returned in @iter.addr and 207 + * the length in @iter.len. If no segment was mapped the status code is 208 + * returned in @iter.status. 209 + */ 210 + bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, 211 + struct dma_iova_state *state, struct blk_dma_iter *iter) 212 + { 213 + struct phys_vec vec; 214 + 215 + if (!blk_map_iter_next(req, &iter->iter, &vec)) 216 + return false; 217 + 218 + if (iter->p2pdma.map == PCI_P2PDMA_MAP_BUS_ADDR) 219 + return blk_dma_map_bus(iter, &vec); 220 + return blk_dma_map_direct(req, dma_dev, iter, &vec); 221 + } 222 + EXPORT_SYMBOL_GPL(blk_rq_dma_map_iter_next); 64 223 65 224 static inline struct scatterlist * 66 225 blk_next_sg(struct scatterlist **sg, struct scatterlist *sglist)
+63
include/linux/blk-mq-dma.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef BLK_MQ_DMA_H 3 + #define BLK_MQ_DMA_H 4 + 5 + #include <linux/blk-mq.h> 6 + #include <linux/pci-p2pdma.h> 7 + 8 + struct blk_dma_iter { 9 + /* Output address range for this iteration */ 10 + dma_addr_t addr; 11 + u32 len; 12 + 13 + /* Status code. Only valid when blk_rq_dma_map_iter_* returned false */ 14 + blk_status_t status; 15 + 16 + /* Internal to blk_rq_dma_map_iter_* */ 17 + struct req_iterator iter; 18 + struct pci_p2pdma_map_state p2pdma; 19 + }; 20 + 21 + bool blk_rq_dma_map_iter_start(struct request *req, struct device *dma_dev, 22 + struct dma_iova_state *state, struct blk_dma_iter *iter); 23 + bool blk_rq_dma_map_iter_next(struct request *req, struct device *dma_dev, 24 + struct dma_iova_state *state, struct blk_dma_iter *iter); 25 + 26 + /** 27 + * blk_rq_dma_map_coalesce - were all segments coalesced? 28 + * @state: DMA state to check 29 + * 30 + * Returns true if blk_rq_dma_map_iter_start coalesced all segments into a 31 + * single DMA range. 32 + */ 33 + static inline bool blk_rq_dma_map_coalesce(struct dma_iova_state *state) 34 + { 35 + return dma_use_iova(state); 36 + } 37 + 38 + /** 39 + * blk_rq_dma_unmap - try to DMA unmap a request 40 + * @req: request to unmap 41 + * @dma_dev: device to unmap from 42 + * @state: DMA IOVA state 43 + * @mapped_len: number of bytes to unmap 44 + * 45 + * Returns %false if the callers need to manually unmap every DMA segment 46 + * mapped using @iter or %true if no work is left to be done. 47 + */ 48 + static inline bool blk_rq_dma_unmap(struct request *req, struct device *dma_dev, 49 + struct dma_iova_state *state, size_t mapped_len) 50 + { 51 + if (req->cmd_flags & REQ_P2PDMA) 52 + return true; 53 + 54 + if (dma_use_iova(state)) { 55 + dma_iova_destroy(dma_dev, state, mapped_len, rq_dma_dir(req), 56 + 0); 57 + return true; 58 + } 59 + 60 + return !dma_need_unmap(dma_dev); 61 + } 62 + 63 + #endif /* BLK_MQ_DMA_H */