Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/core: add MR support for bvec-based RDMA operations

The bvec-based RDMA API currently returns -EOPNOTSUPP when Memory
Region registration is required. This prevents iWARP devices from
using the bvec path, since iWARP requires MR registration for RDMA
READ operations. The force_mr debug parameter is also unusable with
bvec input.

Add rdma_rw_init_mr_wrs_bvec() to handle MR registration for bvec
arrays. The approach creates a synthetic scatterlist populated with
DMA addresses from the bvecs, then reuses the existing ib_map_mr_sg()
infrastructure. This avoids driver changes while keeping the
implementation small.

The synthetic scatterlist is stored in the rdma_rw_ctx for cleanup.
On destroy, the MRs are returned to the pool and the bvec DMA
mappings are released using the stored addresses.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://patch.msgid.link/20260128005400.25147-4-cel@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Chuck Lever and committed by
Leon Romanovsky
bea28ac1 853e8920

+154 -36
+153 -36
drivers/infiniband/core/rw.c
··· 122 122 return count; 123 123 } 124 124 125 + static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg, 126 + struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num, 127 + u64 remote_addr, u32 rkey, enum dma_data_direction dir) 128 + { 129 + if (prev) { 130 + if (reg->mr->need_inval) 131 + prev->wr.wr.next = &reg->inv_wr; 132 + else 133 + prev->wr.wr.next = &reg->reg_wr.wr; 134 + } 135 + 136 + reg->reg_wr.wr.next = &reg->wr.wr; 137 + 138 + reg->wr.wr.sg_list = &reg->sge; 139 + reg->wr.wr.num_sge = 1; 140 + reg->wr.remote_addr = remote_addr; 141 + reg->wr.rkey = rkey; 142 + 143 + if (dir == DMA_TO_DEVICE) { 144 + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; 145 + } else if (!rdma_cap_read_inv(qp->device, port_num)) { 146 + reg->wr.wr.opcode = IB_WR_RDMA_READ; 147 + } else { 148 + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; 149 + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; 150 + } 151 + 152 + return 1; 153 + } 154 + 125 155 static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 126 156 u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, 127 157 u64 remote_addr, u32 rkey, enum dma_data_direction dir) ··· 177 147 if (ret < 0) 178 148 goto out_free; 179 149 count += ret; 180 - 181 - if (prev) { 182 - if (reg->mr->need_inval) 183 - prev->wr.wr.next = &reg->inv_wr; 184 - else 185 - prev->wr.wr.next = &reg->reg_wr.wr; 186 - } 187 - 188 - reg->reg_wr.wr.next = &reg->wr.wr; 189 - 190 - reg->wr.wr.sg_list = &reg->sge; 191 - reg->wr.wr.num_sge = 1; 192 - reg->wr.remote_addr = remote_addr; 193 - reg->wr.rkey = rkey; 194 - if (dir == DMA_TO_DEVICE) { 195 - reg->wr.wr.opcode = IB_WR_RDMA_WRITE; 196 - } else if (!rdma_cap_read_inv(qp->device, port_num)) { 197 - reg->wr.wr.opcode = IB_WR_RDMA_READ; 198 - } else { 199 - reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; 200 - reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; 201 - } 202 - count++; 203 - 150 + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, 151 + remote_addr, rkey, dir); 204 152 remote_addr += reg->sge.length; 205 153 sg_cnt -= nents; 206 154 for (j = 0; j < nents; j++) ··· 198 190 ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 199 191 kfree(ctx->reg); 200 192 out: 193 + return ret; 194 + } 195 + 196 + static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 197 + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, 198 + struct bvec_iter *iter, u64 remote_addr, u32 rkey, 199 + enum dma_data_direction dir) 200 + { 201 + struct ib_device *dev = qp->pd->device; 202 + struct rdma_rw_reg_ctx *prev = NULL; 203 + u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en); 204 + struct scatterlist *sg; 205 + int i, ret, count = 0; 206 + u32 nents = 0; 207 + 208 + ctx->reg = kcalloc(DIV_ROUND_UP(nr_bvec, pages_per_mr), 209 + sizeof(*ctx->reg), GFP_KERNEL); 210 + if (!ctx->reg) 211 + return -ENOMEM; 212 + 213 + /* 214 + * Build scatterlist from bvecs using the iterator. This follows 215 + * the pattern from __blk_rq_map_sg. 216 + */ 217 + ctx->reg[0].sgt.sgl = kmalloc_array(nr_bvec, 218 + sizeof(*ctx->reg[0].sgt.sgl), 219 + GFP_KERNEL); 220 + if (!ctx->reg[0].sgt.sgl) { 221 + ret = -ENOMEM; 222 + goto out_free_reg; 223 + } 224 + sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec); 225 + 226 + for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) { 227 + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); 228 + 229 + if (nents >= nr_bvec) { 230 + ret = -EINVAL; 231 + goto out_free_sgl; 232 + } 233 + sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset); 234 + bvec_iter_advance(bvecs, iter, bv.bv_len); 235 + nents++; 236 + } 237 + sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents)); 238 + ctx->reg[0].sgt.orig_nents = nents; 239 + 240 + /* DMA map the scatterlist */ 241 + ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); 242 + if (ret) 243 + goto out_free_sgl; 244 + 245 + ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr); 246 + 247 + sg = ctx->reg[0].sgt.sgl; 248 + nents = ctx->reg[0].sgt.nents; 249 + for (i = 0; i < ctx->nr_ops; i++) { 250 + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; 251 + u32 sge_cnt = min(nents, pages_per_mr); 252 + 253 + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0); 254 + if (ret < 0) 255 + goto out_free_mrs; 256 + count += ret; 257 + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, 258 + remote_addr, rkey, dir); 259 + remote_addr += reg->sge.length; 260 + nents -= sge_cnt; 261 + sg += sge_cnt; 262 + prev = reg; 263 + } 264 + 265 + if (prev) 266 + prev->wr.wr.next = NULL; 267 + 268 + ctx->type = RDMA_RW_MR; 269 + return count; 270 + 271 + out_free_mrs: 272 + while (--i >= 0) 273 + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 274 + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); 275 + out_free_sgl: 276 + kfree(ctx->reg[0].sgt.sgl); 277 + out_free_reg: 278 + kfree(ctx->reg); 201 279 return ret; 202 280 } 203 281 ··· 641 547 * @rkey: remote key to operate on 642 548 * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ 643 549 * 644 - * Accepts bio_vec arrays directly, avoiding scatterlist conversion for 645 - * callers that already have data in bio_vec form. Prefer this over 646 - * rdma_rw_ctx_init() when the source data is a bio_vec array. 647 - * 648 - * This function does not support devices requiring memory registration. 649 - * iWARP devices and configurations with force_mr=1 should use 650 - * rdma_rw_ctx_init() with a scatterlist instead. 550 + * Maps the bio_vec array directly, avoiding intermediate scatterlist 551 + * conversion. Supports MR registration for iWARP devices and force_mr mode. 651 552 * 652 553 * Returns the number of WQEs that will be needed on the workqueue if 653 554 * successful, or a negative error code: 654 555 * 655 556 * * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero 656 - * * -EOPNOTSUPP - device requires MR path (iWARP or force_mr=1) 657 557 * * -ENOMEM - DMA mapping or memory allocation failed 658 558 */ 659 559 int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, ··· 655 567 struct bvec_iter iter, u64 remote_addr, u32 rkey, 656 568 enum dma_data_direction dir) 657 569 { 570 + struct ib_device *dev = qp->pd->device; 658 571 int ret; 659 572 660 573 if (nr_bvec == 0 || iter.bi_size == 0) 661 574 return -EINVAL; 662 575 663 - /* MR path not supported for bvec - reject iWARP and force_mr */ 664 - if (rdma_rw_io_needs_mr(qp->device, port_num, dir, nr_bvec)) 665 - return -EOPNOTSUPP; 576 + /* 577 + * iWARP requires MR registration for all RDMA READs. The force_mr 578 + * debug option also mandates MR usage. 579 + */ 580 + if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num)) 581 + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, 582 + nr_bvec, &iter, remote_addr, 583 + rkey, dir); 584 + if (unlikely(rdma_rw_force_mr)) 585 + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, 586 + nr_bvec, &iter, remote_addr, 587 + rkey, dir); 666 588 667 589 if (nr_bvec == 1) 668 590 return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter, ··· 680 582 681 583 /* 682 584 * Try IOVA-based mapping first for multi-bvec transfers. 683 - * This reduces IOTLB sync overhead by batching all mappings. 585 + * IOVA coalesces bvecs into a single DMA-contiguous region, 586 + * reducing the number of WRs needed and avoiding MR overhead. 684 587 */ 685 588 ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr, 686 589 rkey, dir); 687 590 if (ret != -EOPNOTSUPP) 688 591 return ret; 592 + 593 + /* 594 + * IOVA mapping not available. Check if MR registration provides 595 + * better performance than multiple SGE entries. 596 + */ 597 + if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec)) 598 + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, 599 + nr_bvec, &iter, remote_addr, 600 + rkey, dir); 689 601 690 602 return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter, 691 603 remote_addr, rkey, dir); ··· 941 833 942 834 switch (ctx->type) { 943 835 case RDMA_RW_MR: 836 + /* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */ 837 + WARN_ON_ONCE(ctx->reg[0].sgt.sgl); 944 838 for (i = 0; i < ctx->nr_ops; i++) 945 839 ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 946 840 kfree(ctx->reg); ··· 990 880 u32 i; 991 881 992 882 switch (ctx->type) { 883 + case RDMA_RW_MR: 884 + for (i = 0; i < ctx->nr_ops; i++) 885 + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 886 + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); 887 + kfree(ctx->reg[0].sgt.sgl); 888 + kfree(ctx->reg); 889 + break; 993 890 case RDMA_RW_IOVA: 994 891 dma_iova_destroy(dev->dma_device, &ctx->iova.state, 995 892 ctx->iova.mapped_len, dir, 0);
+1
include/rdma/rw.h
··· 47 47 struct ib_reg_wr reg_wr; 48 48 struct ib_send_wr inv_wr; 49 49 struct ib_mr *mr; 50 + struct sg_table sgt; 50 51 } *reg; 51 52 }; 52 53 };