Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

svcrdma: use bvec-based RDMA read/write API

Convert svcrdma to the bvec-based RDMA API introduced earlier in
this series.

The bvec-based RDMA API eliminates the intermediate scatterlist
conversion step, allowing direct DMA mapping from bio_vec arrays.
This simplifies the svc_rdma_rw_ctxt structure by removing the
chained SG table management.

The structure retains an inline array approach similar to the
previous scatterlist implementation: an inline bvec array sized
to max_send_sge handles most I/O operations without additional
allocation. Larger requests fall back to dynamic allocation.
This preserves the allocation-free fast path for typical NFS
operations while supporting arbitrarily large transfers.

The bvec API handles all device types internally, including iWARP
devices which require memory registration. No explicit fallback
path is needed.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Link: https://patch.msgid.link/20260128005400.25147-6-cel@kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Chuck Lever and committed by
Leon Romanovsky
5ee62b4a afcae7d7

+86 -69
+86 -69
net/sunrpc/xprtrdma/svc_rdma_rw.c
··· 5 5 * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. 6 6 */ 7 7 8 + #include <linux/bvec.h> 9 + #include <linux/overflow.h> 8 10 #include <rdma/rw.h> 9 11 10 12 #include <linux/sunrpc/xdr.h> ··· 22 20 /* Each R/W context contains state for one chain of RDMA Read or 23 21 * Write Work Requests. 24 22 * 25 - * Each WR chain handles a single contiguous server-side buffer, 26 - * because scatterlist entries after the first have to start on 27 - * page alignment. xdr_buf iovecs cannot guarantee alignment. 23 + * Each WR chain handles a single contiguous server-side buffer. 24 + * - each xdr_buf iovec is a single contiguous buffer 25 + * - the xdr_buf pages array is a single contiguous buffer because the 26 + * second through the last element always start on a page boundary 28 27 * 29 28 * Each WR chain handles only one R_key. Each RPC-over-RDMA segment 30 29 * from a client may contain a unique R_key, so each WR chain moves 31 30 * up to one segment at a time. 32 31 * 33 - * The scatterlist makes this data structure over 4KB in size. To 34 - * make it less likely to fail, and to handle the allocation for 35 - * smaller I/O requests without disabling bottom-halves, these 36 - * contexts are created on demand, but cached and reused until the 37 - * controlling svcxprt_rdma is destroyed. 32 + * The inline bvec array is sized to handle most I/O requests without 33 + * additional allocation. Larger requests fall back to dynamic allocation. 34 + * These contexts are created on demand, but cached and reused until 35 + * the controlling svcxprt_rdma is destroyed. 38 36 */ 39 37 struct svc_rdma_rw_ctxt { 40 38 struct llist_node rw_node; 41 39 struct list_head rw_list; 42 40 struct rdma_rw_ctx rw_ctx; 43 41 unsigned int rw_nents; 44 - unsigned int rw_first_sgl_nents; 45 - struct sg_table rw_sg_table; 46 - struct scatterlist rw_first_sgl[]; 42 + unsigned int rw_first_bvec_nents; 43 + struct bio_vec *rw_bvec; 44 + struct bio_vec rw_first_bvec[]; 47 45 }; 46 + 47 + static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, 48 + struct svc_rdma_rw_ctxt *ctxt); 48 49 49 50 static inline struct svc_rdma_rw_ctxt * 50 51 svc_rdma_next_ctxt(struct list_head *list) ··· 57 52 } 58 53 59 54 static struct svc_rdma_rw_ctxt * 60 - svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) 55 + svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec) 61 56 { 62 57 struct ib_device *dev = rdma->sc_cm_id->device; 63 - unsigned int first_sgl_nents = dev->attrs.max_send_sge; 58 + unsigned int first_bvec_nents = dev->attrs.max_send_sge; 64 59 struct svc_rdma_rw_ctxt *ctxt; 65 60 struct llist_node *node; 66 61 ··· 70 65 if (node) { 71 66 ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); 72 67 } else { 73 - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), 68 + ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec, 69 + first_bvec_nents), 74 70 GFP_KERNEL, ibdev_to_node(dev)); 75 71 if (!ctxt) 76 72 goto out_noctx; 77 73 78 74 INIT_LIST_HEAD(&ctxt->rw_list); 79 - ctxt->rw_first_sgl_nents = first_sgl_nents; 75 + ctxt->rw_first_bvec_nents = first_bvec_nents; 80 76 } 81 77 82 - ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; 83 - if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, 84 - ctxt->rw_sg_table.sgl, 85 - first_sgl_nents)) 86 - goto out_free; 78 + if (nr_bvec <= ctxt->rw_first_bvec_nents) { 79 + ctxt->rw_bvec = ctxt->rw_first_bvec; 80 + } else { 81 + ctxt->rw_bvec = kmalloc_array_node(nr_bvec, 82 + sizeof(*ctxt->rw_bvec), 83 + GFP_KERNEL, 84 + ibdev_to_node(dev)); 85 + if (!ctxt->rw_bvec) 86 + goto out_free; 87 + } 87 88 return ctxt; 88 89 89 90 out_free: 90 - kfree(ctxt); 91 + /* Return cached contexts to cache; free freshly allocated ones */ 92 + if (node) 93 + svc_rdma_put_rw_ctxt(rdma, ctxt); 94 + else 95 + kfree(ctxt); 91 96 out_noctx: 92 - trace_svcrdma_rwctx_empty(rdma, sges); 97 + trace_svcrdma_rwctx_empty(rdma, nr_bvec); 93 98 return NULL; 94 99 } 95 100 96 101 static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, 97 102 struct llist_head *list) 98 103 { 99 - sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); 104 + if (ctxt->rw_bvec != ctxt->rw_first_bvec) 105 + kfree(ctxt->rw_bvec); 100 106 llist_add(&ctxt->rw_node, list); 101 107 } 102 108 ··· 139 123 * @ctxt: R/W context to prepare 140 124 * @offset: RDMA offset 141 125 * @handle: RDMA tag/handle 126 + * @length: total number of bytes in the bvec array 142 127 * @direction: I/O direction 143 128 * 144 129 * Returns on success, the number of WQEs that will be needed ··· 147 130 */ 148 131 static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, 149 132 struct svc_rdma_rw_ctxt *ctxt, 150 - u64 offset, u32 handle, 133 + u64 offset, u32 handle, unsigned int length, 151 134 enum dma_data_direction direction) 152 135 { 136 + struct bvec_iter iter = { 137 + .bi_size = length, 138 + }; 153 139 int ret; 154 140 155 - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, 156 - ctxt->rw_sg_table.sgl, ctxt->rw_nents, 157 - 0, offset, handle, direction); 141 + ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp, 142 + rdma->sc_port_num, 143 + ctxt->rw_bvec, ctxt->rw_nents, 144 + iter, offset, handle, direction); 158 145 if (unlikely(ret < 0)) { 159 146 trace_svcrdma_dma_map_rw_err(rdma, offset, handle, 160 147 ctxt->rw_nents, ret); ··· 196 175 { 197 176 struct llist_node *first, *last; 198 177 struct svc_rdma_rw_ctxt *ctxt; 199 - LLIST_HEAD(free); 200 178 201 179 trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); 202 180 ··· 203 183 while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { 204 184 list_del(&ctxt->rw_list); 205 185 206 - rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, 207 - rdma->sc_port_num, ctxt->rw_sg_table.sgl, 208 - ctxt->rw_nents, dir); 209 - __svc_rdma_put_rw_ctxt(ctxt, &free); 186 + rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp, 187 + rdma->sc_port_num, 188 + ctxt->rw_bvec, ctxt->rw_nents, dir); 189 + if (ctxt->rw_bvec != ctxt->rw_first_bvec) 190 + kfree(ctxt->rw_bvec); 210 191 211 192 ctxt->rw_node.next = first; 212 193 first = &ctxt->rw_node; ··· 435 414 return -ENOTCONN; 436 415 } 437 416 438 - /* Build and DMA-map an SGL that covers one kvec in an xdr_buf 417 + /* Build a bvec that covers one kvec in an xdr_buf. 439 418 */ 440 - static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, 441 - unsigned int len, 442 - struct svc_rdma_rw_ctxt *ctxt) 419 + static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info, 420 + unsigned int len, 421 + struct svc_rdma_rw_ctxt *ctxt) 443 422 { 444 - struct scatterlist *sg = ctxt->rw_sg_table.sgl; 445 - 446 - sg_set_buf(&sg[0], info->wi_base, len); 423 + bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len); 447 424 info->wi_base += len; 448 425 449 426 ctxt->rw_nents = 1; 450 427 } 451 428 452 - /* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. 429 + /* Build a bvec array that covers part of an xdr_buf's pagelist. 453 430 */ 454 - static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, 455 - unsigned int remaining, 456 - struct svc_rdma_rw_ctxt *ctxt) 431 + static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info, 432 + unsigned int remaining, 433 + struct svc_rdma_rw_ctxt *ctxt) 457 434 { 458 - unsigned int sge_no, sge_bytes, page_off, page_no; 435 + unsigned int bvec_idx, bvec_len, page_off, page_no; 459 436 const struct xdr_buf *xdr = info->wi_xdr; 460 - struct scatterlist *sg; 461 437 struct page **page; 462 438 463 439 page_off = info->wi_next_off + xdr->page_base; ··· 462 444 page_off = offset_in_page(page_off); 463 445 page = xdr->pages + page_no; 464 446 info->wi_next_off += remaining; 465 - sg = ctxt->rw_sg_table.sgl; 466 - sge_no = 0; 447 + bvec_idx = 0; 467 448 do { 468 - sge_bytes = min_t(unsigned int, remaining, 469 - PAGE_SIZE - page_off); 470 - sg_set_page(sg, *page, sge_bytes, page_off); 471 - 472 - remaining -= sge_bytes; 473 - sg = sg_next(sg); 449 + bvec_len = min_t(unsigned int, remaining, 450 + PAGE_SIZE - page_off); 451 + bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len, 452 + page_off); 453 + remaining -= bvec_len; 474 454 page_off = 0; 475 - sge_no++; 455 + bvec_idx++; 476 456 page++; 477 457 } while (remaining); 478 458 479 - ctxt->rw_nents = sge_no; 459 + ctxt->rw_nents = bvec_idx; 480 460 } 481 461 482 462 /* Construct RDMA Write WRs to send a portion of an xdr_buf containing ··· 512 496 constructor(info, write_len, ctxt); 513 497 offset = seg->rs_offset + info->wi_seg_off; 514 498 ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, 515 - DMA_TO_DEVICE); 499 + write_len, DMA_TO_DEVICE); 516 500 if (ret < 0) 517 501 return -EIO; 518 502 percpu_counter_inc(&svcrdma_stat_write); ··· 551 535 const struct kvec *iov) 552 536 { 553 537 info->wi_base = iov->iov_base; 554 - return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, 538 + return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec, 555 539 iov->iov_len); 556 540 } 557 541 ··· 575 559 { 576 560 info->wi_xdr = xdr; 577 561 info->wi_next_off = offset - xdr->head[0].iov_len; 578 - return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, 562 + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec, 579 563 length); 580 564 } 581 565 ··· 750 734 { 751 735 struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); 752 736 struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; 753 - unsigned int sge_no, seg_len, len; 737 + unsigned int bvec_idx, nr_bvec, seg_len, len, total; 754 738 struct svc_rdma_rw_ctxt *ctxt; 755 - struct scatterlist *sg; 756 739 int ret; 757 740 758 741 len = segment->rs_length; 759 - sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; 760 - ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); 742 + if (check_add_overflow(head->rc_pageoff, len, &total)) 743 + return -EINVAL; 744 + nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; 745 + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); 761 746 if (!ctxt) 762 747 return -ENOMEM; 763 - ctxt->rw_nents = sge_no; 748 + ctxt->rw_nents = nr_bvec; 764 749 765 - sg = ctxt->rw_sg_table.sgl; 766 - for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { 750 + for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) { 767 751 seg_len = min_t(unsigned int, len, 768 752 PAGE_SIZE - head->rc_pageoff); 769 753 770 754 if (!head->rc_pageoff) 771 755 head->rc_page_count++; 772 756 773 - sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], 774 - seg_len, head->rc_pageoff); 775 - sg = sg_next(sg); 757 + bvec_set_page(&ctxt->rw_bvec[bvec_idx], 758 + rqstp->rq_pages[head->rc_curpage], 759 + seg_len, head->rc_pageoff); 776 760 777 761 head->rc_pageoff += seg_len; 778 762 if (head->rc_pageoff == PAGE_SIZE) { ··· 786 770 } 787 771 788 772 ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, 789 - segment->rs_handle, DMA_FROM_DEVICE); 773 + segment->rs_handle, segment->rs_length, 774 + DMA_FROM_DEVICE); 790 775 if (ret < 0) 791 776 return -EIO; 792 777 percpu_counter_inc(&svcrdma_stat_read);