Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

svcrdma: Add Write chunk WRs to the RPC's Send WR chain

Previously, Write chunk RDMA Writes were posted via a separate
ib_post_send() call with their own completion handler. Each Write
chunk incurred a doorbell and generated a completion event.

Link Write chunk WRs onto the RPC Reply's Send WR chain so that a
single ib_post_send() call posts both the RDMA Writes and the Send
WR. A single completion event signals that all operations have
finished. This reduces both doorbell rate and completion rate, as
well as eliminating the latency of a round-trip between the Write
chunk completion and the subsequent Send WR posting.

The lifecycle of Write chunk resources changes: previously, the
svc_rdma_write_done() completion handler released Write chunk
resources when RDMA Writes completed. With WR chaining, resources
remain live until the Send completion. A new sc_write_info_list
tracks Write chunk metadata attached to each Send context, and
svc_rdma_write_chunk_release() frees these resources when the
Send context is released.

The svc_rdma_write_done() handler now handles only error cases.
On success it returns immediately since the Send completion handles
resource release. On failure (WR flush), it closes the connection
to signal to the client that the RPC Reply is incomplete.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>

+91 -26
+10 -3
include/linux/sunrpc/svc_rdma.h
··· 216 216 */ 217 217 struct svc_rdma_write_info { 218 218 struct svcxprt_rdma *wi_rdma; 219 + struct list_head wi_list; 219 220 220 221 const struct svc_rdma_chunk *wi_chunk; 221 222 ··· 245 244 struct ib_cqe sc_cqe; 246 245 struct xdr_buf sc_hdrbuf; 247 246 struct xdr_stream sc_stream; 247 + 248 + struct list_head sc_write_info_list; 248 249 struct svc_rdma_write_info sc_reply_info; 250 + 249 251 void *sc_xprt_buf; 250 252 int sc_page_count; 251 253 int sc_cur_sge_no; ··· 281 277 extern void svc_rdma_cc_release(struct svcxprt_rdma *rdma, 282 278 struct svc_rdma_chunk_ctxt *cc, 283 279 enum dma_data_direction dir); 280 + extern void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, 281 + struct svc_rdma_send_ctxt *ctxt); 284 282 extern void svc_rdma_reply_chunk_release(struct svcxprt_rdma *rdma, 285 283 struct svc_rdma_send_ctxt *ctxt); 286 - extern int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, 287 - const struct svc_rdma_recv_ctxt *rctxt, 288 - const struct xdr_buf *xdr); 284 + extern int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, 285 + const struct svc_rdma_recv_ctxt *rctxt, 286 + struct svc_rdma_send_ctxt *sctxt, 287 + const struct xdr_buf *xdr); 289 288 extern int svc_rdma_prepare_reply_chunk(struct svcxprt_rdma *rdma, 290 289 const struct svc_rdma_pcl *write_pcl, 291 290 const struct svc_rdma_pcl *reply_pcl,
+72 -22
net/sunrpc/xprtrdma/svc_rdma_rw.c
··· 252 252 } 253 253 254 254 /** 255 + * svc_rdma_write_chunk_release - Release Write chunk I/O resources 256 + * @rdma: controlling transport 257 + * @ctxt: Send context that is being released 258 + * 259 + * Write chunk resources remain live until Send completion because 260 + * Write WRs are chained to the Send WR. This function releases all 261 + * write_info structures accumulated on @ctxt->sc_write_info_list. 262 + */ 263 + void svc_rdma_write_chunk_release(struct svcxprt_rdma *rdma, 264 + struct svc_rdma_send_ctxt *ctxt) 265 + { 266 + struct svc_rdma_write_info *info; 267 + 268 + while (!list_empty(&ctxt->sc_write_info_list)) { 269 + info = list_first_entry(&ctxt->sc_write_info_list, 270 + struct svc_rdma_write_info, wi_list); 271 + list_del(&info->wi_list); 272 + svc_rdma_write_info_free(info); 273 + } 274 + } 275 + 276 + /** 255 277 * svc_rdma_reply_chunk_release - Release Reply chunk I/O resources 256 278 * @rdma: controlling transport 257 279 * @ctxt: Send context that is being released ··· 329 307 struct ib_cqe *cqe = wc->wr_cqe; 330 308 struct svc_rdma_chunk_ctxt *cc = 331 309 container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); 332 - struct svc_rdma_write_info *info = 333 - container_of(cc, struct svc_rdma_write_info, wi_cc); 334 310 335 311 switch (wc->status) { 336 312 case IB_WC_SUCCESS: 337 313 trace_svcrdma_wc_write(&cc->cc_cid); 338 - break; 314 + return; 339 315 case IB_WC_WR_FLUSH_ERR: 340 316 trace_svcrdma_wc_write_flush(wc, &cc->cc_cid); 341 317 break; ··· 341 321 trace_svcrdma_wc_write_err(wc, &cc->cc_cid); 342 322 } 343 323 344 - svc_rdma_wake_send_waiters(rdma, cc->cc_sqecount); 345 - 346 - if (unlikely(wc->status != IB_WC_SUCCESS)) 347 - svc_xprt_deferred_close(&rdma->sc_xprt); 348 - 349 - svc_rdma_write_info_free(info); 324 + /* The RDMA Write has flushed, so the client won't get 325 + * some of the outgoing RPC message. Signal the loss 326 + * to the client by closing the connection. 327 + */ 328 + svc_xprt_deferred_close(&rdma->sc_xprt); 350 329 } 351 330 352 331 /** ··· 619 600 return xdr->len; 620 601 } 621 602 622 - static int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, 623 - const struct svc_rdma_chunk *chunk, 624 - const struct xdr_buf *xdr) 603 + /* 604 + * svc_rdma_prepare_write_chunk - Link Write WRs for @chunk onto @sctxt's chain 605 + * 606 + * Write WRs are prepended to the Send WR chain so that a single 607 + * ib_post_send() posts both RDMA Writes and the final Send. Only 608 + * the first WR in each chunk gets a CQE for error detection; 609 + * subsequent WRs complete without individual completion events. 610 + * The Send WR's signaled completion indicates all chained 611 + * operations have finished. 612 + */ 613 + static int svc_rdma_prepare_write_chunk(struct svcxprt_rdma *rdma, 614 + struct svc_rdma_send_ctxt *sctxt, 615 + const struct svc_rdma_chunk *chunk, 616 + const struct xdr_buf *xdr) 625 617 { 626 618 struct svc_rdma_write_info *info; 627 619 struct svc_rdma_chunk_ctxt *cc; 620 + struct ib_send_wr *first_wr; 628 621 struct xdr_buf payload; 622 + struct list_head *pos; 623 + struct ib_cqe *cqe; 629 624 int ret; 630 625 631 626 if (xdr_buf_subsegment(xdr, &payload, chunk->ch_position, ··· 655 622 if (ret != payload.len) 656 623 goto out_err; 657 624 658 - trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); 659 - ret = svc_rdma_post_chunk_ctxt(rdma, cc); 660 - if (ret < 0) 625 + ret = -EINVAL; 626 + if (unlikely(sctxt->sc_sqecount + cc->cc_sqecount > rdma->sc_sq_depth)) 661 627 goto out_err; 628 + 629 + first_wr = sctxt->sc_wr_chain; 630 + cqe = &cc->cc_cqe; 631 + list_for_each(pos, &cc->cc_rwctxts) { 632 + struct svc_rdma_rw_ctxt *rwc; 633 + 634 + rwc = list_entry(pos, struct svc_rdma_rw_ctxt, rw_list); 635 + first_wr = rdma_rw_ctx_wrs(&rwc->rw_ctx, rdma->sc_qp, 636 + rdma->sc_port_num, cqe, first_wr); 637 + cqe = NULL; 638 + } 639 + sctxt->sc_wr_chain = first_wr; 640 + sctxt->sc_sqecount += cc->cc_sqecount; 641 + list_add(&info->wi_list, &sctxt->sc_write_info_list); 642 + 643 + trace_svcrdma_post_write_chunk(&cc->cc_cid, cc->cc_sqecount); 662 644 return 0; 663 645 664 646 out_err: ··· 682 634 } 683 635 684 636 /** 685 - * svc_rdma_send_write_list - Send all chunks on the Write list 637 + * svc_rdma_prepare_write_list - Construct WR chain for sending Write list 686 638 * @rdma: controlling RDMA transport 687 639 * @rctxt: Write list provisioned by the client 640 + * @sctxt: Send WR resources 688 641 * @xdr: xdr_buf containing an RPC Reply message 689 642 * 690 - * Returns zero on success, or a negative errno if one or more 691 - * Write chunks could not be sent. 643 + * Returns zero on success, or a negative errno if WR chain 644 + * construction fails for one or more Write chunks. 692 645 */ 693 - int svc_rdma_send_write_list(struct svcxprt_rdma *rdma, 694 - const struct svc_rdma_recv_ctxt *rctxt, 695 - const struct xdr_buf *xdr) 646 + int svc_rdma_prepare_write_list(struct svcxprt_rdma *rdma, 647 + const struct svc_rdma_recv_ctxt *rctxt, 648 + struct svc_rdma_send_ctxt *sctxt, 649 + const struct xdr_buf *xdr) 696 650 { 697 651 struct svc_rdma_chunk *chunk; 698 652 int ret; ··· 702 652 pcl_for_each_chunk(chunk, &rctxt->rc_write_pcl) { 703 653 if (!chunk->ch_payload_length) 704 654 break; 705 - ret = svc_rdma_send_write_chunk(rdma, chunk, xdr); 655 + ret = svc_rdma_prepare_write_chunk(rdma, sctxt, chunk, xdr); 706 656 if (ret < 0) 707 657 return ret; 708 658 }
+9 -1
net/sunrpc/xprtrdma/svc_rdma_sendto.c
··· 150 150 ctxt->sc_send_wr.sg_list = ctxt->sc_sges; 151 151 ctxt->sc_send_wr.send_flags = IB_SEND_SIGNALED; 152 152 ctxt->sc_cqe.done = svc_rdma_wc_send; 153 + INIT_LIST_HEAD(&ctxt->sc_write_info_list); 153 154 ctxt->sc_xprt_buf = buffer; 154 155 xdr_buf_init(&ctxt->sc_hdrbuf, ctxt->sc_xprt_buf, 155 156 rdma->sc_max_req_size); ··· 238 237 struct ib_device *device = rdma->sc_cm_id->device; 239 238 unsigned int i; 240 239 240 + svc_rdma_write_chunk_release(rdma, ctxt); 241 241 svc_rdma_reply_chunk_release(rdma, ctxt); 242 242 243 243 if (ctxt->sc_page_count) ··· 1056 1054 sctxt->sc_send_wr.num_sge = 1; 1057 1055 sctxt->sc_send_wr.opcode = IB_WR_SEND; 1058 1056 sctxt->sc_sges[0].length = sctxt->sc_hdrbuf.len; 1057 + 1058 + /* Ensure only the error message is posted, not any previously 1059 + * prepared Write chunk WRs. 1060 + */ 1061 + sctxt->sc_wr_chain = &sctxt->sc_send_wr; 1062 + sctxt->sc_sqecount = 1; 1059 1063 if (svc_rdma_post_send(rdma, sctxt)) 1060 1064 goto put_ctxt; 1061 1065 return; ··· 1109 1101 if (!p) 1110 1102 goto put_ctxt; 1111 1103 1112 - ret = svc_rdma_send_write_list(rdma, rctxt, &rqstp->rq_res); 1104 + ret = svc_rdma_prepare_write_list(rdma, rctxt, sctxt, &rqstp->rq_res); 1113 1105 if (ret < 0) 1114 1106 goto put_ctxt; 1115 1107