svcrdma: Add fair queuing for Send Queue access

+10

include/linux/sunrpc/svc_rdma.h

··· 84 84 85 85 atomic_t sc_sq_avail; /* SQEs ready to be consumed */ 86 86 unsigned int sc_sq_depth; /* Depth of SQ */ 87 + atomic_t sc_sq_ticket_head; /* Next ticket to issue */ 88 + atomic_t sc_sq_ticket_tail; /* Ticket currently serving */ 89 + wait_queue_head_t sc_sq_ticket_wait; /* Ticket ordering waitlist */ 87 90 __be32 sc_fc_credits; /* Forward credits */ 88 91 u32 sc_max_requests; /* Max requests */ 89 92 u32 sc_max_bc_requests;/* Backward credits */ ··· 309 306 struct svc_rdma_recv_ctxt *rctxt, 310 307 int status); 311 308 extern void svc_rdma_wake_send_waiters(struct svcxprt_rdma *rdma, int avail); 309 + extern int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, 310 + const struct rpc_rdma_cid *cid, int sqecount); 311 + extern int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, 312 + const struct rpc_rdma_cid *cid, 313 + const struct ib_send_wr *bad_wr, 314 + const struct ib_send_wr *first_wr, 315 + int sqecount, int ret); 312 316 extern int svc_rdma_sendto(struct svc_rqst *); 313 317 extern int svc_rdma_result_payload(struct svc_rqst *rqstp, unsigned int offset, 314 318 unsigned int length);

+10 -27

net/sunrpc/xprtrdma/svc_rdma_rw.c

··· 405 405 cqe = NULL; 406 406 } 407 407 408 - do { 409 - if (atomic_sub_return(cc->cc_sqecount, 410 - &rdma->sc_sq_avail) > 0) { 411 - cc->cc_posttime = ktime_get(); 412 - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); 413 - if (ret) 414 - break; 415 - return 0; 416 - } 408 + ret = svc_rdma_sq_wait(rdma, &cc->cc_cid, cc->cc_sqecount); 409 + if (ret < 0) 410 + return ret; 417 411 418 - percpu_counter_inc(&svcrdma_stat_sq_starve); 419 - trace_svcrdma_sq_full(rdma, &cc->cc_cid); 420 - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); 421 - wait_event(rdma->sc_send_wait, 422 - atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); 423 - trace_svcrdma_sq_retry(rdma, &cc->cc_cid); 424 - } while (1); 425 - 426 - trace_svcrdma_sq_post_err(rdma, &cc->cc_cid, ret); 427 - svc_xprt_deferred_close(&rdma->sc_xprt); 428 - 429 - /* If even one was posted, there will be a completion. */ 430 - if (bad_wr != first_wr) 431 - return 0; 432 - 433 - atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); 434 - wake_up(&rdma->sc_send_wait); 435 - return -ENOTCONN; 412 + cc->cc_posttime = ktime_get(); 413 + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); 414 + if (ret) 415 + return svc_rdma_post_send_err(rdma, &cc->cc_cid, bad_wr, 416 + first_wr, cc->cc_sqecount, 417 + ret); 418 + return 0; 436 419 } 437 420 438 421 /* Build a bvec that covers one kvec in an xdr_buf.

+120 -40

net/sunrpc/xprtrdma/svc_rdma_sendto.c

··· 295 295 } 296 296 297 297 /** 298 + * svc_rdma_sq_wait - Wait for SQ slots using fair queuing 299 + * @rdma: controlling transport 300 + * @cid: completion ID for tracing 301 + * @sqecount: number of SQ entries needed 302 + * 303 + * A ticket-based system ensures fair ordering when multiple threads 304 + * wait for Send Queue capacity. Each waiter takes a ticket and is 305 + * served in order, preventing starvation. 306 + * 307 + * Protocol invariant: every ticket holder must increment 308 + * sc_sq_ticket_tail exactly once, whether the reservation 309 + * succeeds or the connection closes. Failing to advance the 310 + * tail stalls all subsequent waiters. 311 + * 312 + * The ticket counters are signed 32-bit atomics. After 313 + * wrapping through INT_MAX, the equality check 314 + * (tail == ticket) remains correct because both counters 315 + * advance monotonically and the comparison uses exact 316 + * equality rather than relational operators. 317 + * 318 + * Return values: 319 + * %0: SQ slots were reserved successfully 320 + * %-ENOTCONN: The connection was lost 321 + */ 322 + int svc_rdma_sq_wait(struct svcxprt_rdma *rdma, 323 + const struct rpc_rdma_cid *cid, int sqecount) 324 + { 325 + int ticket; 326 + 327 + /* Fast path: try to reserve SQ slots without waiting. 328 + * 329 + * A failed reservation temporarily understates sc_sq_avail 330 + * until the compensating atomic_add restores it. A Send 331 + * completion arriving in that window sees a lower count 332 + * than reality, but the value self-corrects once the add 333 + * completes. No ordering guarantee is needed here because 334 + * the slow path serializes all contended waiters. 335 + */ 336 + if (likely(atomic_sub_return(sqecount, &rdma->sc_sq_avail) >= 0)) 337 + return 0; 338 + atomic_add(sqecount, &rdma->sc_sq_avail); 339 + 340 + /* Slow path: take a ticket and wait in line */ 341 + ticket = atomic_fetch_inc(&rdma->sc_sq_ticket_head); 342 + 343 + percpu_counter_inc(&svcrdma_stat_sq_starve); 344 + trace_svcrdma_sq_full(rdma, cid); 345 + 346 + /* Wait until all earlier tickets have been served */ 347 + wait_event(rdma->sc_sq_ticket_wait, 348 + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || 349 + atomic_read(&rdma->sc_sq_ticket_tail) == ticket); 350 + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) 351 + goto out_close; 352 + 353 + /* It's our turn. Wait for enough SQ slots to be available. */ 354 + while (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { 355 + atomic_add(sqecount, &rdma->sc_sq_avail); 356 + 357 + wait_event(rdma->sc_send_wait, 358 + test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags) || 359 + atomic_read(&rdma->sc_sq_avail) >= sqecount); 360 + if (test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) 361 + goto out_close; 362 + } 363 + 364 + /* Slots reserved successfully. Let the next waiter proceed. */ 365 + atomic_inc(&rdma->sc_sq_ticket_tail); 366 + wake_up(&rdma->sc_sq_ticket_wait); 367 + trace_svcrdma_sq_retry(rdma, cid); 368 + return 0; 369 + 370 + out_close: 371 + atomic_inc(&rdma->sc_sq_ticket_tail); 372 + wake_up(&rdma->sc_sq_ticket_wait); 373 + return -ENOTCONN; 374 + } 375 + 376 + /** 377 + * svc_rdma_post_send_err - Handle ib_post_send failure 378 + * @rdma: controlling transport 379 + * @cid: completion ID for tracing 380 + * @bad_wr: first WR that was not posted 381 + * @first_wr: first WR in the chain 382 + * @sqecount: number of SQ entries that were reserved 383 + * @ret: error code from ib_post_send 384 + * 385 + * Return values: 386 + * %0: At least one WR was posted; a completion handles cleanup 387 + * %-ENOTCONN: No WRs were posted; SQ slots are released 388 + */ 389 + int svc_rdma_post_send_err(struct svcxprt_rdma *rdma, 390 + const struct rpc_rdma_cid *cid, 391 + const struct ib_send_wr *bad_wr, 392 + const struct ib_send_wr *first_wr, 393 + int sqecount, int ret) 394 + { 395 + trace_svcrdma_sq_post_err(rdma, cid, ret); 396 + svc_xprt_deferred_close(&rdma->sc_xprt); 397 + 398 + /* If even one WR was posted, a Send completion will 399 + * return the reserved SQ slots. 400 + */ 401 + if (bad_wr != first_wr) 402 + return 0; 403 + 404 + svc_rdma_wake_send_waiters(rdma, sqecount); 405 + return -ENOTCONN; 406 + } 407 + 408 + /** 298 409 * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC 299 410 * @cq: Completion Queue context 300 411 * @wc: Work Completion object ··· 447 336 * that these values remain available after the ib_post_send() call. 448 337 * In some error flow cases, svc_rdma_wc_send() releases @ctxt. 449 338 * 450 - * Note there is potential for starvation when the Send Queue is 451 - * full because there is no order to when waiting threads are 452 - * awoken. The transport is typically provisioned with a deep 453 - * enough Send Queue that SQ exhaustion should be a rare event. 454 - * 455 339 * Return values: 456 340 * %0: @ctxt's WR chain was posted successfully 457 341 * %-ENOTCONN: The connection was lost ··· 468 362 send_wr->sg_list[0].length, 469 363 DMA_TO_DEVICE); 470 364 471 - /* If the SQ is full, wait until an SQ entry is available */ 472 - while (!test_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags)) { 473 - if (atomic_sub_return(sqecount, &rdma->sc_sq_avail) < 0) { 474 - svc_rdma_wake_send_waiters(rdma, sqecount); 365 + ret = svc_rdma_sq_wait(rdma, &cid, sqecount); 366 + if (ret < 0) 367 + return ret; 475 368 476 - /* When the transport is torn down, assume 477 - * ib_drain_sq() will trigger enough Send 478 - * completions to wake us. The XPT_CLOSE test 479 - * above should then cause the while loop to 480 - * exit. 481 - */ 482 - percpu_counter_inc(&svcrdma_stat_sq_starve); 483 - trace_svcrdma_sq_full(rdma, &cid); 484 - wait_event(rdma->sc_send_wait, 485 - atomic_read(&rdma->sc_sq_avail) > 0); 486 - trace_svcrdma_sq_retry(rdma, &cid); 487 - continue; 488 - } 489 - 490 - trace_svcrdma_post_send(ctxt); 491 - ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); 492 - if (ret) { 493 - trace_svcrdma_sq_post_err(rdma, &cid, ret); 494 - svc_xprt_deferred_close(&rdma->sc_xprt); 495 - 496 - /* If even one WR was posted, there will be a 497 - * Send completion that bumps sc_sq_avail. 498 - */ 499 - if (bad_wr == first_wr) { 500 - svc_rdma_wake_send_waiters(rdma, sqecount); 501 - break; 502 - } 503 - } 504 - return 0; 505 - } 506 - return -ENOTCONN; 369 + trace_svcrdma_post_send(ctxt); 370 + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); 371 + if (ret) 372 + return svc_rdma_post_send_err(rdma, &cid, bad_wr, 373 + first_wr, sqecount, ret); 374 + return 0; 507 375 } 508 376 509 377 /**

+5 -1

net/sunrpc/xprtrdma/svc_rdma_transport.c

··· 179 179 init_llist_head(&cma_xprt->sc_recv_ctxts); 180 180 init_llist_head(&cma_xprt->sc_rw_ctxts); 181 181 init_waitqueue_head(&cma_xprt->sc_send_wait); 182 + init_waitqueue_head(&cma_xprt->sc_sq_ticket_wait); 182 183 183 184 spin_lock_init(&cma_xprt->sc_lock); 184 185 spin_lock_init(&cma_xprt->sc_rq_dto_lock); ··· 478 477 if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) 479 478 newxprt->sc_sq_depth = dev->attrs.max_qp_wr; 480 479 atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); 480 + atomic_set(&newxprt->sc_sq_ticket_head, 0); 481 + atomic_set(&newxprt->sc_sq_ticket_tail, 0); 481 482 482 483 newxprt->sc_pd = ib_alloc_pd(dev, 0); 483 484 if (IS_ERR(newxprt->sc_pd)) { ··· 652 649 * If there are already waiters on the SQ, 653 650 * return false. 654 651 */ 655 - if (waitqueue_active(&rdma->sc_send_wait)) 652 + if (waitqueue_active(&rdma->sc_send_wait) || 653 + waitqueue_active(&rdma->sc_sq_ticket_wait)) 656 654 return 0; 657 655 658 656 /* Otherwise return true. */

Configure Feed

Configure Feed