Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'net-rds-rds-tcp-state-machine-and-message-loss-improvements'

Allison Henderson says:

====================
net/rds: RDS-TCP state machine and message loss improvements

This is subset 2 of the larger RDS-TCP patch series I posted last
Oct. The greater series aims to correct multiple rds-tcp issues that
can cause dropped or out of sequence messages. I've broken it down into
smaller sets to make reviews more manageable.

In this set, we correct a few RDS/TCP connection handling issues, and
message loss issues.
====================

Link: https://patch.msgid.link/20260122055213.83608-1-achender@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+171 -81
+5
net/rds/connection.c
··· 395 395 if (!rds_conn_path_transition(cp, RDS_CONN_UP, 396 396 RDS_CONN_DISCONNECTING) && 397 397 !rds_conn_path_transition(cp, RDS_CONN_ERROR, 398 + RDS_CONN_DISCONNECTING) && 399 + !rds_conn_path_transition(cp, RDS_CONN_RESETTING, 398 400 RDS_CONN_DISCONNECTING)) { 399 401 rds_conn_path_error(cp, 400 402 "shutdown called in state %d\n", ··· 449 447 } else { 450 448 rcu_read_unlock(); 451 449 } 450 + 451 + if (conn->c_trans->conn_slots_available) 452 + conn->c_trans->conn_slots_available(conn); 452 453 } 453 454 454 455 /* destroy a single rds_conn_path. rds_conn_destroy() iterates over
+39 -27
net/rds/rds.h
··· 506 506 */ 507 507 #define RDS_TRANS_LOOP 3 508 508 509 - /** 510 - * struct rds_transport - transport specific behavioural hooks 511 - * 512 - * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send 513 - * part of a message. The caller serializes on the send_sem so this 514 - * doesn't need to be reentrant for a given conn. The header must be 515 - * sent before the data payload. .xmit must be prepared to send a 516 - * message with no data payload. .xmit should return the number of 517 - * bytes that were sent down the connection, including header bytes. 518 - * Returning 0 tells the caller that it doesn't need to perform any 519 - * additional work now. This is usually the case when the transport has 520 - * filled the sending queue for its connection and will handle 521 - * triggering the rds thread to continue the send when space becomes 522 - * available. Returning -EAGAIN tells the caller to retry the send 523 - * immediately. Returning -ENOMEM tells the caller to retry the send at 524 - * some point in the future. 525 - * 526 - * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once 527 - * it returns the connection can not call rds_recv_incoming(). 528 - * This will only be called once after conn_connect returns 529 - * non-zero success and will The caller serializes this with 530 - * the send and connecting paths (xmit_* and conn_*). The 531 - * transport is responsible for other serialization, including 532 - * rds_recv_incoming(). This is called in process context but 533 - * should try hard not to block. 534 - */ 535 - 536 509 struct rds_transport { 537 510 char t_name[TRANSNAMSIZ]; 538 511 struct list_head t_item; ··· 518 545 __u32 scope_id); 519 546 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); 520 547 void (*conn_free)(void *data); 548 + 549 + /* 550 + * conn_slots_available is invoked when a previously unavailable 551 + * connection slot becomes available again. rds_tcp_accept_one_path may 552 + * return -ENOBUFS if it cannot find an available slot, and then stashes 553 + * the new socket in "rds_tcp_accepted_sock". This function re-issues 554 + * `rds_tcp_accept_one_path`, which picks up the stashed socket and 555 + * continuing where it left with "-ENOBUFS" last time. This ensures 556 + * messages received on the new socket are not discarded when no 557 + * connection path was available at the time. 558 + */ 559 + void (*conn_slots_available)(struct rds_connection *conn); 521 560 int (*conn_path_connect)(struct rds_conn_path *cp); 561 + 562 + /* 563 + * conn_shutdown stops traffic on the given connection. Once 564 + * it returns the connection can not call rds_recv_incoming(). 565 + * This will only be called once after conn_connect returns 566 + * non-zero success and will The caller serializes this with 567 + * the send and connecting paths (xmit_* and conn_*). The 568 + * transport is responsible for other serialization, including 569 + * rds_recv_incoming(). This is called in process context but 570 + * should try hard not to block. 571 + */ 522 572 void (*conn_path_shutdown)(struct rds_conn_path *conn); 523 573 void (*xmit_path_prepare)(struct rds_conn_path *cp); 524 574 void (*xmit_path_complete)(struct rds_conn_path *cp); 575 + 576 + /* 577 + * .xmit is called by rds_send_xmit() to tell the transport to send 578 + * part of a message. The caller serializes on the send_sem so this 579 + * doesn't need to be reentrant for a given conn. The header must be 580 + * sent before the data payload. .xmit must be prepared to send a 581 + * message with no data payload. .xmit should return the number of 582 + * bytes that were sent down the connection, including header bytes. 583 + * Returning 0 tells the caller that it doesn't need to perform any 584 + * additional work now. This is usually the case when the transport has 585 + * filled the sending queue for its connection and will handle 586 + * triggering the rds thread to continue the send when space becomes 587 + * available. Returning -EAGAIN tells the caller to retry the send 588 + * immediately. Returning -ENOMEM tells the caller to retry the send at 589 + * some point in the future. 590 + */ 525 591 int (*xmit)(struct rds_connection *conn, struct rds_message *rm, 526 592 unsigned int hdr_off, unsigned int sg, unsigned int off); 527 593 int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
+4
net/rds/recv.c
··· 230 230 conn->c_npaths = max_t(int, conn->c_npaths, 1); 231 231 conn->c_ping_triggered = 0; 232 232 rds_conn_peer_gen_update(conn, new_peer_gen_num); 233 + 234 + if (conn->c_npaths > 1 && 235 + conn->c_trans->conn_slots_available) 236 + conn->c_trans->conn_slots_available(conn); 233 237 } 234 238 235 239 /* rds_start_mprds() will synchronously start multiple paths when appropriate.
+11 -16
net/rds/tcp.c
··· 213 213 sock->sk->sk_data_ready = sock->sk->sk_user_data; 214 214 215 215 tc->t_sock = sock; 216 + if (!tc->t_rtn) 217 + tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid); 216 218 tc->t_cpath = cp; 217 219 tc->t_orig_data_ready = sock->sk->sk_data_ready; 218 220 tc->t_orig_write_space = sock->sk->sk_write_space; ··· 380 378 } 381 379 mutex_init(&tc->t_conn_path_lock); 382 380 tc->t_sock = NULL; 381 + tc->t_rtn = NULL; 383 382 tc->t_tinc = NULL; 384 383 tc->t_tinc_hdr_rem = sizeof(struct rds_header); 385 384 tc->t_tinc_data_rem = 0; ··· 461 458 .recv_path = rds_tcp_recv_path, 462 459 .conn_alloc = rds_tcp_conn_alloc, 463 460 .conn_free = rds_tcp_conn_free, 461 + .conn_slots_available = rds_tcp_conn_slots_available, 464 462 .conn_path_connect = rds_tcp_conn_path_connect, 465 463 .conn_path_shutdown = rds_tcp_conn_path_shutdown, 466 464 .inc_copy_to_user = rds_tcp_inc_copy_to_user, ··· 477 473 .t_unloading = rds_tcp_is_unloading, 478 474 }; 479 475 480 - static unsigned int rds_tcp_netid; 481 - 482 - /* per-network namespace private data for this module */ 483 - struct rds_tcp_net { 484 - struct socket *rds_tcp_listen_sock; 485 - struct work_struct rds_tcp_accept_w; 486 - struct ctl_table_header *rds_tcp_sysctl; 487 - struct ctl_table *ctl_table; 488 - int sndbuf_size; 489 - int rcvbuf_size; 490 - }; 476 + int rds_tcp_netid; 491 477 492 478 /* All module specific customizations to the RDS-TCP socket should be done in 493 479 * rds_tcp_tune() and applied after socket creation. ··· 520 526 struct rds_tcp_net, 521 527 rds_tcp_accept_w); 522 528 523 - while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0) 529 + while (rds_tcp_accept_one(rtn) == 0) 524 530 cond_resched(); 525 531 } 526 532 527 - void rds_tcp_accept_work(struct sock *sk) 533 + void rds_tcp_accept_work(struct rds_tcp_net *rtn) 528 534 { 529 - struct net *net = sock_net(sk); 530 - struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); 531 - 532 535 queue_work(rds_wq, &rtn->rds_tcp_accept_w); 533 536 } 534 537 ··· 536 545 int err = 0; 537 546 538 547 memset(rtn, 0, sizeof(*rtn)); 548 + 549 + mutex_init(&rtn->rds_tcp_accept_lock); 539 550 540 551 /* {snd, rcv}buf_size default to 0, which implies we let the 541 552 * stack pick the value, and permit auto-tuning of buffer size. ··· 602 609 603 610 rtn->rds_tcp_listen_sock = NULL; 604 611 rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); 612 + if (rtn->rds_tcp_accepted_sock) 613 + sock_release(rtn->rds_tcp_accepted_sock); 605 614 spin_lock_irq(&rds_tcp_conn_lock); 606 615 list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { 607 616 struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
+20 -2
net/rds/tcp.h
··· 4 4 5 5 #define RDS_TCP_PORT 16385 6 6 7 + /* per-network namespace private data for this module */ 8 + struct rds_tcp_net { 9 + /* serialize "rds_tcp_accept_one" with "rds_tcp_accept_lock" 10 + * to protect "rds_tcp_accepted_sock" 11 + */ 12 + struct mutex rds_tcp_accept_lock; 13 + struct socket *rds_tcp_listen_sock; 14 + struct socket *rds_tcp_accepted_sock; 15 + struct work_struct rds_tcp_accept_w; 16 + struct ctl_table_header *rds_tcp_sysctl; 17 + const struct ctl_table *ctl_table; 18 + int sndbuf_size; 19 + int rcvbuf_size; 20 + }; 21 + 7 22 struct rds_tcp_incoming { 8 23 struct rds_incoming ti_inc; 9 24 struct sk_buff_head ti_skb_list; ··· 34 19 */ 35 20 struct mutex t_conn_path_lock; 36 21 struct socket *t_sock; 22 + struct rds_tcp_net *t_rtn; 37 23 void *t_orig_write_space; 38 24 void *t_orig_data_ready; 39 25 void *t_orig_state_change; ··· 65 49 }; 66 50 67 51 /* tcp.c */ 52 + extern int rds_tcp_netid; 68 53 bool rds_tcp_tune(struct socket *sock); 69 54 void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); 70 55 void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); ··· 74 57 u32 rds_tcp_write_seq(struct rds_tcp_connection *tc); 75 58 u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); 76 59 extern struct rds_transport rds_tcp_transport; 77 - void rds_tcp_accept_work(struct sock *sk); 60 + void rds_tcp_accept_work(struct rds_tcp_net *rtn); 78 61 int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, 79 62 __u32 scope_id); 80 63 /* tcp_connect.c */ ··· 86 69 struct socket *rds_tcp_listen_init(struct net *net, bool isv6); 87 70 void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); 88 71 void rds_tcp_listen_data_ready(struct sock *sk); 89 - int rds_tcp_accept_one(struct socket *sock); 72 + void rds_tcp_conn_slots_available(struct rds_connection *conn); 73 + int rds_tcp_accept_one(struct rds_tcp_net *rtn); 90 74 void rds_tcp_keepalive(struct socket *sock); 91 75 void *rds_tcp_listen_sock_def_readable(struct net *net); 92 76
+92 -36
net/rds/tcp_listen.c
··· 35 35 #include <linux/in.h> 36 36 #include <net/tcp.h> 37 37 #include <trace/events/sock.h> 38 + #include <net/net_namespace.h> 39 + #include <net/netns/generic.h> 38 40 39 41 #include "rds.h" 40 42 #include "tcp.h" ··· 61 59 * socket and force a reconneect from smaller -> larger ip addr. The reason 62 60 * we special case cp_index 0 is to allow the rds probe ping itself to itself 63 61 * get through efficiently. 64 - * Since reconnects are only initiated from the node with the numerically 65 - * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side 66 - * by moving them to CONNECTING in this function. 67 62 */ 68 63 static 69 64 struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) ··· 68 69 int i; 69 70 int npaths = max_t(int, 1, conn->c_npaths); 70 71 71 - /* for mprds, all paths MUST be initiated by the peer 72 - * with the smaller address. 73 - */ 74 - if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { 75 - /* Make sure we initiate at least one path if this 76 - * has not already been done; rds_start_mprds() will 77 - * take care of additional paths, if necessary. 78 - */ 79 - if (npaths == 1) 80 - rds_conn_path_connect_if_down(&conn->c_path[0]); 81 - return NULL; 82 - } 83 - 84 72 for (i = 0; i < npaths; i++) { 85 73 struct rds_conn_path *cp = &conn->c_path[i]; 86 74 87 75 if (rds_conn_path_transition(cp, RDS_CONN_DOWN, 88 - RDS_CONN_CONNECTING) || 89 - rds_conn_path_transition(cp, RDS_CONN_ERROR, 90 - RDS_CONN_CONNECTING)) { 76 + RDS_CONN_CONNECTING)) 91 77 return cp->cp_transport_data; 92 - } 93 78 } 94 79 return NULL; 95 80 } 96 81 97 - int rds_tcp_accept_one(struct socket *sock) 82 + void rds_tcp_conn_slots_available(struct rds_connection *conn) 98 83 { 84 + struct rds_tcp_connection *tc; 85 + struct rds_tcp_net *rtn; 86 + 87 + if (rds_destroy_pending(conn)) 88 + return; 89 + 90 + tc = conn->c_path->cp_transport_data; 91 + rtn = tc->t_rtn; 92 + if (!rtn) 93 + return; 94 + 95 + /* As soon as a connection went down, 96 + * it is safe to schedule a "rds_tcp_accept_one" 97 + * attempt even if there are no connections pending: 98 + * Function "rds_tcp_accept_one" won't block 99 + * but simply return -EAGAIN in that case. 100 + * 101 + * Doing so is necessary to address the case where an 102 + * incoming connection on "rds_tcp_listen_sock" is ready 103 + * to be acccepted prior to a free slot being available: 104 + * the -ENOBUFS case in "rds_tcp_accept_one". 105 + */ 106 + rds_tcp_accept_work(rtn); 107 + } 108 + 109 + int rds_tcp_accept_one(struct rds_tcp_net *rtn) 110 + { 111 + struct socket *listen_sock = rtn->rds_tcp_listen_sock; 99 112 struct socket *new_sock = NULL; 100 113 struct rds_connection *conn; 101 114 int ret; ··· 121 110 #endif 122 111 int dev_if = 0; 123 112 124 - if (!sock) /* module unload or netns delete in progress */ 113 + if (!listen_sock) /* module unload or netns delete in progress */ 125 114 return -ENETUNREACH; 126 115 127 - ret = kernel_accept(sock, &new_sock, O_NONBLOCK); 128 - if (ret) 129 - return ret; 116 + mutex_lock(&rtn->rds_tcp_accept_lock); 117 + new_sock = rtn->rds_tcp_accepted_sock; 118 + rtn->rds_tcp_accepted_sock = NULL; 130 119 131 - rds_tcp_keepalive(new_sock); 132 - if (!rds_tcp_tune(new_sock)) { 133 - ret = -EINVAL; 134 - goto out; 120 + if (!new_sock) { 121 + ret = kernel_accept(listen_sock, &new_sock, O_NONBLOCK); 122 + if (ret) 123 + goto out; 124 + 125 + rds_tcp_keepalive(new_sock); 126 + if (!rds_tcp_tune(new_sock)) { 127 + ret = -EINVAL; 128 + goto out; 129 + } 135 130 } 136 131 137 132 inet = inet_sk(new_sock->sk); ··· 152 135 peer_addr = &daddr; 153 136 #endif 154 137 rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n", 155 - sock->sk->sk_family, 138 + listen_sock->sk->sk_family, 156 139 my_addr, ntohs(inet->inet_sport), 157 140 peer_addr, ntohs(inet->inet_dport)); 158 141 ··· 172 155 } 173 156 #endif 174 157 175 - if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) { 158 + if (!rds_tcp_laddr_check(sock_net(listen_sock->sk), peer_addr, dev_if)) { 176 159 /* local address connection is only allowed via loopback */ 177 160 ret = -EOPNOTSUPP; 178 161 goto out; 179 162 } 180 163 181 - conn = rds_conn_create(sock_net(sock->sk), 164 + conn = rds_conn_create(sock_net(listen_sock->sk), 182 165 my_addr, peer_addr, 183 166 &rds_tcp_transport, 0, GFP_KERNEL, dev_if); 184 167 ··· 191 174 * If the client reboots, this conn will need to be cleaned up. 192 175 * rds_tcp_state_change() will do that cleanup 193 176 */ 194 - rs_tcp = rds_tcp_accept_one_path(conn); 195 - if (!rs_tcp) 177 + if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) < 0) { 178 + /* Try to obtain a free connection slot. 179 + * If unsuccessful, we need to preserve "new_sock" 180 + * that we just accepted, since its "sk_receive_queue" 181 + * may contain messages already that have been acknowledged 182 + * to and discarded by the sender. 183 + * We must not throw those away! 184 + */ 185 + rs_tcp = rds_tcp_accept_one_path(conn); 186 + if (!rs_tcp) { 187 + /* It's okay to stash "new_sock", since 188 + * "rds_tcp_conn_slots_available" triggers 189 + * "rds_tcp_accept_one" again as soon as one of the 190 + * connection slots becomes available again 191 + */ 192 + rtn->rds_tcp_accepted_sock = new_sock; 193 + new_sock = NULL; 194 + ret = -ENOBUFS; 195 + goto out; 196 + } 197 + } else { 198 + /* This connection request came from a peer with 199 + * a larger address. 200 + * Function "rds_tcp_state_change" makes sure 201 + * that the connection doesn't transition 202 + * to state "RDS_CONN_UP", and therefore 203 + * we should not have received any messages 204 + * on this socket yet. 205 + * This is the only case where it's okay to 206 + * not dequeue messages from "sk_receive_queue". 207 + */ 208 + if (conn->c_npaths <= 1) 209 + rds_conn_path_connect_if_down(&conn->c_path[0]); 210 + rs_tcp = NULL; 196 211 goto rst_nsk; 212 + } 213 + 197 214 mutex_lock(&rs_tcp->t_conn_path_lock); 198 215 cp = rs_tcp->t_cpath; 199 216 conn_state = rds_conn_path_state(cp); 200 217 WARN_ON(conn_state == RDS_CONN_UP); 201 - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) 218 + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) { 219 + rds_conn_path_drop(cp, 0); 202 220 goto rst_nsk; 221 + } 203 222 if (rs_tcp->t_sock) { 204 223 /* Duelling SYN has been handled in rds_tcp_accept_one() */ 205 224 rds_tcp_reset_callbacks(new_sock, cp); ··· 265 212 mutex_unlock(&rs_tcp->t_conn_path_lock); 266 213 if (new_sock) 267 214 sock_release(new_sock); 215 + 216 + mutex_unlock(&rtn->rds_tcp_accept_lock); 217 + 268 218 return ret; 269 219 } 270 220 ··· 295 239 * the listen socket is being torn down. 296 240 */ 297 241 if (sk->sk_state == TCP_LISTEN) 298 - rds_tcp_accept_work(sk); 242 + rds_tcp_accept_work(net_generic(sock_net(sk), rds_tcp_netid)); 299 243 else 300 244 ready = rds_tcp_listen_sock_def_readable(sock_net(sk)); 301 245