Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'tcp-rfc-7323-compliant-window-retraction-handling'

Simon Baatz says:

====================
tcp: RFC 7323-compliant window retraction handling

this series implements the receiver-side requirements for TCP window
retraction as specified in RFC 7323 and adds packetdrill tests to
cover the new behavior.

Please see the first patch for background and implementation
details. Since MPTCP adjusts the TCP receive window on subflows, the
relevant MPTCP code paths are updated accordingly.
====================

Link: https://patch.msgid.link/20260309-tcp_rfc7323_retract_wnd_rfc-v3-0-4c7f96b1ec69@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+242 -8
+1
Documentation/networking/net_cachelines/tcp_sock.rst
··· 121 121 u32 rate_delivered read_mostly tcp_rate_gen 122 122 u32 rate_interval_us read_mostly rate_delivered,rate_app_limited 123 123 u32 rcv_wnd read_write read_mostly tcp_select_window,tcp_receive_window,tcp_fast_path_check 124 + u32 rcv_mwnd_seq read_write tcp_select_window 124 125 u32 write_seq read_write tcp_rate_check_app_limited,tcp_write_queue_empty,tcp_skb_entail,forced_push,tcp_mark_push 125 126 u32 notsent_lowat read_mostly tcp_stream_memory_free 126 127 u32 pushed_seq read_write tcp_mark_push,forced_push
+3
include/linux/tcp.h
··· 316 316 */ 317 317 u32 app_limited; /* limited until "delivered" reaches this val */ 318 318 u32 rcv_wnd; /* Current receiver window */ 319 + u32 rcv_mwnd_seq; /* Maximum window sequence number (RFC 7323, 320 + * section 2.4, receiver requirements) 321 + */ 319 322 u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ 320 323 /* 321 324 * Options received (usually on last packet, some only on SYN packets).
+22
include/net/tcp.h
··· 934 934 return (u32) win; 935 935 } 936 936 937 + /* Compute the maximum receive window we ever advertised. 938 + * Rcv_nxt can be after the window if our peer push more data 939 + * than the offered window. 940 + */ 941 + static inline u32 tcp_max_receive_window(const struct tcp_sock *tp) 942 + { 943 + s32 win = tp->rcv_mwnd_seq - tp->rcv_nxt; 944 + 945 + if (win < 0) 946 + win = 0; 947 + return (u32) win; 948 + } 949 + 950 + /* Check if we need to update the maximum receive window sequence number */ 951 + static inline void tcp_update_max_rcv_wnd_seq(struct tcp_sock *tp) 952 + { 953 + u32 wre = tp->rcv_wup + tp->rcv_wnd; 954 + 955 + if (after(wre, tp->rcv_mwnd_seq)) 956 + tp->rcv_mwnd_seq = wre; 957 + } 958 + 937 959 /* Choose a new window, without checks for shrinking, and without 938 960 * scaling applied to the result. The caller does these things 939 961 * if necessary. This is a "raw" window selection.
+2
net/ipv4/tcp.c
··· 3561 3561 3562 3562 tp->rcv_wnd = opt.rcv_wnd; 3563 3563 tp->rcv_wup = opt.rcv_wup; 3564 + tp->rcv_mwnd_seq = opt.rcv_wup + opt.rcv_wnd; 3564 3565 3565 3566 return 0; 3566 3567 } ··· 5276 5275 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); 5277 5276 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); 5278 5277 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); 5278 + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_mwnd_seq); 5279 5279 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); 5280 5280 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); 5281 5281
+1
net/ipv4/tcp_fastopen.c
··· 377 377 378 378 tcp_rsk(req)->rcv_nxt = tp->rcv_nxt; 379 379 tp->rcv_wup = tp->rcv_nxt; 380 + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; 380 381 /* tcp_conn_request() is sending the SYNACK, 381 382 * and queues the child into listener accept queue. 382 383 */
+6 -5
net/ipv4/tcp_input.c
··· 4808 4808 const struct tcphdr *th) 4809 4809 { 4810 4810 const struct tcp_sock *tp = tcp_sk(sk); 4811 - u32 seq_limit; 4812 4811 4813 4812 if (before(end_seq, tp->rcv_wup)) 4814 4813 return SKB_DROP_REASON_TCP_OLD_SEQUENCE; 4815 4814 4816 - seq_limit = tp->rcv_nxt + tcp_receive_window(tp); 4817 - if (unlikely(after(end_seq, seq_limit))) { 4815 + if (unlikely(after(end_seq, tp->rcv_nxt + tcp_max_receive_window(tp)))) { 4818 4816 /* Some stacks are known to handle FIN incorrectly; allow the 4819 4817 * FIN to extend beyond the window and check it in detail later. 4820 4818 */ 4821 - if (!after(end_seq - th->fin, seq_limit)) 4819 + if (!after(end_seq - th->fin, tp->rcv_nxt + tcp_receive_window(tp))) 4822 4820 return SKB_NOT_DROPPED_YET; 4823 4821 4824 - if (after(seq, seq_limit)) 4822 + if (after(seq, tp->rcv_nxt + tcp_max_receive_window(tp))) 4825 4823 return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; 4826 4824 4827 4825 /* Only accept this packet if receive queue is empty. */ ··· 5678 5680 if (!before(TCP_SKB_CB(skb)->seq, 5679 5681 tp->rcv_nxt + tcp_receive_window(tp))) { 5680 5682 reason = SKB_DROP_REASON_TCP_OVERWINDOW; 5683 + NET_INC_STATS(sock_net(sk), LINUX_MIB_BEYOND_WINDOW); 5681 5684 goto out_of_window; 5682 5685 } 5683 5686 ··· 6902 6903 */ 6903 6904 WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); 6904 6905 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 6906 + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; 6905 6907 6906 6908 /* RFC1323: The window in SYN & SYN/ACK segments is 6907 6909 * never scaled. ··· 7015 7015 WRITE_ONCE(tp->rcv_nxt, TCP_SKB_CB(skb)->seq + 1); 7016 7016 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); 7017 7017 tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 7018 + tp->rcv_mwnd_seq = tp->rcv_wup + tp->rcv_wnd; 7018 7019 7019 7020 /* RFC1323: The window in SYN & SYN/ACK segments is 7020 7021 * never scaled.
+1
net/ipv4/tcp_minisocks.c
··· 604 604 newtp->window_clamp = req->rsk_window_clamp; 605 605 newtp->rcv_ssthresh = req->rsk_rcv_wnd; 606 606 newtp->rcv_wnd = req->rsk_rcv_wnd; 607 + newtp->rcv_mwnd_seq = newtp->rcv_wup + req->rsk_rcv_wnd; 607 608 newtp->rx_opt.wscale_ok = ireq->wscale_ok; 608 609 if (newtp->rx_opt.wscale_ok) { 609 610 newtp->rx_opt.snd_wscale = ireq->snd_wscale;
+3
net/ipv4/tcp_output.c
··· 293 293 tp->pred_flags = 0; 294 294 tp->rcv_wnd = 0; 295 295 tp->rcv_wup = tp->rcv_nxt; 296 + tcp_update_max_rcv_wnd_seq(tp); 296 297 return 0; 297 298 } 298 299 ··· 317 316 318 317 tp->rcv_wnd = new_win; 319 318 tp->rcv_wup = tp->rcv_nxt; 319 + tcp_update_max_rcv_wnd_seq(tp); 320 320 321 321 /* Make sure we do not exceed the maximum possible 322 322 * scaled window. ··· 4167 4165 else 4168 4166 tp->rcv_tstamp = tcp_jiffies32; 4169 4167 tp->rcv_wup = tp->rcv_nxt; 4168 + tp->rcv_mwnd_seq = tp->rcv_nxt + tp->rcv_wnd; 4170 4169 WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); 4171 4170 4172 4171 inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
+4 -2
net/mptcp/options.c
··· 1076 1076 * resync. 1077 1077 */ 1078 1078 tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent; 1079 + tcp_update_max_rcv_wnd_seq(tp); 1079 1080 subflow->rcv_wnd_sent = mptcp_rcv_wnd; 1080 1081 } 1081 1082 ··· 1339 1338 */ 1340 1339 rcv_wnd_new = rcv_wnd_old; 1341 1340 win = rcv_wnd_old - ack_seq; 1342 - tp->rcv_wnd = min_t(u64, win, U32_MAX); 1343 - new_win = tp->rcv_wnd; 1341 + new_win = min_t(u64, win, U32_MAX); 1342 + tp->rcv_wnd = new_win; 1343 + tcp_update_max_rcv_wnd_seq(tp); 1344 1344 1345 1345 /* Make sure we do not exceed the maximum possible 1346 1346 * scaled window.
+1 -1
tools/testing/selftests/net/packetdrill/tcp_rcv_big_endseq.pkt
··· 36 36 37 37 +0 read(4, ..., 100000) = 4000 38 38 39 - // If queue is empty, accept a packet even if its end_seq is above wup + rcv_wnd 39 + // If queue is empty, accept a packet even if its end_seq is above rcv_mwnd_seq 40 40 +0 < P. 4001:54001(50000) ack 1 win 257 41 41 * > . 1:1(0) ack 54001 win 0 42 42
+26
tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window.pkt
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + --mss=1000 4 + 5 + `./defaults.sh` 6 + 7 + // Establish a connection. 8 + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 9 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 10 + +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0 11 + +0 bind(3, ..., ...) = 0 12 + +0 listen(3, 1) = 0 13 + 14 + +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> 15 + +0 > S. 0:0(0) ack 1 win 18980 <mss 1460,nop,wscale 0> 16 + +.1 < . 1:1(0) ack 1 win 257 17 + 18 + +0 accept(3, ..., ...) = 4 19 + 20 + // A too big packet is accepted if the receive queue is empty 21 + +0 < P. 1:20001(20000) ack 1 win 257 22 + // Send a RST immediately so that there is no rcv_wup/rcv_mwnd_seq update yet 23 + +0 < R. 20001:20001(0) ack 1 win 257 24 + 25 + +.1 %{ assert tcpi_state == TCP_CLOSE, tcpi_state }% 26 +
+40
tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed.pkt
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + --mss=1000 4 + 5 + `./defaults.sh 6 + sysctl -q net.ipv4.tcp_shrink_window=1 7 + sysctl -q net.ipv4.tcp_rmem="4096 32768 $((32*1024*1024))"` 8 + 9 + 0 `nstat -n` 10 + 11 + // Establish a connection. 12 + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 13 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 14 + +0 bind(3, ..., ...) = 0 15 + +0 listen(3, 1) = 0 16 + 17 + +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7> 18 + +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 10> 19 + +0 < . 1:1(0) ack 1 win 257 20 + 21 + +0 accept(3, ..., ...) = 4 22 + 23 + +0 < P. 1:10001(10000) ack 1 win 257 24 + * > . 1:1(0) ack 10001 win 15 25 + 26 + +0 < P. 10001:11024(1023) ack 1 win 257 27 + * > . 1:1(0) ack 11024 win 13 28 + 29 + // Max window seq advertised 10001 + 15*1024 = 25361, last advertised: 11024 + 13*1024 = 24336 30 + 31 + // Segment beyond the max window is dropped 32 + +0 < P. 11024:25362(14338) ack 1 win 257 33 + * > . 1:1(0) ack 11024 win 13 34 + 35 + // Segment using the max window is accepted 36 + +0 < P. 11024:25361(14337) ack 1 win 257 37 + * > . 1:1(0) ack 25361 win 0 38 + 39 + // Check LINUX_MIB_BEYOND_WINDOW has been incremented once 40 + +0 `nstat | grep TcpExtBeyondWindow | grep -q " 1 "`
+132
tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_nomem.pkt
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + // When tcp_receive_window() < tcp_max_receive_window(), tcp_sequence() accepts 3 + // packets that would be dropped under normal conditions (i.e. tcp_receive_window() 4 + // equal to tcp_max_receive_window()). 5 + // Test that such packets are handled as expected for RWIN == 0 and for RWIN > 0. 6 + 7 + --mss=1000 8 + 9 + `./defaults.sh` 10 + 11 + 0 `nstat -n` 12 + 13 + // Establish a connection. 14 + +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 15 + +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 16 + +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1000000], 4) = 0 17 + +0 bind(3, ..., ...) = 0 18 + +0 listen(3, 1) = 0 19 + 20 + +0 < S 0:0(0) win 32792 <mss 1000,nop,nop,sackOK,nop,wscale 7> 21 + +0 > S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 4> 22 + +0 < . 1:1(0) ack 1 win 257 23 + 24 + +0 accept(3, ..., ...) = 4 25 + 26 + // Put 1040000 bytes into the receive buffer 27 + +0 < P. 1:65001(65000) ack 1 win 257 28 + * > . 1:1(0) ack 65001 29 + +0 < P. 65001:130001(65000) ack 1 win 257 30 + * > . 1:1(0) ack 130001 31 + +0 < P. 130001:195001(65000) ack 1 win 257 32 + * > . 1:1(0) ack 195001 33 + +0 < P. 195001:260001(65000) ack 1 win 257 34 + * > . 1:1(0) ack 260001 35 + +0 < P. 260001:325001(65000) ack 1 win 257 36 + * > . 1:1(0) ack 325001 37 + +0 < P. 325001:390001(65000) ack 1 win 257 38 + * > . 1:1(0) ack 390001 39 + +0 < P. 390001:455001(65000) ack 1 win 257 40 + * > . 1:1(0) ack 455001 41 + +0 < P. 455001:520001(65000) ack 1 win 257 42 + * > . 1:1(0) ack 520001 43 + +0 < P. 520001:585001(65000) ack 1 win 257 44 + * > . 1:1(0) ack 585001 45 + +0 < P. 585001:650001(65000) ack 1 win 257 46 + * > . 1:1(0) ack 650001 47 + +0 < P. 650001:715001(65000) ack 1 win 257 48 + * > . 1:1(0) ack 715001 49 + +0 < P. 715001:780001(65000) ack 1 win 257 50 + * > . 1:1(0) ack 780001 51 + +0 < P. 780001:845001(65000) ack 1 win 257 52 + * > . 1:1(0) ack 845001 53 + +0 < P. 845001:910001(65000) ack 1 win 257 54 + * > . 1:1(0) ack 910001 55 + +0 < P. 910001:975001(65000) ack 1 win 257 56 + * > . 1:1(0) ack 975001 57 + +0 < P. 975001:1040001(65000) ack 1 win 257 58 + * > . 1:1(0) ack 1040001 59 + 60 + // Trigger an extreme memory squeeze by shrinking SO_RCVBUF 61 + +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [16000], 4) = 0 62 + 63 + +0 < P. 1040001:1105001(65000) ack 1 win 257 64 + * > . 1:1(0) ack 1040001 win 0 65 + // Check LINUX_MIB_TCPRCVQDROP has been incremented 66 + +0 `nstat -s | grep TcpExtTCPRcvQDrop| grep -q " 1 "` 67 + 68 + // RWIN == 0: rcv_wup = 1040001, rcv_wnd = 0, rcv_mwnd_seq > 1105001 (significantly larger, typically ~1970000) 69 + 70 + // Accept pure ack with seq in max adv. window 71 + +0 write(4, ..., 1000) = 1000 72 + +0 > P. 1:1001(1000) ack 1040001 win 0 73 + +0 < . 1105001:1105001(0) ack 1001 win 257 74 + 75 + // In order segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW) 76 + +0 < P. 1040001:1041001(1000) ack 1001 win 257 77 + +0 > . 1001:1001(0) ack 1040001 win 0 78 + // Ooo partial segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW) 79 + +0 < P. 1039001:1041001(2000) ack 1001 win 257 80 + +0 > . 1001:1001(0) ack 1040001 win 0 <nop,nop,sack 1039001:1040001> 81 + // Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented twice 82 + +0 `nstat -s | grep TcpExtTCPZeroWindowDrop| grep -q " 2 "` 83 + 84 + // Ooo segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_OVERWINDOW) 85 + +0 < P. 1105001:1106001(1000) ack 1001 win 257 86 + +0 > . 1001:1001(0) ack 1040001 win 0 87 + // Ooo segment, beyond max adv. window -> drop (SKB_DROP_REASON_TCP_INVALID_SEQUENCE) 88 + +0 < P. 2000001:2001001(1000) ack 1001 win 257 89 + +0 > . 1001:1001(0) ack 1040001 win 0 90 + // Check LINUX_MIB_BEYOND_WINDOW has been incremented twice 91 + +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 2 "` 92 + 93 + // Read all data 94 + +0 read(4, ..., 2000000) = 1040000 95 + * > . 1001:1001(0) ack 1040001 96 + 97 + // RWIN > 0: rcv_wup = 1040001, 0 < rcv_wnd < 32000, rcv_mwnd_seq > 1105001 (significantly larger, typically ~1970000) 98 + 99 + // Accept pure ack with seq in max adv. window, beyond adv. window 100 + +0 write(4, ..., 1000) = 1000 101 + +0 > P. 1001:2001(1000) ack 1040001 102 + +0 < . 1105001:1105001(0) ack 2001 win 257 103 + 104 + // In order segment, in max adv. window, in adv. window -> accept 105 + // Note: This also ensures that we cannot hit the empty queue exception in tcp_sequence() in the following tests 106 + +0 < P. 1040001:1041001(1000) ack 2001 win 257 107 + * > . 2001:2001(0) ack 1041001 108 + 109 + // Ooo partial segment, in adv. window -> accept 110 + +0 < P. 1040001:1042001(2000) ack 2001 win 257 111 + +0 > . 2001:2001(0) ack 1042001 <nop,nop,sack 1040001:1041001> 112 + 113 + // Ooo segment, in max adv. window, beyond adv. window -> drop (SKB_DROP_REASON_TCP_OVERWINDOW) 114 + +0 < P. 1105001:1106001(1000) ack 2001 win 257 115 + +0 > . 2001:2001(0) ack 1042001 116 + // Ooo segment, beyond max adv. window, beyond adv. window -> drop (SKB_DROP_REASON_TCP_INVALID_SEQUENCE) 117 + +0 < P. 2000001:2001001(1000) ack 2001 win 257 118 + +0 > . 2001:2001(0) ack 1042001 119 + // Check LINUX_MIB_BEYOND_WINDOW has been incremented twice 120 + +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 4 "` 121 + 122 + // We are allowed to go beyond the window and buffer with one packet 123 + +0 < P. 1042001:1062001(20000) ack 2001 win 257 124 + * > . 2001:2001(0) ack 1062001 125 + +0 < P. 1062001:1082001(20000) ack 2001 win 257 126 + * > . 2001:2001(0) ack 1082001 win 0 127 + 128 + // But not more: In order segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW) 129 + +0 < P. 1082001:1083001(1000) ack 2001 win 257 130 + * > . 2001:2001(0) ack 1082001 131 + // Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented again 132 + +0 `nstat -s | grep TcpExtTCPZeroWindowDrop| grep -q " 3 "`