tcp: AccECN core · tjh.dev/kernel@542a495

tcp: AccECN core

This change implements Accurate ECN without negotiation and
AccECN Option (that will be added by later changes). Based on
AccECN specifications:
https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt

Accurate ECN allows feeding back the number of CE (congestion
experienced) marks accurately to the sender in contrast to
RFC3168 ECN that can only signal one marks-seen-yes/no per RTT.
Congestion control algorithms can take advantage of the accurate
ECN information to fine-tune their congestion response to avoid
drastic rate reduction when only mild congestion is encountered.

With Accurate ECN, tp->received_ce (r.cep in AccECN spec) keeps
track of how many segments have arrived with a CE mark. Accurate
ECN uses ACE field (ECE, CWR, AE) to communicate the value back
to the sender which updates tp->delivered_ce (s.cep) based on the
feedback. This signalling channel is lossy when ACE field overflow
occurs.

Conservative strategy is selected here to deal with the ACE
overflow, however, some strategies using the AccECN option later
in the overall patchset mitigate against false overflows detected.

The ACE field values on the wire are offset by
TCP_ACCECN_CEP_INIT_OFFSET. Delivered_ce/received_ce count the
real CE marks rather than forcing all downstream users to adapt
to the wire offset.

This patch uses the first 1-byte hole and the last 4-byte hole of
the tcp_sock_write_txrx for 'received_ce_pending' and 'received_ce'.
Also, the group size of tcp_sock_write_txrx is increased from
91 + 4 to 95 + 4 due to the new u32 received_ce member. Below are
the trimmed pahole outcomes before and after this patch.

[BEFORE THIS PATCH]
struct tcp_sock {
[...]
__cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */
u8 nonagle:4; /* 2521: 0 1 */
u8 rate_app_limited:1; /* 2521: 4 1 */
/* XXX 3 bits hole, try to pack */
/* XXX 2 bytes hole, try to pack */

[...]
u32 delivered_ce; /* 2576 4 */
u32 app_limited; /* 2580 4 */
u32 rcv_wnd; /* 2684 4 */
struct tcp_options_received rx_opt; /* 2688 24 */
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2612 0 */
/* XXX 4 bytes hole, try to pack */

[...]
/* size: 3200, cachelines: 50, members: 161 */
}

[AFTER THIS PATCH]
struct tcp_sock {
[...]
__cacheline_group_begin__tcp_sock_write_txrx[0]; /* 2521 0 */
u8 nonagle:4; /* 2521: 0 1 */
u8 rate_app_limited:1; /* 2521: 4 1 */
/* XXX 3 bits hole, try to pack */

/* Force alignment to the next boundary: */
u8 :0;
u8 received_ce_pending:4;/* 2522: 0 1 */
u8 unused2:4; /* 2522: 4 1 */
/* XXX 1 byte hole, try to pack */

[...]
u32 delivered_ce; /* 2576 4 */
u32 received_ce; /* 2580 4 */
u32 app_limited; /* 2584 4 */
u32 rcv_wnd; /* 2588 4 */
struct tcp_options_received rx_opt; /* 2592 24 */
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */

[...]
/* size: 3200, cachelines: 50, members: 164 */
}

Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Co-developed-by: Olivier Tilmans <olivier.tilmans@nokia.com>
Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia.com>
Co-developed-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-2-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Ilpo Järvinen and committed by

Paolo Abeni 9 months ago 542a495c 152ba35c

+175 -12

7 changed files

expand all

Documentation

networking

net_cachelines

tcp_sock.rst

include

linux

tcp.h

net

tcp.h

tcp_ecn.h

net

ipv4

tcp.c

tcp_input.c

tcp_output.c

Documentation/networking/net_cachelines/tcp_sock.rst

··· 101 101 u32 prr_out read_mostly read_mostly tcp_rate_skb_sent,tcp_newly_delivered(tx);tcp_ack,tcp_rate_gen,tcp_clean_rtx_queue(rx) 102 102 u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) 103 103 u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) 104 + u32 received_ce read_mostly read_write 105 + u8:4 received_ce_pending read_mostly read_write 104 106 u32 lost read_mostly tcp_ack 105 107 u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) 106 108 u64 first_tx_mstamp read_write tcp_rate_skb_sent

include/linux/tcp.h

··· 287 287 */ 288 288 u8 nonagle : 4,/* Disable Nagle algorithm? */ 289 289 rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ 290 + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ 291 + unused2:4; 290 292 __be32 pred_flags; 291 293 u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ 292 294 u64 tcp_mstamp; /* most recent packet received/sent */ ··· 301 299 u32 snd_up; /* Urgent pointer */ 302 300 u32 delivered; /* Total data packets delivered incl. rexmits */ 303 301 u32 delivered_ce; /* Like the above but only ECE marked packets */ 302 + u32 received_ce; /* Like the above but for rcvd CE marked pkts */ 304 303 u32 app_limited; /* limited until "delivered" reaches this val */ 305 304 u32 rcv_wnd; /* Current receiver window */ 306 305 /*

+15

include/net/tcp.h

··· 973 973 #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) 974 974 #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) 975 975 976 + #define TCP_ACCECN_CEP_ACE_MASK 0x7 977 + #define TCP_ACCECN_ACE_MAX_DELTA 6 978 + 979 + /* To avoid/detect middlebox interference, not all counters start at 0. 980 + * See draft-ietf-tcpm-accurate-ecn for the latest values. 981 + */ 982 + #define TCP_ACCECN_CEP_INIT_OFFSET 5 983 + 976 984 /* State flags for sacked in struct tcp_skb_cb */ 977 985 enum tcp_skb_cb_sacked_flags { 978 986 TCPCB_SACKED_ACKED = (1 << 0), /* SKB ACK'd by a SACK block */ ··· 1790 1782 1791 1783 static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) 1792 1784 { 1785 + u32 ace; 1786 + 1793 1787 /* mptcp hooks are only on the slow path */ 1794 1788 if (sk_is_mptcp((struct sock *)tp)) 1795 1789 return; 1796 1790 1791 + ace = tcp_ecn_mode_accecn(tp) ? 1792 + ((tp->delivered_ce + TCP_ACCECN_CEP_INIT_OFFSET) & 1793 + TCP_ACCECN_CEP_ACE_MASK) : 0; 1794 + 1797 1795 tp->pred_flags = htonl((tp->tcp_header_len << 26) | 1796 + (ace << 22) | 1798 1797 ntohl(TCP_FLAG_ACK) | 1799 1798 snd_wnd); 1800 1799 }

+51 -2

include/net/tcp_ecn.h

··· 12 12 13 13 static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) 14 14 { 15 + /* Do not set CWR if in AccECN mode! */ 15 16 if (tcp_ecn_mode_rfc3168(tp)) 16 17 tp->ecn_flags |= TCP_ECN_QUEUE_CWR; 17 18 } ··· 20 19 static inline void tcp_ecn_accept_cwr(struct sock *sk, 21 20 const struct sk_buff *skb) 22 21 { 23 - if (tcp_hdr(skb)->cwr) { 24 - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 22 + struct tcp_sock *tp = tcp_sk(sk); 23 + 24 + if (tcp_ecn_mode_rfc3168(tp) && tcp_hdr(skb)->cwr) { 25 + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; 25 26 26 27 /* If the sender is telling us it has entered CWR, then its 27 28 * cwnd may be very low (even just 1 packet), so we should ACK ··· 37 34 static inline void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) 38 35 { 39 36 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; 37 + } 38 + 39 + static inline u8 tcp_accecn_ace(const struct tcphdr *th) 40 + { 41 + return (th->ae << 2) | (th->cwr << 1) | th->ece; 42 + } 43 + 44 + static inline void tcp_accecn_init_counters(struct tcp_sock *tp) 45 + { 46 + tp->received_ce = 0; 47 + tp->received_ce_pending = 0; 48 + } 49 + 50 + /* Updates Accurate ECN received counters from the received IP ECN field */ 51 + static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) 52 + { 53 + u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; 54 + u8 is_ce = INET_ECN_is_ce(ecnfield); 55 + struct tcp_sock *tp = tcp_sk(sk); 56 + 57 + if (!INET_ECN_is_not_ect(ecnfield)) { 58 + u32 pcount = is_ce * max_t(u16, 1, skb_shinfo(skb)->gso_segs); 59 + 60 + /* As for accurate ECN, the TCP_ECN_SEEN flag is set by 61 + * tcp_ecn_received_counters() when the ECN codepoint of 62 + * received TCP data or ACK contains ECT(0), ECT(1), or CE. 63 + */ 64 + if (!tcp_ecn_mode_rfc3168(tp)) 65 + tp->ecn_flags |= TCP_ECN_SEEN; 66 + 67 + /* ACE counter tracks *all* segments including pure ACKs */ 68 + tp->received_ce += pcount; 69 + tp->received_ce_pending = min(tp->received_ce_pending + pcount, 70 + 0xfU); 71 + } 72 + } 73 + 74 + static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) 75 + { 76 + u32 wire_ace; 77 + 78 + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; 79 + th->ece = !!(wire_ace & 0x1); 80 + th->cwr = !!(wire_ace & 0x2); 81 + th->ae = !!(wire_ace & 0x4); 82 + tp->received_ce_pending = 0; 40 83 } 41 84 42 85 static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp,

+4 -1

net/ipv4/tcp.c

··· 271 271 #include <net/icmp.h> 272 272 #include <net/inet_common.h> 273 273 #include <net/tcp.h> 274 + #include <net/tcp_ecn.h> 274 275 #include <net/mptcp.h> 275 276 #include <net/proto_memory.h> 276 277 #include <net/xfrm.h> ··· 3407 3406 tp->window_clamp = 0; 3408 3407 tp->delivered = 0; 3409 3408 tp->delivered_ce = 0; 3409 + tcp_accecn_init_counters(tp); 3410 3410 if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) 3411 3411 icsk->icsk_ca_ops->release(sk); 3412 3412 memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); ··· 5140 5138 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); 5141 5139 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); 5142 5140 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); 5141 + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); 5143 5142 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); 5144 5143 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); 5145 5144 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); ··· 5148 5145 /* 32bit arches with 8byte alignment on u64 fields might need padding 5149 5146 * before tcp_clock_cache. 5150 5147 */ 5151 - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 91 + 4); 5148 + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); 5152 5149 5153 5150 /* RX read-write hotpath cache lines */ 5154 5151 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);

+92 -8

net/ipv4/tcp_input.c

··· 360 360 if (tcp_ca_needs_ecn(sk)) 361 361 tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); 362 362 363 - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { 363 + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && 364 + tcp_ecn_mode_rfc3168(tp)) { 364 365 /* Better not delay acks, sender can have a very low cwnd */ 365 366 tcp_enter_quickack_mode(sk, 2); 366 367 tp->ecn_flags |= TCP_ECN_DEMAND_CWR; 367 368 } 369 + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by 370 + * tcp_data_ecn_check() when the ECN codepoint of 371 + * received TCP data contains ECT(0), ECT(1), or CE. 372 + */ 373 + if (!tcp_ecn_mode_rfc3168(tp)) 374 + break; 368 375 tp->ecn_flags |= TCP_ECN_SEEN; 369 376 break; 370 377 default: 371 378 if (tcp_ca_needs_ecn(sk)) 372 379 tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); 380 + if (!tcp_ecn_mode_rfc3168(tp)) 381 + break; 373 382 tp->ecn_flags |= TCP_ECN_SEEN; 374 383 break; 375 384 } ··· 394 385 bool ece_ack) 395 386 { 396 387 tp->delivered += delivered; 397 - if (ece_ack) 388 + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) 398 389 tcp_count_delivered_ce(tp, delivered); 390 + } 391 + 392 + /* Returns the ECN CE delta */ 393 + static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, 394 + u32 delivered_pkts, int flag) 395 + { 396 + const struct tcphdr *th = tcp_hdr(skb); 397 + struct tcp_sock *tp = tcp_sk(sk); 398 + u32 delta, safe_delta; 399 + u32 corrected_ace; 400 + 401 + /* Reordered ACK or uncertain due to lack of data to send and ts */ 402 + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) 403 + return 0; 404 + 405 + if (!(flag & FLAG_SLOWPATH)) { 406 + /* AccECN counter might overflow on large ACKs */ 407 + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) 408 + return 0; 409 + } 410 + 411 + /* ACE field is not available during handshake */ 412 + if (flag & FLAG_SYN_ACKED) 413 + return 0; 414 + 415 + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) 416 + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; 417 + 418 + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; 419 + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; 420 + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) 421 + return delta; 422 + 423 + safe_delta = delivered_pkts - 424 + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); 425 + 426 + return safe_delta; 427 + } 428 + 429 + static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, 430 + u32 delivered_pkts, int *flag) 431 + { 432 + struct tcp_sock *tp = tcp_sk(sk); 433 + u32 delta; 434 + 435 + delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); 436 + if (delta > 0) { 437 + tcp_count_delivered_ce(tp, delta); 438 + *flag |= FLAG_ECE; 439 + /* Recalculate header predictor */ 440 + if (tp->pred_flags) 441 + tcp_fast_path_on(tp); 442 + } 443 + return delta; 399 444 } 400 445 401 446 /* Buffer size and advertised window tuning. ··· 3807 3744 } 3808 3745 3809 3746 /* Returns the number of packets newly acked or sacked by the current ACK */ 3810 - static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) 3747 + static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, 3748 + u32 ecn_count, int flag) 3811 3749 { 3812 3750 const struct net *net = sock_net(sk); 3813 3751 struct tcp_sock *tp = tcp_sk(sk); ··· 3816 3752 3817 3753 delivered = tp->delivered - prior_delivered; 3818 3754 NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); 3819 - if (flag & FLAG_ECE) 3820 - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); 3755 + 3756 + if (flag & FLAG_ECE) { 3757 + if (tcp_ecn_mode_rfc3168(tp)) 3758 + ecn_count = delivered; 3759 + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); 3760 + } 3821 3761 3822 3762 return delivered; 3823 3763 } ··· 3842 3774 u32 delivered = tp->delivered; 3843 3775 u32 lost = tp->lost; 3844 3776 int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ 3777 + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ 3845 3778 u32 prior_fack; 3846 3779 3847 3780 sack_state.first_sackt = 0; ··· 3950 3881 3951 3882 tcp_rack_update_reo_wnd(sk, &rs); 3952 3883 3884 + if (tcp_ecn_mode_accecn(tp)) 3885 + ecn_count = tcp_accecn_process(sk, skb, 3886 + tp->delivered - delivered, 3887 + &flag); 3888 + 3953 3889 tcp_in_ack_event(sk, flag); 3954 3890 3955 3891 if (tp->tlp_high_seq) ··· 3979 3905 if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) 3980 3906 sk_dst_confirm(sk); 3981 3907 3982 - delivered = tcp_newly_delivered(sk, delivered, flag); 3908 + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); 3909 + 3983 3910 lost = tp->lost - lost; /* freshly marked lost */ 3984 3911 rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); 3985 3912 tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); ··· 3989 3914 return 1; 3990 3915 3991 3916 no_queue: 3917 + if (tcp_ecn_mode_accecn(tp)) 3918 + ecn_count = tcp_accecn_process(sk, skb, 3919 + tp->delivered - delivered, 3920 + &flag); 3992 3921 tcp_in_ack_event(sk, flag); 3993 3922 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3994 3923 if (flag & FLAG_DSACKING_ACK) { 3995 3924 tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, 3996 3925 &rexmit); 3997 - tcp_newly_delivered(sk, delivered, flag); 3926 + tcp_newly_delivered(sk, delivered, ecn_count, flag); 3998 3927 } 3999 3928 /* If this ack opens up a zero window, clear backoff. It was 4000 3929 * being used to time the probes, and is probably far higher than ··· 4019 3940 &sack_state); 4020 3941 tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, 4021 3942 &rexmit); 4022 - tcp_newly_delivered(sk, delivered, flag); 3943 + tcp_newly_delivered(sk, delivered, ecn_count, flag); 4023 3944 tcp_xmit_recovery(sk, rexmit); 4024 3945 } 4025 3946 ··· 6150 6071 flag |= __tcp_replace_ts_recent(tp, 6151 6072 delta); 6152 6073 6074 + tcp_ecn_received_counters(sk, skb); 6075 + 6153 6076 /* We know that such packets are checksummed 6154 6077 * on entry. 6155 6078 */ ··· 6200 6119 /* Bulk data transfer: receiver */ 6201 6120 tcp_cleanup_skb(skb); 6202 6121 __skb_pull(skb, tcp_header_len); 6122 + tcp_ecn_received_counters(sk, skb); 6203 6123 eaten = tcp_queue_rcv(sk, skb, &fragstolen); 6204 6124 6205 6125 tcp_event_data_recv(sk, skb); ··· 6241 6159 return; 6242 6160 6243 6161 step5: 6162 + tcp_ecn_received_counters(sk, skb); 6163 + 6244 6164 reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); 6245 6165 if ((int)reason < 0) { 6246 6166 reason = -reason;

+8 -1

net/ipv4/tcp_output.c

··· 328 328 { 329 329 struct tcp_sock *tp = tcp_sk(sk); 330 330 331 - if (tcp_ecn_mode_rfc3168(tp)) { 331 + if (!tcp_ecn_mode_any(tp)) 332 + return; 333 + 334 + if (tcp_ecn_mode_accecn(tp)) { 335 + INET_ECN_xmit(sk); 336 + tcp_accecn_set_ace(th, tp); 337 + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; 338 + } else { 332 339 /* Not-retransmitted data segment: set ECT and inject CWR. */ 333 340 if (skb->len != tcp_header_len && 334 341 !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {

Configure Feed

Configure Feed