Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tcp: accecn: AccECN negotiation

Accurate ECN negotiation parts based on the specification:
https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt

Accurate ECN is negotiated using ECE, CWR and AE flags in the
TCP header. TCP falls back into using RFC3168 ECN if one of the
ends supports only RFC3168-style ECN.

The AccECN negotiation includes reflecting IP ECN field value
seen in SYN and SYNACK back using the same bits as negotiation
to allow responding to SYN CE marks and to detect ECN field
mangling. CE marks should not occur currently because SYN=1
segments are sent with Non-ECT in IP ECN field (but proposal
exists to remove this restriction).

Reflecting SYN IP ECN field in SYNACK is relatively simple.
Reflecting SYNACK IP ECN field in the final/third ACK of
the handshake is more challenging. Linux TCP code is not well
prepared for using the final/third ACK a signalling channel
which makes things somewhat complicated here.

tcp_ecn sysctl can be used to select the highest ECN variant
(Accurate ECN, ECN, No ECN) that is attemped to be negotiated and
requested for incoming connection and outgoing connection:
TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN,
TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN,
TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN.

After this patch, the size of tcp_request_sock remains unchanged
and no new holes are added. Below are the pahole outcomes before
and after this patch:

[BEFORE THIS PATCH]
struct tcp_request_sock {
[...]
u32 rcv_nxt; /* 352 4 */
u8 syn_tos; /* 356 1 */

/* size: 360, cachelines: 6, members: 16 */
}

[AFTER THIS PATCH]
struct tcp_request_sock {
[...]
u32 rcv_nxt; /* 352 4 */
u8 syn_tos; /* 356 1 */
bool accecn_ok; /* 357 1 */
u8 syn_ect_snt:2; /* 358: 0 1 */
u8 syn_ect_rcv:2; /* 358: 2 1 */
u8 accecn_fail_mode:4; /* 358: 4 1 */

/* size: 360, cachelines: 6, members: 20 */
}

After this patch, the size of tcp_sock remains unchanged and no new
holes are added. Also, 4 bits of the existing 2-byte hole are exploited.
Below are the pahole outcomes before and after this patch:

[BEFORE THIS PATCH]
struct tcp_sock {
[...]
u8 dup_ack_counter:2; /* 2761: 0 1 */
u8 tlp_retrans:1; /* 2761: 2 1 */
u8 unused:5; /* 2761: 3 1 */
u8 thin_lto:1; /* 2762: 0 1 */
u8 fastopen_connect:1; /* 2762: 1 1 */
u8 fastopen_no_cookie:1; /* 2762: 2 1 */
u8 fastopen_client_fail:2; /* 2762: 3 1 */
u8 frto:1; /* 2762: 5 1 */
/* XXX 2 bits hole, try to pack */

[...]
u8 keepalive_probes; /* 2765 1 */
/* XXX 2 bytes hole, try to pack */

[...]
/* size: 3200, cachelines: 50, members: 164 */
}

[AFTER THIS PATCH]
struct tcp_sock {
[...]
u8 dup_ack_counter:2; /* 2761: 0 1 */
u8 tlp_retrans:1; /* 2761: 2 1 */
u8 syn_ect_snt:2; /* 2761: 3 1 */
u8 syn_ect_rcv:2; /* 2761: 5 1 */
u8 thin_lto:1; /* 2761: 7 1 */
u8 fastopen_connect:1; /* 2762: 0 1 */
u8 fastopen_no_cookie:1; /* 2762: 1 1 */
u8 fastopen_client_fail:2; /* 2762: 2 1 */
u8 frto:1; /* 2762: 4 1 */
/* XXX 3 bits hole, try to pack */

[...]
u8 keepalive_probes; /* 2765 1 */
u8 accecn_fail_mode:4; /* 2766: 0 1 */
/* XXX 4 bits hole, try to pack */
/* XXX 1 byte hole, try to pack */

[...]
/* size: 3200, cachelines: 50, members: 166 */
}

Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Co-developed-by: Olivier Tilmans <olivier.tilmans@nokia.com>
Signed-off-by: Olivier Tilmans <olivier.tilmans@nokia.com>
Co-developed-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Acked-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-3-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Ilpo Järvinen and committed by
Paolo Abeni
3cae3427 542a495c

+399 -58
+20 -12
Documentation/networking/ip-sysctl.rst
··· 443 443 444 444 tcp_ecn - INTEGER 445 445 Control use of Explicit Congestion Notification (ECN) by TCP. 446 - ECN is used only when both ends of the TCP connection indicate 447 - support for it. This feature is useful in avoiding losses due 448 - to congestion by allowing supporting routers to signal 449 - congestion before having to drop packets. 446 + ECN is used only when both ends of the TCP connection indicate support 447 + for it. This feature is useful in avoiding losses due to congestion by 448 + allowing supporting routers to signal congestion before having to drop 449 + packets. A host that supports ECN both sends ECN at the IP layer and 450 + feeds back ECN at the TCP layer. The highest variant of ECN feedback 451 + that both peers support is chosen by the ECN negotiation (Accurate ECN, 452 + ECN, or no ECN). 450 453 451 - Possible values are: 454 + The highest negotiated variant for incoming connection requests 455 + and the highest variant requested by outgoing connection 456 + attempts: 452 457 453 - = ===================================================== 454 - 0 Disable ECN. Neither initiate nor accept ECN. 455 - 1 Enable ECN when requested by incoming connections and 456 - also request ECN on outgoing connection attempts. 457 - 2 Enable ECN when requested by incoming connections 458 - but do not request ECN on outgoing connections. 459 - = ===================================================== 458 + ===== ==================== ==================== 459 + Value Incoming connections Outgoing connections 460 + ===== ==================== ==================== 461 + 0 No ECN No ECN 462 + 1 ECN ECN 463 + 2 ECN No ECN 464 + 3 AccECN AccECN 465 + 4 AccECN ECN 466 + 5 AccECN No ECN 467 + ===== ==================== ==================== 460 468 461 469 Default: 2 462 470
+3
Documentation/networking/net_cachelines/tcp_sock.rst
··· 103 103 u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) 104 104 u32 received_ce read_mostly read_write 105 105 u8:4 received_ce_pending read_mostly read_write 106 + u8:2 syn_ect_snt write_mostly read_write 107 + u8:2 syn_ect_rcv read_mostly read_write 108 + u8:4 accecn_fail_mode 106 109 u32 lost read_mostly tcp_ack 107 110 u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx) 108 111 u64 first_tx_mstamp read_write tcp_rate_skb_sent
+7 -1
include/linux/tcp.h
··· 168 168 * after data-in-SYN. 169 169 */ 170 170 u8 syn_tos; 171 + bool accecn_ok; 172 + u8 syn_ect_snt: 2, 173 + syn_ect_rcv: 2, 174 + accecn_fail_mode:4; 171 175 #ifdef CONFIG_TCP_AO 172 176 u8 ao_keyid; 173 177 u8 ao_rcv_next; ··· 379 375 u8 compressed_ack; 380 376 u8 dup_ack_counter:2, 381 377 tlp_retrans:1, /* TLP is a retransmission */ 382 - unused:5; 378 + syn_ect_snt:2, /* AccECN ECT memory, only */ 379 + syn_ect_rcv:2; /* ... needed during 3WHS + first seqno */ 383 380 u8 thin_lto : 1,/* Use linear timeouts for thin streams */ 384 381 fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ 385 382 fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ ··· 396 391 syn_fastopen_child:1; /* created TFO passive child socket */ 397 392 398 393 u8 keepalive_probes; /* num of allowed keep alive probes */ 394 + u8 accecn_fail_mode:4; /* AccECN failure handling */ 399 395 u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ 400 396 401 397 /* RTT measurement */
+1
include/net/tcp.h
··· 972 972 973 973 #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE) 974 974 #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) 975 + #define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR) 975 976 976 977 #define TCP_ACCECN_CEP_ACE_MASK 0x7 977 978 #define TCP_ACCECN_ACE_MAX_DELTA 6
+289 -23
include/net/tcp_ecn.h
··· 4 4 5 5 #include <linux/tcp.h> 6 6 #include <linux/skbuff.h> 7 + #include <linux/bitfield.h> 7 8 8 9 #include <net/inet_connection_sock.h> 9 10 #include <net/sock.h> 10 11 #include <net/tcp.h> 11 12 #include <net/inet_ecn.h> 13 + 14 + /* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is 15 + * attemped to be negotiated and requested for incoming connection 16 + * and outgoing connection, respectively. 17 + */ 18 + enum tcp_ecn_mode { 19 + TCP_ECN_IN_NOECN_OUT_NOECN = 0, 20 + TCP_ECN_IN_ECN_OUT_ECN = 1, 21 + TCP_ECN_IN_ECN_OUT_NOECN = 2, 22 + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3, 23 + TCP_ECN_IN_ACCECN_OUT_ECN = 4, 24 + TCP_ECN_IN_ACCECN_OUT_NOECN = 5, 25 + }; 12 26 13 27 static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) 14 28 { ··· 53 39 tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; 54 40 } 55 41 42 + /* tp->accecn_fail_mode */ 43 + #define TCP_ACCECN_ACE_FAIL_SEND BIT(0) 44 + #define TCP_ACCECN_ACE_FAIL_RECV BIT(1) 45 + #define TCP_ACCECN_OPT_FAIL_SEND BIT(2) 46 + #define TCP_ACCECN_OPT_FAIL_RECV BIT(3) 47 + 48 + static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp) 49 + { 50 + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND; 51 + } 52 + 53 + static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp) 54 + { 55 + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV; 56 + } 57 + 58 + static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp) 59 + { 60 + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND; 61 + } 62 + 63 + static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp) 64 + { 65 + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV; 66 + } 67 + 68 + static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode) 69 + { 70 + tp->accecn_fail_mode |= mode; 71 + } 72 + 56 73 static inline u8 tcp_accecn_ace(const struct tcphdr *th) 57 74 { 58 75 return (th->ae << 2) | (th->cwr << 1) | th->ece; 59 76 } 60 77 61 - static inline void tcp_accecn_init_counters(struct tcp_sock *tp) 78 + /* Infer the ECT value our SYN arrived with from the echoed ACE field */ 79 + static inline int tcp_accecn_extract_syn_ect(u8 ace) 62 80 { 63 - tp->received_ce = 0; 64 - tp->received_ce_pending = 0; 81 + /* Below is an excerpt from the 1st block of Table 2 of AccECN spec */ 82 + static const int ace_to_ecn[8] = { 83 + INET_ECN_ECT_0, /* 0b000 (Undefined) */ 84 + INET_ECN_ECT_1, /* 0b001 (Undefined) */ 85 + INET_ECN_NOT_ECT, /* 0b010 (Not-ECT is received) */ 86 + INET_ECN_ECT_1, /* 0b011 (ECT-1 is received) */ 87 + INET_ECN_ECT_0, /* 0b100 (ECT-0 is received) */ 88 + INET_ECN_ECT_1, /* 0b101 (Reserved) */ 89 + INET_ECN_CE, /* 0b110 (CE is received) */ 90 + INET_ECN_ECT_1 /* 0b111 (Undefined) */ 91 + }; 92 + 93 + return ace_to_ecn[ace & 0x7]; 94 + } 95 + 96 + /* Check ECN field transition to detect invalid transitions */ 97 + static inline bool tcp_ect_transition_valid(u8 snt, u8 rcv) 98 + { 99 + if (rcv == snt) 100 + return true; 101 + 102 + /* Non-ECT altered to something or something became non-ECT */ 103 + if (snt == INET_ECN_NOT_ECT || rcv == INET_ECN_NOT_ECT) 104 + return false; 105 + /* CE -> ECT(0/1)? */ 106 + if (snt == INET_ECN_CE) 107 + return false; 108 + return true; 109 + } 110 + 111 + static inline bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, 112 + u8 sent_ect) 113 + { 114 + u8 ect = tcp_accecn_extract_syn_ect(ace); 115 + struct tcp_sock *tp = tcp_sk(sk); 116 + 117 + if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) 118 + return true; 119 + 120 + if (!tcp_ect_transition_valid(sent_ect, ect)) { 121 + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); 122 + return false; 123 + } 124 + 125 + return true; 126 + } 127 + 128 + /* Validate the 3rd ACK based on the ACE field, see Table 4 of AccECN spec */ 129 + static inline void tcp_accecn_third_ack(struct sock *sk, 130 + const struct sk_buff *skb, u8 sent_ect) 131 + { 132 + u8 ace = tcp_accecn_ace(tcp_hdr(skb)); 133 + struct tcp_sock *tp = tcp_sk(sk); 134 + 135 + switch (ace) { 136 + case 0x0: 137 + /* Invalid value */ 138 + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); 139 + break; 140 + case 0x7: 141 + case 0x5: 142 + case 0x1: 143 + /* Unused but legal values */ 144 + break; 145 + default: 146 + /* Validation only applies to first non-data packet */ 147 + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq && 148 + !TCP_SKB_CB(skb)->sacked && 149 + tcp_accecn_validate_syn_feedback(sk, ace, sent_ect)) { 150 + if ((tcp_accecn_extract_syn_ect(ace) == INET_ECN_CE) && 151 + !tp->delivered_ce) 152 + tp->delivered_ce++; 153 + } 154 + break; 155 + } 65 156 } 66 157 67 158 /* Updates Accurate ECN received counters from the received IP ECN field */ 68 - static inline void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb) 159 + static inline void tcp_ecn_received_counters(struct sock *sk, 160 + const struct sk_buff *skb) 69 161 { 70 162 u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; 71 163 u8 is_ce = INET_ECN_is_ce(ecnfield); ··· 194 74 } 195 75 } 196 76 197 - static inline void tcp_accecn_set_ace(struct tcphdr *th, struct tcp_sock *tp) 77 + /* AccECN specification, 5.1: [...] a server can determine that it 78 + * negotiated AccECN as [...] if the ACK contains an ACE field with 79 + * the value 0b010 to 0b111 (decimal 2 to 7). 80 + */ 81 + static inline bool cookie_accecn_ok(const struct tcphdr *th) 198 82 { 199 - u32 wire_ace; 83 + return tcp_accecn_ace(th) > 0x1; 84 + } 200 85 201 - wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; 202 - th->ece = !!(wire_ace & 0x1); 203 - th->cwr = !!(wire_ace & 0x2); 204 - th->ae = !!(wire_ace & 0x4); 86 + /* Used to form the ACE flags for SYN/ACK */ 87 + static inline u16 tcp_accecn_reflector_flags(u8 ect) 88 + { 89 + /* TCP ACE flags of SYN/ACK are set based on IP-ECN received from SYN. 90 + * Below is an excerpt from the 1st block of Table 2 of AccECN spec, 91 + * in which TCP ACE flags are encoded as: (AE << 2) | (CWR << 1) | ECE 92 + */ 93 + static const u8 ecn_to_ace_flags[4] = { 94 + 0b010, /* Not-ECT is received */ 95 + 0b011, /* ECT(1) is received */ 96 + 0b100, /* ECT(0) is received */ 97 + 0b110 /* CE is received */ 98 + }; 99 + 100 + return FIELD_PREP(TCPHDR_ACE, ecn_to_ace_flags[ect & 0x3]); 101 + } 102 + 103 + /* AccECN specification, 3.1.2: If a TCP server that implements AccECN 104 + * receives a SYN with the three TCP header flags (AE, CWR and ECE) set 105 + * to any combination other than 000, 011 or 111, it MUST negotiate the 106 + * use of AccECN as if they had been set to 111. 107 + */ 108 + static inline bool tcp_accecn_syn_requested(const struct tcphdr *th) 109 + { 110 + u8 ace = tcp_accecn_ace(th); 111 + 112 + return ace && ace != 0x3; 113 + } 114 + 115 + static inline void tcp_accecn_init_counters(struct tcp_sock *tp) 116 + { 117 + tp->received_ce = 0; 205 118 tp->received_ce_pending = 0; 206 119 } 207 120 208 - static inline void tcp_ecn_rcv_synack(struct tcp_sock *tp, 209 - const struct tcphdr *th) 121 + /* Used for make_synack to form the ACE flags */ 122 + static inline void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) 210 123 { 211 - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) 212 - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); 124 + /* TCP ACE flags of SYN/ACK are set based on IP-ECN codepoint received 125 + * from SYN. Below is an excerpt from Table 2 of the AccECN spec: 126 + * +====================+====================================+ 127 + * | IP-ECN codepoint | Respective ACE falgs on SYN/ACK | 128 + * | received on SYN | AE CWR ECE | 129 + * +====================+====================================+ 130 + * | Not-ECT | 0 1 0 | 131 + * | ECT(1) | 0 1 1 | 132 + * | ECT(0) | 1 0 0 | 133 + * | CE | 1 1 0 | 134 + * +====================+====================================+ 135 + */ 136 + th->ae = !!(ect & INET_ECN_ECT_0); 137 + th->cwr = ect != INET_ECN_ECT_0; 138 + th->ece = ect == INET_ECN_ECT_1; 213 139 } 214 140 215 - static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, 216 - const struct tcphdr *th) 141 + static inline void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, 142 + struct tcphdr *th) 217 143 { 144 + u32 wire_ace; 145 + 146 + /* The final packet of the 3WHS or anything like it must reflect 147 + * the SYN/ACK ECT instead of putting CEP into ACE field, such 148 + * case show up in tcp_flags. 149 + */ 150 + if (likely(!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACE))) { 151 + wire_ace = tp->received_ce + TCP_ACCECN_CEP_INIT_OFFSET; 152 + th->ece = !!(wire_ace & 0x1); 153 + th->cwr = !!(wire_ace & 0x2); 154 + th->ae = !!(wire_ace & 0x4); 155 + tp->received_ce_pending = 0; 156 + } 157 + } 158 + 159 + /* See Table 2 of the AccECN draft */ 160 + static inline void tcp_ecn_rcv_synack(struct sock *sk, const struct tcphdr *th, 161 + u8 ip_dsfield) 162 + { 163 + struct tcp_sock *tp = tcp_sk(sk); 164 + u8 ace = tcp_accecn_ace(th); 165 + 166 + switch (ace) { 167 + case 0x0: 168 + case 0x7: 169 + /* +========+========+============+=============+ 170 + * | A | B | SYN/ACK | Feedback | 171 + * | | | B->A | Mode of A | 172 + * | | | AE CWR ECE | | 173 + * +========+========+============+=============+ 174 + * | AccECN | No ECN | 0 0 0 | Not ECN | 175 + * | AccECN | Broken | 1 1 1 | Not ECN | 176 + * +========+========+============+=============+ 177 + */ 178 + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); 179 + break; 180 + case 0x1: 181 + case 0x5: 182 + /* +========+========+============+=============+ 183 + * | A | B | SYN/ACK | Feedback | 184 + * | | | B->A | Mode of A | 185 + * | | | AE CWR ECE | | 186 + * +========+========+============+=============+ 187 + * | AccECN | Nonce | 1 0 1 | (Reserved) | 188 + * | AccECN | ECN | 0 0 1 | Classic ECN | 189 + * | Nonce | AccECN | 0 0 1 | Classic ECN | 190 + * | ECN | AccECN | 0 0 1 | Classic ECN | 191 + * +========+========+============+=============+ 192 + */ 193 + if (tcp_ecn_mode_pending(tp)) 194 + /* Downgrade from AccECN, or requested initially */ 195 + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); 196 + break; 197 + default: 198 + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 199 + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; 200 + if (INET_ECN_is_ce(ip_dsfield) && 201 + tcp_accecn_validate_syn_feedback(sk, ace, 202 + tp->syn_ect_snt)) { 203 + tp->received_ce++; 204 + tp->received_ce_pending++; 205 + } 206 + break; 207 + } 208 + } 209 + 210 + static inline void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th, 211 + const struct sk_buff *skb) 212 + { 213 + if (tcp_ecn_mode_pending(tp)) { 214 + if (!tcp_accecn_syn_requested(th)) { 215 + /* Downgrade to classic ECN feedback */ 216 + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); 217 + } else { 218 + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & 219 + INET_ECN_MASK; 220 + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 221 + } 222 + } 218 223 if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) 219 224 tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); 220 225 } ··· 355 110 /* Packet ECN state for a SYN-ACK */ 356 111 static inline void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) 357 112 { 358 - const struct tcp_sock *tp = tcp_sk(sk); 113 + struct tcp_sock *tp = tcp_sk(sk); 359 114 360 115 TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; 361 116 if (tcp_ecn_disabled(tp)) ··· 363 118 else if (tcp_ca_needs_ecn(sk) || 364 119 tcp_bpf_ca_needs_ecn(sk)) 365 120 INET_ECN_xmit(sk); 121 + 122 + if (tp->ecn_flags & TCP_ECN_MODE_ACCECN) { 123 + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; 124 + TCP_SKB_CB(skb)->tcp_flags |= 125 + tcp_accecn_reflector_flags(tp->syn_ect_rcv); 126 + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 127 + } 366 128 } 367 129 368 130 /* Packet ECN state for a SYN. */ ··· 377 125 { 378 126 struct tcp_sock *tp = tcp_sk(sk); 379 127 bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); 380 - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || 381 - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; 128 + bool use_ecn, use_accecn; 129 + u8 tcp_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn); 130 + 131 + use_accecn = tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ACCECN; 132 + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || 133 + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || 134 + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; 382 135 383 136 if (!use_ecn) { 384 137 const struct dst_entry *dst = __sk_dst_get(sk); ··· 399 142 INET_ECN_xmit(sk); 400 143 401 144 TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; 402 - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); 145 + if (use_accecn) { 146 + TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_AE; 147 + tcp_ecn_mode_set(tp, TCP_ECN_MODE_PENDING); 148 + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 149 + } else { 150 + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); 151 + } 403 152 } 404 153 } 405 154 406 155 static inline void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) 407 156 { 408 - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) 157 + if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) { 409 158 /* tp->ecn_flags are cleared at a later point in time when 410 159 * SYN ACK is ultimatively being received. 411 160 */ 412 - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); 161 + TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ACE; 162 + } 413 163 } 414 164 415 165 static inline void 416 166 tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) 417 167 { 418 - if (inet_rsk(req)->ecn_ok) 168 + if (tcp_rsk(req)->accecn_ok) 169 + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); 170 + else if (inet_rsk(req)->ecn_ok) 419 171 th->ece = 1; 420 172 } 421 173
+4
net/ipv4/syncookies.c
··· 12 12 #include <linux/export.h> 13 13 #include <net/secure_seq.h> 14 14 #include <net/tcp.h> 15 + #include <net/tcp_ecn.h> 15 16 #include <net/route.h> 16 17 17 18 static siphash_aligned_key_t syncookie_secret[2]; ··· 404 403 struct tcp_sock *tp = tcp_sk(sk); 405 404 struct inet_request_sock *ireq; 406 405 struct net *net = sock_net(sk); 406 + struct tcp_request_sock *treq; 407 407 struct request_sock *req; 408 408 struct sock *ret = sk; 409 409 struct flowi4 fl4; ··· 430 428 } 431 429 432 430 ireq = inet_rsk(req); 431 + treq = tcp_rsk(req); 433 432 434 433 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); 435 434 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ··· 486 483 if (!req->syncookie) 487 484 ireq->rcv_wscale = rcv_wscale; 488 485 ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); 486 + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); 489 487 490 488 ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); 491 489 /* ip_queue_xmit() depends on our flow being setup
+2 -1
net/ipv4/sysctl_net_ipv4.c
··· 47 47 static int tcp_plb_max_rounds = 31; 48 48 static int tcp_plb_max_cong_thresh = 256; 49 49 static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; 50 + static int tcp_ecn_mode_max = 2; 50 51 51 52 /* obsolete */ 52 53 static int sysctl_tcp_low_latency __read_mostly; ··· 729 728 .mode = 0644, 730 729 .proc_handler = proc_dou8vec_minmax, 731 730 .extra1 = SYSCTL_ZERO, 732 - .extra2 = SYSCTL_TWO, 731 + .extra2 = &tcp_ecn_mode_max, 733 732 }, 734 733 { 735 734 .procname = "tcp_ecn_fallback",
+1
net/ipv4/tcp.c
··· 3407 3407 tp->window_clamp = 0; 3408 3408 tp->delivered = 0; 3409 3409 tp->delivered_ce = 0; 3410 + tp->accecn_fail_mode = 0; 3410 3411 tcp_accecn_init_counters(tp); 3411 3412 if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) 3412 3413 icsk->icsk_ca_ops->release(sk);
+40 -10
net/ipv4/tcp_input.c
··· 3665 3665 return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); 3666 3666 } 3667 3667 3668 + static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) 3669 + { 3670 + struct tcp_sock *tp = tcp_sk(sk); 3671 + u16 flags = 0; 3672 + 3673 + if (accecn_reflector) 3674 + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); 3675 + __tcp_send_ack(sk, tp->rcv_nxt, flags); 3676 + } 3677 + 3668 3678 /* RFC 5961 7 [ACK Throttling] */ 3669 - static void tcp_send_challenge_ack(struct sock *sk) 3679 + static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) 3670 3680 { 3671 3681 struct tcp_sock *tp = tcp_sk(sk); 3672 3682 struct net *net = sock_net(sk); ··· 3706 3696 WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); 3707 3697 send_ack: 3708 3698 NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); 3709 - tcp_send_ack(sk); 3699 + tcp_send_ack_reflect_ect(sk, accecn_reflector); 3710 3700 } 3711 3701 } 3712 3702 ··· 3873 3863 /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ 3874 3864 if (before(ack, prior_snd_una - max_window)) { 3875 3865 if (!(flag & FLAG_NO_CHALLENGE_ACK)) 3876 - tcp_send_challenge_ack(sk); 3866 + tcp_send_challenge_ack(sk, false); 3877 3867 return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; 3878 3868 } 3879 3869 goto old_ack; ··· 5917 5907 const struct tcphdr *th, int syn_inerr) 5918 5908 { 5919 5909 struct tcp_sock *tp = tcp_sk(sk); 5910 + bool accecn_reflector = false; 5920 5911 SKB_DR(reason); 5921 5912 5922 5913 /* RFC1323: H1. Apply PAWS check first. */ ··· 6015 6004 if (tp->syn_fastopen && !tp->data_segs_in && 6016 6005 sk->sk_state == TCP_ESTABLISHED) 6017 6006 tcp_fastopen_active_disable(sk); 6018 - tcp_send_challenge_ack(sk); 6007 + tcp_send_challenge_ack(sk, false); 6019 6008 SKB_DR_SET(reason, TCP_RESET); 6020 6009 goto discard; 6021 6010 } ··· 6026 6015 * RFC 5961 4.2 : Send a challenge ack 6027 6016 */ 6028 6017 if (th->syn) { 6018 + if (tcp_ecn_mode_accecn(tp)) 6019 + accecn_reflector = true; 6029 6020 if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && 6030 6021 TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && 6031 6022 TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && ··· 6037 6024 if (syn_inerr) 6038 6025 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); 6039 6026 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); 6040 - tcp_send_challenge_ack(sk); 6027 + tcp_send_challenge_ack(sk, accecn_reflector); 6041 6028 SKB_DR_SET(reason, TCP_INVALID_SYN); 6042 6029 goto discard; 6043 6030 } ··· 6506 6493 * state to ESTABLISHED..." 6507 6494 */ 6508 6495 6509 - tcp_ecn_rcv_synack(tp, th); 6496 + if (tcp_ecn_mode_any(tp)) 6497 + tcp_ecn_rcv_synack(sk, th, TCP_SKB_CB(skb)->ip_dsfield); 6510 6498 6511 6499 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 6512 6500 tcp_try_undo_spurious_syn(sk); ··· 6579 6565 TCP_DELACK_MAX, false); 6580 6566 goto consume; 6581 6567 } 6582 - tcp_send_ack(sk); 6568 + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); 6583 6569 return -1; 6584 6570 } 6585 6571 ··· 6638 6624 tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 6639 6625 tp->max_window = tp->snd_wnd; 6640 6626 6641 - tcp_ecn_rcv_syn(tp, th); 6627 + tcp_ecn_rcv_syn(tp, th, skb); 6642 6628 6643 6629 tcp_mtup_init(sk); 6644 6630 tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); ··· 6820 6806 } 6821 6807 /* accept old ack during closing */ 6822 6808 if ((int)reason < 0) { 6823 - tcp_send_challenge_ack(sk); 6809 + tcp_send_challenge_ack(sk, false); 6824 6810 reason = -reason; 6825 6811 goto discard; 6826 6812 } ··· 6867 6853 tp->lsndtime = tcp_jiffies32; 6868 6854 6869 6855 tcp_initialize_rcv_mss(sk); 6856 + if (tcp_ecn_mode_accecn(tp)) 6857 + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); 6870 6858 tcp_fast_path_on(tp); 6871 6859 if (sk->sk_shutdown & SEND_SHUTDOWN) 6872 6860 tcp_shutdown(sk, SEND_SHUTDOWN); 6861 + 6873 6862 break; 6874 6863 6875 6864 case TCP_FIN_WAIT1: { ··· 7042 7025 bool ect, ecn_ok; 7043 7026 u32 ecn_ok_dst; 7044 7027 7028 + if (tcp_accecn_syn_requested(th) && 7029 + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { 7030 + inet_rsk(req)->ecn_ok = 1; 7031 + tcp_rsk(req)->accecn_ok = 1; 7032 + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & 7033 + INET_ECN_MASK; 7034 + return; 7035 + } 7036 + 7045 7037 if (!th_ecn) 7046 7038 return; 7047 7039 ··· 7058 7032 ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); 7059 7033 ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; 7060 7034 7061 - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || 7035 + if (((!ect || th->res1 || th->ae) && ecn_ok) || 7036 + tcp_ca_needs_ecn(listen_sk) || 7062 7037 (ecn_ok_dst & DST_FEATURE_ECN_CA) || 7063 7038 tcp_bpf_ca_needs_ecn((struct sock *)req)) 7064 7039 inet_rsk(req)->ecn_ok = 1; ··· 7077 7050 tcp_rsk(req)->snt_synack = 0; 7078 7051 tcp_rsk(req)->snt_tsval_first = 0; 7079 7052 tcp_rsk(req)->last_oow_ack_time = 0; 7053 + tcp_rsk(req)->accecn_ok = 0; 7054 + tcp_rsk(req)->syn_ect_rcv = 0; 7055 + tcp_rsk(req)->syn_ect_snt = 0; 7080 7056 req->mss = rx_opt->mss_clamp; 7081 7057 req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; 7082 7058 ireq->tstamp_ok = rx_opt->tstamp_ok;
+4 -2
net/ipv4/tcp_ipv4.c
··· 65 65 #include <net/icmp.h> 66 66 #include <net/inet_hashtables.h> 67 67 #include <net/tcp.h> 68 + #include <net/tcp_ecn.h> 68 69 #include <net/transp_v6.h> 69 70 #include <net/ipv6.h> 70 71 #include <net/inet_common.h> ··· 1190 1189 enum tcp_synack_type synack_type, 1191 1190 struct sk_buff *syn_skb) 1192 1191 { 1193 - const struct inet_request_sock *ireq = inet_rsk(req); 1192 + struct inet_request_sock *ireq = inet_rsk(req); 1194 1193 struct flowi4 fl4; 1195 1194 int err = -1; 1196 1195 struct sk_buff *skb; ··· 1203 1202 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 1204 1203 1205 1204 if (skb) { 1205 + tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; 1206 1206 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); 1207 1207 1208 1208 tos = READ_ONCE(inet_sk(sk)->tos); ··· 3560 3558 3561 3559 static int __net_init tcp_sk_init(struct net *net) 3562 3560 { 3563 - net->ipv4.sysctl_tcp_ecn = 2; 3561 + net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3564 3562 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3565 3563 3566 3564 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+18 -6
net/ipv4/tcp_minisocks.c
··· 20 20 */ 21 21 22 22 #include <net/tcp.h> 23 + #include <net/tcp_ecn.h> 23 24 #include <net/xfrm.h> 24 25 #include <net/busy_poll.h> 25 26 #include <net/rstreason.h> ··· 452 451 ireq->rcv_wscale = rcv_wscale; 453 452 } 454 453 455 - static void tcp_ecn_openreq_child(struct tcp_sock *tp, 456 - const struct request_sock *req) 454 + static void tcp_ecn_openreq_child(struct sock *sk, 455 + const struct request_sock *req, 456 + const struct sk_buff *skb) 457 457 { 458 - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? 459 - TCP_ECN_MODE_RFC3168 : 460 - TCP_ECN_DISABLED); 458 + const struct tcp_request_sock *treq = tcp_rsk(req); 459 + struct tcp_sock *tp = tcp_sk(sk); 460 + 461 + if (treq->accecn_ok) { 462 + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 463 + tp->syn_ect_snt = treq->syn_ect_snt; 464 + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); 465 + tcp_ecn_received_counters(sk, skb); 466 + } else { 467 + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? 468 + TCP_ECN_MODE_RFC3168 : 469 + TCP_ECN_DISABLED); 470 + } 461 471 } 462 472 463 473 void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) ··· 633 621 if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) 634 622 newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; 635 623 newtp->rx_opt.mss_clamp = req->mss; 636 - tcp_ecn_openreq_child(newtp, req); 624 + tcp_ecn_openreq_child(newsk, req, skb); 637 625 newtp->fastopen_req = NULL; 638 626 RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); 639 627
+7 -3
net/ipv4/tcp_output.c
··· 332 332 return; 333 333 334 334 if (tcp_ecn_mode_accecn(tp)) { 335 - INET_ECN_xmit(sk); 336 - tcp_accecn_set_ace(th, tp); 335 + if (!tcp_accecn_ace_fail_recv(tp)) 336 + INET_ECN_xmit(sk); 337 + tcp_accecn_set_ace(tp, skb, th); 337 338 skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; 338 339 } else { 339 340 /* Not-retransmitted data segment: set ECT and inject CWR. */ ··· 3357 3356 tcp_retrans_try_collapse(sk, skb, avail_wnd); 3358 3357 } 3359 3358 3360 - /* RFC3168, section 6.1.1.1. ECN fallback */ 3359 + /* RFC3168, section 6.1.1.1. ECN fallback 3360 + * As AccECN uses the same SYN flags (+ AE), this check covers both 3361 + * cases. 3362 + */ 3361 3363 if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) 3362 3364 tcp_ecn_clear_syn(sk, skb); 3363 3365
+2
net/ipv6/syncookies.c
··· 16 16 #include <net/secure_seq.h> 17 17 #include <net/ipv6.h> 18 18 #include <net/tcp.h> 19 + #include <net/tcp_ecn.h> 19 20 20 21 #define COOKIEBITS 24 /* Upper bits store count */ 21 22 #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) ··· 265 264 if (!req->syncookie) 266 265 ireq->rcv_wscale = rcv_wscale; 267 266 ireq->ecn_ok &= cookie_ecn_ok(net, dst); 267 + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); 268 268 269 269 ret = tcp_get_cookie_sock(sk, skb, req, dst); 270 270 if (!ret) {
+1
net/ipv6/tcp_ipv6.c
··· 544 544 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); 545 545 546 546 if (skb) { 547 + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; 547 548 __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, 548 549 &ireq->ir_v6_rmt_addr); 549 550