Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tcp: accecn: AccECN option

The Accurate ECN allows echoing back the sum of bytes for
each IP ECN field value in the received packets using
AccECN option. This change implements AccECN option tx & rx
side processing without option send control related features
that are added by a later change.

Based on specification:
https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt
(Some features of the spec will be added in the later changes
rather than in this one).

A full-length AccECN option is always attempted but if it does
not fit, the minimum length is selected based on the counters
that have changed since the last update. The AccECN option
(with 24-bit fields) often ends in odd sizes so the option
write code tries to take advantage of some nop used to pad
the other TCP options.

The delivered_ecn_bytes pairs with received_ecn_bytes similar
to how delivered_ce pairs with received_ce. In contrast to
ACE field, however, the option is not always available to update
delivered_ecn_bytes. For ACK w/o AccECN option, the delivered
bytes calculated based on the cumulative ACK+SACK information
are assigned to one of the counters using an estimation
heuristic to select the most likely ECN byte counter. Any
estimation error is corrected when the next AccECN option
arrives. It may occur that the heuristic gets too confused
when there are enough different byte counter deltas between
ACKs with the AccECN option in which case the heuristic just
gives up on updating the counters for a while.

tcp_ecn_option sysctl can be used to select option sending
mode for AccECN: TCP_ECN_OPTION_DISABLED, TCP_ECN_OPTION_MINIMUM,
and TCP_ECN_OPTION_FULL.

This patch increases the size of tcp_info struct, as there is
no existing holes for new u32 variables. Below are the pahole
outcomes before and after this patch:

[BEFORE THIS PATCH]
struct tcp_info {
[...]
__u32 tcpi_total_rto_time; /* 244 4 */

/* size: 248, cachelines: 4, members: 61 */
}

[AFTER THIS PATCH]
struct tcp_info {
[...]
__u32 tcpi_total_rto_time; /* 244 4 */
__u32 tcpi_received_ce; /* 248 4 */
__u32 tcpi_delivered_e1_bytes; /* 252 4 */
__u32 tcpi_delivered_e0_bytes; /* 256 4 */
__u32 tcpi_delivered_ce_bytes; /* 260 4 */
__u32 tcpi_received_e1_bytes; /* 264 4 */
__u32 tcpi_received_e0_bytes; /* 268 4 */
__u32 tcpi_received_ce_bytes; /* 272 4 */

/* size: 280, cachelines: 5, members: 68 */
}

This patch uses the existing 1-byte holes in the tcp_sock_write_txrx
group for new u8 members, but adds a 4-byte hole in tcp_sock_write_rx
group after the new u32 delivered_ecn_bytes[3] member. Therefore, the
group size of tcp_sock_write_rx is increased from 96 to 112. Below
are the pahole outcomes before and after this patch:

[BEFORE THIS PATCH]
struct tcp_sock {
[...]
u8 received_ce_pending:4; /* 2522: 0 1 */
u8 unused2:4; /* 2522: 4 1 */
/* XXX 1 byte hole, try to pack */

[...]
u32 rcv_rtt_last_tsecr; /* 2668 4 */

[...]
__cacheline_group_end__tcp_sock_write_rx[0]; /* 2728 0 */

[...]
/* size: 3200, cachelines: 50, members: 167 */
}

[AFTER THIS PATCH]
struct tcp_sock {
[...]
u8 received_ce_pending:4;/* 2522: 0 1 */
u8 unused2:4; /* 2522: 4 1 */
u8 accecn_minlen:2; /* 2523: 0 1 */
u8 est_ecnfield:2; /* 2523: 2 1 */
u8 unused3:4; /* 2523: 4 1 */

[...]
u32 rcv_rtt_last_tsecr; /* 2668 4 */
u32 delivered_ecn_bytes[3];/* 2672 12 */
/* XXX 4 bytes hole, try to pack */

[...]
__cacheline_group_end__tcp_sock_write_rx[0]; /* 2744 0 */

[...]
/* size: 3200, cachelines: 50, members: 171 */
}

Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Co-developed-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-7-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Ilpo Järvinen and committed by
Paolo Abeni
b5e74132 77a4fdf4

+412 -13
+19
Documentation/networking/ip-sysctl.rst
··· 468 468 469 469 Default: 2 470 470 471 + tcp_ecn_option - INTEGER 472 + Control Accurate ECN (AccECN) option sending when AccECN has been 473 + successfully negotiated during handshake. Send logic inhibits 474 + sending AccECN options regarless of this setting when no AccECN 475 + option has been seen for the reverse direction. 476 + 477 + Possible values are: 478 + 479 + = ============================================================ 480 + 0 Never send AccECN option. This also disables sending AccECN 481 + option in SYN/ACK during handshake. 482 + 1 Send AccECN option sparingly according to the minimum option 483 + rules outlined in draft-ietf-tcpm-accurate-ecn. 484 + 2 Send AccECN option on every packet whenever it fits into TCP 485 + option space. 486 + = ============================================================ 487 + 488 + Default: 2 489 + 471 490 tcp_ecn_fallback - BOOLEAN 472 491 If the kernel detects that ECN connection misbehaves, enable fall 473 492 back to non-ECN. Currently, this knob implements the fallback
+3
Documentation/networking/net_cachelines/tcp_sock.rst
··· 104 104 u32 received_ce read_mostly read_write 105 105 u32[3] received_ecn_bytes read_mostly read_write 106 106 u8:4 received_ce_pending read_mostly read_write 107 + u32[3] delivered_ecn_bytes read_write 107 108 u8:2 syn_ect_snt write_mostly read_write 108 109 u8:2 syn_ect_rcv read_mostly read_write 110 + u8:2 accecn_minlen write_mostly read_write 111 + u8:2 est_ecnfield read_write 109 112 u8:4 accecn_fail_mode 110 113 u32 lost read_mostly tcp_ack 111 114 u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
+7 -2
include/linux/tcp.h
··· 122 122 smc_ok : 1, /* SMC seen on SYN packet */ 123 123 snd_wscale : 4, /* Window scaling received from sender */ 124 124 rcv_wscale : 4; /* Window scaling to send to receiver */ 125 - u8 saw_unknown:1, /* Received unknown option */ 126 - unused:7; 125 + u8 accecn:6, /* AccECN index in header, 0=no options */ 126 + saw_unknown:1, /* Received unknown option */ 127 + unused:1; 127 128 u8 num_sacks; /* Number of SACK blocks */ 128 129 u16 user_mss; /* mss requested by user in ioctl */ 129 130 u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ ··· 294 293 rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ 295 294 u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ 296 295 unused2:4; 296 + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ 297 + est_ecnfield:2,/* ECN field for AccECN delivered estimates */ 298 + unused3:4; 297 299 __be32 pred_flags; 298 300 u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ 299 301 u64 tcp_mstamp; /* most recent packet received/sent */ ··· 341 337 u32 rate_delivered; /* saved rate sample: packets delivered */ 342 338 u32 rate_interval_us; /* saved rate sample: time elapsed */ 343 339 u32 rcv_rtt_last_tsecr; 340 + u32 delivered_ecn_bytes[3]; 344 341 u64 first_tx_mstamp; /* start of window send phase */ 345 342 u64 delivered_mstamp; /* time we reached "delivered" */ 346 343 u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked
+1
include/net/netns/ipv4.h
··· 148 148 struct local_ports ip_local_ports; 149 149 150 150 u8 sysctl_tcp_ecn; 151 + u8 sysctl_tcp_ecn_option; 151 152 u8 sysctl_tcp_ecn_fallback; 152 153 153 154 u8 sysctl_ip_default_ttl;
+13
include/net/tcp.h
··· 213 213 #define TCPOPT_AO 29 /* Authentication Option (RFC5925) */ 214 214 #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ 215 215 #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ 216 + #define TCPOPT_ACCECN0 172 /* 0xAC: Accurate ECN Order 0 */ 217 + #define TCPOPT_ACCECN1 174 /* 0xAE: Accurate ECN Order 1 */ 216 218 #define TCPOPT_EXP 254 /* Experimental */ 217 219 /* Magic number to be after the option value for sharing TCP 218 220 * experimental options. See draft-ietf-tcpm-experimental-options-00.txt ··· 232 230 #define TCPOLEN_TIMESTAMP 10 233 231 #define TCPOLEN_MD5SIG 18 234 232 #define TCPOLEN_FASTOPEN_BASE 2 233 + #define TCPOLEN_ACCECN_BASE 2 235 234 #define TCPOLEN_EXP_FASTOPEN_BASE 4 236 235 #define TCPOLEN_EXP_SMC_BASE 6 237 236 ··· 246 243 #define TCPOLEN_MD5SIG_ALIGNED 20 247 244 #define TCPOLEN_MSS_ALIGNED 4 248 245 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 246 + #define TCPOLEN_ACCECN_PERFIELD 3 247 + 248 + /* Maximum number of byte counters in AccECN option + size */ 249 + #define TCP_ACCECN_NUMFIELDS 3 250 + #define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ 251 + TCPOLEN_ACCECN_PERFIELD * \ 252 + TCP_ACCECN_NUMFIELDS) 249 253 250 254 /* Flags in tp->nonagle */ 251 255 #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ ··· 991 981 * See draft-ietf-tcpm-accurate-ecn for the latest values. 992 982 */ 993 983 #define TCP_ACCECN_CEP_INIT_OFFSET 5 984 + #define TCP_ACCECN_E1B_INIT_OFFSET 1 985 + #define TCP_ACCECN_E0B_INIT_OFFSET 1 986 + #define TCP_ACCECN_CEB_INIT_OFFSET 0 994 987 995 988 /* State flags for sacked in struct tcp_skb_cb */ 996 989 enum tcp_skb_cb_sacked_flags {
+88 -1
include/net/tcp_ecn.h
··· 24 24 TCP_ECN_IN_ACCECN_OUT_NOECN = 5, 25 25 }; 26 26 27 + /* AccECN option sending when AccECN has been successfully negotiated */ 28 + enum tcp_accecn_option { 29 + TCP_ACCECN_OPTION_DISABLED = 0, 30 + TCP_ACCECN_OPTION_MINIMUM = 1, 31 + TCP_ACCECN_OPTION_FULL = 2, 32 + }; 33 + 27 34 static inline void tcp_ecn_queue_cwr(struct tcp_sock *tp) 28 35 { 29 36 /* Do not set CWR if in AccECN mode! */ ··· 176 169 } 177 170 } 178 171 172 + /* Maps IP ECN field ECT/CE code point to AccECN option field number, given 173 + * we are sending fields with Accurate ECN Order 1: ECT(1), CE, ECT(0). 174 + */ 175 + static inline u8 tcp_ecnfield_to_accecn_optfield(u8 ecnfield) 176 + { 177 + switch (ecnfield & INET_ECN_MASK) { 178 + case INET_ECN_NOT_ECT: 179 + return 0; /* AccECN does not send counts of NOT_ECT */ 180 + case INET_ECN_ECT_1: 181 + return 1; 182 + case INET_ECN_CE: 183 + return 2; 184 + case INET_ECN_ECT_0: 185 + return 3; 186 + } 187 + return 0; 188 + } 189 + 190 + /* Maps IP ECN field ECT/CE code point to AccECN option field value offset. 191 + * Some fields do not start from zero, to detect zeroing by middleboxes. 192 + */ 193 + static inline u32 tcp_accecn_field_init_offset(u8 ecnfield) 194 + { 195 + switch (ecnfield & INET_ECN_MASK) { 196 + case INET_ECN_NOT_ECT: 197 + return 0; /* AccECN does not send counts of NOT_ECT */ 198 + case INET_ECN_ECT_1: 199 + return TCP_ACCECN_E1B_INIT_OFFSET; 200 + case INET_ECN_CE: 201 + return TCP_ACCECN_CEB_INIT_OFFSET; 202 + case INET_ECN_ECT_0: 203 + return TCP_ACCECN_E0B_INIT_OFFSET; 204 + } 205 + return 0; 206 + } 207 + 208 + /* Maps AccECN option field #nr to IP ECN field ECT/CE bits */ 209 + static inline unsigned int tcp_accecn_optfield_to_ecnfield(unsigned int option, 210 + bool order) 211 + { 212 + /* Based on Table 5 of the AccECN spec to map (option, order) to 213 + * the corresponding ECN conuters (ECT-1, ECT-0, or CE). 214 + */ 215 + static const u8 optfield_lookup[2][3] = { 216 + /* order = 0: 1st field ECT-0, 2nd field CE, 3rd field ECT-1 */ 217 + { INET_ECN_ECT_0, INET_ECN_CE, INET_ECN_ECT_1 }, 218 + /* order = 1: 1st field ECT-1, 2nd field CE, 3rd field ECT-0 */ 219 + { INET_ECN_ECT_1, INET_ECN_CE, INET_ECN_ECT_0 } 220 + }; 221 + 222 + return optfield_lookup[order][option % 3]; 223 + } 224 + 225 + /* Handles AccECN option ECT and CE 24-bit byte counters update into 226 + * the u32 value in tcp_sock. As we're processing TCP options, it is 227 + * safe to access from - 1. 228 + */ 229 + static inline s32 tcp_update_ecn_bytes(u32 *cnt, const char *from, 230 + u32 init_offset) 231 + { 232 + u32 truncated = (get_unaligned_be32(from - 1) - init_offset) & 233 + 0xFFFFFFU; 234 + u32 delta = (truncated - *cnt) & 0xFFFFFFU; 235 + 236 + /* If delta has the highest bit set (24th bit) indicating 237 + * negative, sign extend to correct an estimation using 238 + * sign_extend32(delta, 24 - 1) 239 + */ 240 + delta = sign_extend32(delta, 23); 241 + *cnt += delta; 242 + return (s32)delta; 243 + } 244 + 179 245 /* Updates Accurate ECN received counters from the received IP ECN field */ 180 246 static inline void tcp_ecn_received_counters(struct sock *sk, 181 247 const struct sk_buff *skb, u32 len) ··· 272 192 tp->received_ce_pending = min(tp->received_ce_pending + pcount, 273 193 0xfU); 274 194 275 - if (len > 0) 195 + if (len > 0) { 196 + u8 minlen = tcp_ecnfield_to_accecn_optfield(ecnfield); 276 197 tp->received_ecn_bytes[ecnfield - 1] += len; 198 + tp->accecn_minlen = max_t(u8, tp->accecn_minlen, 199 + minlen); 200 + } 277 201 } 278 202 } 279 203 ··· 347 263 tp->received_ce = 0; 348 264 tp->received_ce_pending = 0; 349 265 __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); 266 + __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes); 267 + tp->accecn_minlen = 0; 268 + tp->est_ecnfield = 0; 350 269 } 351 270 352 271 /* Used for make_synack to form the ACE flags */
+7
include/uapi/linux/tcp.h
··· 316 316 * in milliseconds, including any 317 317 * unfinished recovery. 318 318 */ 319 + __u32 tcpi_received_ce; /* # of CE marks received */ 320 + __u32 tcpi_delivered_e1_bytes; /* Accurate ECN byte counters */ 321 + __u32 tcpi_delivered_e0_bytes; 322 + __u32 tcpi_delivered_ce_bytes; 323 + __u32 tcpi_received_e1_bytes; 324 + __u32 tcpi_received_e0_bytes; 325 + __u32 tcpi_received_ce_bytes; 319 326 }; 320 327 321 328 /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */
+9
net/ipv4/sysctl_net_ipv4.c
··· 732 732 .extra2 = &tcp_ecn_mode_max, 733 733 }, 734 734 { 735 + .procname = "tcp_ecn_option", 736 + .data = &init_net.ipv4.sysctl_tcp_ecn_option, 737 + .maxlen = sizeof(u8), 738 + .mode = 0644, 739 + .proc_handler = proc_dou8vec_minmax, 740 + .extra1 = SYSCTL_ZERO, 741 + .extra2 = SYSCTL_TWO, 742 + }, 743 + { 735 744 .procname = "tcp_ecn_fallback", 736 745 .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, 737 746 .maxlen = sizeof(u8),
+14 -1
net/ipv4/tcp.c
··· 270 270 271 271 #include <net/icmp.h> 272 272 #include <net/inet_common.h> 273 + #include <net/inet_ecn.h> 273 274 #include <net/tcp.h> 274 275 #include <net/tcp_ecn.h> 275 276 #include <net/mptcp.h> ··· 4156 4155 { 4157 4156 const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ 4158 4157 const struct inet_connection_sock *icsk = inet_csk(sk); 4158 + const u8 ect1_idx = INET_ECN_ECT_1 - 1; 4159 + const u8 ect0_idx = INET_ECN_ECT_0 - 1; 4160 + const u8 ce_idx = INET_ECN_CE - 1; 4159 4161 unsigned long rate; 4160 4162 u32 now; 4161 4163 u64 rate64; ··· 4284 4280 info->tcpi_total_rto_time = tp->total_rto_time; 4285 4281 if (tp->rto_stamp) 4286 4282 info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; 4283 + 4284 + info->tcpi_received_ce = tp->received_ce; 4285 + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; 4286 + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; 4287 + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; 4288 + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; 4289 + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; 4290 + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; 4287 4291 4288 4292 unlock_sock_fast(sk, slow); 4289 4293 } ··· 5174 5162 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); 5175 5163 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); 5176 5164 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); 5165 + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); 5177 5166 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); 5178 5167 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); 5179 5168 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); 5180 5169 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); 5181 5170 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); 5182 - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 96); 5171 + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 112); 5183 5172 } 5184 5173 5185 5174 void __init tcp_init(void)
+90 -4
net/ipv4/tcp_input.c
··· 70 70 #include <linux/sysctl.h> 71 71 #include <linux/kernel.h> 72 72 #include <linux/prefetch.h> 73 + #include <linux/bitops.h> 73 74 #include <net/dst.h> 74 75 #include <net/tcp.h> 75 76 #include <net/tcp_ecn.h> ··· 385 384 } 386 385 } 387 386 387 + /* Returns true if the byte counters can be used */ 388 + static bool tcp_accecn_process_option(struct tcp_sock *tp, 389 + const struct sk_buff *skb, 390 + u32 delivered_bytes, int flag) 391 + { 392 + u8 estimate_ecnfield = tp->est_ecnfield; 393 + bool ambiguous_ecn_bytes_incr = false; 394 + bool first_changed = false; 395 + unsigned int optlen; 396 + bool order1, res; 397 + unsigned int i; 398 + u8 *ptr; 399 + 400 + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { 401 + if (estimate_ecnfield) { 402 + u8 ecnfield = estimate_ecnfield - 1; 403 + 404 + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; 405 + return true; 406 + } 407 + return false; 408 + } 409 + 410 + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; 411 + optlen = ptr[1] - 2; 412 + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) 413 + return false; 414 + order1 = (ptr[0] == TCPOPT_ACCECN1); 415 + ptr += 2; 416 + 417 + res = !!estimate_ecnfield; 418 + for (i = 0; i < 3; i++) { 419 + u32 init_offset; 420 + u8 ecnfield; 421 + s32 delta; 422 + u32 *cnt; 423 + 424 + if (optlen < TCPOLEN_ACCECN_PERFIELD) 425 + break; 426 + 427 + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); 428 + init_offset = tcp_accecn_field_init_offset(ecnfield); 429 + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; 430 + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); 431 + if (delta && delta < 0) { 432 + res = false; 433 + ambiguous_ecn_bytes_incr = true; 434 + } 435 + if (delta && ecnfield != estimate_ecnfield) { 436 + if (!first_changed) { 437 + tp->est_ecnfield = ecnfield; 438 + first_changed = true; 439 + } else { 440 + res = false; 441 + ambiguous_ecn_bytes_incr = true; 442 + } 443 + } 444 + 445 + optlen -= TCPOLEN_ACCECN_PERFIELD; 446 + ptr += TCPOLEN_ACCECN_PERFIELD; 447 + } 448 + if (ambiguous_ecn_bytes_incr) 449 + tp->est_ecnfield = 0; 450 + 451 + return res; 452 + } 453 + 388 454 static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) 389 455 { 390 456 tp->delivered_ce += ecn_count; ··· 468 400 469 401 /* Returns the ECN CE delta */ 470 402 static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, 471 - u32 delivered_pkts, int flag) 403 + u32 delivered_pkts, u32 delivered_bytes, 404 + int flag) 472 405 { 473 406 const struct tcphdr *th = tcp_hdr(skb); 474 407 struct tcp_sock *tp = tcp_sk(sk); ··· 479 410 /* Reordered ACK or uncertain due to lack of data to send and ts */ 480 411 if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) 481 412 return 0; 413 + 414 + tcp_accecn_process_option(tp, skb, delivered_bytes, flag); 482 415 483 416 if (!(flag & FLAG_SLOWPATH)) { 484 417 /* AccECN counter might overflow on large ACKs */ ··· 507 436 } 508 437 509 438 static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, 510 - u32 delivered_pkts, int *flag) 439 + u32 delivered_pkts, u32 delivered_bytes, 440 + int *flag) 511 441 { 512 442 struct tcp_sock *tp = tcp_sk(sk); 513 443 u32 delta; 514 444 515 - delta = __tcp_accecn_process(sk, skb, delivered_pkts, *flag); 445 + delta = __tcp_accecn_process(sk, skb, delivered_pkts, 446 + delivered_bytes, *flag); 516 447 if (delta > 0) { 517 448 tcp_count_delivered_ce(tp, delta); 518 449 *flag |= FLAG_ECE; ··· 4046 3973 if (tcp_ecn_mode_accecn(tp)) 4047 3974 ecn_count = tcp_accecn_process(sk, skb, 4048 3975 tp->delivered - delivered, 3976 + sack_state.delivered_bytes, 4049 3977 &flag); 4050 3978 4051 3979 tcp_in_ack_event(sk, flag); ··· 4086 4012 if (tcp_ecn_mode_accecn(tp)) 4087 4013 ecn_count = tcp_accecn_process(sk, skb, 4088 4014 tp->delivered - delivered, 4015 + sack_state.delivered_bytes, 4089 4016 &flag); 4090 4017 tcp_in_ack_event(sk, flag); 4091 4018 /* If data was DSACKed, see if we can undo a cwnd reduction. */ ··· 4214 4139 4215 4140 ptr = (const unsigned char *)(th + 1); 4216 4141 opt_rx->saw_tstamp = 0; 4142 + opt_rx->accecn = 0; 4217 4143 opt_rx->saw_unknown = 0; 4218 4144 4219 4145 while (length > 0) { ··· 4306 4230 ptr, th->syn, foc, false); 4307 4231 break; 4308 4232 4233 + case TCPOPT_ACCECN0: 4234 + case TCPOPT_ACCECN1: 4235 + /* Save offset of AccECN option in TCP header */ 4236 + opt_rx->accecn = (ptr - 2) - (__u8 *)th; 4237 + break; 4238 + 4309 4239 case TCPOPT_EXP: 4310 4240 /* Fast Open option shares code 254 using a 4311 4241 * 16 bits magic number. ··· 4372 4290 */ 4373 4291 if (th->doff == (sizeof(*th) / 4)) { 4374 4292 tp->rx_opt.saw_tstamp = 0; 4293 + tp->rx_opt.accecn = 0; 4375 4294 return false; 4376 4295 } else if (tp->rx_opt.tstamp_ok && 4377 4296 th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { 4378 - if (tcp_parse_aligned_timestamp(tp, th)) 4297 + if (tcp_parse_aligned_timestamp(tp, th)) { 4298 + tp->rx_opt.accecn = 0; 4379 4299 return true; 4300 + } 4380 4301 } 4381 4302 4382 4303 tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); ··· 6204 6119 */ 6205 6120 6206 6121 tp->rx_opt.saw_tstamp = 0; 6122 + tp->rx_opt.accecn = 0; 6207 6123 6208 6124 /* pred_flags is 0xS?10 << 16 + snd_wnd 6209 6125 * if header_prediction is to be made
+1
net/ipv4/tcp_ipv4.c
··· 3561 3561 static int __net_init tcp_sk_init(struct net *net) 3562 3562 { 3563 3563 net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; 3564 + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; 3564 3565 net->ipv4.sysctl_tcp_ecn_fallback = 1; 3565 3566 3566 3567 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+160 -5
net/ipv4/tcp_output.c
··· 385 385 #define OPTION_SMC BIT(9) 386 386 #define OPTION_MPTCP BIT(10) 387 387 #define OPTION_AO BIT(11) 388 + #define OPTION_ACCECN BIT(12) 388 389 389 390 static void smc_options_write(__be32 *ptr, u16 *options) 390 391 { ··· 407 406 u16 mss; /* 0 to disable */ 408 407 u8 ws; /* window scale, 0 to disable */ 409 408 u8 num_sack_blocks; /* number of SACK blocks to include */ 409 + u8 num_accecn_fields:7, /* number of AccECN fields needed */ 410 + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ 410 411 u8 hash_size; /* bytes in hash_location */ 411 412 u8 bpf_opt_len; /* length of BPF hdr option */ 412 413 __u8 *hash_location; /* temporary pointer, overloaded */ ··· 606 603 return ptr; 607 604 } 608 605 606 + /* Initial values for AccECN option, ordered is based on ECN field bits 607 + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. 608 + */ 609 + static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; 610 + 609 611 /* Write previously computed TCP options to the packet. 610 612 * 611 613 * Beware: Something in the Internet is very sensitive to the ordering of ··· 629 621 struct tcp_out_options *opts, 630 622 struct tcp_key *key) 631 623 { 624 + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ 625 + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ 632 626 __be32 *ptr = (__be32 *)(th + 1); 633 627 u16 options = opts->options; /* mungable copy */ 634 628 ··· 666 656 *ptr++ = htonl(opts->tsecr); 667 657 } 668 658 659 + if (OPTION_ACCECN & options) { 660 + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? 661 + synack_ecn_bytes : 662 + tp->received_ecn_bytes; 663 + const u8 ect0_idx = INET_ECN_ECT_0 - 1; 664 + const u8 ect1_idx = INET_ECN_ECT_1 - 1; 665 + const u8 ce_idx = INET_ECN_CE - 1; 666 + u32 e0b; 667 + u32 e1b; 668 + u32 ceb; 669 + u8 len; 670 + 671 + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; 672 + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; 673 + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; 674 + len = TCPOLEN_ACCECN_BASE + 675 + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; 676 + 677 + if (opts->num_accecn_fields == 2) { 678 + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | 679 + ((e1b >> 8) & 0xffff)); 680 + *ptr++ = htonl(((e1b & 0xff) << 24) | 681 + (ceb & 0xffffff)); 682 + } else if (opts->num_accecn_fields == 1) { 683 + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | 684 + ((e1b >> 8) & 0xffff)); 685 + leftover_highbyte = e1b & 0xff; 686 + leftover_lowbyte = TCPOPT_NOP; 687 + } else if (opts->num_accecn_fields == 0) { 688 + leftover_highbyte = TCPOPT_ACCECN1; 689 + leftover_lowbyte = len; 690 + } else if (opts->num_accecn_fields == 3) { 691 + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | 692 + ((e1b >> 8) & 0xffff)); 693 + *ptr++ = htonl(((e1b & 0xff) << 24) | 694 + (ceb & 0xffffff)); 695 + *ptr++ = htonl(((e0b & 0xffffff) << 8) | 696 + TCPOPT_NOP); 697 + } 698 + if (tp) 699 + tp->accecn_minlen = 0; 700 + } 701 + 669 702 if (unlikely(OPTION_SACK_ADVERTISE & options)) { 670 - *ptr++ = htonl((TCPOPT_NOP << 24) | 671 - (TCPOPT_NOP << 16) | 703 + *ptr++ = htonl((leftover_highbyte << 24) | 704 + (leftover_lowbyte << 16) | 672 705 (TCPOPT_SACK_PERM << 8) | 673 706 TCPOLEN_SACK_PERM); 707 + leftover_highbyte = TCPOPT_NOP; 708 + leftover_lowbyte = TCPOPT_NOP; 674 709 } 675 710 676 711 if (unlikely(OPTION_WSCALE & options)) { 677 - *ptr++ = htonl((TCPOPT_NOP << 24) | 712 + u8 highbyte = TCPOPT_NOP; 713 + 714 + /* Do not split the leftover 2-byte to fit into a single 715 + * NOP, i.e., replace this NOP only when 1 byte is leftover 716 + * within leftover_highbyte. 717 + */ 718 + if (unlikely(leftover_highbyte != TCPOPT_NOP && 719 + leftover_lowbyte == TCPOPT_NOP)) { 720 + highbyte = leftover_highbyte; 721 + leftover_highbyte = TCPOPT_NOP; 722 + } 723 + *ptr++ = htonl((highbyte << 24) | 678 724 (TCPOPT_WINDOW << 16) | 679 725 (TCPOLEN_WINDOW << 8) | 680 726 opts->ws); ··· 741 675 tp->duplicate_sack : tp->selective_acks; 742 676 int this_sack; 743 677 744 - *ptr++ = htonl((TCPOPT_NOP << 24) | 745 - (TCPOPT_NOP << 16) | 678 + *ptr++ = htonl((leftover_highbyte << 24) | 679 + (leftover_lowbyte << 16) | 746 680 (TCPOPT_SACK << 8) | 747 681 (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * 748 682 TCPOLEN_SACK_PERBLOCK))); 683 + leftover_highbyte = TCPOPT_NOP; 684 + leftover_lowbyte = TCPOPT_NOP; 749 685 750 686 for (this_sack = 0; this_sack < opts->num_sack_blocks; 751 687 ++this_sack) { ··· 756 688 } 757 689 758 690 tp->rx_opt.dsack = 0; 691 + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || 692 + leftover_lowbyte != TCPOPT_NOP)) { 693 + *ptr++ = htonl((leftover_highbyte << 24) | 694 + (leftover_lowbyte << 16) | 695 + (TCPOPT_NOP << 8) | 696 + TCPOPT_NOP); 697 + leftover_highbyte = TCPOPT_NOP; 698 + leftover_lowbyte = TCPOPT_NOP; 759 699 } 760 700 761 701 if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { ··· 842 766 } 843 767 } 844 768 } 769 + } 770 + 771 + static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) 772 + { 773 + /* How much there's room for combining with the alignment padding? */ 774 + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == 775 + OPTION_SACK_ADVERTISE) 776 + return 2; 777 + else if (opts->options & OPTION_WSCALE) 778 + return 1; 779 + return 0; 780 + } 781 + 782 + /* Calculates how long AccECN option will fit to @remaining option space. 783 + * 784 + * AccECN option can sometimes replace NOPs used for alignment of other 785 + * TCP options (up to @max_combine_saving available). 786 + * 787 + * Only solutions with at least @required AccECN fields are accepted. 788 + * 789 + * Returns: The size of the AccECN option excluding space repurposed from 790 + * the alignment of the other options. 791 + */ 792 + static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, 793 + int remaining) 794 + { 795 + int size = TCP_ACCECN_MAXSIZE; 796 + int max_combine_saving; 797 + int align_size; 798 + 799 + if (opts->use_synack_ecn_bytes) 800 + max_combine_saving = tcp_synack_options_combine_saving(opts); 801 + else 802 + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; 803 + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; 804 + while (opts->num_accecn_fields >= required) { 805 + /* Pad to dword if cannot combine */ 806 + if ((size & 0x3) > max_combine_saving) 807 + align_size = ALIGN(size, 4); 808 + else 809 + align_size = ALIGN_DOWN(size, 4); 810 + 811 + if (remaining >= align_size) { 812 + size = align_size; 813 + break; 814 + } 815 + 816 + opts->num_accecn_fields--; 817 + size -= TCPOLEN_ACCECN_PERFIELD; 818 + } 819 + if (opts->num_accecn_fields < required) 820 + return 0; 821 + 822 + opts->options |= OPTION_ACCECN; 823 + return size; 845 824 } 846 825 847 826 /* Compute TCP options for SYN packets. This is not the final ··· 981 850 } 982 851 } 983 852 853 + /* Simultaneous open SYN/ACK needs AccECN option but not SYN */ 854 + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && 855 + tcp_ecn_mode_accecn(tp) && 856 + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && 857 + remaining >= TCPOLEN_ACCECN_BASE)) { 858 + opts->use_synack_ecn_bytes = 1; 859 + remaining -= tcp_options_fit_accecn(opts, 0, remaining); 860 + } 861 + 984 862 bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); 985 863 986 864 return MAX_TCP_OPTION_SPACE - remaining; ··· 1007 867 { 1008 868 struct inet_request_sock *ireq = inet_rsk(req); 1009 869 unsigned int remaining = MAX_TCP_OPTION_SPACE; 870 + struct tcp_request_sock *treq = tcp_rsk(req); 1010 871 1011 872 if (tcp_key_is_md5(key)) { 1012 873 opts->options |= OPTION_MD5; ··· 1069 928 mptcp_set_option_cond(req, opts, &remaining); 1070 929 1071 930 smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); 931 + 932 + if (treq->accecn_ok && 933 + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && 934 + remaining >= TCPOLEN_ACCECN_BASE) { 935 + opts->use_synack_ecn_bytes = 1; 936 + remaining -= tcp_options_fit_accecn(opts, 0, remaining); 937 + } 1072 938 1073 939 bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, 1074 940 synack_type, opts, &remaining); ··· 1147 999 } 1148 1000 } else { 1149 1001 opts->num_sack_blocks = 0; 1002 + } 1003 + 1004 + if (tcp_ecn_mode_accecn(tp) && 1005 + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option)) { 1006 + opts->use_synack_ecn_bytes = 0; 1007 + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, 1008 + MAX_TCP_OPTION_SPACE - size); 1150 1009 } 1151 1010 1152 1011 if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,