Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tcp: accecn: add AccECN rx byte counters

These three byte counters track IP ECN field payload byte sums for
all arriving (acceptable) packets for ECT0, ECT1, and CE. The
AccECN option (added by a later patch in the series) echoes these
counters back to sender side; therefore, it is placed within the
group of tcp_sock_write_txrx.

Below are the pahole outcomes before and after this patch, in which
the group size of tcp_sock_write_txrx is increased from 95 + 4 to
107 + 4 and an extra 4-byte hole is created but will be exploited
in later patches:

[BEFORE THIS PATCH]
struct tcp_sock {
[...]
u32 delivered_ce; /* 2576 4 */
u32 received_ce; /* 2580 4 */
u32 app_limited; /* 2584 4 */
u32 rcv_wnd; /* 2588 4 */
struct tcp_options_received rx_opt; /* 2592 24 */
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2616 0 */

[...]
/* size: 3200, cachelines: 50, members: 166 */
}

[AFTER THIS PATCH]
struct tcp_sock {
[...]
u32 delivered_ce; /* 2576 4 */
u32 received_ce; /* 2580 4 */
u32 received_ecn_bytes[3];/* 2584 12 */
u32 app_limited; /* 2596 4 */
u32 rcv_wnd; /* 2600 4 */
struct tcp_options_received rx_opt; /* 2604 24 */
__cacheline_group_end__tcp_sock_write_txrx[0]; /* 2628 0 */
/* XXX 4 bytes hole, try to pack */

[...]
/* size: 3200, cachelines: 50, members: 167 */
}

Signed-off-by: Ilpo Järvinen <ij@kernel.org>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Co-developed-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Signed-off-by: Chia-Yu Chang <chia-yu.chang@nokia-bell-labs.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Link: https://patch.msgid.link/20250916082434.100722-4-chia-yu.chang@nokia-bell-labs.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Ilpo Järvinen and committed by
Paolo Abeni
9a011277 3cae3427

+40 -6
+1
Documentation/networking/net_cachelines/tcp_sock.rst
··· 102 102 u32 delivered read_mostly read_write tcp_rate_skb_sent, tcp_newly_delivered(tx);tcp_ack, tcp_rate_gen, tcp_clean_rtx_queue (rx) 103 103 u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx) 104 104 u32 received_ce read_mostly read_write 105 + u32[3] received_ecn_bytes read_mostly read_write 105 106 u8:4 received_ce_pending read_mostly read_write 106 107 u8:2 syn_ect_snt write_mostly read_write 107 108 u8:2 syn_ect_rcv read_mostly read_write
+4
include/linux/tcp.h
··· 306 306 u32 delivered; /* Total data packets delivered incl. rexmits */ 307 307 u32 delivered_ce; /* Like the above but only ECE marked packets */ 308 308 u32 received_ce; /* Like the above but for rcvd CE marked pkts */ 309 + u32 received_ecn_bytes[3]; /* received byte counters for three ECN 310 + * types: INET_ECN_ECT_1, INET_ECN_ECT_0, 311 + * and INET_ECN_CE 312 + */ 309 313 u32 app_limited; /* limited until "delivered" reaches this val */ 310 314 u32 rcv_wnd; /* Current receiver window */ 311 315 /*
+28 -1
include/net/tcp_ecn.h
··· 171 171 172 172 /* Updates Accurate ECN received counters from the received IP ECN field */ 173 173 static inline void tcp_ecn_received_counters(struct sock *sk, 174 - const struct sk_buff *skb) 174 + const struct sk_buff *skb, u32 len) 175 175 { 176 176 u8 ecnfield = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; 177 177 u8 is_ce = INET_ECN_is_ce(ecnfield); ··· 191 191 tp->received_ce += pcount; 192 192 tp->received_ce_pending = min(tp->received_ce_pending + pcount, 193 193 0xfU); 194 + 195 + if (len > 0) 196 + tp->received_ecn_bytes[ecnfield - 1] += len; 194 197 } 198 + } 199 + 200 + /* AccECN specification, 2.2: [...] A Data Receiver maintains four counters 201 + * initialized at the start of the half-connection. [...] These byte counters 202 + * reflect only the TCP payload length, excluding TCP header and TCP options. 203 + */ 204 + static inline void tcp_ecn_received_counters_payload(struct sock *sk, 205 + const struct sk_buff *skb) 206 + { 207 + const struct tcphdr *th = (const struct tcphdr *)skb->data; 208 + 209 + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); 195 210 } 196 211 197 212 /* AccECN specification, 5.1: [...] a server can determine that it ··· 247 232 return ace && ace != 0x3; 248 233 } 249 234 235 + static inline void __tcp_accecn_init_bytes_counters(int *counter_array) 236 + { 237 + BUILD_BUG_ON(INET_ECN_ECT_1 != 0x1); 238 + BUILD_BUG_ON(INET_ECN_ECT_0 != 0x2); 239 + BUILD_BUG_ON(INET_ECN_CE != 0x3); 240 + 241 + counter_array[INET_ECN_ECT_1 - 1] = 0; 242 + counter_array[INET_ECN_ECT_0 - 1] = 0; 243 + counter_array[INET_ECN_CE - 1] = 0; 244 + } 245 + 250 246 static inline void tcp_accecn_init_counters(struct tcp_sock *tp) 251 247 { 252 248 tp->received_ce = 0; 253 249 tp->received_ce_pending = 0; 250 + __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes); 254 251 } 255 252 256 253 /* Used for make_synack to form the ACE flags */
+2 -1
net/ipv4/tcp.c
··· 5142 5142 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); 5143 5143 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); 5144 5144 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); 5145 + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); 5145 5146 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); 5146 5147 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); 5147 5148 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); ··· 5150 5149 /* 32bit arches with 8byte alignment on u64 fields might need padding 5151 5150 * before tcp_clock_cache. 5152 5151 */ 5153 - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 95 + 4); 5152 + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 107 + 4); 5154 5153 5155 5154 /* RX read-write hotpath cache lines */ 5156 5155 CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
+4 -3
net/ipv4/tcp_input.c
··· 6163 6163 flag |= __tcp_replace_ts_recent(tp, 6164 6164 delta); 6165 6165 6166 - tcp_ecn_received_counters(sk, skb); 6166 + tcp_ecn_received_counters(sk, skb, 0); 6167 6167 6168 6168 /* We know that such packets are checksummed 6169 6169 * on entry. ··· 6213 6213 /* Bulk data transfer: receiver */ 6214 6214 tcp_cleanup_skb(skb); 6215 6215 __skb_pull(skb, tcp_header_len); 6216 - tcp_ecn_received_counters(sk, skb); 6216 + tcp_ecn_received_counters(sk, skb, 6217 + len - tcp_header_len); 6217 6218 eaten = tcp_queue_rcv(sk, skb, &fragstolen); 6218 6219 6219 6220 tcp_event_data_recv(sk, skb); ··· 6255 6254 return; 6256 6255 6257 6256 step5: 6258 - tcp_ecn_received_counters(sk, skb); 6257 + tcp_ecn_received_counters_payload(sk, skb); 6259 6258 6260 6259 reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); 6261 6260 if ((int)reason < 0) {
+1 -1
net/ipv4/tcp_minisocks.c
··· 463 463 tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); 464 464 tp->syn_ect_snt = treq->syn_ect_snt; 465 465 tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); 466 - tcp_ecn_received_counters(sk, skb); 466 + tcp_ecn_received_counters_payload(sk, skb); 467 467 } else { 468 468 tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? 469 469 TCP_ECN_MODE_RFC3168 :