Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'add-tcp-fraglist-gro-support'

Felix Fietkau says:

====================
Add TCP fraglist GRO support

When forwarding TCP after GRO, software segmentation is very expensive,
especially when the checksum needs to be recalculated.
One case where that's currently unavoidable is when routing packets over
PPPoE. Performance improves significantly when using fraglist GRO
implemented in the same way as for UDP.

When NETIF_F_GRO_FRAGLIST is enabled, perform a lookup for an established
socket in the same netns as the receiving device. While this may not
cover all relevant use cases in multi-netns configurations, it should be
good enough for most configurations that need this.

Here's a measurement of running 2 TCP streams through a MediaTek MT7622
device (2-core Cortex-A53), which runs NAT with flow offload enabled from
one ethernet port to PPPoE on another ethernet port + cake qdisc set to
1Gbps.

rx-gro-list off: 630 Mbit/s, CPU 35% idle
rx-gro-list on: 770 Mbit/s, CPU 40% idle

Changes since v4:
- add likely() to prefer the non-fraglist path in check

Changes since v3:
- optimize __tcpv4_gso_segment_csum
- add unlikely()
- reorder dev_net/skb_gro_network_header calls after NETIF_F_GRO_FRAGLIST
check
- add support for ipv6 nat
- drop redundant pskb_may_pull check

Changes since v2:
- create tcp_gro_header_pull helper function to pull tcp header only once
- optimize __tcpv4_gso_segment_list_csum, drop obsolete flags check

Changes since v1:
- revert bogus tcp flags overwrite on segmentation
- fix kbuild issue with !CONFIG_IPV6
- only perform socket lookup for the first skb in the GRO train

Changes since RFC:
- split up patches
- handle TCP flags mutations
====================

Link: https://lore.kernel.org/r/20240502084450.44009-1-nbd@nbd.name
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+334 -78
+1
include/net/gro.h
··· 438 438 } 439 439 440 440 int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb); 441 + int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb); 441 442 442 443 /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */ 443 444 static inline void gro_normal_list(struct napi_struct *napi)
+4 -1
include/net/tcp.h
··· 2191 2191 2192 2192 struct sk_buff *tcp_gso_segment(struct sk_buff *skb, 2193 2193 netdev_features_t features); 2194 - struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); 2194 + struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb); 2195 + struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th); 2196 + struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, 2197 + struct tcphdr *th); 2195 2198 INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); 2196 2199 INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); 2197 2200 INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff));
+27
net/core/gro.c
··· 231 231 return 0; 232 232 } 233 233 234 + int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) 235 + { 236 + if (unlikely(p->len + skb->len >= 65536)) 237 + return -E2BIG; 238 + 239 + if (NAPI_GRO_CB(p)->last == p) 240 + skb_shinfo(p)->frag_list = skb; 241 + else 242 + NAPI_GRO_CB(p)->last->next = skb; 243 + 244 + skb_pull(skb, skb_gro_offset(skb)); 245 + 246 + NAPI_GRO_CB(p)->last = skb; 247 + NAPI_GRO_CB(p)->count++; 248 + p->data_len += skb->len; 249 + 250 + /* sk ownership - if any - completely transferred to the aggregated packet */ 251 + skb->destructor = NULL; 252 + skb->sk = NULL; 253 + p->truesize += skb->truesize; 254 + p->len += skb->len; 255 + 256 + NAPI_GRO_CB(skb)->same_flow = 1; 257 + 258 + return 0; 259 + } 260 + 234 261 235 262 static void napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb) 236 263 {
+187 -45
net/ipv4/tcp_offload.c
··· 28 28 } 29 29 } 30 30 31 + static void __tcpv4_gso_segment_csum(struct sk_buff *seg, 32 + __be32 *oldip, __be32 newip, 33 + __be16 *oldport, __be16 newport) 34 + { 35 + struct tcphdr *th; 36 + struct iphdr *iph; 37 + 38 + if (*oldip == newip && *oldport == newport) 39 + return; 40 + 41 + th = tcp_hdr(seg); 42 + iph = ip_hdr(seg); 43 + 44 + inet_proto_csum_replace4(&th->check, seg, *oldip, newip, true); 45 + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); 46 + *oldport = newport; 47 + 48 + csum_replace4(&iph->check, *oldip, newip); 49 + *oldip = newip; 50 + } 51 + 52 + static struct sk_buff *__tcpv4_gso_segment_list_csum(struct sk_buff *segs) 53 + { 54 + const struct tcphdr *th; 55 + const struct iphdr *iph; 56 + struct sk_buff *seg; 57 + struct tcphdr *th2; 58 + struct iphdr *iph2; 59 + 60 + seg = segs; 61 + th = tcp_hdr(seg); 62 + iph = ip_hdr(seg); 63 + th2 = tcp_hdr(seg->next); 64 + iph2 = ip_hdr(seg->next); 65 + 66 + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && 67 + iph->daddr == iph2->daddr && iph->saddr == iph2->saddr) 68 + return segs; 69 + 70 + while ((seg = seg->next)) { 71 + th2 = tcp_hdr(seg); 72 + iph2 = ip_hdr(seg); 73 + 74 + __tcpv4_gso_segment_csum(seg, 75 + &iph2->saddr, iph->saddr, 76 + &th2->source, th->source); 77 + __tcpv4_gso_segment_csum(seg, 78 + &iph2->daddr, iph->daddr, 79 + &th2->dest, th->dest); 80 + } 81 + 82 + return segs; 83 + } 84 + 85 + static struct sk_buff *__tcp4_gso_segment_list(struct sk_buff *skb, 86 + netdev_features_t features) 87 + { 88 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); 89 + if (IS_ERR(skb)) 90 + return skb; 91 + 92 + return __tcpv4_gso_segment_list_csum(skb); 93 + } 94 + 31 95 static struct sk_buff *tcp4_gso_segment(struct sk_buff *skb, 32 96 netdev_features_t features) 33 97 { ··· 100 36 101 37 if (!pskb_may_pull(skb, sizeof(struct tcphdr))) 102 38 return ERR_PTR(-EINVAL); 39 + 40 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) 41 + return __tcp4_gso_segment_list(skb, features); 103 42 104 43 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 105 44 const struct iphdr *iph = ip_hdr(skb); ··· 245 178 return segs; 246 179 } 247 180 248 - struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb) 181 + struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) 249 182 { 250 - struct sk_buff *pp = NULL; 251 - struct sk_buff *p; 252 - struct tcphdr *th; 253 183 struct tcphdr *th2; 254 - unsigned int len; 255 - unsigned int thlen; 256 - __be32 flags; 257 - unsigned int mss = 1; 258 - unsigned int hlen; 259 - unsigned int off; 260 - int flush = 1; 261 - int i; 262 - 263 - off = skb_gro_offset(skb); 264 - hlen = off + sizeof(*th); 265 - th = skb_gro_header(skb, hlen, off); 266 - if (unlikely(!th)) 267 - goto out; 268 - 269 - thlen = th->doff * 4; 270 - if (thlen < sizeof(*th)) 271 - goto out; 272 - 273 - hlen = off + thlen; 274 - if (!skb_gro_may_pull(skb, hlen)) { 275 - th = skb_gro_header_slow(skb, hlen, off); 276 - if (unlikely(!th)) 277 - goto out; 278 - } 279 - 280 - skb_gro_pull(skb, thlen); 281 - 282 - len = skb_gro_len(skb); 283 - flags = tcp_flag_word(th); 184 + struct sk_buff *p; 284 185 285 186 list_for_each_entry(p, head, list) { 286 187 if (!NAPI_GRO_CB(p)->same_flow) 287 188 continue; 288 189 289 190 th2 = tcp_hdr(p); 290 - 291 191 if (*(u32 *)&th->source ^ *(u32 *)&th2->source) { 292 192 NAPI_GRO_CB(p)->same_flow = 0; 293 193 continue; 294 194 } 295 195 296 - goto found; 196 + return p; 297 197 } 298 - p = NULL; 299 - goto out_check_final; 300 198 301 - found: 199 + return NULL; 200 + } 201 + 202 + struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) 203 + { 204 + unsigned int thlen, hlen, off; 205 + struct tcphdr *th; 206 + 207 + off = skb_gro_offset(skb); 208 + hlen = off + sizeof(*th); 209 + th = skb_gro_header(skb, hlen, off); 210 + if (unlikely(!th)) 211 + return NULL; 212 + 213 + thlen = th->doff * 4; 214 + if (thlen < sizeof(*th)) 215 + return NULL; 216 + 217 + hlen = off + thlen; 218 + if (!skb_gro_may_pull(skb, hlen)) { 219 + th = skb_gro_header_slow(skb, hlen, off); 220 + if (unlikely(!th)) 221 + return NULL; 222 + } 223 + 224 + skb_gro_pull(skb, thlen); 225 + 226 + return th; 227 + } 228 + 229 + struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, 230 + struct tcphdr *th) 231 + { 232 + unsigned int thlen = th->doff * 4; 233 + struct sk_buff *pp = NULL; 234 + struct sk_buff *p; 235 + struct tcphdr *th2; 236 + unsigned int len; 237 + __be32 flags; 238 + unsigned int mss = 1; 239 + int flush = 1; 240 + int i; 241 + 242 + len = skb_gro_len(skb); 243 + flags = tcp_flag_word(th); 244 + 245 + p = tcp_gro_lookup(head, th); 246 + if (!p) 247 + goto out_check_final; 248 + 302 249 /* Include the IP ID check below from the inner most IP hdr */ 250 + th2 = tcp_hdr(p); 303 251 flush = NAPI_GRO_CB(p)->flush; 304 252 flush |= (__force int)(flags & TCP_FLAG_CWR); 305 253 flush |= (__force int)((flags ^ tcp_flag_word(th2)) & ··· 349 267 flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); 350 268 flush |= skb_cmp_decrypted(p, skb); 351 269 270 + if (unlikely(NAPI_GRO_CB(p)->is_flist)) { 271 + flush |= (__force int)(flags ^ tcp_flag_word(th2)); 272 + flush |= skb->ip_summed != p->ip_summed; 273 + flush |= skb->csum_level != p->csum_level; 274 + flush |= NAPI_GRO_CB(p)->count >= 64; 275 + 276 + if (flush || skb_gro_receive_list(p, skb)) 277 + mss = 1; 278 + 279 + goto out_check_final; 280 + } 281 + 352 282 if (flush || skb_gro_receive(p, skb)) { 353 283 mss = 1; 354 284 goto out_check_final; ··· 382 288 if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) 383 289 pp = p; 384 290 385 - out: 386 291 NAPI_GRO_CB(skb)->flush |= (flush != 0); 387 292 388 293 return pp; ··· 407 314 } 408 315 EXPORT_SYMBOL(tcp_gro_complete); 409 316 317 + static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, 318 + struct tcphdr *th) 319 + { 320 + const struct iphdr *iph; 321 + struct sk_buff *p; 322 + struct sock *sk; 323 + struct net *net; 324 + int iif, sdif; 325 + 326 + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) 327 + return; 328 + 329 + p = tcp_gro_lookup(head, th); 330 + if (p) { 331 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; 332 + return; 333 + } 334 + 335 + inet_get_iif_sdif(skb, &iif, &sdif); 336 + iph = skb_gro_network_header(skb); 337 + net = dev_net(skb->dev); 338 + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 339 + iph->saddr, th->source, 340 + iph->daddr, ntohs(th->dest), 341 + iif, sdif); 342 + NAPI_GRO_CB(skb)->is_flist = !sk; 343 + if (sk) 344 + sock_put(sk); 345 + } 346 + 410 347 INDIRECT_CALLABLE_SCOPE 411 348 struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb) 412 349 { 350 + struct tcphdr *th; 351 + 413 352 /* Don't bother verifying checksum if we're going to flush anyway. */ 414 353 if (!NAPI_GRO_CB(skb)->flush && 415 354 skb_gro_checksum_validate(skb, IPPROTO_TCP, 416 - inet_gro_compute_pseudo)) { 417 - NAPI_GRO_CB(skb)->flush = 1; 418 - return NULL; 419 - } 355 + inet_gro_compute_pseudo)) 356 + goto flush; 420 357 421 - return tcp_gro_receive(head, skb); 358 + th = tcp_gro_pull_header(skb); 359 + if (!th) 360 + goto flush; 361 + 362 + tcp4_check_fraglist_gro(head, skb, th); 363 + 364 + return tcp_gro_receive(head, skb, th); 365 + 366 + flush: 367 + NAPI_GRO_CB(skb)->flush = 1; 368 + return NULL; 422 369 } 423 370 424 371 INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) 425 372 { 426 373 const struct iphdr *iph = ip_hdr(skb); 427 374 struct tcphdr *th = tcp_hdr(skb); 375 + 376 + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { 377 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV4; 378 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; 379 + 380 + __skb_incr_checksum_unnecessary(skb); 381 + 382 + return 0; 383 + } 428 384 429 385 th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, 430 386 iph->daddr, 0);
-27
net/ipv4/udp_offload.c
··· 433 433 return segs; 434 434 } 435 435 436 - static int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) 437 - { 438 - if (unlikely(p->len + skb->len >= 65536)) 439 - return -E2BIG; 440 - 441 - if (NAPI_GRO_CB(p)->last == p) 442 - skb_shinfo(p)->frag_list = skb; 443 - else 444 - NAPI_GRO_CB(p)->last->next = skb; 445 - 446 - skb_pull(skb, skb_gro_offset(skb)); 447 - 448 - NAPI_GRO_CB(p)->last = skb; 449 - NAPI_GRO_CB(p)->count++; 450 - p->data_len += skb->len; 451 - 452 - /* sk ownership - if any - completely transferred to the aggregated packet */ 453 - skb->destructor = NULL; 454 - skb->sk = NULL; 455 - p->truesize += skb->truesize; 456 - p->len += skb->len; 457 - 458 - NAPI_GRO_CB(skb)->same_flow = 1; 459 - 460 - return 0; 461 - } 462 - 463 436 464 437 #define UDP_GRO_CNT_MAX 64 465 438 static struct sk_buff *udp_gro_receive_segment(struct list_head *head,
+115 -5
net/ipv6/tcpv6_offload.c
··· 7 7 */ 8 8 #include <linux/indirect_call_wrapper.h> 9 9 #include <linux/skbuff.h> 10 + #include <net/inet6_hashtables.h> 10 11 #include <net/gro.h> 11 12 #include <net/protocol.h> 12 13 #include <net/tcp.h> 13 14 #include <net/ip6_checksum.h> 14 15 #include "ip6_offload.h" 15 16 17 + static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, 18 + struct tcphdr *th) 19 + { 20 + #if IS_ENABLED(CONFIG_IPV6) 21 + const struct ipv6hdr *hdr; 22 + struct sk_buff *p; 23 + struct sock *sk; 24 + struct net *net; 25 + int iif, sdif; 26 + 27 + if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) 28 + return; 29 + 30 + p = tcp_gro_lookup(head, th); 31 + if (p) { 32 + NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; 33 + return; 34 + } 35 + 36 + inet6_get_iif_sdif(skb, &iif, &sdif); 37 + hdr = skb_gro_network_header(skb); 38 + net = dev_net(skb->dev); 39 + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, 40 + &hdr->saddr, th->source, 41 + &hdr->daddr, ntohs(th->dest), 42 + iif, sdif); 43 + NAPI_GRO_CB(skb)->is_flist = !sk; 44 + if (sk) 45 + sock_put(sk); 46 + #endif /* IS_ENABLED(CONFIG_IPV6) */ 47 + } 48 + 16 49 INDIRECT_CALLABLE_SCOPE 17 50 struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) 18 51 { 52 + struct tcphdr *th; 53 + 19 54 /* Don't bother verifying checksum if we're going to flush anyway. */ 20 55 if (!NAPI_GRO_CB(skb)->flush && 21 56 skb_gro_checksum_validate(skb, IPPROTO_TCP, 22 - ip6_gro_compute_pseudo)) { 23 - NAPI_GRO_CB(skb)->flush = 1; 24 - return NULL; 25 - } 57 + ip6_gro_compute_pseudo)) 58 + goto flush; 26 59 27 - return tcp_gro_receive(head, skb); 60 + th = tcp_gro_pull_header(skb); 61 + if (!th) 62 + goto flush; 63 + 64 + tcp6_check_fraglist_gro(head, skb, th); 65 + 66 + return tcp_gro_receive(head, skb, th); 67 + 68 + flush: 69 + NAPI_GRO_CB(skb)->flush = 1; 70 + return NULL; 28 71 } 29 72 30 73 INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) ··· 75 32 const struct ipv6hdr *iph = ipv6_hdr(skb); 76 33 struct tcphdr *th = tcp_hdr(skb); 77 34 35 + if (unlikely(NAPI_GRO_CB(skb)->is_flist)) { 36 + skb_shinfo(skb)->gso_type |= SKB_GSO_FRAGLIST | SKB_GSO_TCPV6; 37 + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; 38 + 39 + __skb_incr_checksum_unnecessary(skb); 40 + 41 + return 0; 42 + } 43 + 78 44 th->check = ~tcp_v6_check(skb->len - thoff, &iph->saddr, 79 45 &iph->daddr, 0); 80 46 skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV6; 81 47 82 48 tcp_gro_complete(skb); 83 49 return 0; 50 + } 51 + 52 + static void __tcpv6_gso_segment_csum(struct sk_buff *seg, 53 + __be16 *oldport, __be16 newport) 54 + { 55 + struct tcphdr *th; 56 + 57 + if (*oldport == newport) 58 + return; 59 + 60 + th = tcp_hdr(seg); 61 + inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false); 62 + *oldport = newport; 63 + } 64 + 65 + static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs) 66 + { 67 + const struct tcphdr *th; 68 + const struct ipv6hdr *iph; 69 + struct sk_buff *seg; 70 + struct tcphdr *th2; 71 + struct ipv6hdr *iph2; 72 + 73 + seg = segs; 74 + th = tcp_hdr(seg); 75 + iph = ipv6_hdr(seg); 76 + th2 = tcp_hdr(seg->next); 77 + iph2 = ipv6_hdr(seg->next); 78 + 79 + if (!(*(const u32 *)&th->source ^ *(const u32 *)&th2->source) && 80 + ipv6_addr_equal(&iph->saddr, &iph2->saddr) && 81 + ipv6_addr_equal(&iph->daddr, &iph2->daddr)) 82 + return segs; 83 + 84 + while ((seg = seg->next)) { 85 + th2 = tcp_hdr(seg); 86 + iph2 = ipv6_hdr(seg); 87 + 88 + iph2->saddr = iph->saddr; 89 + iph2->daddr = iph->daddr; 90 + __tcpv6_gso_segment_csum(seg, &th2->source, th->source); 91 + __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest); 92 + } 93 + 94 + return segs; 95 + } 96 + 97 + static struct sk_buff *__tcp6_gso_segment_list(struct sk_buff *skb, 98 + netdev_features_t features) 99 + { 100 + skb = skb_segment_list(skb, features, skb_mac_header_len(skb)); 101 + if (IS_ERR(skb)) 102 + return skb; 103 + 104 + return __tcpv6_gso_segment_list_csum(skb); 84 105 } 85 106 86 107 static struct sk_buff *tcp6_gso_segment(struct sk_buff *skb, ··· 157 50 158 51 if (!pskb_may_pull(skb, sizeof(*th))) 159 52 return ERR_PTR(-EINVAL); 53 + 54 + if (skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST) 55 + return __tcp6_gso_segment_list(skb, features); 160 56 161 57 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) { 162 58 const struct ipv6hdr *ipv6h = ipv6_hdr(skb);