Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'preserve-msg_zerocopy-with-forwarding'

Willem de Bruijn says:

====================
preserve MSG_ZEROCOPY with forwarding

Avoid false positive copying of zerocopy skb frags when entering the
ingress path if the skb is not queued locally but forwarded.

Patch 1 for more details and feature.

Patch 2 converts the existing selftest to a pass/fail test and adds
coverage for this new feature.
====================

Link: https://patch.msgid.link/20250630194312.1571410-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+90 -33
-2
net/core/dev.c
··· 5937 5937 } 5938 5938 5939 5939 if (pt_prev) { 5940 - if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) 5941 - goto drop; 5942 5940 *ppt_prev = pt_prev; 5943 5941 } else { 5944 5942 drop:
+6
net/ipv4/ip_input.c
··· 226 226 227 227 static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 228 228 { 229 + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { 230 + __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); 231 + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); 232 + return 0; 233 + } 234 + 229 235 skb_clear_delivery_time(skb); 230 236 __skb_pull(skb, skb_network_header_len(skb)); 231 237
+7
net/ipv6/ip6_input.c
··· 478 478 479 479 static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 480 480 { 481 + if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) { 482 + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 483 + IPSTATS_MIB_INDISCARDS); 484 + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); 485 + return 0; 486 + } 487 + 481 488 skb_clear_delivery_time(skb); 482 489 ip6_protocol_deliver_rcu(net, skb, 0, false); 483 490
+15 -9
tools/testing/selftests/net/msg_zerocopy.c
··· 77 77 static int cfg_cork; 78 78 static bool cfg_cork_mixed; 79 79 static int cfg_cpu = -1; /* default: pin to last cpu */ 80 + static int cfg_expect_zerocopy = -1; 80 81 static int cfg_family = PF_UNSPEC; 81 82 static int cfg_ifindex = 1; 82 83 static int cfg_payload_len; ··· 93 92 static struct sockaddr_storage cfg_dst_addr; 94 93 static struct sockaddr_storage cfg_src_addr; 95 94 95 + static int exitcode; 96 96 static char payload[IP_MAXPACKET]; 97 97 static long packets, bytes, completions, expected_completions; 98 - static int zerocopied = -1; 99 98 static uint32_t next_completion; 100 99 static uint32_t sends_since_notify; 101 100 ··· 445 444 next_completion = hi + 1; 446 445 447 446 zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED); 448 - if (zerocopied == -1) 449 - zerocopied = zerocopy; 450 - else if (zerocopied != zerocopy) { 451 - fprintf(stderr, "serr: inconsistent\n"); 452 - zerocopied = zerocopy; 447 + if (cfg_expect_zerocopy != -1 && 448 + cfg_expect_zerocopy != zerocopy) { 449 + fprintf(stderr, "serr: ee_code: %u != expected %u\n", 450 + zerocopy, cfg_expect_zerocopy); 451 + exitcode = 1; 452 + /* suppress repeated messages */ 453 + cfg_expect_zerocopy = zerocopy; 453 454 } 454 455 455 456 if (cfg_verbose >= 2) ··· 574 571 575 572 fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n", 576 573 packets, bytes >> 20, completions, 577 - zerocopied == 1 ? 'y' : 'n'); 574 + cfg_zerocopy && cfg_expect_zerocopy == 1 ? 'y' : 'n'); 578 575 } 579 576 580 577 static int do_setup_rx(int domain, int type, int protocol) ··· 718 715 719 716 cfg_payload_len = max_payload_len; 720 717 721 - while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) { 718 + while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vzZ:")) != -1) { 722 719 switch (c) { 723 720 case '4': 724 721 if (cfg_family != PF_UNSPEC) ··· 773 770 case 'z': 774 771 cfg_zerocopy = true; 775 772 break; 773 + case 'Z': 774 + cfg_expect_zerocopy = !!atoi(optarg); 775 + break; 776 776 } 777 777 } 778 778 ··· 823 817 else 824 818 error(1, 0, "unknown cfg_test %s", cfg_test); 825 819 826 - return 0; 820 + return exitcode; 827 821 }
+62 -22
tools/testing/selftests/net/msg_zerocopy.sh
··· 6 6 set -e 7 7 8 8 readonly DEV="veth0" 9 + readonly DUMMY_DEV="dummy0" 9 10 readonly DEV_MTU=65535 10 11 readonly BIN="./msg_zerocopy" 11 12 ··· 15 14 readonly NS1="${NSPREFIX}1" 16 15 readonly NS2="${NSPREFIX}2" 17 16 18 - readonly SADDR4='192.168.1.1' 19 - readonly DADDR4='192.168.1.2' 20 - readonly SADDR6='fd::1' 21 - readonly DADDR6='fd::2' 17 + readonly LPREFIX4='192.168.1' 18 + readonly RPREFIX4='192.168.2' 19 + readonly LPREFIX6='fd' 20 + readonly RPREFIX6='fc' 21 + 22 22 23 23 readonly path_sysctl_mem="net.core.optmem_max" 24 24 25 25 # No arguments: automated test 26 26 if [[ "$#" -eq "0" ]]; then 27 - $0 4 tcp -t 1 28 - $0 6 tcp -t 1 29 - $0 4 udp -t 1 30 - $0 6 udp -t 1 31 - echo "OK. All tests passed" 32 - exit 0 27 + ret=0 28 + 29 + $0 4 tcp -t 1 || ret=1 30 + $0 6 tcp -t 1 || ret=1 31 + $0 4 udp -t 1 || ret=1 32 + $0 6 udp -t 1 || ret=1 33 + 34 + [[ "$ret" == "0" ]] && echo "OK. All tests passed" 35 + exit $ret 33 36 fi 34 37 35 38 # Argument parsing ··· 50 45 51 46 # Argument parsing: configure addresses 52 47 if [[ "${IP}" == "4" ]]; then 53 - readonly SADDR="${SADDR4}" 54 - readonly DADDR="${DADDR4}" 48 + readonly SADDR="${LPREFIX4}.1" 49 + readonly DADDR="${LPREFIX4}.2" 50 + readonly DUMMY_ADDR="${RPREFIX4}.1" 51 + readonly DADDR_TXONLY="${RPREFIX4}.2" 52 + readonly MASK="24" 55 53 elif [[ "${IP}" == "6" ]]; then 56 - readonly SADDR="${SADDR6}" 57 - readonly DADDR="${DADDR6}" 54 + readonly SADDR="${LPREFIX6}::1" 55 + readonly DADDR="${LPREFIX6}::2" 56 + readonly DUMMY_ADDR="${RPREFIX6}::1" 57 + readonly DADDR_TXONLY="${RPREFIX6}::2" 58 + readonly MASK="64" 59 + readonly NODAD="nodad" 58 60 else 59 61 echo "Invalid IP version ${IP}" 60 62 exit 1 ··· 101 89 ip link add "${DEV}" mtu "${DEV_MTU}" netns "${NS1}" type veth \ 102 90 peer name "${DEV}" mtu "${DEV_MTU}" netns "${NS2}" 103 91 92 + ip link add "${DUMMY_DEV}" mtu "${DEV_MTU}" netns "${NS2}" type dummy 93 + 104 94 # Bring the devices up 105 95 ip -netns "${NS1}" link set "${DEV}" up 106 96 ip -netns "${NS2}" link set "${DEV}" up 97 + ip -netns "${NS2}" link set "${DUMMY_DEV}" up 107 98 108 99 # Set fixed MAC addresses on the devices 109 100 ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 110 101 ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 111 102 112 103 # Add fixed IP addresses to the devices 113 - ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" 114 - ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" 115 - ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad 116 - ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad 104 + ip -netns "${NS1}" addr add "${SADDR}/${MASK}" dev "${DEV}" ${NODAD} 105 + ip -netns "${NS2}" addr add "${DADDR}/${MASK}" dev "${DEV}" ${NODAD} 106 + ip -netns "${NS2}" addr add "${DUMMY_ADDR}/${MASK}" dev "${DUMMY_DEV}" ${NODAD} 107 + 108 + ip -netns "${NS1}" route add default via "${DADDR}" dev "${DEV}" 109 + ip -netns "${NS2}" route add default via "${DADDR_TXONLY}" dev "${DUMMY_DEV}" 110 + 111 + ip netns exec "${NS2}" sysctl -wq net.ipv4.ip_forward=1 112 + ip netns exec "${NS2}" sysctl -wq net.ipv6.conf.all.forwarding=1 117 113 118 114 # Optionally disable sg or csum offload to test edge cases 119 115 # ip netns exec "${NS1}" ethtool -K "${DEV}" sg off 120 116 117 + ret=0 118 + 121 119 do_test() { 122 120 local readonly ARGS="$1" 123 121 124 - echo "ipv${IP} ${TXMODE} ${ARGS}" 125 - ip netns exec "${NS2}" "${BIN}" "-${IP}" -i "${DEV}" -t 2 -C 2 -S "${SADDR}" -D "${DADDR}" ${ARGS} -r "${RXMODE}" & 122 + # tx-rx test 123 + # packets queued to a local socket are copied, 124 + # sender notification has SO_EE_CODE_ZEROCOPY_COPIED. 125 + 126 + echo -e "\nipv${IP} ${TXMODE} ${ARGS} tx-rx\n" 127 + ip netns exec "${NS2}" "${BIN}" "-${IP}" -i "${DEV}" -t 2 -C 2 \ 128 + -S "${SADDR}" -D "${DADDR}" ${ARGS} -r "${RXMODE}" & 126 129 sleep 0.2 127 - ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 -S "${SADDR}" -D "${DADDR}" ${ARGS} "${TXMODE}" 130 + ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 \ 131 + -S "${SADDR}" -D "${DADDR}" ${ARGS} "${TXMODE}" -Z 0 || ret=1 128 132 wait 133 + 134 + # next test is unconnected tx to dummy0, cannot exercise with tcp 135 + [[ "${TXMODE}" == "tcp" ]] && return 136 + 137 + # tx-only test: send out dummy0 138 + # packets leaving the host are not copied, 139 + # sender notification does not have SO_EE_CODE_ZEROCOPY_COPIED. 140 + 141 + echo -e "\nipv${IP} ${TXMODE} ${ARGS} tx-only\n" 142 + ip netns exec "${NS1}" "${BIN}" "-${IP}" -i "${DEV}" -t 1 -C 3 \ 143 + -S "${SADDR}" -D "${DADDR_TXONLY}" ${ARGS} "${TXMODE}" -Z 1 || ret=1 129 144 } 130 145 131 146 do_test "${EXTRA_ARGS}" 132 147 do_test "-z ${EXTRA_ARGS}" 133 - echo ok 148 + 149 + [[ "$ret" == "0" ]] && echo "OK"