Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'ip-improve-tcp-sock-multipath-routing'

Willem de Bruijn says:

====================
ip: improve tcp sock multipath routing

From: Willem de Bruijn <willemb@google.com>

Improve layer 4 multipath hash policy for local tcp connections:

patch 1: Select a source address that matches the nexthop device.
Due to tcp_v4_connect making separate route lookups for saddr
and route, the two can currently be inconsistent.

patch 2: Use all paths when opening multiple local tcp connections to
the same ip address and port.

patch 3: Test the behavior. Extend the fib_tests.sh testsuite with one
opening many connections, and count SYNs on both egress
devices, for packets matching the source address of the dev.

Changelog in the individual patches
====================

Link: https://patch.msgid.link/20250424143549.669426-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+197 -23
+1
include/net/flow.h
··· 39 39 #define FLOWI_FLAG_ANYSRC 0x01 40 40 #define FLOWI_FLAG_KNOWN_NH 0x02 41 41 #define FLOWI_FLAG_L3MDEV_OIF 0x04 42 + #define FLOWI_FLAG_ANY_SPORT 0x08 42 43 __u32 flowic_secid; 43 44 kuid_t flowic_uid; 44 45 __u32 flowic_multipath_hash;
+2 -1
include/net/ip_fib.h
··· 574 574 575 575 int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, 576 576 struct netlink_ext_ack *extack); 577 - void fib_select_multipath(struct fib_result *res, int hash); 577 + void fib_select_multipath(struct fib_result *res, int hash, 578 + const struct flowi4 *fl4); 578 579 void fib_select_path(struct net *net, struct fib_result *res, 579 580 struct flowi4 *fl4, const struct sk_buff *skb); 580 581
+3
include/net/route.h
··· 326 326 if (inet_test_bit(TRANSPARENT, sk)) 327 327 flow_flags |= FLOWI_FLAG_ANYSRC; 328 328 329 + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !sport) 330 + flow_flags |= FLOWI_FLAG_ANY_SPORT; 331 + 329 332 flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), 330 333 ip_sock_rt_scope(sk), protocol, flow_flags, dst, 331 334 src, dport, sport, sk->sk_uid);
+25 -14
net/ipv4/fib_semantics.c
··· 2170 2170 return !!(state & NUD_VALID); 2171 2171 } 2172 2172 2173 - void fib_select_multipath(struct fib_result *res, int hash) 2173 + void fib_select_multipath(struct fib_result *res, int hash, 2174 + const struct flowi4 *fl4) 2174 2175 { 2175 2176 struct fib_info *fi = res->fi; 2176 2177 struct net *net = fi->fib_net; 2177 - bool first = false; 2178 + bool found = false; 2179 + bool use_neigh; 2180 + __be32 saddr; 2178 2181 2179 2182 if (unlikely(res->fi->nh)) { 2180 2183 nexthop_path_fib_result(res, hash); 2181 2184 return; 2182 2185 } 2183 2186 2187 + use_neigh = READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh); 2188 + saddr = fl4 ? fl4->saddr : 0; 2189 + 2184 2190 change_nexthops(fi) { 2185 - if (READ_ONCE(net->ipv4.sysctl_fib_multipath_use_neigh)) { 2186 - if (!fib_good_nh(nexthop_nh)) 2187 - continue; 2188 - if (!first) { 2189 - res->nh_sel = nhsel; 2190 - res->nhc = &nexthop_nh->nh_common; 2191 - first = true; 2192 - } 2191 + if (use_neigh && !fib_good_nh(nexthop_nh)) 2192 + continue; 2193 + 2194 + if (!found) { 2195 + res->nh_sel = nhsel; 2196 + res->nhc = &nexthop_nh->nh_common; 2197 + found = !saddr || nexthop_nh->nh_saddr == saddr; 2193 2198 } 2194 2199 2195 2200 if (hash > atomic_read(&nexthop_nh->fib_nh_upper_bound)) 2196 2201 continue; 2197 2202 2198 - res->nh_sel = nhsel; 2199 - res->nhc = &nexthop_nh->nh_common; 2200 - return; 2203 + if (!saddr || nexthop_nh->nh_saddr == saddr) { 2204 + res->nh_sel = nhsel; 2205 + res->nhc = &nexthop_nh->nh_common; 2206 + return; 2207 + } 2208 + 2209 + if (found) 2210 + return; 2211 + 2201 2212 } endfor_nexthops(fi); 2202 2213 } 2203 2214 #endif ··· 2223 2212 if (fib_info_num_path(res->fi) > 1) { 2224 2213 int h = fib_multipath_hash(net, fl4, skb, NULL); 2225 2214 2226 - fib_select_multipath(res, h); 2215 + fib_select_multipath(res, h, fl4); 2227 2216 } 2228 2217 else 2229 2218 #endif
+11 -4
net/ipv4/route.c
··· 2037 2037 hash_keys.addrs.v4addrs.dst = fl4->daddr; 2038 2038 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) 2039 2039 hash_keys.basic.ip_proto = fl4->flowi4_proto; 2040 - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2041 - hash_keys.ports.src = fl4->fl4_sport; 2040 + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { 2041 + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) 2042 + hash_keys.ports.src = (__force __be16)get_random_u16(); 2043 + else 2044 + hash_keys.ports.src = fl4->fl4_sport; 2045 + } 2042 2046 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2043 2047 hash_keys.ports.dst = fl4->fl4_dport; 2044 2048 ··· 2097 2093 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 2098 2094 hash_keys.addrs.v4addrs.src = fl4->saddr; 2099 2095 hash_keys.addrs.v4addrs.dst = fl4->daddr; 2100 - hash_keys.ports.src = fl4->fl4_sport; 2096 + if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) 2097 + hash_keys.ports.src = (__force __be16)get_random_u16(); 2098 + else 2099 + hash_keys.ports.src = fl4->fl4_sport; 2101 2100 hash_keys.ports.dst = fl4->fl4_dport; 2102 2101 hash_keys.basic.ip_proto = fl4->flowi4_proto; 2103 2102 } ··· 2161 2154 if (res->fi && fib_info_num_path(res->fi) > 1) { 2162 2155 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); 2163 2156 2164 - fib_select_multipath(res, h); 2157 + fib_select_multipath(res, h, NULL); 2165 2158 IPCB(skb)->flags |= IPSKB_MULTIPATH; 2166 2159 } 2167 2160 #endif
+10 -3
net/ipv6/route.c
··· 2492 2492 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2493 2493 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_FLOWLABEL) 2494 2494 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6); 2495 - if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) 2496 - hash_keys.ports.src = fl6->fl6_sport; 2495 + if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { 2496 + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) 2497 + hash_keys.ports.src = (__force __be16)get_random_u16(); 2498 + else 2499 + hash_keys.ports.src = fl6->fl6_sport; 2500 + } 2497 2501 if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) 2498 2502 hash_keys.ports.dst = fl6->fl6_dport; 2499 2503 ··· 2551 2547 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 2552 2548 hash_keys.addrs.v6addrs.src = fl6->saddr; 2553 2549 hash_keys.addrs.v6addrs.dst = fl6->daddr; 2554 - hash_keys.ports.src = fl6->fl6_sport; 2550 + if (fl6->flowi6_flags & FLOWI_FLAG_ANY_SPORT) 2551 + hash_keys.ports.src = (__force __be16)get_random_u16(); 2552 + else 2553 + hash_keys.ports.src = fl6->fl6_sport; 2555 2554 hash_keys.ports.dst = fl6->fl6_dport; 2556 2555 hash_keys.basic.ip_proto = fl6->flowi6_proto; 2557 2556 }
+2
net/ipv6/tcp_ipv6.c
··· 267 267 fl6.flowi6_mark = sk->sk_mark; 268 268 fl6.fl6_dport = usin->sin6_port; 269 269 fl6.fl6_sport = inet->inet_sport; 270 + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) 271 + fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; 270 272 fl6.flowi6_uid = sk->sk_uid; 271 273 272 274 opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
+119 -1
tools/testing/selftests/net/fib_tests.sh
··· 11 11 ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric ipv6_route_metrics \ 12 12 ipv4_route_metrics ipv4_route_v6_gw rp_filter ipv4_del_addr \ 13 13 ipv6_del_addr ipv4_mangle ipv6_mangle ipv4_bcast_neigh fib6_gc_test \ 14 - ipv4_mpath_list ipv6_mpath_list" 14 + ipv4_mpath_list ipv6_mpath_list ipv4_mpath_balance ipv6_mpath_balance" 15 15 16 16 VERBOSE=0 17 17 PAUSE_ON_FAIL=no ··· 1083 1083 ip -netns $ns2 addr add 172.16.104.1/24 dev dummy1 1084 1084 1085 1085 set +e 1086 + } 1087 + 1088 + forwarding_cleanup() 1089 + { 1090 + cleanup_ns $ns3 1091 + 1092 + route_cleanup 1093 + } 1094 + 1095 + # extend route_setup with an ns3 reachable through ns2 over both devices 1096 + forwarding_setup() 1097 + { 1098 + forwarding_cleanup 1099 + 1100 + route_setup 1101 + 1102 + setup_ns ns3 1103 + 1104 + ip link add veth5 netns $ns3 type veth peer name veth6 netns $ns2 1105 + ip -netns $ns3 link set veth5 up 1106 + ip -netns $ns2 link set veth6 up 1107 + 1108 + ip -netns $ns3 -4 addr add dev veth5 172.16.105.1/24 1109 + ip -netns $ns2 -4 addr add dev veth6 172.16.105.2/24 1110 + ip -netns $ns3 -4 route add 172.16.100.0/22 via 172.16.105.2 1111 + 1112 + ip -netns $ns3 -6 addr add dev veth5 2001:db8:105::1/64 nodad 1113 + ip -netns $ns2 -6 addr add dev veth6 2001:db8:105::2/64 nodad 1114 + ip -netns $ns3 -6 route add 2001:db8:101::/33 via 2001:db8:105::2 1086 1115 } 1087 1116 1088 1117 # assumption is that basic add of a single path route works ··· 2629 2600 route_cleanup 2630 2601 } 2631 2602 2603 + tc_set_flower_counter__saddr_syn() { 2604 + tc_set_flower_counter $1 $2 $3 "src_ip $4 ip_proto tcp tcp_flags 0x2" 2605 + } 2606 + 2607 + ip_mpath_balance_dep_check() 2608 + { 2609 + if [ ! -x "$(command -v socat)" ]; then 2610 + echo "socat command not found. Skipping test" 2611 + return 1 2612 + fi 2613 + 2614 + if [ ! -x "$(command -v jq)" ]; then 2615 + echo "jq command not found. Skipping test" 2616 + return 1 2617 + fi 2618 + } 2619 + 2620 + ip_mpath_balance() { 2621 + local -r ipver=$1 2622 + local -r daddr=$2 2623 + local -r num_conn=20 2624 + 2625 + for i in $(seq 1 $num_conn); do 2626 + ip netns exec $ns3 socat $ipver TCP-LISTEN:8000 STDIO >/dev/null & 2627 + sleep 0.02 2628 + echo -n a | ip netns exec $ns1 socat $ipver STDIO TCP:$daddr:8000 2629 + done 2630 + 2631 + local -r syn0="$(tc_get_flower_counter $ns1 veth1)" 2632 + local -r syn1="$(tc_get_flower_counter $ns1 veth3)" 2633 + local -r syns=$((syn0+syn1)) 2634 + 2635 + [ "$VERBOSE" = "1" ] && echo "multipath: syns seen: ($syn0,$syn1)" 2636 + 2637 + [[ $syns -ge $num_conn ]] && [[ $syn0 -gt 0 ]] && [[ $syn1 -gt 0 ]] 2638 + } 2639 + 2640 + ipv4_mpath_balance_test() 2641 + { 2642 + echo 2643 + echo "IPv4 multipath load balance test" 2644 + 2645 + ip_mpath_balance_dep_check || return 1 2646 + forwarding_setup 2647 + 2648 + $IP route add 172.16.105.1 \ 2649 + nexthop via 172.16.101.2 \ 2650 + nexthop via 172.16.103.2 2651 + 2652 + ip netns exec $ns1 \ 2653 + sysctl -q -w net.ipv4.fib_multipath_hash_policy=1 2654 + 2655 + tc_set_flower_counter__saddr_syn $ns1 4 veth1 172.16.101.1 2656 + tc_set_flower_counter__saddr_syn $ns1 4 veth3 172.16.103.1 2657 + 2658 + ip_mpath_balance -4 172.16.105.1 2659 + 2660 + log_test $? 0 "IPv4 multipath loadbalance" 2661 + 2662 + forwarding_cleanup 2663 + } 2664 + 2665 + ipv6_mpath_balance_test() 2666 + { 2667 + echo 2668 + echo "IPv6 multipath load balance test" 2669 + 2670 + ip_mpath_balance_dep_check || return 1 2671 + forwarding_setup 2672 + 2673 + $IP route add 2001:db8:105::1\ 2674 + nexthop via 2001:db8:101::2 \ 2675 + nexthop via 2001:db8:103::2 2676 + 2677 + ip netns exec $ns1 \ 2678 + sysctl -q -w net.ipv6.fib_multipath_hash_policy=1 2679 + 2680 + tc_set_flower_counter__saddr_syn $ns1 6 veth1 2001:db8:101::1 2681 + tc_set_flower_counter__saddr_syn $ns1 6 veth3 2001:db8:103::1 2682 + 2683 + ip_mpath_balance -6 "[2001:db8:105::1]" 2684 + 2685 + log_test $? 0 "IPv6 multipath loadbalance" 2686 + 2687 + forwarding_cleanup 2688 + } 2689 + 2632 2690 ################################################################################ 2633 2691 # usage 2634 2692 ··· 2799 2683 fib6_gc_test|ipv6_gc) fib6_gc_test;; 2800 2684 ipv4_mpath_list) ipv4_mpath_list_test;; 2801 2685 ipv6_mpath_list) ipv6_mpath_list_test;; 2686 + ipv4_mpath_balance) ipv4_mpath_balance_test;; 2687 + ipv6_mpath_balance) ipv6_mpath_balance_test;; 2802 2688 2803 2689 help) echo "Test names: $TESTS"; exit 0;; 2804 2690 esac
+24
tools/testing/selftests/net/lib.sh
··· 270 270 .options.actions[0].stats$selector" 271 271 } 272 272 273 + # attach a qdisc with two children match/no-match and a flower filter to match 274 + tc_set_flower_counter() { 275 + local -r ns=$1 276 + local -r ipver=$2 277 + local -r dev=$3 278 + local -r flower_expr=$4 279 + 280 + tc -n $ns qdisc add dev $dev root handle 1: prio bands 2 \ 281 + priomap 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 282 + 283 + tc -n $ns qdisc add dev $dev parent 1:1 handle 11: pfifo 284 + tc -n $ns qdisc add dev $dev parent 1:2 handle 12: pfifo 285 + 286 + tc -n $ns filter add dev $dev parent 1: protocol ipv$ipver \ 287 + flower $flower_expr classid 1:2 288 + } 289 + 290 + tc_get_flower_counter() { 291 + local -r ns=$1 292 + local -r dev=$2 293 + 294 + tc -n $ns -j -s qdisc show dev $dev handle 12: | jq .[0].packets 295 + } 296 + 273 297 ret_set_ksft_status() 274 298 { 275 299 local ksft_status=$1; shift