Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

geneve: Allow users to specify source port range

Recently, in case of Cilium, we run into users on Azure who require to use
tunneling for east/west traffic due to hitting IPAM API limits for Kubernetes
Pods if they would have gone with publicly routable IPs for Pods. In case
of tunneling, Cilium supports the option of vxlan or geneve. In order to
RSS spread flows among remote CPUs both derive a source port hash via
udp_flow_src_port() which takes the inner packet's skb->hash into account.
For clusters with many nodes, this can then hit a new limitation [0]: Today,
the Azure networking stack supports 1M total flows (500k inbound and 500k
outbound) for a VM. [...] Once this limit is hit, other connections are
dropped. [...] Each flow is distinguished by a 5-tuple (protocol, local IP
address, remote IP address, local port, and remote port) information. [...]

For vxlan and geneve, this can create a massive amount of UDP flows which
then run into the limits if stale flows are not evicted fast enough. One
option to mitigate this for vxlan is to narrow the source port range via
IFLA_VXLAN_PORT_RANGE while still being able to benefit from RSS. However,
geneve currently does not have this option and it spreads traffic across
the full source port range of [1, USHRT_MAX]. To overcome this limitation
also for geneve, add an equivalent IFLA_GENEVE_PORT_RANGE setting for users.

Note that struct geneve_config before/after still remains at 2 cachelines
on x86-64. The low/high members of struct ifla_geneve_port_range (which is
uapi exposed) are of type __be16. While they would be perfectly fine to be
of __u16 type, the consensus was that it would be good to be consistent
with the existing struct ifla_vxlan_port_range from a uapi consumer PoV.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://learn.microsoft.com/en-us/azure/virtual-network/virtual-machine-network-throughput [0]
Link: https://patch.msgid.link/20250226182030.89440-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Daniel Borkmann and committed by
Jakub Kicinski
e1f95b19 bf08fd32

+54 -4
+48 -4
drivers/net/geneve.c
··· 57 57 bool ttl_inherit; 58 58 enum ifla_geneve_df df; 59 59 bool inner_proto_inherit; 60 + u16 port_min; 61 + u16 port_max; 60 62 }; 61 63 62 64 /* Pseudo network device */ ··· 837 835 838 836 use_cache = ip_tunnel_dst_cache_usable(skb, info); 839 837 tos = geneve_get_dsfield(skb, dev, info, &use_cache); 840 - sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); 838 + sport = udp_flow_src_port(geneve->net, skb, 839 + geneve->cfg.port_min, 840 + geneve->cfg.port_max, true); 841 841 842 842 rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, 843 843 &info->key, ··· 949 945 950 946 use_cache = ip_tunnel_dst_cache_usable(skb, info); 951 947 prio = geneve_get_dsfield(skb, dev, info, &use_cache); 952 - sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true); 948 + sport = udp_flow_src_port(geneve->net, skb, 949 + geneve->cfg.port_min, 950 + geneve->cfg.port_max, true); 953 951 954 952 dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, 955 953 &saddr, key, sport, ··· 1090 1084 use_cache = ip_tunnel_dst_cache_usable(skb, info); 1091 1085 tos = geneve_get_dsfield(skb, dev, info, &use_cache); 1092 1086 sport = udp_flow_src_port(geneve->net, skb, 1093 - 1, USHRT_MAX, true); 1087 + geneve->cfg.port_min, 1088 + geneve->cfg.port_max, true); 1094 1089 1095 1090 rt = udp_tunnel_dst_lookup(skb, dev, geneve->net, 0, &saddr, 1096 1091 &info->key, ··· 1117 1110 use_cache = ip_tunnel_dst_cache_usable(skb, info); 1118 1111 prio = geneve_get_dsfield(skb, dev, info, &use_cache); 1119 1112 sport = udp_flow_src_port(geneve->net, skb, 1120 - 1, USHRT_MAX, true); 1113 + geneve->cfg.port_min, 1114 + geneve->cfg.port_max, true); 1121 1115 1122 1116 dst = udp_tunnel6_dst_lookup(skb, dev, geneve->net, gs6->sock, 0, 1123 1117 &saddr, &info->key, sport, ··· 1242 1234 [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, 1243 1235 [IFLA_GENEVE_DF] = { .type = NLA_U8 }, 1244 1236 [IFLA_GENEVE_INNER_PROTO_INHERIT] = { .type = NLA_FLAG }, 1237 + [IFLA_GENEVE_PORT_RANGE] = NLA_POLICY_EXACT_LEN(sizeof(struct ifla_geneve_port_range)), 1245 1238 }; 1246 1239 1247 1240 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], ··· 1284 1275 if (df < 0 || df > GENEVE_DF_MAX) { 1285 1276 NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_DF], 1286 1277 "Invalid DF attribute"); 1278 + return -EINVAL; 1279 + } 1280 + } 1281 + 1282 + if (data[IFLA_GENEVE_PORT_RANGE]) { 1283 + const struct ifla_geneve_port_range *p; 1284 + 1285 + p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); 1286 + if (ntohs(p->high) < ntohs(p->low)) { 1287 + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_GENEVE_PORT_RANGE], 1288 + "Invalid source port range"); 1287 1289 return -EINVAL; 1288 1290 } 1289 1291 } ··· 1526 1506 info->key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]); 1527 1507 } 1528 1508 1509 + if (data[IFLA_GENEVE_PORT_RANGE]) { 1510 + const struct ifla_geneve_port_range *p; 1511 + 1512 + if (changelink) { 1513 + attrtype = IFLA_GENEVE_PORT_RANGE; 1514 + goto change_notsup; 1515 + } 1516 + p = nla_data(data[IFLA_GENEVE_PORT_RANGE]); 1517 + cfg->port_min = ntohs(p->low); 1518 + cfg->port_max = ntohs(p->high); 1519 + } 1520 + 1529 1521 if (data[IFLA_GENEVE_COLLECT_METADATA]) { 1530 1522 if (changelink) { 1531 1523 attrtype = IFLA_GENEVE_COLLECT_METADATA; ··· 1658 1626 .use_udp6_rx_checksums = false, 1659 1627 .ttl_inherit = false, 1660 1628 .collect_md = false, 1629 + .port_min = 1, 1630 + .port_max = USHRT_MAX, 1661 1631 }; 1662 1632 int err; 1663 1633 ··· 1778 1744 nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ 1779 1745 nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ 1780 1746 nla_total_size(0) + /* IFLA_GENEVE_INNER_PROTO_INHERIT */ 1747 + nla_total_size(sizeof(struct ifla_geneve_port_range)) + /* IFLA_GENEVE_PORT_RANGE */ 1781 1748 0; 1782 1749 } 1783 1750 ··· 1788 1753 struct ip_tunnel_info *info = &geneve->cfg.info; 1789 1754 bool ttl_inherit = geneve->cfg.ttl_inherit; 1790 1755 bool metadata = geneve->cfg.collect_md; 1756 + struct ifla_geneve_port_range ports = { 1757 + .low = htons(geneve->cfg.port_min), 1758 + .high = htons(geneve->cfg.port_max), 1759 + }; 1791 1760 __u8 tmp_vni[3]; 1792 1761 __u32 vni; 1793 1762 ··· 1848 1809 nla_put_flag(skb, IFLA_GENEVE_INNER_PROTO_INHERIT)) 1849 1810 goto nla_put_failure; 1850 1811 1812 + if (nla_put(skb, IFLA_GENEVE_PORT_RANGE, sizeof(ports), &ports)) 1813 + goto nla_put_failure; 1814 + 1851 1815 return 0; 1852 1816 1853 1817 nla_put_failure: ··· 1883 1841 .use_udp6_rx_checksums = true, 1884 1842 .ttl_inherit = false, 1885 1843 .collect_md = true, 1844 + .port_min = 1, 1845 + .port_max = USHRT_MAX, 1886 1846 }; 1887 1847 1888 1848 memset(tb, 0, sizeof(tb));
+6
include/uapi/linux/if_link.h
··· 1438 1438 IFLA_GENEVE_TTL_INHERIT, 1439 1439 IFLA_GENEVE_DF, 1440 1440 IFLA_GENEVE_INNER_PROTO_INHERIT, 1441 + IFLA_GENEVE_PORT_RANGE, 1441 1442 __IFLA_GENEVE_MAX 1442 1443 }; 1443 1444 #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) ··· 1449 1448 GENEVE_DF_INHERIT, 1450 1449 __GENEVE_DF_END, 1451 1450 GENEVE_DF_MAX = __GENEVE_DF_END - 1, 1451 + }; 1452 + 1453 + struct ifla_geneve_port_range { 1454 + __be16 low; 1455 + __be16 high; 1452 1456 }; 1453 1457 1454 1458 /* Bareudp section */