Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

netfilter: flowtable: Add IPIP rx sw acceleration

Introduce sw acceleration for rx path of IPIP tunnels relying on the
netfilter flowtable infrastructure. Subsequent patches will add sw
acceleration for IPIP tunnels tx path.
This series introduces basic infrastructure to accelerate other tunnel
types (e.g. IP6IP6).
IPIP rx sw acceleration can be tested running the following scenario where
the traffic is forwarded between two NICs (eth0 and eth1) and an IPIP
tunnel is used to access a remote site (using eth1 as the underlay device):

ETH0 -- TUN0 <==> ETH1 -- [IP network] -- TUN1 (192.168.100.2)

$ip addr show
6: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:00:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet 192.168.0.2/24 scope global eth0
valid_lft forever preferred_lft forever
7: eth1: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc noqueue state UP group default qlen 1000
link/ether 00:11:22:33:11:55 brd ff:ff:ff:ff:ff:ff
inet 192.168.1.1/24 scope global eth1
valid_lft forever preferred_lft forever
8: tun0@NONE: <POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1480 qdisc noqueue state UNKNOWN group default qlen 1000
link/ipip 192.168.1.1 peer 192.168.1.2
inet 192.168.100.1/24 scope global tun0
valid_lft forever preferred_lft forever

$ip route show
default via 192.168.100.2 dev tun0
192.168.0.0/24 dev eth0 proto kernel scope link src 192.168.0.2
192.168.1.0/24 dev eth1 proto kernel scope link src 192.168.1.1
192.168.100.0/24 dev tun0 proto kernel scope link src 192.168.100.1

$nft list ruleset
table inet filter {
flowtable ft {
hook ingress priority filter
devices = { eth0, eth1 }
}

chain forward {
type filter hook forward priority filter; policy accept;
meta l4proto { tcp, udp } flow add @ft
}
}

Reproducing the scenario described above using veths I got the following
results:
- TCP stream received from the IPIP tunnel:
- net-next: (baseline) ~ 71Gbps
- net-next + IPIP flowtbale support: ~101Gbps

Signed-off-by: Lorenzo Bianconi <lorenzo@kernel.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Lorenzo Bianconi and committed by
Pablo Neira Ayuso
ab427db1 a0d98b64

+153 -13
+13
include/linux/netdevice.h
··· 877 877 DEV_PATH_PPPOE, 878 878 DEV_PATH_DSA, 879 879 DEV_PATH_MTK_WDMA, 880 + DEV_PATH_TUN, 880 881 }; 881 882 882 883 struct net_device_path { ··· 889 888 __be16 proto; 890 889 u8 h_dest[ETH_ALEN]; 891 890 } encap; 891 + struct { 892 + union { 893 + struct in_addr src_v4; 894 + struct in6_addr src_v6; 895 + }; 896 + union { 897 + struct in_addr dst_v4; 898 + struct in6_addr dst_v6; 899 + }; 900 + 901 + u8 l3_proto; 902 + } tun; 892 903 struct { 893 904 enum { 894 905 DEV_PATH_BR_VLAN_KEEP,
+18
include/net/netfilter/nf_flow_table.h
··· 107 107 108 108 #define NF_FLOW_TABLE_ENCAP_MAX 2 109 109 110 + struct flow_offload_tunnel { 111 + union { 112 + struct in_addr src_v4; 113 + struct in6_addr src_v6; 114 + }; 115 + union { 116 + struct in_addr dst_v4; 117 + struct in6_addr dst_v6; 118 + }; 119 + 120 + u8 l3_proto; 121 + }; 122 + 110 123 struct flow_offload_tuple { 111 124 union { 112 125 struct in_addr src_v4; ··· 143 130 __be16 proto; 144 131 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 145 132 133 + struct flow_offload_tunnel tun; 134 + 146 135 /* All members above are keys for lookups, see flow_offload_hash(). */ 147 136 struct { } __hash; 148 137 149 138 u8 dir:2, 150 139 xmit_type:3, 151 140 encap_num:2, 141 + tun_num:2, 152 142 in_vlan_ingress:2; 153 143 u16 mtu; 154 144 union { ··· 222 206 u16 id; 223 207 __be16 proto; 224 208 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 209 + struct flow_offload_tunnel tun; 225 210 u8 num_encaps:2, 211 + num_tuns:2, 226 212 ingress_vlans:2; 227 213 } in; 228 214 struct {
+25
net/ipv4/ipip.c
··· 353 353 return ip_tunnel_ctl(dev, p, cmd); 354 354 } 355 355 356 + static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, 357 + struct net_device_path *path) 358 + { 359 + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); 360 + const struct iphdr *tiph = &tunnel->parms.iph; 361 + struct rtable *rt; 362 + 363 + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, 364 + RT_SCOPE_UNIVERSE); 365 + if (IS_ERR(rt)) 366 + return PTR_ERR(rt); 367 + 368 + path->type = DEV_PATH_TUN; 369 + path->tun.src_v4.s_addr = tiph->saddr; 370 + path->tun.dst_v4.s_addr = tiph->daddr; 371 + path->tun.l3_proto = IPPROTO_IPIP; 372 + path->dev = ctx->dev; 373 + 374 + ctx->dev = rt->dst.dev; 375 + ip_rt_put(rt); 376 + 377 + return 0; 378 + } 379 + 356 380 static const struct net_device_ops ipip_netdev_ops = { 357 381 .ndo_init = ipip_tunnel_init, 358 382 .ndo_uninit = ip_tunnel_uninit, ··· 386 362 .ndo_get_stats64 = dev_get_tstats64, 387 363 .ndo_get_iflink = ip_tunnel_get_iflink, 388 364 .ndo_tunnel_ctl = ipip_tunnel_ctl, 365 + .ndo_fill_forward_path = ipip_fill_forward_path, 389 366 }; 390 367 391 368 #define IPIP_FEATURES (NETIF_F_SG | \
+3
net/netfilter/nf_flow_table_core.c
··· 118 118 flow_tuple->in_vlan_ingress |= BIT(j); 119 119 j++; 120 120 } 121 + 122 + flow_tuple->tun = route->tuple[dir].in.tun; 121 123 flow_tuple->encap_num = route->tuple[dir].in.num_encaps; 124 + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; 122 125 123 126 switch (route->tuple[dir].xmit_type) { 124 127 case FLOW_OFFLOAD_XMIT_DIRECT:
+63 -6
net/netfilter/nf_flow_table_ip.c
··· 145 145 static void nf_flow_tuple_encap(struct sk_buff *skb, 146 146 struct flow_offload_tuple *tuple) 147 147 { 148 + __be16 inner_proto = skb->protocol; 148 149 struct vlan_ethhdr *veth; 149 150 struct pppoe_hdr *phdr; 151 + struct iphdr *iph; 152 + u16 offset = 0; 150 153 int i = 0; 151 154 152 155 if (skb_vlan_tag_present(skb)) { ··· 162 159 veth = (struct vlan_ethhdr *)skb_mac_header(skb); 163 160 tuple->encap[i].id = ntohs(veth->h_vlan_TCI); 164 161 tuple->encap[i].proto = skb->protocol; 162 + inner_proto = veth->h_vlan_encapsulated_proto; 163 + offset += VLAN_HLEN; 165 164 break; 166 165 case htons(ETH_P_PPP_SES): 167 166 phdr = (struct pppoe_hdr *)skb_network_header(skb); 168 167 tuple->encap[i].id = ntohs(phdr->sid); 169 168 tuple->encap[i].proto = skb->protocol; 169 + inner_proto = *((__be16 *)(phdr + 1)); 170 + offset += PPPOE_SES_HLEN; 170 171 break; 172 + } 173 + 174 + if (inner_proto == htons(ETH_P_IP)) { 175 + iph = (struct iphdr *)(skb_network_header(skb) + offset); 176 + if (iph->protocol == IPPROTO_IPIP) { 177 + tuple->tun.dst_v4.s_addr = iph->daddr; 178 + tuple->tun.src_v4.s_addr = iph->saddr; 179 + tuple->tun.l3_proto = IPPROTO_IPIP; 180 + } 171 181 } 172 182 } 173 183 ··· 293 277 return NF_STOLEN; 294 278 } 295 279 280 + static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) 281 + { 282 + struct iphdr *iph; 283 + u16 size; 284 + 285 + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) 286 + return false; 287 + 288 + iph = (struct iphdr *)(skb_network_header(skb) + *psize); 289 + size = iph->ihl << 2; 290 + 291 + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) 292 + return false; 293 + 294 + if (iph->ttl <= 1) 295 + return false; 296 + 297 + if (iph->protocol == IPPROTO_IPIP) 298 + *psize += size; 299 + 300 + return true; 301 + } 302 + 303 + static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) 304 + { 305 + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); 306 + 307 + if (iph->protocol != IPPROTO_IPIP) 308 + return; 309 + 310 + skb_pull(skb, iph->ihl << 2); 311 + skb_reset_network_header(skb); 312 + } 313 + 296 314 static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, 297 315 u32 *offset) 298 316 { 317 + __be16 inner_proto = skb->protocol; 299 318 struct vlan_ethhdr *veth; 300 - __be16 inner_proto; 319 + bool ret = false; 301 320 302 321 switch (skb->protocol) { 303 322 case htons(ETH_P_8021Q): ··· 342 291 veth = (struct vlan_ethhdr *)skb_mac_header(skb); 343 292 if (veth->h_vlan_encapsulated_proto == proto) { 344 293 *offset += VLAN_HLEN; 345 - return true; 294 + inner_proto = proto; 295 + ret = true; 346 296 } 347 297 break; 348 298 case htons(ETH_P_PPP_SES): 349 299 if (nf_flow_pppoe_proto(skb, &inner_proto) && 350 300 inner_proto == proto) { 351 301 *offset += PPPOE_SES_HLEN; 352 - return true; 302 + ret = true; 353 303 } 354 304 break; 355 305 } 356 306 357 - return false; 307 + if (inner_proto == htons(ETH_P_IP)) 308 + ret = nf_flow_ip4_tunnel_proto(skb, offset); 309 + 310 + return ret; 358 311 } 359 312 360 313 static void nf_flow_encap_pop(struct sk_buff *skb, ··· 386 331 break; 387 332 } 388 333 } 334 + 335 + if (skb->protocol == htons(ETH_P_IP)) 336 + nf_flow_ip4_tunnel_pop(skb); 389 337 } 390 338 391 339 struct nf_flow_xmit { ··· 414 356 { 415 357 struct flow_offload_tuple tuple = {}; 416 358 417 - if (skb->protocol != htons(ETH_P_IP) && 418 - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) 359 + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) 419 360 return NULL; 420 361 421 362 if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)
+31 -7
net/netfilter/nf_flow_table_path.c
··· 80 80 __be16 proto; 81 81 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 82 82 u8 num_encaps; 83 + struct flow_offload_tunnel tun; 84 + u8 num_tuns; 83 85 u8 ingress_vlans; 84 86 u8 h_source[ETH_ALEN]; 85 87 u8 h_dest[ETH_ALEN]; ··· 104 102 case DEV_PATH_DSA: 105 103 case DEV_PATH_VLAN: 106 104 case DEV_PATH_PPPOE: 105 + case DEV_PATH_TUN: 107 106 info->indev = path->dev; 108 107 if (is_zero_ether_addr(info->h_source)) 109 108 memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); ··· 116 113 break; 117 114 } 118 115 119 - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ 120 - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 121 - info->indev = NULL; 122 - break; 116 + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ 117 + if (path->type == DEV_PATH_TUN) { 118 + if (info->num_tuns) { 119 + info->indev = NULL; 120 + break; 121 + } 122 + info->tun.src_v6 = path->tun.src_v6; 123 + info->tun.dst_v6 = path->tun.dst_v6; 124 + info->tun.l3_proto = path->tun.l3_proto; 125 + info->num_tuns++; 126 + } else { 127 + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 128 + info->indev = NULL; 129 + break; 130 + } 131 + info->encap[info->num_encaps].id = 132 + path->encap.id; 133 + info->encap[info->num_encaps].proto = 134 + path->encap.proto; 135 + info->num_encaps++; 123 136 } 124 - info->encap[info->num_encaps].id = path->encap.id; 125 - info->encap[info->num_encaps].proto = path->encap.proto; 126 - info->num_encaps++; 127 137 if (path->type == DEV_PATH_PPPOE) 128 138 memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 129 139 break; ··· 219 203 route->tuple[!dir].in.encap[i].id = info.encap[i].id; 220 204 route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 221 205 } 206 + 207 + if (info.num_tuns) { 208 + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; 209 + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; 210 + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; 211 + route->tuple[!dir].in.num_tuns = info.num_tuns; 212 + } 213 + 222 214 route->tuple[!dir].in.num_encaps = info.num_encaps; 223 215 route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 224 216 route->tuple[dir].out.ifindex = info.outdev->ifindex;