Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'nf-next-25-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

The following batch contains Netfilter updates for net-next:

0) Add sanity check for maximum encapsulations in bridge vlan,
reported by the new AI robot.

1) Move the flowtable path discovery code to its own file, the
nft_flow_offload.c mixes the nf_tables evaluation with the path
discovery logic, just split this in two for clarity.

2) Consolidate flowtable xmit path by using dev_queue_xmit() and the
real device behind the layer 2 vlan/pppoe device. This allows to
inline encapsulation. After this update, hw_ifidx can be removed
since both ifidx and hw_ifidx now point to the same device.

3) Support for IPIP encapsulation in the flowtable, extend selftest
to cover for this new layer 3 offload, from Lorenzo Bianconi.

4) Push down the skb into the conncount API to fix duplicates in the
conncount list for packets with non-confirmed conntrack entries,
this is due to an optimization introduced in d265929930e2
("netfilter: nf_conncount: reduce unnecessary GC").
From Fernando Fernandez Mancera.

5) In conncount, disable BH when performing garbage collection
to consolidate existing behaviour in the conncount API, also
from Fernando.

6) A matching packet with a confirmed conntrack invokes GC if
conncount reaches the limit in an attempt to release slots.
This allows the existing extensions to be used for real conntrack
counting, not just limiting new connections, from Fernando.

7) Support for updating ct count objects in nf_tables, from Fernando.

8) Extend nft_flowtables.sh selftest to send IPv6 TCP traffic,
from Lorenzo Bianconi.

9) Fixes for UAPI kernel-doc documentation, from Randy Dunlap.

* tag 'nf-next-25-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next:
netfilter: nf_tables: improve UAPI kernel-doc comments
netfilter: ip6t_srh: fix UAPI kernel-doc comments format
selftests: netfilter: nft_flowtable.sh: Add the capability to send IPv6 TCP traffic
netfilter: nft_connlimit: add support to object update operation
netfilter: nft_connlimit: update the count if add was skipped
netfilter: nf_conncount: make nf_conncount_gc_list() to disable BH
netfilter: nf_conncount: rework API to use sk_buff directly
selftests: netfilter: nft_flowtable.sh: Add IPIP flowtable selftest
netfilter: flowtable: Add IPIP tx sw acceleration
netfilter: flowtable: Add IPIP rx sw acceleration
netfilter: flowtable: use tuple address to calculate next hop
netfilter: flowtable: remove hw_ifidx
netfilter: flowtable: inline pppoe encapsulation in xmit path
netfilter: flowtable: inline vlan encapsulation in xmit path
netfilter: flowtable: consolidate xmit path
netfilter: flowtable: move path discovery infrastructure to its own file
netfilter: flowtable: check for maximum number of encapsulations in bridge vlan
====================

Link: https://patch.msgid.link/20251128002345.29378-1-pablo@netfilter.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+983 -462
+13
include/linux/netdevice.h
··· 877 877 DEV_PATH_PPPOE, 878 878 DEV_PATH_DSA, 879 879 DEV_PATH_MTK_WDMA, 880 + DEV_PATH_TUN, 880 881 }; 881 882 882 883 struct net_device_path { ··· 889 888 __be16 proto; 890 889 u8 h_dest[ETH_ALEN]; 891 890 } encap; 891 + struct { 892 + union { 893 + struct in_addr src_v4; 894 + struct in6_addr src_v6; 895 + }; 896 + union { 897 + struct in_addr dst_v4; 898 + struct in6_addr dst_v6; 899 + }; 900 + 901 + u8 l3_proto; 902 + } tun; 892 903 struct { 893 904 enum { 894 905 DEV_PATH_BR_VLAN_KEEP,
+7 -8
include/net/netfilter/nf_conntrack_count.h
··· 18 18 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); 19 19 void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); 20 20 21 - unsigned int nf_conncount_count(struct net *net, 22 - struct nf_conncount_data *data, 23 - const u32 *key, 24 - const struct nf_conntrack_tuple *tuple, 25 - const struct nf_conntrack_zone *zone); 21 + unsigned int nf_conncount_count_skb(struct net *net, 22 + const struct sk_buff *skb, 23 + u16 l3num, 24 + struct nf_conncount_data *data, 25 + const u32 *key); 26 26 27 - int nf_conncount_add(struct net *net, struct nf_conncount_list *list, 28 - const struct nf_conntrack_tuple *tuple, 29 - const struct nf_conntrack_zone *zone); 27 + int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, 28 + u16 l3num, struct nf_conncount_list *list); 30 29 31 30 void nf_conncount_list_init(struct nf_conncount_list *list); 32 31
+25 -1
include/net/netfilter/nf_flow_table.h
··· 107 107 108 108 #define NF_FLOW_TABLE_ENCAP_MAX 2 109 109 110 + struct flow_offload_tunnel { 111 + union { 112 + struct in_addr src_v4; 113 + struct in6_addr src_v6; 114 + }; 115 + union { 116 + struct in_addr dst_v4; 117 + struct in6_addr dst_v6; 118 + }; 119 + 120 + u8 l3_proto; 121 + }; 122 + 110 123 struct flow_offload_tuple { 111 124 union { 112 125 struct in_addr src_v4; ··· 143 130 __be16 proto; 144 131 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 145 132 133 + struct flow_offload_tunnel tun; 134 + 146 135 /* All members above are keys for lookups, see flow_offload_hash(). */ 147 136 struct { } __hash; 148 137 149 138 u8 dir:2, 150 139 xmit_type:3, 151 140 encap_num:2, 141 + tun_num:2, 152 142 in_vlan_ingress:2; 153 143 u16 mtu; 154 144 union { 155 145 struct { 156 146 struct dst_entry *dst_cache; 147 + u32 ifidx; 157 148 u32 dst_cookie; 158 149 }; 159 150 struct { 160 151 u32 ifidx; 161 - u32 hw_ifidx; 162 152 u8 h_source[ETH_ALEN]; 163 153 u8 h_dest[ETH_ALEN]; 164 154 } out; ··· 222 206 u16 id; 223 207 __be16 proto; 224 208 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 209 + struct flow_offload_tunnel tun; 225 210 u8 num_encaps:2, 211 + num_tuns:2, 226 212 ingress_vlans:2; 227 213 } in; 228 214 struct { ··· 239 221 240 222 struct flow_offload *flow_offload_alloc(struct nf_conn *ct); 241 223 void flow_offload_free(struct flow_offload *flow); 224 + 225 + struct nft_flowtable; 226 + struct nft_pktinfo; 227 + int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, 228 + struct nf_flow_route *route, enum ip_conntrack_dir dir, 229 + struct nft_flowtable *ft); 242 230 243 231 static inline int 244 232 nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,
+7 -7
include/uapi/linux/netfilter/nf_tables.h
··· 881 881 * enum nft_exthdr_op - nf_tables match options 882 882 * 883 883 * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers 884 - * @NFT_EXTHDR_OP_TCP: match against tcp options 884 + * @NFT_EXTHDR_OP_TCPOPT: match against tcp options 885 885 * @NFT_EXTHDR_OP_IPV4: match against ipv4 options 886 886 * @NFT_EXTHDR_OP_SCTP: match against sctp chunks 887 887 * @NFT_EXTHDR_OP_DCCP: match against dccp otions ··· 1200 1200 #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) 1201 1201 1202 1202 /** 1203 - * enum nft_flow_attributes - ct offload expression attributes 1203 + * enum nft_offload_attributes - ct offload expression attributes 1204 1204 * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING) 1205 1205 */ 1206 1206 enum nft_offload_attributes { ··· 1410 1410 }; 1411 1411 1412 1412 /** 1413 - * enum nft_reject_code - Generic reject codes for IPv4/IPv6 1413 + * enum nft_reject_inet_code - Generic reject codes for IPv4/IPv6 1414 1414 * 1415 1415 * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable 1416 1416 * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable ··· 1480 1480 /** 1481 1481 * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes 1482 1482 * 1483 - * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) 1484 - * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) 1485 - * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) 1483 + * @NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) 1484 + * @NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) 1485 + * @NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) 1486 1486 */ 1487 1487 enum nft_tproxy_attributes { 1488 1488 NFTA_TPROXY_UNSPEC, ··· 1783 1783 #define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) 1784 1784 1785 1785 /** 1786 - * enum nft_device_attributes - nf_tables device netlink attributes 1786 + * enum nft_devices_attributes - nf_tables device netlink attributes 1787 1787 * 1788 1788 * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) 1789 1789 * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING)
+20 -20
include/uapi/linux/netfilter_ipv6/ip6t_srh.h
··· 41 41 42 42 /** 43 43 * struct ip6t_srh - SRH match options 44 - * @ next_hdr: Next header field of SRH 45 - * @ hdr_len: Extension header length field of SRH 46 - * @ segs_left: Segments left field of SRH 47 - * @ last_entry: Last entry field of SRH 48 - * @ tag: Tag field of SRH 49 - * @ mt_flags: match options 50 - * @ mt_invflags: Invert the sense of match options 44 + * @next_hdr: Next header field of SRH 45 + * @hdr_len: Extension header length field of SRH 46 + * @segs_left: Segments left field of SRH 47 + * @last_entry: Last entry field of SRH 48 + * @tag: Tag field of SRH 49 + * @mt_flags: match options 50 + * @mt_invflags: Invert the sense of match options 51 51 */ 52 52 53 53 struct ip6t_srh { ··· 62 62 63 63 /** 64 64 * struct ip6t_srh1 - SRH match options (revision 1) 65 - * @ next_hdr: Next header field of SRH 66 - * @ hdr_len: Extension header length field of SRH 67 - * @ segs_left: Segments left field of SRH 68 - * @ last_entry: Last entry field of SRH 69 - * @ tag: Tag field of SRH 70 - * @ psid_addr: Address of previous SID in SRH SID list 71 - * @ nsid_addr: Address of NEXT SID in SRH SID list 72 - * @ lsid_addr: Address of LAST SID in SRH SID list 73 - * @ psid_msk: Mask of previous SID in SRH SID list 74 - * @ nsid_msk: Mask of next SID in SRH SID list 75 - * @ lsid_msk: MAsk of last SID in SRH SID list 76 - * @ mt_flags: match options 77 - * @ mt_invflags: Invert the sense of match options 65 + * @next_hdr: Next header field of SRH 66 + * @hdr_len: Extension header length field of SRH 67 + * @segs_left: Segments left field of SRH 68 + * @last_entry: Last entry field of SRH 69 + * @tag: Tag field of SRH 70 + * @psid_addr: Address of previous SID in SRH SID list 71 + * @nsid_addr: Address of NEXT SID in SRH SID list 72 + * @lsid_addr: Address of LAST SID in SRH SID list 73 + * @psid_msk: Mask of previous SID in SRH SID list 74 + * @nsid_msk: Mask of next SID in SRH SID list 75 + * @lsid_msk: MAsk of last SID in SRH SID list 76 + * @mt_flags: match options 77 + * @mt_invflags: Invert the sense of match options 78 78 */ 79 79 80 80 struct ip6t_srh1 {
+25
net/ipv4/ipip.c
··· 353 353 return ip_tunnel_ctl(dev, p, cmd); 354 354 } 355 355 356 + static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, 357 + struct net_device_path *path) 358 + { 359 + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); 360 + const struct iphdr *tiph = &tunnel->parms.iph; 361 + struct rtable *rt; 362 + 363 + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, 364 + RT_SCOPE_UNIVERSE); 365 + if (IS_ERR(rt)) 366 + return PTR_ERR(rt); 367 + 368 + path->type = DEV_PATH_TUN; 369 + path->tun.src_v4.s_addr = tiph->saddr; 370 + path->tun.dst_v4.s_addr = tiph->daddr; 371 + path->tun.l3_proto = IPPROTO_IPIP; 372 + path->dev = ctx->dev; 373 + 374 + ctx->dev = rt->dst.dev; 375 + ip_rt_put(rt); 376 + 377 + return 0; 378 + } 379 + 356 380 static const struct net_device_ops ipip_netdev_ops = { 357 381 .ndo_init = ipip_tunnel_init, 358 382 .ndo_uninit = ip_tunnel_uninit, ··· 386 362 .ndo_get_stats64 = dev_get_tstats64, 387 363 .ndo_get_iflink = ip_tunnel_get_iflink, 388 364 .ndo_tunnel_ctl = ipip_tunnel_ctl, 365 + .ndo_fill_forward_path = ipip_fill_forward_path, 389 366 }; 390 367 391 368 #define IPIP_FEATURES (NETIF_F_SG | \
+1
net/netfilter/Makefile
··· 141 141 # flow table infrastructure 142 142 obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o 143 143 nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ 144 + nf_flow_table_path.o \ 144 145 nf_flow_table_offload.o nf_flow_table_xdp.o 145 146 nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o 146 147 ifeq ($(CONFIG_NF_FLOW_TABLE),m)
+148 -71
net/netfilter/nf_conncount.c
··· 122 122 return ERR_PTR(-EAGAIN); 123 123 } 124 124 125 - static int __nf_conncount_add(struct net *net, 126 - struct nf_conncount_list *list, 127 - const struct nf_conntrack_tuple *tuple, 128 - const struct nf_conntrack_zone *zone) 125 + static bool get_ct_or_tuple_from_skb(struct net *net, 126 + const struct sk_buff *skb, 127 + u16 l3num, 128 + struct nf_conn **ct, 129 + struct nf_conntrack_tuple *tuple, 130 + const struct nf_conntrack_zone **zone, 131 + bool *refcounted) 129 132 { 133 + const struct nf_conntrack_tuple_hash *h; 134 + enum ip_conntrack_info ctinfo; 135 + struct nf_conn *found_ct; 136 + 137 + found_ct = nf_ct_get(skb, &ctinfo); 138 + if (found_ct && !nf_ct_is_template(found_ct)) { 139 + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 140 + *zone = nf_ct_zone(found_ct); 141 + *ct = found_ct; 142 + return true; 143 + } 144 + 145 + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) 146 + return false; 147 + 148 + if (found_ct) 149 + *zone = nf_ct_zone(found_ct); 150 + 151 + h = nf_conntrack_find_get(net, *zone, tuple); 152 + if (!h) 153 + return true; 154 + 155 + found_ct = nf_ct_tuplehash_to_ctrack(h); 156 + *refcounted = true; 157 + *ct = found_ct; 158 + 159 + return true; 160 + } 161 + 162 + static int __nf_conncount_add(struct net *net, 163 + const struct sk_buff *skb, 164 + u16 l3num, 165 + struct nf_conncount_list *list) 166 + { 167 + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 130 168 const struct nf_conntrack_tuple_hash *found; 131 169 struct nf_conncount_tuple *conn, *conn_n; 170 + struct nf_conntrack_tuple tuple; 171 + struct nf_conn *ct = NULL; 132 172 struct nf_conn *found_ct; 133 173 unsigned int collect = 0; 174 + bool refcounted = false; 175 + 176 + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) 177 + return -ENOENT; 178 + 179 + if (ct && nf_ct_is_confirmed(ct)) { 180 + if (refcounted) 181 + nf_ct_put(ct); 182 + return -EEXIST; 183 + } 134 184 135 185 if ((u32)jiffies == list->last_gc) 136 186 goto add_new_node; ··· 194 144 if (IS_ERR(found)) { 195 145 /* Not found, but might be about to be confirmed */ 196 146 if (PTR_ERR(found) == -EAGAIN) { 197 - if (nf_ct_tuple_equal(&conn->tuple, tuple) && 147 + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && 198 148 nf_ct_zone_id(&conn->zone, conn->zone.dir) == 199 149 nf_ct_zone_id(zone, zone->dir)) 200 - return 0; /* already exists */ 150 + goto out_put; /* already exists */ 201 151 } else { 202 152 collect++; 203 153 } ··· 206 156 207 157 found_ct = nf_ct_tuplehash_to_ctrack(found); 208 158 209 - if (nf_ct_tuple_equal(&conn->tuple, tuple) && 159 + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && 210 160 nf_ct_zone_equal(found_ct, zone, zone->dir)) { 211 161 /* 212 162 * We should not see tuples twice unless someone hooks ··· 215 165 * Attempt to avoid a re-add in this case. 216 166 */ 217 167 nf_ct_put(found_ct); 218 - return 0; 168 + goto out_put; 219 169 } else if (already_closed(found_ct)) { 220 170 /* 221 171 * we do not care about connections which are ··· 238 188 if (conn == NULL) 239 189 return -ENOMEM; 240 190 241 - conn->tuple = *tuple; 191 + conn->tuple = tuple; 242 192 conn->zone = *zone; 243 193 conn->cpu = raw_smp_processor_id(); 244 194 conn->jiffies32 = (u32)jiffies; 245 195 list_add_tail(&conn->node, &list->head); 246 196 list->count++; 247 197 list->last_gc = (u32)jiffies; 198 + 199 + out_put: 200 + if (refcounted) 201 + nf_ct_put(ct); 248 202 return 0; 249 203 } 250 204 251 - int nf_conncount_add(struct net *net, 252 - struct nf_conncount_list *list, 253 - const struct nf_conntrack_tuple *tuple, 254 - const struct nf_conntrack_zone *zone) 205 + int nf_conncount_add_skb(struct net *net, 206 + const struct sk_buff *skb, 207 + u16 l3num, 208 + struct nf_conncount_list *list) 255 209 { 256 210 int ret; 257 211 258 212 /* check the saved connections */ 259 213 spin_lock_bh(&list->list_lock); 260 - ret = __nf_conncount_add(net, list, tuple, zone); 214 + ret = __nf_conncount_add(net, skb, l3num, list); 261 215 spin_unlock_bh(&list->list_lock); 262 216 263 217 return ret; 264 218 } 265 - EXPORT_SYMBOL_GPL(nf_conncount_add); 219 + EXPORT_SYMBOL_GPL(nf_conncount_add_skb); 266 220 267 221 void nf_conncount_list_init(struct nf_conncount_list *list) 268 222 { ··· 278 224 EXPORT_SYMBOL_GPL(nf_conncount_list_init); 279 225 280 226 /* Return true if the list is empty. Must be called with BH disabled. */ 281 - bool nf_conncount_gc_list(struct net *net, 282 - struct nf_conncount_list *list) 227 + static bool __nf_conncount_gc_list(struct net *net, 228 + struct nf_conncount_list *list) 283 229 { 284 230 const struct nf_conntrack_tuple_hash *found; 285 231 struct nf_conncount_tuple *conn, *conn_n; ··· 289 235 290 236 /* don't bother if we just did GC */ 291 237 if ((u32)jiffies == READ_ONCE(list->last_gc)) 292 - return false; 293 - 294 - /* don't bother if other cpu is already doing GC */ 295 - if (!spin_trylock(&list->list_lock)) 296 238 return false; 297 239 298 240 list_for_each_entry_safe(conn, conn_n, &list->head, node) { ··· 319 269 if (!list->count) 320 270 ret = true; 321 271 list->last_gc = (u32)jiffies; 322 - spin_unlock(&list->list_lock); 272 + 273 + return ret; 274 + } 275 + 276 + bool nf_conncount_gc_list(struct net *net, 277 + struct nf_conncount_list *list) 278 + { 279 + bool ret; 280 + 281 + /* don't bother if other cpu is already doing GC */ 282 + if (!spin_trylock_bh(&list->list_lock)) 283 + return false; 284 + 285 + ret = __nf_conncount_gc_list(net, list); 286 + spin_unlock_bh(&list->list_lock); 323 287 324 288 return ret; 325 289 } ··· 373 309 374 310 static unsigned int 375 311 insert_tree(struct net *net, 312 + const struct sk_buff *skb, 313 + u16 l3num, 376 314 struct nf_conncount_data *data, 377 315 struct rb_root *root, 378 316 unsigned int hash, 379 - const u32 *key, 380 - const struct nf_conntrack_tuple *tuple, 381 - const struct nf_conntrack_zone *zone) 317 + const u32 *key) 382 318 { 383 319 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; 384 - struct rb_node **rbnode, *parent; 385 - struct nf_conncount_rb *rbconn; 386 - struct nf_conncount_tuple *conn; 320 + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 321 + bool do_gc = true, refcounted = false; 387 322 unsigned int count = 0, gc_count = 0; 388 - bool do_gc = true; 323 + struct rb_node **rbnode, *parent; 324 + struct nf_conntrack_tuple tuple; 325 + struct nf_conncount_tuple *conn; 326 + struct nf_conncount_rb *rbconn; 327 + struct nf_conn *ct = NULL; 389 328 390 329 spin_lock_bh(&nf_conncount_locks[hash]); 391 330 restart: ··· 407 340 } else { 408 341 int ret; 409 342 410 - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); 411 - if (ret) 343 + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); 344 + if (ret && ret != -EEXIST) 412 345 count = 0; /* hotdrop */ 413 346 else 414 347 count = rbconn->list.count; ··· 431 364 goto restart; 432 365 } 433 366 434 - /* expected case: match, insert new node */ 435 - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); 436 - if (rbconn == NULL) 437 - goto out_unlock; 367 + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { 368 + /* expected case: match, insert new node */ 369 + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); 370 + if (rbconn == NULL) 371 + goto out_unlock; 438 372 439 - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 440 - if (conn == NULL) { 441 - kmem_cache_free(conncount_rb_cachep, rbconn); 442 - goto out_unlock; 373 + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 374 + if (conn == NULL) { 375 + kmem_cache_free(conncount_rb_cachep, rbconn); 376 + goto out_unlock; 377 + } 378 + 379 + conn->tuple = tuple; 380 + conn->zone = *zone; 381 + conn->cpu = raw_smp_processor_id(); 382 + conn->jiffies32 = (u32)jiffies; 383 + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); 384 + 385 + nf_conncount_list_init(&rbconn->list); 386 + list_add(&conn->node, &rbconn->list.head); 387 + count = 1; 388 + rbconn->list.count = count; 389 + 390 + rb_link_node_rcu(&rbconn->node, parent, rbnode); 391 + rb_insert_color(&rbconn->node, root); 392 + 393 + if (refcounted) 394 + nf_ct_put(ct); 443 395 } 444 - 445 - conn->tuple = *tuple; 446 - conn->zone = *zone; 447 - conn->cpu = raw_smp_processor_id(); 448 - conn->jiffies32 = (u32)jiffies; 449 - memcpy(rbconn->key, key, sizeof(u32) * data->keylen); 450 - 451 - nf_conncount_list_init(&rbconn->list); 452 - list_add(&conn->node, &rbconn->list.head); 453 - count = 1; 454 - rbconn->list.count = count; 455 - 456 - rb_link_node_rcu(&rbconn->node, parent, rbnode); 457 - rb_insert_color(&rbconn->node, root); 458 396 out_unlock: 459 397 spin_unlock_bh(&nf_conncount_locks[hash]); 460 398 return count; ··· 467 395 468 396 static unsigned int 469 397 count_tree(struct net *net, 398 + const struct sk_buff *skb, 399 + u16 l3num, 470 400 struct nf_conncount_data *data, 471 - const u32 *key, 472 - const struct nf_conntrack_tuple *tuple, 473 - const struct nf_conntrack_zone *zone) 401 + const u32 *key) 474 402 { 475 403 struct rb_root *root; 476 404 struct rb_node *parent; ··· 494 422 } else { 495 423 int ret; 496 424 497 - if (!tuple) { 425 + if (!skb) { 498 426 nf_conncount_gc_list(net, &rbconn->list); 499 427 return rbconn->list.count; 500 428 } ··· 509 437 } 510 438 511 439 /* same source network -> be counted! */ 512 - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); 440 + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); 513 441 spin_unlock_bh(&rbconn->list.list_lock); 514 - if (ret) 442 + if (ret && ret != -EEXIST) { 515 443 return 0; /* hotdrop */ 516 - else 444 + } else { 445 + /* -EEXIST means add was skipped, update the list */ 446 + if (ret == -EEXIST) 447 + nf_conncount_gc_list(net, &rbconn->list); 517 448 return rbconn->list.count; 449 + } 518 450 } 519 451 } 520 452 521 - if (!tuple) 453 + if (!skb) 522 454 return 0; 523 455 524 - return insert_tree(net, data, root, hash, key, tuple, zone); 456 + return insert_tree(net, skb, l3num, data, root, hash, key); 525 457 } 526 458 527 459 static void tree_gc_worker(struct work_struct *work) ··· 587 511 } 588 512 589 513 /* Count and return number of conntrack entries in 'net' with particular 'key'. 590 - * If 'tuple' is not null, insert it into the accounting data structure. 591 - * Call with RCU read lock. 514 + * If 'skb' is not null, insert the corresponding tuple into the accounting 515 + * data structure. Call with RCU read lock. 592 516 */ 593 - unsigned int nf_conncount_count(struct net *net, 594 - struct nf_conncount_data *data, 595 - const u32 *key, 596 - const struct nf_conntrack_tuple *tuple, 597 - const struct nf_conntrack_zone *zone) 517 + unsigned int nf_conncount_count_skb(struct net *net, 518 + const struct sk_buff *skb, 519 + u16 l3num, 520 + struct nf_conncount_data *data, 521 + const u32 *key) 598 522 { 599 - return count_tree(net, data, key, tuple, zone); 523 + return count_tree(net, skb, l3num, data, key); 524 + 600 525 } 601 - EXPORT_SYMBOL_GPL(nf_conncount_count); 526 + EXPORT_SYMBOL_GPL(nf_conncount_count_skb); 602 527 603 528 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) 604 529 {
+4 -1
net/netfilter/nf_flow_table_core.c
··· 118 118 flow_tuple->in_vlan_ingress |= BIT(j); 119 119 j++; 120 120 } 121 + 122 + flow_tuple->tun = route->tuple[dir].in.tun; 121 123 flow_tuple->encap_num = route->tuple[dir].in.num_encaps; 124 + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; 122 125 123 126 switch (route->tuple[dir].xmit_type) { 124 127 case FLOW_OFFLOAD_XMIT_DIRECT: ··· 130 127 memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, 131 128 ETH_ALEN); 132 129 flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; 133 - flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; 134 130 dst_release(dst); 135 131 break; 136 132 case FLOW_OFFLOAD_XMIT_XFRM: 137 133 case FLOW_OFFLOAD_XMIT_NEIGH: 134 + flow_tuple->ifidx = route->tuple[dir].out.ifindex; 138 135 flow_tuple->dst_cache = dst; 139 136 flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); 140 137 break;
+252 -41
net/netfilter/nf_flow_table_ip.c
··· 145 145 static void nf_flow_tuple_encap(struct sk_buff *skb, 146 146 struct flow_offload_tuple *tuple) 147 147 { 148 + __be16 inner_proto = skb->protocol; 148 149 struct vlan_ethhdr *veth; 149 150 struct pppoe_hdr *phdr; 151 + struct iphdr *iph; 152 + u16 offset = 0; 150 153 int i = 0; 151 154 152 155 if (skb_vlan_tag_present(skb)) { ··· 162 159 veth = (struct vlan_ethhdr *)skb_mac_header(skb); 163 160 tuple->encap[i].id = ntohs(veth->h_vlan_TCI); 164 161 tuple->encap[i].proto = skb->protocol; 162 + inner_proto = veth->h_vlan_encapsulated_proto; 163 + offset += VLAN_HLEN; 165 164 break; 166 165 case htons(ETH_P_PPP_SES): 167 166 phdr = (struct pppoe_hdr *)skb_network_header(skb); 168 167 tuple->encap[i].id = ntohs(phdr->sid); 169 168 tuple->encap[i].proto = skb->protocol; 169 + inner_proto = *((__be16 *)(phdr + 1)); 170 + offset += PPPOE_SES_HLEN; 170 171 break; 172 + } 173 + 174 + if (inner_proto == htons(ETH_P_IP)) { 175 + iph = (struct iphdr *)(skb_network_header(skb) + offset); 176 + if (iph->protocol == IPPROTO_IPIP) { 177 + tuple->tun.dst_v4.s_addr = iph->daddr; 178 + tuple->tun.src_v4.s_addr = iph->saddr; 179 + tuple->tun.l3_proto = IPPROTO_IPIP; 180 + } 171 181 } 172 182 } 173 183 ··· 293 277 return NF_STOLEN; 294 278 } 295 279 280 + static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) 281 + { 282 + struct iphdr *iph; 283 + u16 size; 284 + 285 + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) 286 + return false; 287 + 288 + iph = (struct iphdr *)(skb_network_header(skb) + *psize); 289 + size = iph->ihl << 2; 290 + 291 + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) 292 + return false; 293 + 294 + if (iph->ttl <= 1) 295 + return false; 296 + 297 + if (iph->protocol == IPPROTO_IPIP) 298 + *psize += size; 299 + 300 + return true; 301 + } 302 + 303 + static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) 304 + { 305 + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); 306 + 307 + if (iph->protocol != IPPROTO_IPIP) 308 + return; 309 + 310 + skb_pull(skb, iph->ihl << 2); 311 + skb_reset_network_header(skb); 312 + } 313 + 296 314 static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, 297 315 u32 *offset) 298 316 { 317 + __be16 inner_proto = skb->protocol; 299 318 struct vlan_ethhdr *veth; 300 - __be16 inner_proto; 319 + bool ret = false; 301 320 302 321 switch (skb->protocol) { 303 322 case htons(ETH_P_8021Q): ··· 342 291 veth = (struct vlan_ethhdr *)skb_mac_header(skb); 343 292 if (veth->h_vlan_encapsulated_proto == proto) { 344 293 *offset += VLAN_HLEN; 345 - return true; 294 + inner_proto = proto; 295 + ret = true; 346 296 } 347 297 break; 348 298 case htons(ETH_P_PPP_SES): 349 299 if (nf_flow_pppoe_proto(skb, &inner_proto) && 350 300 inner_proto == proto) { 351 301 *offset += PPPOE_SES_HLEN; 352 - return true; 302 + ret = true; 353 303 } 354 304 break; 355 305 } 356 306 357 - return false; 307 + if (inner_proto == htons(ETH_P_IP)) 308 + ret = nf_flow_ip4_tunnel_proto(skb, offset); 309 + 310 + return ret; 358 311 } 359 312 360 313 static void nf_flow_encap_pop(struct sk_buff *skb, ··· 386 331 break; 387 332 } 388 333 } 334 + 335 + if (skb->protocol == htons(ETH_P_IP)) 336 + nf_flow_ip4_tunnel_pop(skb); 389 337 } 390 338 339 + struct nf_flow_xmit { 340 + const void *dest; 341 + const void *source; 342 + struct net_device *outdev; 343 + }; 344 + 391 345 static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, 392 - const struct flow_offload_tuple_rhash *tuplehash, 393 - unsigned short type) 346 + struct nf_flow_xmit *xmit) 394 347 { 395 - struct net_device *outdev; 396 - 397 - outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); 398 - if (!outdev) 399 - return NF_DROP; 400 - 401 - skb->dev = outdev; 402 - dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, 403 - tuplehash->tuple.out.h_source, skb->len); 348 + skb->dev = xmit->outdev; 349 + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), 350 + xmit->dest, xmit->source, skb->len); 404 351 dev_queue_xmit(skb); 405 352 406 353 return NF_STOLEN; ··· 414 357 { 415 358 struct flow_offload_tuple tuple = {}; 416 359 417 - if (skb->protocol != htons(ETH_P_IP) && 418 - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) 360 + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) 419 361 return NULL; 420 362 421 363 if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) ··· 437 381 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 438 382 439 383 mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; 384 + if (flow->tuplehash[!dir].tuple.tun_num) 385 + mtu -= sizeof(*iph); 386 + 440 387 if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) 441 388 return 0; 442 389 ··· 473 414 return 1; 474 415 } 475 416 417 + static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) 418 + { 419 + int data_len = skb->len + sizeof(__be16); 420 + struct ppp_hdr { 421 + struct pppoe_hdr hdr; 422 + __be16 proto; 423 + } *ph; 424 + __be16 proto; 425 + 426 + if (skb_cow_head(skb, PPPOE_SES_HLEN)) 427 + return -1; 428 + 429 + switch (skb->protocol) { 430 + case htons(ETH_P_IP): 431 + proto = htons(PPP_IP); 432 + break; 433 + case htons(ETH_P_IPV6): 434 + proto = htons(PPP_IPV6); 435 + break; 436 + default: 437 + return -1; 438 + } 439 + 440 + __skb_push(skb, PPPOE_SES_HLEN); 441 + skb_reset_network_header(skb); 442 + 443 + ph = (struct ppp_hdr *)(skb->data); 444 + ph->hdr.ver = 1; 445 + ph->hdr.type = 1; 446 + ph->hdr.code = 0; 447 + ph->hdr.sid = htons(id); 448 + ph->hdr.length = htons(data_len); 449 + ph->proto = proto; 450 + skb->protocol = htons(ETH_P_PPP_SES); 451 + 452 + return 0; 453 + } 454 + 455 + static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, 456 + struct flow_offload_tuple *tuple, 457 + __be32 *ip_daddr) 458 + { 459 + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); 460 + struct rtable *rt = dst_rtable(tuple->dst_cache); 461 + u8 tos = iph->tos, ttl = iph->ttl; 462 + __be16 frag_off = iph->frag_off; 463 + u32 headroom = sizeof(*iph); 464 + int err; 465 + 466 + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); 467 + if (err) 468 + return err; 469 + 470 + skb_set_inner_ipproto(skb, IPPROTO_IPIP); 471 + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 472 + err = skb_cow_head(skb, headroom); 473 + if (err) 474 + return err; 475 + 476 + skb_scrub_packet(skb, true); 477 + skb_clear_hash_if_not_l4(skb); 478 + 479 + /* Push down and install the IP header. */ 480 + skb_push(skb, sizeof(*iph)); 481 + skb_reset_network_header(skb); 482 + 483 + iph = ip_hdr(skb); 484 + iph->version = 4; 485 + iph->ihl = sizeof(*iph) >> 2; 486 + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; 487 + iph->protocol = tuple->tun.l3_proto; 488 + iph->tos = tos; 489 + iph->daddr = tuple->tun.src_v4.s_addr; 490 + iph->saddr = tuple->tun.dst_v4.s_addr; 491 + iph->ttl = ttl; 492 + iph->tot_len = htons(skb->len); 493 + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); 494 + ip_send_check(iph); 495 + 496 + *ip_daddr = tuple->tun.src_v4.s_addr; 497 + 498 + return 0; 499 + } 500 + 501 + static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, 502 + struct flow_offload_tuple *tuple, 503 + __be32 *ip_daddr) 504 + { 505 + if (tuple->tun_num) 506 + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); 507 + 508 + return 0; 509 + } 510 + 511 + static int nf_flow_encap_push(struct sk_buff *skb, 512 + struct flow_offload_tuple *tuple) 513 + { 514 + int i; 515 + 516 + for (i = 0; i < tuple->encap_num; i++) { 517 + switch (tuple->encap[i].proto) { 518 + case htons(ETH_P_8021Q): 519 + case htons(ETH_P_8021AD): 520 + if (skb_vlan_push(skb, tuple->encap[i].proto, 521 + tuple->encap[i].id) < 0) 522 + return -1; 523 + break; 524 + case htons(ETH_P_PPP_SES): 525 + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) 526 + return -1; 527 + break; 528 + } 529 + } 530 + 531 + return 0; 532 + } 533 + 476 534 unsigned int 477 535 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, 478 536 const struct nf_hook_state *state) 479 537 { 480 538 struct flow_offload_tuple_rhash *tuplehash; 481 539 struct nf_flowtable *flow_table = priv; 540 + struct flow_offload_tuple *other_tuple; 482 541 enum flow_offload_tuple_dir dir; 483 542 struct nf_flowtable_ctx ctx = { 484 543 .in = state->in, 485 544 }; 545 + struct nf_flow_xmit xmit = {}; 486 546 struct flow_offload *flow; 487 - struct net_device *outdev; 547 + struct neighbour *neigh; 488 548 struct rtable *rt; 489 - __be32 nexthop; 549 + __be32 ip_daddr; 490 550 int ret; 491 551 492 552 tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); ··· 628 450 629 451 dir = tuplehash->tuple.dir; 630 452 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 453 + other_tuple = &flow->tuplehash[!dir].tuple; 454 + ip_daddr = other_tuple->src_v4.s_addr; 455 + 456 + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) 457 + return NF_DROP; 458 + 459 + if (nf_flow_encap_push(skb, other_tuple) < 0) 460 + return NF_DROP; 631 461 632 462 switch (tuplehash->tuple.xmit_type) { 633 463 case FLOW_OFFLOAD_XMIT_NEIGH: 634 464 rt = dst_rtable(tuplehash->tuple.dst_cache); 635 - outdev = rt->dst.dev; 636 - skb->dev = outdev; 637 - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); 465 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); 466 + if (!xmit.outdev) { 467 + flow_offload_teardown(flow); 468 + return NF_DROP; 469 + } 470 + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); 471 + if (IS_ERR(neigh)) { 472 + flow_offload_teardown(flow); 473 + return NF_DROP; 474 + } 475 + xmit.dest = neigh->ha; 638 476 skb_dst_set_noref(skb, &rt->dst); 639 - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); 640 - ret = NF_STOLEN; 641 477 break; 642 478 case FLOW_OFFLOAD_XMIT_DIRECT: 643 - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); 644 - if (ret == NF_DROP) 479 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); 480 + if (!xmit.outdev) { 645 481 flow_offload_teardown(flow); 482 + return NF_DROP; 483 + } 484 + xmit.dest = tuplehash->tuple.out.h_dest; 485 + xmit.source = tuplehash->tuple.out.h_source; 646 486 break; 647 487 default: 648 488 WARN_ON_ONCE(1); 649 - ret = NF_DROP; 650 - break; 489 + return NF_DROP; 651 490 } 652 491 653 - return ret; 492 + return nf_flow_queue_xmit(state->net, skb, &xmit); 654 493 } 655 494 EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); 656 495 ··· 910 715 { 911 716 struct flow_offload_tuple_rhash *tuplehash; 912 717 struct nf_flowtable *flow_table = priv; 718 + struct flow_offload_tuple *other_tuple; 913 719 enum flow_offload_tuple_dir dir; 914 720 struct nf_flowtable_ctx ctx = { 915 721 .in = state->in, 916 722 }; 917 - const struct in6_addr *nexthop; 723 + struct nf_flow_xmit xmit = {}; 724 + struct in6_addr *ip6_daddr; 918 725 struct flow_offload *flow; 919 - struct net_device *outdev; 726 + struct neighbour *neigh; 920 727 struct rt6_info *rt; 921 728 int ret; 922 729 ··· 942 745 943 746 dir = tuplehash->tuple.dir; 944 747 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 748 + other_tuple = &flow->tuplehash[!dir].tuple; 749 + ip6_daddr = &other_tuple->src_v6; 750 + 751 + if (nf_flow_encap_push(skb, other_tuple) < 0) 752 + return NF_DROP; 945 753 946 754 switch (tuplehash->tuple.xmit_type) { 947 755 case FLOW_OFFLOAD_XMIT_NEIGH: 948 756 rt = dst_rt6_info(tuplehash->tuple.dst_cache); 949 - outdev = rt->dst.dev; 950 - skb->dev = outdev; 951 - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); 757 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); 758 + if (!xmit.outdev) { 759 + flow_offload_teardown(flow); 760 + return NF_DROP; 761 + } 762 + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); 763 + if (IS_ERR(neigh)) { 764 + flow_offload_teardown(flow); 765 + return NF_DROP; 766 + } 767 + xmit.dest = neigh->ha; 952 768 skb_dst_set_noref(skb, &rt->dst); 953 - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); 954 - ret = NF_STOLEN; 955 769 break; 956 770 case FLOW_OFFLOAD_XMIT_DIRECT: 957 - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); 958 - if (ret == NF_DROP) 771 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); 772 + if (!xmit.outdev) { 959 773 flow_offload_teardown(flow); 774 + return NF_DROP; 775 + } 776 + xmit.dest = tuplehash->tuple.out.h_dest; 777 + xmit.source = tuplehash->tuple.out.h_source; 960 778 break; 961 779 default: 962 780 WARN_ON_ONCE(1); 963 - ret = NF_DROP; 964 - break; 781 + return NF_DROP; 965 782 } 966 783 967 - return ret; 784 + return nf_flow_queue_xmit(state->net, skb, &xmit); 968 785 } 969 786 EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
+1 -1
net/netfilter/nf_flow_table_offload.c
··· 555 555 switch (this_tuple->xmit_type) { 556 556 case FLOW_OFFLOAD_XMIT_DIRECT: 557 557 this_tuple = &flow->tuplehash[dir].tuple; 558 - ifindex = this_tuple->out.hw_ifidx; 558 + ifindex = this_tuple->out.ifidx; 559 559 break; 560 560 case FLOW_OFFLOAD_XMIT_NEIGH: 561 561 other_tuple = &flow->tuplehash[!dir].tuple;
+330
net/netfilter/nf_flow_table_path.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <linux/kernel.h> 3 + #include <linux/module.h> 4 + #include <linux/init.h> 5 + #include <linux/netlink.h> 6 + #include <linux/netfilter.h> 7 + #include <linux/spinlock.h> 8 + #include <linux/netfilter/nf_conntrack_common.h> 9 + #include <linux/netfilter/nf_tables.h> 10 + #include <net/ip.h> 11 + #include <net/inet_dscp.h> 12 + #include <net/netfilter/nf_tables.h> 13 + #include <net/netfilter/nf_tables_core.h> 14 + #include <net/netfilter/nf_conntrack_core.h> 15 + #include <net/netfilter/nf_conntrack_extend.h> 16 + #include <net/netfilter/nf_flow_table.h> 17 + 18 + static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) 19 + { 20 + if (dst_xfrm(dst)) 21 + return FLOW_OFFLOAD_XMIT_XFRM; 22 + 23 + return FLOW_OFFLOAD_XMIT_NEIGH; 24 + } 25 + 26 + static void nft_default_forward_path(struct nf_flow_route *route, 27 + struct dst_entry *dst_cache, 28 + enum ip_conntrack_dir dir) 29 + { 30 + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; 31 + route->tuple[dir].dst = dst_cache; 32 + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); 33 + } 34 + 35 + static bool nft_is_valid_ether_device(const struct net_device *dev) 36 + { 37 + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || 38 + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) 39 + return false; 40 + 41 + return true; 42 + } 43 + 44 + static int nft_dev_fill_forward_path(const struct nf_flow_route *route, 45 + const struct dst_entry *dst_cache, 46 + const struct nf_conn *ct, 47 + enum ip_conntrack_dir dir, u8 *ha, 48 + struct net_device_path_stack *stack) 49 + { 50 + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; 51 + struct net_device *dev = dst_cache->dev; 52 + struct neighbour *n; 53 + u8 nud_state; 54 + 55 + if (!nft_is_valid_ether_device(dev)) 56 + goto out; 57 + 58 + n = dst_neigh_lookup(dst_cache, daddr); 59 + if (!n) 60 + return -1; 61 + 62 + read_lock_bh(&n->lock); 63 + nud_state = n->nud_state; 64 + ether_addr_copy(ha, n->ha); 65 + read_unlock_bh(&n->lock); 66 + neigh_release(n); 67 + 68 + if (!(nud_state & NUD_VALID)) 69 + return -1; 70 + 71 + out: 72 + return dev_fill_forward_path(dev, ha, stack); 73 + } 74 + 75 + struct nft_forward_info { 76 + const struct net_device *indev; 77 + const struct net_device *outdev; 78 + struct id { 79 + __u16 id; 80 + __be16 proto; 81 + } encap[NF_FLOW_TABLE_ENCAP_MAX]; 82 + u8 num_encaps; 83 + struct flow_offload_tunnel tun; 84 + u8 num_tuns; 85 + u8 ingress_vlans; 86 + u8 h_source[ETH_ALEN]; 87 + u8 h_dest[ETH_ALEN]; 88 + enum flow_offload_xmit_type xmit_type; 89 + }; 90 + 91 + static void nft_dev_path_info(const struct net_device_path_stack *stack, 92 + struct nft_forward_info *info, 93 + unsigned char *ha, struct nf_flowtable *flowtable) 94 + { 95 + const struct net_device_path *path; 96 + int i; 97 + 98 + memcpy(info->h_dest, ha, ETH_ALEN); 99 + 100 + for (i = 0; i < stack->num_paths; i++) { 101 + path = &stack->path[i]; 102 + switch (path->type) { 103 + case DEV_PATH_ETHERNET: 104 + case DEV_PATH_DSA: 105 + case DEV_PATH_VLAN: 106 + case DEV_PATH_PPPOE: 107 + case DEV_PATH_TUN: 108 + info->indev = path->dev; 109 + if (is_zero_ether_addr(info->h_source)) 110 + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 111 + 112 + if (path->type == DEV_PATH_ETHERNET) 113 + break; 114 + if (path->type == DEV_PATH_DSA) { 115 + i = stack->num_paths; 116 + break; 117 + } 118 + 119 + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ 120 + if (path->type == DEV_PATH_TUN) { 121 + if (info->num_tuns) { 122 + info->indev = NULL; 123 + break; 124 + } 125 + info->tun.src_v6 = path->tun.src_v6; 126 + info->tun.dst_v6 = path->tun.dst_v6; 127 + info->tun.l3_proto = path->tun.l3_proto; 128 + info->num_tuns++; 129 + } else { 130 + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 131 + info->indev = NULL; 132 + break; 133 + } 134 + info->encap[info->num_encaps].id = 135 + path->encap.id; 136 + info->encap[info->num_encaps].proto = 137 + path->encap.proto; 138 + info->num_encaps++; 139 + } 140 + if (path->type == DEV_PATH_PPPOE) 141 + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 142 + break; 143 + case DEV_PATH_BRIDGE: 144 + if (is_zero_ether_addr(info->h_source)) 145 + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 146 + 147 + switch (path->bridge.vlan_mode) { 148 + case DEV_PATH_BR_VLAN_UNTAG_HW: 149 + info->ingress_vlans |= BIT(info->num_encaps - 1); 150 + break; 151 + case DEV_PATH_BR_VLAN_TAG: 152 + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 153 + info->indev = NULL; 154 + break; 155 + } 156 + info->encap[info->num_encaps].id = path->bridge.vlan_id; 157 + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; 158 + info->num_encaps++; 159 + break; 160 + case DEV_PATH_BR_VLAN_UNTAG: 161 + if (WARN_ON_ONCE(info->num_encaps-- == 0)) { 162 + info->indev = NULL; 163 + break; 164 + } 165 + break; 166 + case DEV_PATH_BR_VLAN_KEEP: 167 + break; 168 + } 169 + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 170 + break; 171 + default: 172 + info->indev = NULL; 173 + break; 174 + } 175 + } 176 + info->outdev = info->indev; 177 + 178 + if (nf_flowtable_hw_offload(flowtable) && 179 + nft_is_valid_ether_device(info->indev)) 180 + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 181 + } 182 + 183 + static bool nft_flowtable_find_dev(const struct net_device *dev, 184 + struct nft_flowtable *ft) 185 + { 186 + struct nft_hook *hook; 187 + bool found = false; 188 + 189 + list_for_each_entry_rcu(hook, &ft->hook_list, list) { 190 + if (!nft_hook_find_ops_rcu(hook, dev)) 191 + continue; 192 + 193 + found = true; 194 + break; 195 + } 196 + 197 + return found; 198 + } 199 + 200 + static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, 201 + struct flow_offload_tunnel *tun, 202 + struct nf_flow_route *route, 203 + enum ip_conntrack_dir dir) 204 + { 205 + struct dst_entry *cur_dst = route->tuple[dir].dst; 206 + struct dst_entry *tun_dst = NULL; 207 + struct flowi fl = {}; 208 + 209 + switch (nft_pf(pkt)) { 210 + case NFPROTO_IPV4: 211 + fl.u.ip4.daddr = tun->dst_v4.s_addr; 212 + fl.u.ip4.saddr = tun->src_v4.s_addr; 213 + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; 214 + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); 215 + fl.u.ip4.flowi4_mark = pkt->skb->mark; 216 + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 217 + break; 218 + case NFPROTO_IPV6: 219 + fl.u.ip6.daddr = tun->dst_v6; 220 + fl.u.ip6.saddr = tun->src_v6; 221 + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; 222 + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 223 + fl.u.ip6.flowi6_mark = pkt->skb->mark; 224 + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 225 + break; 226 + } 227 + 228 + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); 229 + if (!tun_dst) 230 + return -ENOENT; 231 + 232 + route->tuple[dir].dst = tun_dst; 233 + dst_release(cur_dst); 234 + 235 + return 0; 236 + } 237 + 238 + static void nft_dev_forward_path(const struct nft_pktinfo *pkt, 239 + struct nf_flow_route *route, 240 + const struct nf_conn *ct, 241 + enum ip_conntrack_dir dir, 242 + struct nft_flowtable *ft) 243 + { 244 + const struct dst_entry *dst = route->tuple[dir].dst; 245 + struct net_device_path_stack stack; 246 + struct nft_forward_info info = {}; 247 + unsigned char ha[ETH_ALEN]; 248 + int i; 249 + 250 + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) 251 + nft_dev_path_info(&stack, &info, ha, &ft->data); 252 + 253 + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) 254 + return; 255 + 256 + route->tuple[!dir].in.ifindex = info.indev->ifindex; 257 + for (i = 0; i < info.num_encaps; i++) { 258 + route->tuple[!dir].in.encap[i].id = info.encap[i].id; 259 + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 260 + } 261 + 262 + if (info.num_tuns && 263 + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { 264 + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; 265 + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; 266 + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; 267 + route->tuple[!dir].in.num_tuns = info.num_tuns; 268 + } 269 + 270 + route->tuple[!dir].in.num_encaps = info.num_encaps; 271 + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 272 + route->tuple[dir].out.ifindex = info.outdev->ifindex; 273 + 274 + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { 275 + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); 276 + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); 277 + route->tuple[dir].xmit_type = info.xmit_type; 278 + } 279 + } 280 + 281 + int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, 282 + struct nf_flow_route *route, enum ip_conntrack_dir dir, 283 + struct nft_flowtable *ft) 284 + { 285 + struct dst_entry *this_dst = skb_dst(pkt->skb); 286 + struct dst_entry *other_dst = NULL; 287 + struct flowi fl; 288 + 289 + memset(&fl, 0, sizeof(fl)); 290 + switch (nft_pf(pkt)) { 291 + case NFPROTO_IPV4: 292 + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; 293 + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; 294 + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; 295 + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; 296 + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); 297 + fl.u.ip4.flowi4_mark = pkt->skb->mark; 298 + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 299 + break; 300 + case NFPROTO_IPV6: 301 + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; 302 + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; 303 + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; 304 + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; 305 + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 306 + fl.u.ip6.flowi6_mark = pkt->skb->mark; 307 + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 308 + break; 309 + } 310 + 311 + if (!dst_hold_safe(this_dst)) 312 + return -ENOENT; 313 + 314 + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); 315 + if (!other_dst) { 316 + dst_release(this_dst); 317 + return -ENOENT; 318 + } 319 + 320 + nft_default_forward_path(route, this_dst, dir); 321 + nft_default_forward_path(route, other_dst, !dir); 322 + 323 + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) 324 + nft_dev_forward_path(pkt, route, ct, dir, ft); 325 + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) 326 + nft_dev_forward_path(pkt, route, ct, !dir, ft); 327 + 328 + return 0; 329 + } 330 + EXPORT_SYMBOL_GPL(nft_flow_route);
+27 -27
net/netfilter/nft_connlimit.c
··· 24 24 const struct nft_pktinfo *pkt, 25 25 const struct nft_set_ext *ext) 26 26 { 27 - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 28 - const struct nf_conntrack_tuple *tuple_ptr; 29 - struct nf_conntrack_tuple tuple; 30 - enum ip_conntrack_info ctinfo; 31 - const struct nf_conn *ct; 32 27 unsigned int count; 28 + int err; 33 29 34 - tuple_ptr = &tuple; 35 - 36 - ct = nf_ct_get(pkt->skb, &ctinfo); 37 - if (ct != NULL) { 38 - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 39 - zone = nf_ct_zone(ct); 40 - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), 41 - nft_pf(pkt), nft_net(pkt), &tuple)) { 42 - regs->verdict.code = NF_DROP; 43 - return; 44 - } 45 - 46 - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { 47 - regs->verdict.code = NF_DROP; 48 - return; 30 + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); 31 + if (err) { 32 + if (err == -EEXIST) { 33 + /* Call gc to update the list count if any connection has 34 + * been closed already. This is useful for softlimit 35 + * connections like limiting bandwidth based on a number 36 + * of open connections. 37 + */ 38 + nf_conncount_gc_list(nft_net(pkt), priv->list); 39 + } else { 40 + regs->verdict.code = NF_DROP; 41 + return; 42 + } 49 43 } 50 44 51 45 count = READ_ONCE(priv->list->count); 52 46 53 - if ((count > priv->limit) ^ priv->invert) { 47 + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { 54 48 regs->verdict.code = NFT_BREAK; 55 49 return; 56 50 } ··· 131 137 return nft_connlimit_do_init(ctx, tb, priv); 132 138 } 133 139 140 + static void nft_connlimit_obj_update(struct nft_object *obj, 141 + struct nft_object *newobj) 142 + { 143 + struct nft_connlimit *newpriv = nft_obj_data(newobj); 144 + struct nft_connlimit *priv = nft_obj_data(obj); 145 + 146 + WRITE_ONCE(priv->limit, newpriv->limit); 147 + WRITE_ONCE(priv->invert, newpriv->invert); 148 + } 149 + 134 150 static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, 135 151 struct nft_object *obj) 136 152 { ··· 170 166 .init = nft_connlimit_obj_init, 171 167 .destroy = nft_connlimit_obj_destroy, 172 168 .dump = nft_connlimit_obj_dump, 169 + .update = nft_connlimit_obj_update, 173 170 }; 174 171 175 172 static struct nft_object_type nft_connlimit_obj_type __read_mostly = { ··· 243 238 static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) 244 239 { 245 240 struct nft_connlimit *priv = nft_expr_priv(expr); 246 - bool ret; 247 241 248 - local_bh_disable(); 249 - ret = nf_conncount_gc_list(net, priv->list); 250 - local_bh_enable(); 251 - 252 - return ret; 242 + return nf_conncount_gc_list(net, priv->list); 253 243 } 254 244 255 245 static struct nft_expr_type nft_connlimit_type;
-252
net/netfilter/nft_flow_offload.c
··· 20 20 struct nft_flowtable *flowtable; 21 21 }; 22 22 23 - static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) 24 - { 25 - if (dst_xfrm(dst)) 26 - return FLOW_OFFLOAD_XMIT_XFRM; 27 - 28 - return FLOW_OFFLOAD_XMIT_NEIGH; 29 - } 30 - 31 - static void nft_default_forward_path(struct nf_flow_route *route, 32 - struct dst_entry *dst_cache, 33 - enum ip_conntrack_dir dir) 34 - { 35 - route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; 36 - route->tuple[dir].dst = dst_cache; 37 - route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); 38 - } 39 - 40 - static bool nft_is_valid_ether_device(const struct net_device *dev) 41 - { 42 - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || 43 - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) 44 - return false; 45 - 46 - return true; 47 - } 48 - 49 - static int nft_dev_fill_forward_path(const struct nf_flow_route *route, 50 - const struct dst_entry *dst_cache, 51 - const struct nf_conn *ct, 52 - enum ip_conntrack_dir dir, u8 *ha, 53 - struct net_device_path_stack *stack) 54 - { 55 - const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; 56 - struct net_device *dev = dst_cache->dev; 57 - struct neighbour *n; 58 - u8 nud_state; 59 - 60 - if (!nft_is_valid_ether_device(dev)) 61 - goto out; 62 - 63 - n = dst_neigh_lookup(dst_cache, daddr); 64 - if (!n) 65 - return -1; 66 - 67 - read_lock_bh(&n->lock); 68 - nud_state = n->nud_state; 69 - ether_addr_copy(ha, n->ha); 70 - read_unlock_bh(&n->lock); 71 - neigh_release(n); 72 - 73 - if (!(nud_state & NUD_VALID)) 74 - return -1; 75 - 76 - out: 77 - return dev_fill_forward_path(dev, ha, stack); 78 - } 79 - 80 - struct nft_forward_info { 81 - const struct net_device *indev; 82 - const struct net_device *outdev; 83 - const struct net_device *hw_outdev; 84 - struct id { 85 - __u16 id; 86 - __be16 proto; 87 - } encap[NF_FLOW_TABLE_ENCAP_MAX]; 88 - u8 num_encaps; 89 - u8 ingress_vlans; 90 - u8 h_source[ETH_ALEN]; 91 - u8 h_dest[ETH_ALEN]; 92 - enum flow_offload_xmit_type xmit_type; 93 - }; 94 - 95 - static void nft_dev_path_info(const struct net_device_path_stack *stack, 96 - struct nft_forward_info *info, 97 - unsigned char *ha, struct nf_flowtable *flowtable) 98 - { 99 - const struct net_device_path *path; 100 - int i; 101 - 102 - memcpy(info->h_dest, ha, ETH_ALEN); 103 - 104 - for (i = 0; i < stack->num_paths; i++) { 105 - path = &stack->path[i]; 106 - switch (path->type) { 107 - case DEV_PATH_ETHERNET: 108 - case DEV_PATH_DSA: 109 - case DEV_PATH_VLAN: 110 - case DEV_PATH_PPPOE: 111 - info->indev = path->dev; 112 - if (is_zero_ether_addr(info->h_source)) 113 - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 114 - 115 - if (path->type == DEV_PATH_ETHERNET) 116 - break; 117 - if (path->type == DEV_PATH_DSA) { 118 - i = stack->num_paths; 119 - break; 120 - } 121 - 122 - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ 123 - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 124 - info->indev = NULL; 125 - break; 126 - } 127 - if (!info->outdev) 128 - info->outdev = path->dev; 129 - info->encap[info->num_encaps].id = path->encap.id; 130 - info->encap[info->num_encaps].proto = path->encap.proto; 131 - info->num_encaps++; 132 - if (path->type == DEV_PATH_PPPOE) 133 - memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 134 - break; 135 - case DEV_PATH_BRIDGE: 136 - if (is_zero_ether_addr(info->h_source)) 137 - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 138 - 139 - switch (path->bridge.vlan_mode) { 140 - case DEV_PATH_BR_VLAN_UNTAG_HW: 141 - info->ingress_vlans |= BIT(info->num_encaps - 1); 142 - break; 143 - case DEV_PATH_BR_VLAN_TAG: 144 - info->encap[info->num_encaps].id = path->bridge.vlan_id; 145 - info->encap[info->num_encaps].proto = path->bridge.vlan_proto; 146 - info->num_encaps++; 147 - break; 148 - case DEV_PATH_BR_VLAN_UNTAG: 149 - info->num_encaps--; 150 - break; 151 - case DEV_PATH_BR_VLAN_KEEP: 152 - break; 153 - } 154 - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 155 - break; 156 - default: 157 - info->indev = NULL; 158 - break; 159 - } 160 - } 161 - if (!info->outdev) 162 - info->outdev = info->indev; 163 - 164 - info->hw_outdev = info->indev; 165 - 166 - if (nf_flowtable_hw_offload(flowtable) && 167 - nft_is_valid_ether_device(info->indev)) 168 - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 169 - } 170 - 171 - static bool nft_flowtable_find_dev(const struct net_device *dev, 172 - struct nft_flowtable *ft) 173 - { 174 - struct nft_hook *hook; 175 - bool found = false; 176 - 177 - list_for_each_entry_rcu(hook, &ft->hook_list, list) { 178 - if (!nft_hook_find_ops_rcu(hook, dev)) 179 - continue; 180 - 181 - found = true; 182 - break; 183 - } 184 - 185 - return found; 186 - } 187 - 188 - static void nft_dev_forward_path(struct nf_flow_route *route, 189 - const struct nf_conn *ct, 190 - enum ip_conntrack_dir dir, 191 - struct nft_flowtable *ft) 192 - { 193 - const struct dst_entry *dst = route->tuple[dir].dst; 194 - struct net_device_path_stack stack; 195 - struct nft_forward_info info = {}; 196 - unsigned char ha[ETH_ALEN]; 197 - int i; 198 - 199 - if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) 200 - nft_dev_path_info(&stack, &info, ha, &ft->data); 201 - 202 - if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) 203 - return; 204 - 205 - route->tuple[!dir].in.ifindex = info.indev->ifindex; 206 - for (i = 0; i < info.num_encaps; i++) { 207 - route->tuple[!dir].in.encap[i].id = info.encap[i].id; 208 - route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 209 - } 210 - route->tuple[!dir].in.num_encaps = info.num_encaps; 211 - route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 212 - 213 - if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { 214 - memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); 215 - memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); 216 - route->tuple[dir].out.ifindex = info.outdev->ifindex; 217 - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; 218 - route->tuple[dir].xmit_type = info.xmit_type; 219 - } 220 - } 221 - 222 - static int nft_flow_route(const struct nft_pktinfo *pkt, 223 - const struct nf_conn *ct, 224 - struct nf_flow_route *route, 225 - enum ip_conntrack_dir dir, 226 - struct nft_flowtable *ft) 227 - { 228 - struct dst_entry *this_dst = skb_dst(pkt->skb); 229 - struct dst_entry *other_dst = NULL; 230 - struct flowi fl; 231 - 232 - memset(&fl, 0, sizeof(fl)); 233 - switch (nft_pf(pkt)) { 234 - case NFPROTO_IPV4: 235 - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; 236 - fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; 237 - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; 238 - fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; 239 - fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); 240 - fl.u.ip4.flowi4_mark = pkt->skb->mark; 241 - fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 242 - break; 243 - case NFPROTO_IPV6: 244 - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; 245 - fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; 246 - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; 247 - fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; 248 - fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 249 - fl.u.ip6.flowi6_mark = pkt->skb->mark; 250 - fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 251 - break; 252 - } 253 - 254 - if (!dst_hold_safe(this_dst)) 255 - return -ENOENT; 256 - 257 - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); 258 - if (!other_dst) { 259 - dst_release(this_dst); 260 - return -ENOENT; 261 - } 262 - 263 - nft_default_forward_path(route, this_dst, dir); 264 - nft_default_forward_path(route, other_dst, !dir); 265 - 266 - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && 267 - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { 268 - nft_dev_forward_path(route, ct, dir, ft); 269 - nft_dev_forward_path(route, ct, !dir, ft); 270 - } 271 - 272 - return 0; 273 - } 274 - 275 23 static bool nft_flow_offload_skip(struct sk_buff *skb, int family) 276 24 { 277 25 if (skb_sec_path(skb))
+3 -11
net/netfilter/xt_connlimit.c
··· 31 31 { 32 32 struct net *net = xt_net(par); 33 33 const struct xt_connlimit_info *info = par->matchinfo; 34 - struct nf_conntrack_tuple tuple; 35 - const struct nf_conntrack_tuple *tuple_ptr = &tuple; 36 34 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 37 35 enum ip_conntrack_info ctinfo; 38 36 const struct nf_conn *ct; ··· 38 40 u32 key[5]; 39 41 40 42 ct = nf_ct_get(skb, &ctinfo); 41 - if (ct != NULL) { 42 - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 43 + if (ct) 43 44 zone = nf_ct_zone(ct); 44 - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 45 - xt_family(par), net, &tuple)) { 46 - goto hotdrop; 47 - } 48 45 49 46 if (xt_family(par) == NFPROTO_IPV6) { 50 47 const struct ipv6hdr *iph = ipv6_hdr(skb); ··· 62 69 key[1] = zone->id; 63 70 } 64 71 65 - connections = nf_conncount_count(net, info->data, key, tuple_ptr, 66 - zone); 72 + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); 67 73 if (connections == 0) 68 - /* kmalloc failed, drop it entirely */ 74 + /* kmalloc failed or tuple couldn't be found, drop it entirely */ 69 75 goto hotdrop; 70 76 71 77 return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);
+8 -8
net/openvswitch/conntrack.c
··· 928 928 } 929 929 930 930 static int ovs_ct_check_limit(struct net *net, 931 - const struct ovs_conntrack_info *info, 932 - const struct nf_conntrack_tuple *tuple) 931 + const struct sk_buff *skb, 932 + const struct ovs_conntrack_info *info) 933 933 { 934 934 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 935 935 const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; ··· 942 942 if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) 943 943 return 0; 944 944 945 - connections = nf_conncount_count(net, ct_limit_info->data, 946 - &conncount_key, tuple, &info->zone); 945 + connections = nf_conncount_count_skb(net, skb, info->family, 946 + ct_limit_info->data, 947 + &conncount_key); 947 948 if (connections > per_zone_limit) 948 949 return -ENOMEM; 949 950 ··· 973 972 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 974 973 if (static_branch_unlikely(&ovs_ct_limit_enabled)) { 975 974 if (!nf_ct_is_confirmed(ct)) { 976 - err = ovs_ct_check_limit(net, info, 977 - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 975 + err = ovs_ct_check_limit(net, skb, info); 978 976 if (err) { 979 977 net_warn_ratelimited("openvswitch: zone: %u " 980 978 "exceeds conntrack limit\n", ··· 1770 1770 zone_limit.limit = limit; 1771 1771 nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); 1772 1772 1773 - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, 1774 - &ct_zone); 1773 + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, 1774 + &conncount_key); 1775 1775 return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); 1776 1776 } 1777 1777
+112 -14
tools/testing/selftests/net/netfilter/nft_flowtable.sh
··· 127 127 ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 128 128 ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad 129 129 130 + ip netns exec "$nsr1" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 131 + ip netns exec "$nsr2" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 130 132 for i in 0 1; do 131 133 ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null 132 134 ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null ··· 155 153 ip -net "$ns2" route add default via dead:2::1 156 154 157 155 ip -net "$nsr1" route add default via 192.168.10.2 156 + ip -6 -net "$nsr1" route add default via fee1:2::2 158 157 ip -net "$nsr2" route add default via 192.168.10.1 158 + ip -6 -net "$nsr2" route add default via fee1:2::1 159 159 160 160 ip netns exec "$nsr1" nft -f - <<EOF 161 161 table inet filter { ··· 356 352 local nsa=$1 357 353 local nsb=$2 358 354 local pmtu=$3 359 - local dstip=$4 360 - local dstport=$5 355 + local proto=$4 356 + local dstip=$5 357 + local dstport=$6 361 358 local lret=0 362 359 local socatc 363 360 local socatl ··· 368 363 infile="$nsin_small" 369 364 fi 370 365 371 - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & 366 + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -${proto} \ 367 + TCP"${proto}"-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & 372 368 lpid=$! 373 369 374 370 busywait 1000 listener_ready 375 371 376 - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" 372 + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -${proto} \ 373 + TCP"${proto}":"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" 377 374 socatc=$? 378 375 379 376 wait $lpid ··· 401 394 test_tcp_forwarding() 402 395 { 403 396 local pmtu="$3" 397 + local proto="$4" 398 + local dstip="$5" 399 + local dstport="$6" 404 400 405 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 401 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 406 402 407 403 return $? 408 404 } ··· 413 403 test_tcp_forwarding_set_dscp() 414 404 { 415 405 local pmtu="$3" 406 + local proto="$4" 407 + local dstip="$5" 408 + local dstport="$6" 416 409 417 410 ip netns exec "$nsr1" nft -f - <<EOF 418 411 table netdev dscpmangle { ··· 426 413 } 427 414 EOF 428 415 if [ $? -eq 0 ]; then 429 - test_tcp_forwarding_ip "$1" "$2" "$3" 10.0.2.99 12345 416 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 430 417 check_dscp "dscp_ingress" "$pmtu" 431 418 432 419 ip netns exec "$nsr1" nft delete table netdev dscpmangle ··· 443 430 } 444 431 EOF 445 432 if [ $? -eq 0 ]; then 446 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 433 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 447 434 check_dscp "dscp_egress" "$pmtu" 448 435 449 436 ip netns exec "$nsr1" nft delete table netdev dscpmangle ··· 454 441 # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 455 442 # counters should have seen packets (before and after ft offload kicks in). 456 443 ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 457 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 444 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 458 445 check_dscp "dscp_fwd" "$pmtu" 459 446 } 460 447 ··· 468 455 469 456 [ "$pmtu" -eq 0 ] && what="$what (pmtu disabled)" 470 457 471 - test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 10.0.2.99 12345 458 + test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 4 10.0.2.99 12345 472 459 lret=$? 473 460 474 461 if [ "$lret" -eq 0 ] ; then ··· 478 465 echo "PASS: flow offload for ns1/ns2 with masquerade $what" 479 466 fi 480 467 481 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.6.6.6 1666 468 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 4 10.6.6.6 1666 482 469 lret=$? 483 470 if [ "$pmtu" -eq 1 ] ;then 484 471 check_counters "flow offload for ns1/ns2 with dnat $what" ··· 500 487 # Due to MTU mismatch in both directions, all packets (except small packets like pure 501 488 # acks) have to be handled by normal forwarding path. Therefore, packet counters 502 489 # are not checked. 503 - if test_tcp_forwarding "$ns1" "$ns2" 0; then 490 + if test_tcp_forwarding "$ns1" "$ns2" 0 4 10.0.2.99 12345; then 504 491 echo "PASS: flow offloaded for ns1/ns2" 505 492 else 506 493 echo "FAIL: flow offload for ns1/ns2:" 1>&2 494 + ip netns exec "$nsr1" nft list ruleset 495 + ret=1 496 + fi 497 + 498 + if test_tcp_forwarding "$ns1" "$ns2" 0 6 "[dead:2::99]" 12345; then 499 + echo "PASS: IPv6 flow offloaded for ns1/ns2" 500 + else 501 + echo "FAIL: IPv6 flow offload for ns1/ns2:" 1>&2 507 502 ip netns exec "$nsr1" nft list ruleset 508 503 ret=1 509 504 fi ··· 541 520 EOF 542 521 543 522 check_dscp "dscp_none" "0" 544 - if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then 523 + if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 4 10.0.2.99 12345; then 545 524 echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2 546 525 exit 0 547 526 fi ··· 567 546 ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null 568 547 ip netns exec "$ns2" nft reset counters table inet filter >/dev/null 569 548 570 - if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then 549 + if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 4 10.0.2.99 12345; then 571 550 echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2 572 551 exit 0 573 552 fi ··· 578 557 echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 579 558 ip netns exec "$nsr1" nft list ruleset 580 559 fi 560 + 561 + # IPIP tunnel test: 562 + # Add IPIP tunnel interfaces and check flowtable acceleration. 563 + test_ipip() { 564 + if ! ip -net "$nsr1" link add name tun0 type ipip \ 565 + local 192.168.10.1 remote 192.168.10.2 >/dev/null;then 566 + echo "SKIP: could not add ipip tunnel" 567 + [ "$ret" -eq 0 ] && ret=$ksft_skip 568 + return 569 + fi 570 + ip -net "$nsr1" link set tun0 up 571 + ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 572 + ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null 573 + 574 + ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 575 + ip -net "$nsr2" link set tun0 up 576 + ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 577 + ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null 578 + 579 + ip -net "$nsr1" route change default via 192.168.100.2 580 + ip -net "$nsr2" route change default via 192.168.100.1 581 + ip -net "$ns2" route add default via 10.0.2.1 582 + 583 + ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' 584 + ip netns exec "$nsr1" nft -a insert rule inet filter forward \ 585 + 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' 586 + 587 + if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then 588 + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel" 1>&2 589 + ip netns exec "$nsr1" nft list ruleset 590 + ret=1 591 + fi 592 + 593 + # Create vlan tagged devices for IPIP traffic. 594 + ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 595 + ip -net "$nsr1" link set veth1.10 up 596 + ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 597 + ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null 598 + ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' 599 + ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 600 + ip -net "$nsr1" link set tun1 up 601 + ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 602 + ip -net "$nsr1" route change default via 192.168.200.2 603 + ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null 604 + ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' 605 + 606 + ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 607 + ip -net "$nsr2" link set veth0.10 up 608 + ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 609 + ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null 610 + ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 611 + ip -net "$nsr2" link set tun1 up 612 + ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 613 + ip -net "$nsr2" route change default via 192.168.200.1 614 + ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null 615 + 616 + if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then 617 + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 618 + ip netns exec "$nsr1" nft list ruleset 619 + ret=1 620 + fi 621 + 622 + # Restore the previous configuration 623 + ip -net "$nsr1" route change default via 192.168.10.2 624 + ip -net "$nsr2" route change default via 192.168.10.1 625 + ip -net "$ns2" route del default via 10.0.2.1 626 + } 581 627 582 628 # Another test: 583 629 # Add bridge interface br0 to Router1, with NAT enabled. ··· 731 643 ip -net "$nsr1" link set up dev veth0 732 644 } 733 645 646 + test_ipip 647 + 734 648 test_bridge 735 649 736 650 KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) ··· 773 683 ip -net "$ns2" route add default via 10.0.2.1 774 684 ip -net "$ns2" route add default via dead:2::1 775 685 776 - if test_tcp_forwarding "$ns1" "$ns2" 1; then 686 + if test_tcp_forwarding "$ns1" "$ns2" 1 4 10.0.2.99 12345; then 777 687 check_counters "ipsec tunnel mode for ns1/ns2" 778 688 else 779 689 echo "FAIL: ipsec tunnel mode for ns1/ns2" 690 + ip netns exec "$nsr1" nft list ruleset 1>&2 691 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 692 + fi 693 + 694 + if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then 695 + check_counters "IPv6 ipsec tunnel mode for ns1/ns2" 696 + else 697 + echo "FAIL: IPv6 ipsec tunnel mode for ns1/ns2" 780 698 ip netns exec "$nsr1" nft list ruleset 1>&2 781 699 ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 782 700 fi