Merge tag 'nf-next-25-11-28' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf-next

+13

include/linux/netdevice.h

··· 877 877 DEV_PATH_PPPOE, 878 878 DEV_PATH_DSA, 879 879 DEV_PATH_MTK_WDMA, 880 + DEV_PATH_TUN, 880 881 }; 881 882 882 883 struct net_device_path { ··· 889 888 __be16 proto; 890 889 u8 h_dest[ETH_ALEN]; 891 890 } encap; 891 + struct { 892 + union { 893 + struct in_addr src_v4; 894 + struct in6_addr src_v6; 895 + }; 896 + union { 897 + struct in_addr dst_v4; 898 + struct in6_addr dst_v6; 899 + }; 900 + 901 + u8 l3_proto; 902 + } tun; 892 903 struct { 893 904 enum { 894 905 DEV_PATH_BR_VLAN_KEEP,

+7 -8

include/net/netfilter/nf_conntrack_count.h

··· 18 18 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen); 19 19 void nf_conncount_destroy(struct net *net, struct nf_conncount_data *data); 20 20 21 - unsigned int nf_conncount_count(struct net *net, 22 - struct nf_conncount_data *data, 23 - const u32 *key, 24 - const struct nf_conntrack_tuple *tuple, 25 - const struct nf_conntrack_zone *zone); 21 + unsigned int nf_conncount_count_skb(struct net *net, 22 + const struct sk_buff *skb, 23 + u16 l3num, 24 + struct nf_conncount_data *data, 25 + const u32 *key); 26 26 27 - int nf_conncount_add(struct net *net, struct nf_conncount_list *list, 28 - const struct nf_conntrack_tuple *tuple, 29 - const struct nf_conntrack_zone *zone); 27 + int nf_conncount_add_skb(struct net *net, const struct sk_buff *skb, 28 + u16 l3num, struct nf_conncount_list *list); 30 29 31 30 void nf_conncount_list_init(struct nf_conncount_list *list); 32 31

+25 -1

include/net/netfilter/nf_flow_table.h

··· 107 107 108 108 #define NF_FLOW_TABLE_ENCAP_MAX 2 109 109 110 + struct flow_offload_tunnel { 111 + union { 112 + struct in_addr src_v4; 113 + struct in6_addr src_v6; 114 + }; 115 + union { 116 + struct in_addr dst_v4; 117 + struct in6_addr dst_v6; 118 + }; 119 + 120 + u8 l3_proto; 121 + }; 122 + 110 123 struct flow_offload_tuple { 111 124 union { 112 125 struct in_addr src_v4; ··· 143 130 __be16 proto; 144 131 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 145 132 133 + struct flow_offload_tunnel tun; 134 + 146 135 /* All members above are keys for lookups, see flow_offload_hash(). */ 147 136 struct { } __hash; 148 137 149 138 u8 dir:2, 150 139 xmit_type:3, 151 140 encap_num:2, 141 + tun_num:2, 152 142 in_vlan_ingress:2; 153 143 u16 mtu; 154 144 union { 155 145 struct { 156 146 struct dst_entry *dst_cache; 147 + u32 ifidx; 157 148 u32 dst_cookie; 158 149 }; 159 150 struct { 160 151 u32 ifidx; 161 - u32 hw_ifidx; 162 152 u8 h_source[ETH_ALEN]; 163 153 u8 h_dest[ETH_ALEN]; 164 154 } out; ··· 222 206 u16 id; 223 207 __be16 proto; 224 208 } encap[NF_FLOW_TABLE_ENCAP_MAX]; 209 + struct flow_offload_tunnel tun; 225 210 u8 num_encaps:2, 211 + num_tuns:2, 226 212 ingress_vlans:2; 227 213 } in; 228 214 struct { ··· 239 221 240 222 struct flow_offload *flow_offload_alloc(struct nf_conn *ct); 241 223 void flow_offload_free(struct flow_offload *flow); 224 + 225 + struct nft_flowtable; 226 + struct nft_pktinfo; 227 + int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, 228 + struct nf_flow_route *route, enum ip_conntrack_dir dir, 229 + struct nft_flowtable *ft); 242 230 243 231 static inline int 244 232 nf_flow_table_offload_add_cb(struct nf_flowtable *flow_table,

+7 -7

include/uapi/linux/netfilter/nf_tables.h

··· 881 881 * enum nft_exthdr_op - nf_tables match options 882 882 * 883 883 * @NFT_EXTHDR_OP_IPV6: match against ipv6 extension headers 884 - * @NFT_EXTHDR_OP_TCP: match against tcp options 884 + * @NFT_EXTHDR_OP_TCPOPT: match against tcp options 885 885 * @NFT_EXTHDR_OP_IPV4: match against ipv4 options 886 886 * @NFT_EXTHDR_OP_SCTP: match against sctp chunks 887 887 * @NFT_EXTHDR_OP_DCCP: match against dccp otions ··· 1200 1200 #define NFTA_CT_MAX (__NFTA_CT_MAX - 1) 1201 1201 1202 1202 /** 1203 - * enum nft_flow_attributes - ct offload expression attributes 1203 + * enum nft_offload_attributes - ct offload expression attributes 1204 1204 * @NFTA_FLOW_TABLE_NAME: flow table name (NLA_STRING) 1205 1205 */ 1206 1206 enum nft_offload_attributes { ··· 1410 1410 }; 1411 1411 1412 1412 /** 1413 - * enum nft_reject_code - Generic reject codes for IPv4/IPv6 1413 + * enum nft_reject_inet_code - Generic reject codes for IPv4/IPv6 1414 1414 * 1415 1415 * @NFT_REJECT_ICMPX_NO_ROUTE: no route to host / network unreachable 1416 1416 * @NFT_REJECT_ICMPX_PORT_UNREACH: port unreachable ··· 1480 1480 /** 1481 1481 * enum nft_tproxy_attributes - nf_tables tproxy expression netlink attributes 1482 1482 * 1483 - * NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) 1484 - * NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) 1485 - * NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) 1483 + * @NFTA_TPROXY_FAMILY: Target address family (NLA_U32: nft_registers) 1484 + * @NFTA_TPROXY_REG_ADDR: Target address register (NLA_U32: nft_registers) 1485 + * @NFTA_TPROXY_REG_PORT: Target port register (NLA_U32: nft_registers) 1486 1486 */ 1487 1487 enum nft_tproxy_attributes { 1488 1488 NFTA_TPROXY_UNSPEC, ··· 1783 1783 #define NFTA_SYNPROXY_MAX (__NFTA_SYNPROXY_MAX - 1) 1784 1784 1785 1785 /** 1786 - * enum nft_device_attributes - nf_tables device netlink attributes 1786 + * enum nft_devices_attributes - nf_tables device netlink attributes 1787 1787 * 1788 1788 * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) 1789 1789 * @NFTA_DEVICE_PREFIX: device name prefix, a simple wildcard (NLA_STRING)

+20 -20

include/uapi/linux/netfilter_ipv6/ip6t_srh.h

··· 41 41 42 42 /** 43 43 * struct ip6t_srh - SRH match options 44 - * @ next_hdr: Next header field of SRH 45 - * @ hdr_len: Extension header length field of SRH 46 - * @ segs_left: Segments left field of SRH 47 - * @ last_entry: Last entry field of SRH 48 - * @ tag: Tag field of SRH 49 - * @ mt_flags: match options 50 - * @ mt_invflags: Invert the sense of match options 44 + * @next_hdr: Next header field of SRH 45 + * @hdr_len: Extension header length field of SRH 46 + * @segs_left: Segments left field of SRH 47 + * @last_entry: Last entry field of SRH 48 + * @tag: Tag field of SRH 49 + * @mt_flags: match options 50 + * @mt_invflags: Invert the sense of match options 51 51 */ 52 52 53 53 struct ip6t_srh { ··· 62 62 63 63 /** 64 64 * struct ip6t_srh1 - SRH match options (revision 1) 65 - * @ next_hdr: Next header field of SRH 66 - * @ hdr_len: Extension header length field of SRH 67 - * @ segs_left: Segments left field of SRH 68 - * @ last_entry: Last entry field of SRH 69 - * @ tag: Tag field of SRH 70 - * @ psid_addr: Address of previous SID in SRH SID list 71 - * @ nsid_addr: Address of NEXT SID in SRH SID list 72 - * @ lsid_addr: Address of LAST SID in SRH SID list 73 - * @ psid_msk: Mask of previous SID in SRH SID list 74 - * @ nsid_msk: Mask of next SID in SRH SID list 75 - * @ lsid_msk: MAsk of last SID in SRH SID list 76 - * @ mt_flags: match options 77 - * @ mt_invflags: Invert the sense of match options 65 + * @next_hdr: Next header field of SRH 66 + * @hdr_len: Extension header length field of SRH 67 + * @segs_left: Segments left field of SRH 68 + * @last_entry: Last entry field of SRH 69 + * @tag: Tag field of SRH 70 + * @psid_addr: Address of previous SID in SRH SID list 71 + * @nsid_addr: Address of NEXT SID in SRH SID list 72 + * @lsid_addr: Address of LAST SID in SRH SID list 73 + * @psid_msk: Mask of previous SID in SRH SID list 74 + * @nsid_msk: Mask of next SID in SRH SID list 75 + * @lsid_msk: MAsk of last SID in SRH SID list 76 + * @mt_flags: match options 77 + * @mt_invflags: Invert the sense of match options 78 78 */ 79 79 80 80 struct ip6t_srh1 {

+25

net/ipv4/ipip.c

··· 353 353 return ip_tunnel_ctl(dev, p, cmd); 354 354 } 355 355 356 + static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, 357 + struct net_device_path *path) 358 + { 359 + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); 360 + const struct iphdr *tiph = &tunnel->parms.iph; 361 + struct rtable *rt; 362 + 363 + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, 364 + RT_SCOPE_UNIVERSE); 365 + if (IS_ERR(rt)) 366 + return PTR_ERR(rt); 367 + 368 + path->type = DEV_PATH_TUN; 369 + path->tun.src_v4.s_addr = tiph->saddr; 370 + path->tun.dst_v4.s_addr = tiph->daddr; 371 + path->tun.l3_proto = IPPROTO_IPIP; 372 + path->dev = ctx->dev; 373 + 374 + ctx->dev = rt->dst.dev; 375 + ip_rt_put(rt); 376 + 377 + return 0; 378 + } 379 + 356 380 static const struct net_device_ops ipip_netdev_ops = { 357 381 .ndo_init = ipip_tunnel_init, 358 382 .ndo_uninit = ip_tunnel_uninit, ··· 386 362 .ndo_get_stats64 = dev_get_tstats64, 387 363 .ndo_get_iflink = ip_tunnel_get_iflink, 388 364 .ndo_tunnel_ctl = ipip_tunnel_ctl, 365 + .ndo_fill_forward_path = ipip_fill_forward_path, 389 366 }; 390 367 391 368 #define IPIP_FEATURES (NETIF_F_SG | \

+1

net/netfilter/Makefile

··· 141 141 # flow table infrastructure 142 142 obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o 143 143 nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ 144 + nf_flow_table_path.o \ 144 145 nf_flow_table_offload.o nf_flow_table_xdp.o 145 146 nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o 146 147 ifeq ($(CONFIG_NF_FLOW_TABLE),m)

+148 -71

net/netfilter/nf_conncount.c

··· 122 122 return ERR_PTR(-EAGAIN); 123 123 } 124 124 125 - static int __nf_conncount_add(struct net *net, 126 - struct nf_conncount_list *list, 127 - const struct nf_conntrack_tuple *tuple, 128 - const struct nf_conntrack_zone *zone) 125 + static bool get_ct_or_tuple_from_skb(struct net *net, 126 + const struct sk_buff *skb, 127 + u16 l3num, 128 + struct nf_conn **ct, 129 + struct nf_conntrack_tuple *tuple, 130 + const struct nf_conntrack_zone **zone, 131 + bool *refcounted) 129 132 { 133 + const struct nf_conntrack_tuple_hash *h; 134 + enum ip_conntrack_info ctinfo; 135 + struct nf_conn *found_ct; 136 + 137 + found_ct = nf_ct_get(skb, &ctinfo); 138 + if (found_ct && !nf_ct_is_template(found_ct)) { 139 + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 140 + *zone = nf_ct_zone(found_ct); 141 + *ct = found_ct; 142 + return true; 143 + } 144 + 145 + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) 146 + return false; 147 + 148 + if (found_ct) 149 + *zone = nf_ct_zone(found_ct); 150 + 151 + h = nf_conntrack_find_get(net, *zone, tuple); 152 + if (!h) 153 + return true; 154 + 155 + found_ct = nf_ct_tuplehash_to_ctrack(h); 156 + *refcounted = true; 157 + *ct = found_ct; 158 + 159 + return true; 160 + } 161 + 162 + static int __nf_conncount_add(struct net *net, 163 + const struct sk_buff *skb, 164 + u16 l3num, 165 + struct nf_conncount_list *list) 166 + { 167 + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 130 168 const struct nf_conntrack_tuple_hash *found; 131 169 struct nf_conncount_tuple *conn, *conn_n; 170 + struct nf_conntrack_tuple tuple; 171 + struct nf_conn *ct = NULL; 132 172 struct nf_conn *found_ct; 133 173 unsigned int collect = 0; 174 + bool refcounted = false; 175 + 176 + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) 177 + return -ENOENT; 178 + 179 + if (ct && nf_ct_is_confirmed(ct)) { 180 + if (refcounted) 181 + nf_ct_put(ct); 182 + return -EEXIST; 183 + } 134 184 135 185 if ((u32)jiffies == list->last_gc) 136 186 goto add_new_node; ··· 194 144 if (IS_ERR(found)) { 195 145 /* Not found, but might be about to be confirmed */ 196 146 if (PTR_ERR(found) == -EAGAIN) { 197 - if (nf_ct_tuple_equal(&conn->tuple, tuple) && 147 + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && 198 148 nf_ct_zone_id(&conn->zone, conn->zone.dir) == 199 149 nf_ct_zone_id(zone, zone->dir)) 200 - return 0; /* already exists */ 150 + goto out_put; /* already exists */ 201 151 } else { 202 152 collect++; 203 153 } ··· 206 156 207 157 found_ct = nf_ct_tuplehash_to_ctrack(found); 208 158 209 - if (nf_ct_tuple_equal(&conn->tuple, tuple) && 159 + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && 210 160 nf_ct_zone_equal(found_ct, zone, zone->dir)) { 211 161 /* 212 162 * We should not see tuples twice unless someone hooks ··· 215 165 * Attempt to avoid a re-add in this case. 216 166 */ 217 167 nf_ct_put(found_ct); 218 - return 0; 168 + goto out_put; 219 169 } else if (already_closed(found_ct)) { 220 170 /* 221 171 * we do not care about connections which are ··· 238 188 if (conn == NULL) 239 189 return -ENOMEM; 240 190 241 - conn->tuple = *tuple; 191 + conn->tuple = tuple; 242 192 conn->zone = *zone; 243 193 conn->cpu = raw_smp_processor_id(); 244 194 conn->jiffies32 = (u32)jiffies; 245 195 list_add_tail(&conn->node, &list->head); 246 196 list->count++; 247 197 list->last_gc = (u32)jiffies; 198 + 199 + out_put: 200 + if (refcounted) 201 + nf_ct_put(ct); 248 202 return 0; 249 203 } 250 204 251 - int nf_conncount_add(struct net *net, 252 - struct nf_conncount_list *list, 253 - const struct nf_conntrack_tuple *tuple, 254 - const struct nf_conntrack_zone *zone) 205 + int nf_conncount_add_skb(struct net *net, 206 + const struct sk_buff *skb, 207 + u16 l3num, 208 + struct nf_conncount_list *list) 255 209 { 256 210 int ret; 257 211 258 212 /* check the saved connections */ 259 213 spin_lock_bh(&list->list_lock); 260 - ret = __nf_conncount_add(net, list, tuple, zone); 214 + ret = __nf_conncount_add(net, skb, l3num, list); 261 215 spin_unlock_bh(&list->list_lock); 262 216 263 217 return ret; 264 218 } 265 - EXPORT_SYMBOL_GPL(nf_conncount_add); 219 + EXPORT_SYMBOL_GPL(nf_conncount_add_skb); 266 220 267 221 void nf_conncount_list_init(struct nf_conncount_list *list) 268 222 { ··· 278 224 EXPORT_SYMBOL_GPL(nf_conncount_list_init); 279 225 280 226 /* Return true if the list is empty. Must be called with BH disabled. */ 281 - bool nf_conncount_gc_list(struct net *net, 282 - struct nf_conncount_list *list) 227 + static bool __nf_conncount_gc_list(struct net *net, 228 + struct nf_conncount_list *list) 283 229 { 284 230 const struct nf_conntrack_tuple_hash *found; 285 231 struct nf_conncount_tuple *conn, *conn_n; ··· 289 235 290 236 /* don't bother if we just did GC */ 291 237 if ((u32)jiffies == READ_ONCE(list->last_gc)) 292 - return false; 293 - 294 - /* don't bother if other cpu is already doing GC */ 295 - if (!spin_trylock(&list->list_lock)) 296 238 return false; 297 239 298 240 list_for_each_entry_safe(conn, conn_n, &list->head, node) { ··· 319 269 if (!list->count) 320 270 ret = true; 321 271 list->last_gc = (u32)jiffies; 322 - spin_unlock(&list->list_lock); 272 + 273 + return ret; 274 + } 275 + 276 + bool nf_conncount_gc_list(struct net *net, 277 + struct nf_conncount_list *list) 278 + { 279 + bool ret; 280 + 281 + /* don't bother if other cpu is already doing GC */ 282 + if (!spin_trylock_bh(&list->list_lock)) 283 + return false; 284 + 285 + ret = __nf_conncount_gc_list(net, list); 286 + spin_unlock_bh(&list->list_lock); 323 287 324 288 return ret; 325 289 } ··· 373 309 374 310 static unsigned int 375 311 insert_tree(struct net *net, 312 + const struct sk_buff *skb, 313 + u16 l3num, 376 314 struct nf_conncount_data *data, 377 315 struct rb_root *root, 378 316 unsigned int hash, 379 - const u32 *key, 380 - const struct nf_conntrack_tuple *tuple, 381 - const struct nf_conntrack_zone *zone) 317 + const u32 *key) 382 318 { 383 319 struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; 384 - struct rb_node **rbnode, *parent; 385 - struct nf_conncount_rb *rbconn; 386 - struct nf_conncount_tuple *conn; 320 + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 321 + bool do_gc = true, refcounted = false; 387 322 unsigned int count = 0, gc_count = 0; 388 - bool do_gc = true; 323 + struct rb_node **rbnode, *parent; 324 + struct nf_conntrack_tuple tuple; 325 + struct nf_conncount_tuple *conn; 326 + struct nf_conncount_rb *rbconn; 327 + struct nf_conn *ct = NULL; 389 328 390 329 spin_lock_bh(&nf_conncount_locks[hash]); 391 330 restart: ··· 407 340 } else { 408 341 int ret; 409 342 410 - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); 411 - if (ret) 343 + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); 344 + if (ret && ret != -EEXIST) 412 345 count = 0; /* hotdrop */ 413 346 else 414 347 count = rbconn->list.count; ··· 431 364 goto restart; 432 365 } 433 366 434 - /* expected case: match, insert new node */ 435 - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); 436 - if (rbconn == NULL) 437 - goto out_unlock; 367 + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { 368 + /* expected case: match, insert new node */ 369 + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); 370 + if (rbconn == NULL) 371 + goto out_unlock; 438 372 439 - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 440 - if (conn == NULL) { 441 - kmem_cache_free(conncount_rb_cachep, rbconn); 442 - goto out_unlock; 373 + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); 374 + if (conn == NULL) { 375 + kmem_cache_free(conncount_rb_cachep, rbconn); 376 + goto out_unlock; 377 + } 378 + 379 + conn->tuple = tuple; 380 + conn->zone = *zone; 381 + conn->cpu = raw_smp_processor_id(); 382 + conn->jiffies32 = (u32)jiffies; 383 + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); 384 + 385 + nf_conncount_list_init(&rbconn->list); 386 + list_add(&conn->node, &rbconn->list.head); 387 + count = 1; 388 + rbconn->list.count = count; 389 + 390 + rb_link_node_rcu(&rbconn->node, parent, rbnode); 391 + rb_insert_color(&rbconn->node, root); 392 + 393 + if (refcounted) 394 + nf_ct_put(ct); 443 395 } 444 - 445 - conn->tuple = *tuple; 446 - conn->zone = *zone; 447 - conn->cpu = raw_smp_processor_id(); 448 - conn->jiffies32 = (u32)jiffies; 449 - memcpy(rbconn->key, key, sizeof(u32) * data->keylen); 450 - 451 - nf_conncount_list_init(&rbconn->list); 452 - list_add(&conn->node, &rbconn->list.head); 453 - count = 1; 454 - rbconn->list.count = count; 455 - 456 - rb_link_node_rcu(&rbconn->node, parent, rbnode); 457 - rb_insert_color(&rbconn->node, root); 458 396 out_unlock: 459 397 spin_unlock_bh(&nf_conncount_locks[hash]); 460 398 return count; ··· 467 395 468 396 static unsigned int 469 397 count_tree(struct net *net, 398 + const struct sk_buff *skb, 399 + u16 l3num, 470 400 struct nf_conncount_data *data, 471 - const u32 *key, 472 - const struct nf_conntrack_tuple *tuple, 473 - const struct nf_conntrack_zone *zone) 401 + const u32 *key) 474 402 { 475 403 struct rb_root *root; 476 404 struct rb_node *parent; ··· 494 422 } else { 495 423 int ret; 496 424 497 - if (!tuple) { 425 + if (!skb) { 498 426 nf_conncount_gc_list(net, &rbconn->list); 499 427 return rbconn->list.count; 500 428 } ··· 509 437 } 510 438 511 439 /* same source network -> be counted! */ 512 - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); 440 + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); 513 441 spin_unlock_bh(&rbconn->list.list_lock); 514 - if (ret) 442 + if (ret && ret != -EEXIST) { 515 443 return 0; /* hotdrop */ 516 - else 444 + } else { 445 + /* -EEXIST means add was skipped, update the list */ 446 + if (ret == -EEXIST) 447 + nf_conncount_gc_list(net, &rbconn->list); 517 448 return rbconn->list.count; 449 + } 518 450 } 519 451 } 520 452 521 - if (!tuple) 453 + if (!skb) 522 454 return 0; 523 455 524 - return insert_tree(net, data, root, hash, key, tuple, zone); 456 + return insert_tree(net, skb, l3num, data, root, hash, key); 525 457 } 526 458 527 459 static void tree_gc_worker(struct work_struct *work) ··· 587 511 } 588 512 589 513 /* Count and return number of conntrack entries in 'net' with particular 'key'. 590 - * If 'tuple' is not null, insert it into the accounting data structure. 591 - * Call with RCU read lock. 514 + * If 'skb' is not null, insert the corresponding tuple into the accounting 515 + * data structure. Call with RCU read lock. 592 516 */ 593 - unsigned int nf_conncount_count(struct net *net, 594 - struct nf_conncount_data *data, 595 - const u32 *key, 596 - const struct nf_conntrack_tuple *tuple, 597 - const struct nf_conntrack_zone *zone) 517 + unsigned int nf_conncount_count_skb(struct net *net, 518 + const struct sk_buff *skb, 519 + u16 l3num, 520 + struct nf_conncount_data *data, 521 + const u32 *key) 598 522 { 599 - return count_tree(net, data, key, tuple, zone); 523 + return count_tree(net, skb, l3num, data, key); 524 + 600 525 } 601 - EXPORT_SYMBOL_GPL(nf_conncount_count); 526 + EXPORT_SYMBOL_GPL(nf_conncount_count_skb); 602 527 603 528 struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) 604 529 {

+4 -1

net/netfilter/nf_flow_table_core.c

··· 118 118 flow_tuple->in_vlan_ingress |= BIT(j); 119 119 j++; 120 120 } 121 + 122 + flow_tuple->tun = route->tuple[dir].in.tun; 121 123 flow_tuple->encap_num = route->tuple[dir].in.num_encaps; 124 + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; 122 125 123 126 switch (route->tuple[dir].xmit_type) { 124 127 case FLOW_OFFLOAD_XMIT_DIRECT: ··· 130 127 memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, 131 128 ETH_ALEN); 132 129 flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; 133 - flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; 134 130 dst_release(dst); 135 131 break; 136 132 case FLOW_OFFLOAD_XMIT_XFRM: 137 133 case FLOW_OFFLOAD_XMIT_NEIGH: 134 + flow_tuple->ifidx = route->tuple[dir].out.ifindex; 138 135 flow_tuple->dst_cache = dst; 139 136 flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); 140 137 break;

+252 -41

net/netfilter/nf_flow_table_ip.c

··· 145 145 static void nf_flow_tuple_encap(struct sk_buff *skb, 146 146 struct flow_offload_tuple *tuple) 147 147 { 148 + __be16 inner_proto = skb->protocol; 148 149 struct vlan_ethhdr *veth; 149 150 struct pppoe_hdr *phdr; 151 + struct iphdr *iph; 152 + u16 offset = 0; 150 153 int i = 0; 151 154 152 155 if (skb_vlan_tag_present(skb)) { ··· 162 159 veth = (struct vlan_ethhdr *)skb_mac_header(skb); 163 160 tuple->encap[i].id = ntohs(veth->h_vlan_TCI); 164 161 tuple->encap[i].proto = skb->protocol; 162 + inner_proto = veth->h_vlan_encapsulated_proto; 163 + offset += VLAN_HLEN; 165 164 break; 166 165 case htons(ETH_P_PPP_SES): 167 166 phdr = (struct pppoe_hdr *)skb_network_header(skb); 168 167 tuple->encap[i].id = ntohs(phdr->sid); 169 168 tuple->encap[i].proto = skb->protocol; 169 + inner_proto = *((__be16 *)(phdr + 1)); 170 + offset += PPPOE_SES_HLEN; 170 171 break; 172 + } 173 + 174 + if (inner_proto == htons(ETH_P_IP)) { 175 + iph = (struct iphdr *)(skb_network_header(skb) + offset); 176 + if (iph->protocol == IPPROTO_IPIP) { 177 + tuple->tun.dst_v4.s_addr = iph->daddr; 178 + tuple->tun.src_v4.s_addr = iph->saddr; 179 + tuple->tun.l3_proto = IPPROTO_IPIP; 180 + } 171 181 } 172 182 } 173 183 ··· 293 277 return NF_STOLEN; 294 278 } 295 279 280 + static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) 281 + { 282 + struct iphdr *iph; 283 + u16 size; 284 + 285 + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) 286 + return false; 287 + 288 + iph = (struct iphdr *)(skb_network_header(skb) + *psize); 289 + size = iph->ihl << 2; 290 + 291 + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) 292 + return false; 293 + 294 + if (iph->ttl <= 1) 295 + return false; 296 + 297 + if (iph->protocol == IPPROTO_IPIP) 298 + *psize += size; 299 + 300 + return true; 301 + } 302 + 303 + static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) 304 + { 305 + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); 306 + 307 + if (iph->protocol != IPPROTO_IPIP) 308 + return; 309 + 310 + skb_pull(skb, iph->ihl << 2); 311 + skb_reset_network_header(skb); 312 + } 313 + 296 314 static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, 297 315 u32 *offset) 298 316 { 317 + __be16 inner_proto = skb->protocol; 299 318 struct vlan_ethhdr *veth; 300 - __be16 inner_proto; 319 + bool ret = false; 301 320 302 321 switch (skb->protocol) { 303 322 case htons(ETH_P_8021Q): ··· 342 291 veth = (struct vlan_ethhdr *)skb_mac_header(skb); 343 292 if (veth->h_vlan_encapsulated_proto == proto) { 344 293 *offset += VLAN_HLEN; 345 - return true; 294 + inner_proto = proto; 295 + ret = true; 346 296 } 347 297 break; 348 298 case htons(ETH_P_PPP_SES): 349 299 if (nf_flow_pppoe_proto(skb, &inner_proto) && 350 300 inner_proto == proto) { 351 301 *offset += PPPOE_SES_HLEN; 352 - return true; 302 + ret = true; 353 303 } 354 304 break; 355 305 } 356 306 357 - return false; 307 + if (inner_proto == htons(ETH_P_IP)) 308 + ret = nf_flow_ip4_tunnel_proto(skb, offset); 309 + 310 + return ret; 358 311 } 359 312 360 313 static void nf_flow_encap_pop(struct sk_buff *skb, ··· 386 331 break; 387 332 } 388 333 } 334 + 335 + if (skb->protocol == htons(ETH_P_IP)) 336 + nf_flow_ip4_tunnel_pop(skb); 389 337 } 390 338 339 + struct nf_flow_xmit { 340 + const void *dest; 341 + const void *source; 342 + struct net_device *outdev; 343 + }; 344 + 391 345 static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, 392 - const struct flow_offload_tuple_rhash *tuplehash, 393 - unsigned short type) 346 + struct nf_flow_xmit *xmit) 394 347 { 395 - struct net_device *outdev; 396 - 397 - outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); 398 - if (!outdev) 399 - return NF_DROP; 400 - 401 - skb->dev = outdev; 402 - dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, 403 - tuplehash->tuple.out.h_source, skb->len); 348 + skb->dev = xmit->outdev; 349 + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), 350 + xmit->dest, xmit->source, skb->len); 404 351 dev_queue_xmit(skb); 405 352 406 353 return NF_STOLEN; ··· 414 357 { 415 358 struct flow_offload_tuple tuple = {}; 416 359 417 - if (skb->protocol != htons(ETH_P_IP) && 418 - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) 360 + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) 419 361 return NULL; 420 362 421 363 if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) ··· 437 381 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 438 382 439 383 mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; 384 + if (flow->tuplehash[!dir].tuple.tun_num) 385 + mtu -= sizeof(*iph); 386 + 440 387 if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) 441 388 return 0; 442 389 ··· 473 414 return 1; 474 415 } 475 416 417 + static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) 418 + { 419 + int data_len = skb->len + sizeof(__be16); 420 + struct ppp_hdr { 421 + struct pppoe_hdr hdr; 422 + __be16 proto; 423 + } *ph; 424 + __be16 proto; 425 + 426 + if (skb_cow_head(skb, PPPOE_SES_HLEN)) 427 + return -1; 428 + 429 + switch (skb->protocol) { 430 + case htons(ETH_P_IP): 431 + proto = htons(PPP_IP); 432 + break; 433 + case htons(ETH_P_IPV6): 434 + proto = htons(PPP_IPV6); 435 + break; 436 + default: 437 + return -1; 438 + } 439 + 440 + __skb_push(skb, PPPOE_SES_HLEN); 441 + skb_reset_network_header(skb); 442 + 443 + ph = (struct ppp_hdr *)(skb->data); 444 + ph->hdr.ver = 1; 445 + ph->hdr.type = 1; 446 + ph->hdr.code = 0; 447 + ph->hdr.sid = htons(id); 448 + ph->hdr.length = htons(data_len); 449 + ph->proto = proto; 450 + skb->protocol = htons(ETH_P_PPP_SES); 451 + 452 + return 0; 453 + } 454 + 455 + static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, 456 + struct flow_offload_tuple *tuple, 457 + __be32 *ip_daddr) 458 + { 459 + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); 460 + struct rtable *rt = dst_rtable(tuple->dst_cache); 461 + u8 tos = iph->tos, ttl = iph->ttl; 462 + __be16 frag_off = iph->frag_off; 463 + u32 headroom = sizeof(*iph); 464 + int err; 465 + 466 + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); 467 + if (err) 468 + return err; 469 + 470 + skb_set_inner_ipproto(skb, IPPROTO_IPIP); 471 + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; 472 + err = skb_cow_head(skb, headroom); 473 + if (err) 474 + return err; 475 + 476 + skb_scrub_packet(skb, true); 477 + skb_clear_hash_if_not_l4(skb); 478 + 479 + /* Push down and install the IP header. */ 480 + skb_push(skb, sizeof(*iph)); 481 + skb_reset_network_header(skb); 482 + 483 + iph = ip_hdr(skb); 484 + iph->version = 4; 485 + iph->ihl = sizeof(*iph) >> 2; 486 + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; 487 + iph->protocol = tuple->tun.l3_proto; 488 + iph->tos = tos; 489 + iph->daddr = tuple->tun.src_v4.s_addr; 490 + iph->saddr = tuple->tun.dst_v4.s_addr; 491 + iph->ttl = ttl; 492 + iph->tot_len = htons(skb->len); 493 + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); 494 + ip_send_check(iph); 495 + 496 + *ip_daddr = tuple->tun.src_v4.s_addr; 497 + 498 + return 0; 499 + } 500 + 501 + static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, 502 + struct flow_offload_tuple *tuple, 503 + __be32 *ip_daddr) 504 + { 505 + if (tuple->tun_num) 506 + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); 507 + 508 + return 0; 509 + } 510 + 511 + static int nf_flow_encap_push(struct sk_buff *skb, 512 + struct flow_offload_tuple *tuple) 513 + { 514 + int i; 515 + 516 + for (i = 0; i < tuple->encap_num; i++) { 517 + switch (tuple->encap[i].proto) { 518 + case htons(ETH_P_8021Q): 519 + case htons(ETH_P_8021AD): 520 + if (skb_vlan_push(skb, tuple->encap[i].proto, 521 + tuple->encap[i].id) < 0) 522 + return -1; 523 + break; 524 + case htons(ETH_P_PPP_SES): 525 + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) 526 + return -1; 527 + break; 528 + } 529 + } 530 + 531 + return 0; 532 + } 533 + 476 534 unsigned int 477 535 nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, 478 536 const struct nf_hook_state *state) 479 537 { 480 538 struct flow_offload_tuple_rhash *tuplehash; 481 539 struct nf_flowtable *flow_table = priv; 540 + struct flow_offload_tuple *other_tuple; 482 541 enum flow_offload_tuple_dir dir; 483 542 struct nf_flowtable_ctx ctx = { 484 543 .in = state->in, 485 544 }; 545 + struct nf_flow_xmit xmit = {}; 486 546 struct flow_offload *flow; 487 - struct net_device *outdev; 547 + struct neighbour *neigh; 488 548 struct rtable *rt; 489 - __be32 nexthop; 549 + __be32 ip_daddr; 490 550 int ret; 491 551 492 552 tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); ··· 628 450 629 451 dir = tuplehash->tuple.dir; 630 452 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 453 + other_tuple = &flow->tuplehash[!dir].tuple; 454 + ip_daddr = other_tuple->src_v4.s_addr; 455 + 456 + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) 457 + return NF_DROP; 458 + 459 + if (nf_flow_encap_push(skb, other_tuple) < 0) 460 + return NF_DROP; 631 461 632 462 switch (tuplehash->tuple.xmit_type) { 633 463 case FLOW_OFFLOAD_XMIT_NEIGH: 634 464 rt = dst_rtable(tuplehash->tuple.dst_cache); 635 - outdev = rt->dst.dev; 636 - skb->dev = outdev; 637 - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); 465 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); 466 + if (!xmit.outdev) { 467 + flow_offload_teardown(flow); 468 + return NF_DROP; 469 + } 470 + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); 471 + if (IS_ERR(neigh)) { 472 + flow_offload_teardown(flow); 473 + return NF_DROP; 474 + } 475 + xmit.dest = neigh->ha; 638 476 skb_dst_set_noref(skb, &rt->dst); 639 - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); 640 - ret = NF_STOLEN; 641 477 break; 642 478 case FLOW_OFFLOAD_XMIT_DIRECT: 643 - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); 644 - if (ret == NF_DROP) 479 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); 480 + if (!xmit.outdev) { 645 481 flow_offload_teardown(flow); 482 + return NF_DROP; 483 + } 484 + xmit.dest = tuplehash->tuple.out.h_dest; 485 + xmit.source = tuplehash->tuple.out.h_source; 646 486 break; 647 487 default: 648 488 WARN_ON_ONCE(1); 649 - ret = NF_DROP; 650 - break; 489 + return NF_DROP; 651 490 } 652 491 653 - return ret; 492 + return nf_flow_queue_xmit(state->net, skb, &xmit); 654 493 } 655 494 EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); 656 495 ··· 910 715 { 911 716 struct flow_offload_tuple_rhash *tuplehash; 912 717 struct nf_flowtable *flow_table = priv; 718 + struct flow_offload_tuple *other_tuple; 913 719 enum flow_offload_tuple_dir dir; 914 720 struct nf_flowtable_ctx ctx = { 915 721 .in = state->in, 916 722 }; 917 - const struct in6_addr *nexthop; 723 + struct nf_flow_xmit xmit = {}; 724 + struct in6_addr *ip6_daddr; 918 725 struct flow_offload *flow; 919 - struct net_device *outdev; 726 + struct neighbour *neigh; 920 727 struct rt6_info *rt; 921 728 int ret; 922 729 ··· 942 745 943 746 dir = tuplehash->tuple.dir; 944 747 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); 748 + other_tuple = &flow->tuplehash[!dir].tuple; 749 + ip6_daddr = &other_tuple->src_v6; 750 + 751 + if (nf_flow_encap_push(skb, other_tuple) < 0) 752 + return NF_DROP; 945 753 946 754 switch (tuplehash->tuple.xmit_type) { 947 755 case FLOW_OFFLOAD_XMIT_NEIGH: 948 756 rt = dst_rt6_info(tuplehash->tuple.dst_cache); 949 - outdev = rt->dst.dev; 950 - skb->dev = outdev; 951 - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); 757 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); 758 + if (!xmit.outdev) { 759 + flow_offload_teardown(flow); 760 + return NF_DROP; 761 + } 762 + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); 763 + if (IS_ERR(neigh)) { 764 + flow_offload_teardown(flow); 765 + return NF_DROP; 766 + } 767 + xmit.dest = neigh->ha; 952 768 skb_dst_set_noref(skb, &rt->dst); 953 - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); 954 - ret = NF_STOLEN; 955 769 break; 956 770 case FLOW_OFFLOAD_XMIT_DIRECT: 957 - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); 958 - if (ret == NF_DROP) 771 + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); 772 + if (!xmit.outdev) { 959 773 flow_offload_teardown(flow); 774 + return NF_DROP; 775 + } 776 + xmit.dest = tuplehash->tuple.out.h_dest; 777 + xmit.source = tuplehash->tuple.out.h_source; 960 778 break; 961 779 default: 962 780 WARN_ON_ONCE(1); 963 - ret = NF_DROP; 964 - break; 781 + return NF_DROP; 965 782 } 966 783 967 - return ret; 784 + return nf_flow_queue_xmit(state->net, skb, &xmit); 968 785 } 969 786 EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);

+1 -1

net/netfilter/nf_flow_table_offload.c

··· 555 555 switch (this_tuple->xmit_type) { 556 556 case FLOW_OFFLOAD_XMIT_DIRECT: 557 557 this_tuple = &flow->tuplehash[dir].tuple; 558 - ifindex = this_tuple->out.hw_ifidx; 558 + ifindex = this_tuple->out.ifidx; 559 559 break; 560 560 case FLOW_OFFLOAD_XMIT_NEIGH: 561 561 other_tuple = &flow->tuplehash[!dir].tuple;

+330

net/netfilter/nf_flow_table_path.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + #include <linux/kernel.h> 3 + #include <linux/module.h> 4 + #include <linux/init.h> 5 + #include <linux/netlink.h> 6 + #include <linux/netfilter.h> 7 + #include <linux/spinlock.h> 8 + #include <linux/netfilter/nf_conntrack_common.h> 9 + #include <linux/netfilter/nf_tables.h> 10 + #include <net/ip.h> 11 + #include <net/inet_dscp.h> 12 + #include <net/netfilter/nf_tables.h> 13 + #include <net/netfilter/nf_tables_core.h> 14 + #include <net/netfilter/nf_conntrack_core.h> 15 + #include <net/netfilter/nf_conntrack_extend.h> 16 + #include <net/netfilter/nf_flow_table.h> 17 + 18 + static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) 19 + { 20 + if (dst_xfrm(dst)) 21 + return FLOW_OFFLOAD_XMIT_XFRM; 22 + 23 + return FLOW_OFFLOAD_XMIT_NEIGH; 24 + } 25 + 26 + static void nft_default_forward_path(struct nf_flow_route *route, 27 + struct dst_entry *dst_cache, 28 + enum ip_conntrack_dir dir) 29 + { 30 + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; 31 + route->tuple[dir].dst = dst_cache; 32 + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); 33 + } 34 + 35 + static bool nft_is_valid_ether_device(const struct net_device *dev) 36 + { 37 + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || 38 + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) 39 + return false; 40 + 41 + return true; 42 + } 43 + 44 + static int nft_dev_fill_forward_path(const struct nf_flow_route *route, 45 + const struct dst_entry *dst_cache, 46 + const struct nf_conn *ct, 47 + enum ip_conntrack_dir dir, u8 *ha, 48 + struct net_device_path_stack *stack) 49 + { 50 + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; 51 + struct net_device *dev = dst_cache->dev; 52 + struct neighbour *n; 53 + u8 nud_state; 54 + 55 + if (!nft_is_valid_ether_device(dev)) 56 + goto out; 57 + 58 + n = dst_neigh_lookup(dst_cache, daddr); 59 + if (!n) 60 + return -1; 61 + 62 + read_lock_bh(&n->lock); 63 + nud_state = n->nud_state; 64 + ether_addr_copy(ha, n->ha); 65 + read_unlock_bh(&n->lock); 66 + neigh_release(n); 67 + 68 + if (!(nud_state & NUD_VALID)) 69 + return -1; 70 + 71 + out: 72 + return dev_fill_forward_path(dev, ha, stack); 73 + } 74 + 75 + struct nft_forward_info { 76 + const struct net_device *indev; 77 + const struct net_device *outdev; 78 + struct id { 79 + __u16 id; 80 + __be16 proto; 81 + } encap[NF_FLOW_TABLE_ENCAP_MAX]; 82 + u8 num_encaps; 83 + struct flow_offload_tunnel tun; 84 + u8 num_tuns; 85 + u8 ingress_vlans; 86 + u8 h_source[ETH_ALEN]; 87 + u8 h_dest[ETH_ALEN]; 88 + enum flow_offload_xmit_type xmit_type; 89 + }; 90 + 91 + static void nft_dev_path_info(const struct net_device_path_stack *stack, 92 + struct nft_forward_info *info, 93 + unsigned char *ha, struct nf_flowtable *flowtable) 94 + { 95 + const struct net_device_path *path; 96 + int i; 97 + 98 + memcpy(info->h_dest, ha, ETH_ALEN); 99 + 100 + for (i = 0; i < stack->num_paths; i++) { 101 + path = &stack->path[i]; 102 + switch (path->type) { 103 + case DEV_PATH_ETHERNET: 104 + case DEV_PATH_DSA: 105 + case DEV_PATH_VLAN: 106 + case DEV_PATH_PPPOE: 107 + case DEV_PATH_TUN: 108 + info->indev = path->dev; 109 + if (is_zero_ether_addr(info->h_source)) 110 + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 111 + 112 + if (path->type == DEV_PATH_ETHERNET) 113 + break; 114 + if (path->type == DEV_PATH_DSA) { 115 + i = stack->num_paths; 116 + break; 117 + } 118 + 119 + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ 120 + if (path->type == DEV_PATH_TUN) { 121 + if (info->num_tuns) { 122 + info->indev = NULL; 123 + break; 124 + } 125 + info->tun.src_v6 = path->tun.src_v6; 126 + info->tun.dst_v6 = path->tun.dst_v6; 127 + info->tun.l3_proto = path->tun.l3_proto; 128 + info->num_tuns++; 129 + } else { 130 + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 131 + info->indev = NULL; 132 + break; 133 + } 134 + info->encap[info->num_encaps].id = 135 + path->encap.id; 136 + info->encap[info->num_encaps].proto = 137 + path->encap.proto; 138 + info->num_encaps++; 139 + } 140 + if (path->type == DEV_PATH_PPPOE) 141 + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 142 + break; 143 + case DEV_PATH_BRIDGE: 144 + if (is_zero_ether_addr(info->h_source)) 145 + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 146 + 147 + switch (path->bridge.vlan_mode) { 148 + case DEV_PATH_BR_VLAN_UNTAG_HW: 149 + info->ingress_vlans |= BIT(info->num_encaps - 1); 150 + break; 151 + case DEV_PATH_BR_VLAN_TAG: 152 + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 153 + info->indev = NULL; 154 + break; 155 + } 156 + info->encap[info->num_encaps].id = path->bridge.vlan_id; 157 + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; 158 + info->num_encaps++; 159 + break; 160 + case DEV_PATH_BR_VLAN_UNTAG: 161 + if (WARN_ON_ONCE(info->num_encaps-- == 0)) { 162 + info->indev = NULL; 163 + break; 164 + } 165 + break; 166 + case DEV_PATH_BR_VLAN_KEEP: 167 + break; 168 + } 169 + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 170 + break; 171 + default: 172 + info->indev = NULL; 173 + break; 174 + } 175 + } 176 + info->outdev = info->indev; 177 + 178 + if (nf_flowtable_hw_offload(flowtable) && 179 + nft_is_valid_ether_device(info->indev)) 180 + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 181 + } 182 + 183 + static bool nft_flowtable_find_dev(const struct net_device *dev, 184 + struct nft_flowtable *ft) 185 + { 186 + struct nft_hook *hook; 187 + bool found = false; 188 + 189 + list_for_each_entry_rcu(hook, &ft->hook_list, list) { 190 + if (!nft_hook_find_ops_rcu(hook, dev)) 191 + continue; 192 + 193 + found = true; 194 + break; 195 + } 196 + 197 + return found; 198 + } 199 + 200 + static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, 201 + struct flow_offload_tunnel *tun, 202 + struct nf_flow_route *route, 203 + enum ip_conntrack_dir dir) 204 + { 205 + struct dst_entry *cur_dst = route->tuple[dir].dst; 206 + struct dst_entry *tun_dst = NULL; 207 + struct flowi fl = {}; 208 + 209 + switch (nft_pf(pkt)) { 210 + case NFPROTO_IPV4: 211 + fl.u.ip4.daddr = tun->dst_v4.s_addr; 212 + fl.u.ip4.saddr = tun->src_v4.s_addr; 213 + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; 214 + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); 215 + fl.u.ip4.flowi4_mark = pkt->skb->mark; 216 + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 217 + break; 218 + case NFPROTO_IPV6: 219 + fl.u.ip6.daddr = tun->dst_v6; 220 + fl.u.ip6.saddr = tun->src_v6; 221 + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; 222 + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 223 + fl.u.ip6.flowi6_mark = pkt->skb->mark; 224 + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 225 + break; 226 + } 227 + 228 + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); 229 + if (!tun_dst) 230 + return -ENOENT; 231 + 232 + route->tuple[dir].dst = tun_dst; 233 + dst_release(cur_dst); 234 + 235 + return 0; 236 + } 237 + 238 + static void nft_dev_forward_path(const struct nft_pktinfo *pkt, 239 + struct nf_flow_route *route, 240 + const struct nf_conn *ct, 241 + enum ip_conntrack_dir dir, 242 + struct nft_flowtable *ft) 243 + { 244 + const struct dst_entry *dst = route->tuple[dir].dst; 245 + struct net_device_path_stack stack; 246 + struct nft_forward_info info = {}; 247 + unsigned char ha[ETH_ALEN]; 248 + int i; 249 + 250 + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) 251 + nft_dev_path_info(&stack, &info, ha, &ft->data); 252 + 253 + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) 254 + return; 255 + 256 + route->tuple[!dir].in.ifindex = info.indev->ifindex; 257 + for (i = 0; i < info.num_encaps; i++) { 258 + route->tuple[!dir].in.encap[i].id = info.encap[i].id; 259 + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 260 + } 261 + 262 + if (info.num_tuns && 263 + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { 264 + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; 265 + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; 266 + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; 267 + route->tuple[!dir].in.num_tuns = info.num_tuns; 268 + } 269 + 270 + route->tuple[!dir].in.num_encaps = info.num_encaps; 271 + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 272 + route->tuple[dir].out.ifindex = info.outdev->ifindex; 273 + 274 + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { 275 + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); 276 + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); 277 + route->tuple[dir].xmit_type = info.xmit_type; 278 + } 279 + } 280 + 281 + int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, 282 + struct nf_flow_route *route, enum ip_conntrack_dir dir, 283 + struct nft_flowtable *ft) 284 + { 285 + struct dst_entry *this_dst = skb_dst(pkt->skb); 286 + struct dst_entry *other_dst = NULL; 287 + struct flowi fl; 288 + 289 + memset(&fl, 0, sizeof(fl)); 290 + switch (nft_pf(pkt)) { 291 + case NFPROTO_IPV4: 292 + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; 293 + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; 294 + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; 295 + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; 296 + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); 297 + fl.u.ip4.flowi4_mark = pkt->skb->mark; 298 + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 299 + break; 300 + case NFPROTO_IPV6: 301 + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; 302 + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; 303 + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; 304 + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; 305 + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 306 + fl.u.ip6.flowi6_mark = pkt->skb->mark; 307 + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 308 + break; 309 + } 310 + 311 + if (!dst_hold_safe(this_dst)) 312 + return -ENOENT; 313 + 314 + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); 315 + if (!other_dst) { 316 + dst_release(this_dst); 317 + return -ENOENT; 318 + } 319 + 320 + nft_default_forward_path(route, this_dst, dir); 321 + nft_default_forward_path(route, other_dst, !dir); 322 + 323 + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) 324 + nft_dev_forward_path(pkt, route, ct, dir, ft); 325 + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) 326 + nft_dev_forward_path(pkt, route, ct, !dir, ft); 327 + 328 + return 0; 329 + } 330 + EXPORT_SYMBOL_GPL(nft_flow_route);

+27 -27

net/netfilter/nft_connlimit.c

··· 24 24 const struct nft_pktinfo *pkt, 25 25 const struct nft_set_ext *ext) 26 26 { 27 - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 28 - const struct nf_conntrack_tuple *tuple_ptr; 29 - struct nf_conntrack_tuple tuple; 30 - enum ip_conntrack_info ctinfo; 31 - const struct nf_conn *ct; 32 27 unsigned int count; 28 + int err; 33 29 34 - tuple_ptr = &tuple; 35 - 36 - ct = nf_ct_get(pkt->skb, &ctinfo); 37 - if (ct != NULL) { 38 - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 39 - zone = nf_ct_zone(ct); 40 - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), 41 - nft_pf(pkt), nft_net(pkt), &tuple)) { 42 - regs->verdict.code = NF_DROP; 43 - return; 44 - } 45 - 46 - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { 47 - regs->verdict.code = NF_DROP; 48 - return; 30 + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); 31 + if (err) { 32 + if (err == -EEXIST) { 33 + /* Call gc to update the list count if any connection has 34 + * been closed already. This is useful for softlimit 35 + * connections like limiting bandwidth based on a number 36 + * of open connections. 37 + */ 38 + nf_conncount_gc_list(nft_net(pkt), priv->list); 39 + } else { 40 + regs->verdict.code = NF_DROP; 41 + return; 42 + } 49 43 } 50 44 51 45 count = READ_ONCE(priv->list->count); 52 46 53 - if ((count > priv->limit) ^ priv->invert) { 47 + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { 54 48 regs->verdict.code = NFT_BREAK; 55 49 return; 56 50 } ··· 131 137 return nft_connlimit_do_init(ctx, tb, priv); 132 138 } 133 139 140 + static void nft_connlimit_obj_update(struct nft_object *obj, 141 + struct nft_object *newobj) 142 + { 143 + struct nft_connlimit *newpriv = nft_obj_data(newobj); 144 + struct nft_connlimit *priv = nft_obj_data(obj); 145 + 146 + WRITE_ONCE(priv->limit, newpriv->limit); 147 + WRITE_ONCE(priv->invert, newpriv->invert); 148 + } 149 + 134 150 static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, 135 151 struct nft_object *obj) 136 152 { ··· 170 166 .init = nft_connlimit_obj_init, 171 167 .destroy = nft_connlimit_obj_destroy, 172 168 .dump = nft_connlimit_obj_dump, 169 + .update = nft_connlimit_obj_update, 173 170 }; 174 171 175 172 static struct nft_object_type nft_connlimit_obj_type __read_mostly = { ··· 243 238 static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) 244 239 { 245 240 struct nft_connlimit *priv = nft_expr_priv(expr); 246 - bool ret; 247 241 248 - local_bh_disable(); 249 - ret = nf_conncount_gc_list(net, priv->list); 250 - local_bh_enable(); 251 - 252 - return ret; 242 + return nf_conncount_gc_list(net, priv->list); 253 243 } 254 244 255 245 static struct nft_expr_type nft_connlimit_type;

-252

net/netfilter/nft_flow_offload.c

··· 20 20 struct nft_flowtable *flowtable; 21 21 }; 22 22 23 - static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) 24 - { 25 - if (dst_xfrm(dst)) 26 - return FLOW_OFFLOAD_XMIT_XFRM; 27 - 28 - return FLOW_OFFLOAD_XMIT_NEIGH; 29 - } 30 - 31 - static void nft_default_forward_path(struct nf_flow_route *route, 32 - struct dst_entry *dst_cache, 33 - enum ip_conntrack_dir dir) 34 - { 35 - route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; 36 - route->tuple[dir].dst = dst_cache; 37 - route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); 38 - } 39 - 40 - static bool nft_is_valid_ether_device(const struct net_device *dev) 41 - { 42 - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || 43 - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) 44 - return false; 45 - 46 - return true; 47 - } 48 - 49 - static int nft_dev_fill_forward_path(const struct nf_flow_route *route, 50 - const struct dst_entry *dst_cache, 51 - const struct nf_conn *ct, 52 - enum ip_conntrack_dir dir, u8 *ha, 53 - struct net_device_path_stack *stack) 54 - { 55 - const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; 56 - struct net_device *dev = dst_cache->dev; 57 - struct neighbour *n; 58 - u8 nud_state; 59 - 60 - if (!nft_is_valid_ether_device(dev)) 61 - goto out; 62 - 63 - n = dst_neigh_lookup(dst_cache, daddr); 64 - if (!n) 65 - return -1; 66 - 67 - read_lock_bh(&n->lock); 68 - nud_state = n->nud_state; 69 - ether_addr_copy(ha, n->ha); 70 - read_unlock_bh(&n->lock); 71 - neigh_release(n); 72 - 73 - if (!(nud_state & NUD_VALID)) 74 - return -1; 75 - 76 - out: 77 - return dev_fill_forward_path(dev, ha, stack); 78 - } 79 - 80 - struct nft_forward_info { 81 - const struct net_device *indev; 82 - const struct net_device *outdev; 83 - const struct net_device *hw_outdev; 84 - struct id { 85 - __u16 id; 86 - __be16 proto; 87 - } encap[NF_FLOW_TABLE_ENCAP_MAX]; 88 - u8 num_encaps; 89 - u8 ingress_vlans; 90 - u8 h_source[ETH_ALEN]; 91 - u8 h_dest[ETH_ALEN]; 92 - enum flow_offload_xmit_type xmit_type; 93 - }; 94 - 95 - static void nft_dev_path_info(const struct net_device_path_stack *stack, 96 - struct nft_forward_info *info, 97 - unsigned char *ha, struct nf_flowtable *flowtable) 98 - { 99 - const struct net_device_path *path; 100 - int i; 101 - 102 - memcpy(info->h_dest, ha, ETH_ALEN); 103 - 104 - for (i = 0; i < stack->num_paths; i++) { 105 - path = &stack->path[i]; 106 - switch (path->type) { 107 - case DEV_PATH_ETHERNET: 108 - case DEV_PATH_DSA: 109 - case DEV_PATH_VLAN: 110 - case DEV_PATH_PPPOE: 111 - info->indev = path->dev; 112 - if (is_zero_ether_addr(info->h_source)) 113 - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 114 - 115 - if (path->type == DEV_PATH_ETHERNET) 116 - break; 117 - if (path->type == DEV_PATH_DSA) { 118 - i = stack->num_paths; 119 - break; 120 - } 121 - 122 - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ 123 - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { 124 - info->indev = NULL; 125 - break; 126 - } 127 - if (!info->outdev) 128 - info->outdev = path->dev; 129 - info->encap[info->num_encaps].id = path->encap.id; 130 - info->encap[info->num_encaps].proto = path->encap.proto; 131 - info->num_encaps++; 132 - if (path->type == DEV_PATH_PPPOE) 133 - memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); 134 - break; 135 - case DEV_PATH_BRIDGE: 136 - if (is_zero_ether_addr(info->h_source)) 137 - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); 138 - 139 - switch (path->bridge.vlan_mode) { 140 - case DEV_PATH_BR_VLAN_UNTAG_HW: 141 - info->ingress_vlans |= BIT(info->num_encaps - 1); 142 - break; 143 - case DEV_PATH_BR_VLAN_TAG: 144 - info->encap[info->num_encaps].id = path->bridge.vlan_id; 145 - info->encap[info->num_encaps].proto = path->bridge.vlan_proto; 146 - info->num_encaps++; 147 - break; 148 - case DEV_PATH_BR_VLAN_UNTAG: 149 - info->num_encaps--; 150 - break; 151 - case DEV_PATH_BR_VLAN_KEEP: 152 - break; 153 - } 154 - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 155 - break; 156 - default: 157 - info->indev = NULL; 158 - break; 159 - } 160 - } 161 - if (!info->outdev) 162 - info->outdev = info->indev; 163 - 164 - info->hw_outdev = info->indev; 165 - 166 - if (nf_flowtable_hw_offload(flowtable) && 167 - nft_is_valid_ether_device(info->indev)) 168 - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; 169 - } 170 - 171 - static bool nft_flowtable_find_dev(const struct net_device *dev, 172 - struct nft_flowtable *ft) 173 - { 174 - struct nft_hook *hook; 175 - bool found = false; 176 - 177 - list_for_each_entry_rcu(hook, &ft->hook_list, list) { 178 - if (!nft_hook_find_ops_rcu(hook, dev)) 179 - continue; 180 - 181 - found = true; 182 - break; 183 - } 184 - 185 - return found; 186 - } 187 - 188 - static void nft_dev_forward_path(struct nf_flow_route *route, 189 - const struct nf_conn *ct, 190 - enum ip_conntrack_dir dir, 191 - struct nft_flowtable *ft) 192 - { 193 - const struct dst_entry *dst = route->tuple[dir].dst; 194 - struct net_device_path_stack stack; 195 - struct nft_forward_info info = {}; 196 - unsigned char ha[ETH_ALEN]; 197 - int i; 198 - 199 - if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) 200 - nft_dev_path_info(&stack, &info, ha, &ft->data); 201 - 202 - if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) 203 - return; 204 - 205 - route->tuple[!dir].in.ifindex = info.indev->ifindex; 206 - for (i = 0; i < info.num_encaps; i++) { 207 - route->tuple[!dir].in.encap[i].id = info.encap[i].id; 208 - route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; 209 - } 210 - route->tuple[!dir].in.num_encaps = info.num_encaps; 211 - route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; 212 - 213 - if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { 214 - memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); 215 - memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); 216 - route->tuple[dir].out.ifindex = info.outdev->ifindex; 217 - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; 218 - route->tuple[dir].xmit_type = info.xmit_type; 219 - } 220 - } 221 - 222 - static int nft_flow_route(const struct nft_pktinfo *pkt, 223 - const struct nf_conn *ct, 224 - struct nf_flow_route *route, 225 - enum ip_conntrack_dir dir, 226 - struct nft_flowtable *ft) 227 - { 228 - struct dst_entry *this_dst = skb_dst(pkt->skb); 229 - struct dst_entry *other_dst = NULL; 230 - struct flowi fl; 231 - 232 - memset(&fl, 0, sizeof(fl)); 233 - switch (nft_pf(pkt)) { 234 - case NFPROTO_IPV4: 235 - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; 236 - fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; 237 - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; 238 - fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; 239 - fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); 240 - fl.u.ip4.flowi4_mark = pkt->skb->mark; 241 - fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; 242 - break; 243 - case NFPROTO_IPV6: 244 - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; 245 - fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; 246 - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; 247 - fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; 248 - fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); 249 - fl.u.ip6.flowi6_mark = pkt->skb->mark; 250 - fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; 251 - break; 252 - } 253 - 254 - if (!dst_hold_safe(this_dst)) 255 - return -ENOENT; 256 - 257 - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); 258 - if (!other_dst) { 259 - dst_release(this_dst); 260 - return -ENOENT; 261 - } 262 - 263 - nft_default_forward_path(route, this_dst, dir); 264 - nft_default_forward_path(route, other_dst, !dir); 265 - 266 - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && 267 - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { 268 - nft_dev_forward_path(route, ct, dir, ft); 269 - nft_dev_forward_path(route, ct, !dir, ft); 270 - } 271 - 272 - return 0; 273 - } 274 - 275 23 static bool nft_flow_offload_skip(struct sk_buff *skb, int family) 276 24 { 277 25 if (skb_sec_path(skb))

+3 -11

net/netfilter/xt_connlimit.c

··· 31 31 { 32 32 struct net *net = xt_net(par); 33 33 const struct xt_connlimit_info *info = par->matchinfo; 34 - struct nf_conntrack_tuple tuple; 35 - const struct nf_conntrack_tuple *tuple_ptr = &tuple; 36 34 const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; 37 35 enum ip_conntrack_info ctinfo; 38 36 const struct nf_conn *ct; ··· 38 40 u32 key[5]; 39 41 40 42 ct = nf_ct_get(skb, &ctinfo); 41 - if (ct != NULL) { 42 - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 43 + if (ct) 43 44 zone = nf_ct_zone(ct); 44 - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), 45 - xt_family(par), net, &tuple)) { 46 - goto hotdrop; 47 - } 48 45 49 46 if (xt_family(par) == NFPROTO_IPV6) { 50 47 const struct ipv6hdr *iph = ipv6_hdr(skb); ··· 62 69 key[1] = zone->id; 63 70 } 64 71 65 - connections = nf_conncount_count(net, info->data, key, tuple_ptr, 66 - zone); 72 + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); 67 73 if (connections == 0) 68 - /* kmalloc failed, drop it entirely */ 74 + /* kmalloc failed or tuple couldn't be found, drop it entirely */ 69 75 goto hotdrop; 70 76 71 77 return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT);

+8 -8

net/openvswitch/conntrack.c

··· 928 928 } 929 929 930 930 static int ovs_ct_check_limit(struct net *net, 931 - const struct ovs_conntrack_info *info, 932 - const struct nf_conntrack_tuple *tuple) 931 + const struct sk_buff *skb, 932 + const struct ovs_conntrack_info *info) 933 933 { 934 934 struct ovs_net *ovs_net = net_generic(net, ovs_net_id); 935 935 const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; ··· 942 942 if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) 943 943 return 0; 944 944 945 - connections = nf_conncount_count(net, ct_limit_info->data, 946 - &conncount_key, tuple, &info->zone); 945 + connections = nf_conncount_count_skb(net, skb, info->family, 946 + ct_limit_info->data, 947 + &conncount_key); 947 948 if (connections > per_zone_limit) 948 949 return -ENOMEM; 949 950 ··· 973 972 #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) 974 973 if (static_branch_unlikely(&ovs_ct_limit_enabled)) { 975 974 if (!nf_ct_is_confirmed(ct)) { 976 - err = ovs_ct_check_limit(net, info, 977 - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); 975 + err = ovs_ct_check_limit(net, skb, info); 978 976 if (err) { 979 977 net_warn_ratelimited("openvswitch: zone: %u " 980 978 "exceeds conntrack limit\n", ··· 1770 1770 zone_limit.limit = limit; 1771 1771 nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); 1772 1772 1773 - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, 1774 - &ct_zone); 1773 + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, 1774 + &conncount_key); 1775 1775 return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); 1776 1776 } 1777 1777

+112 -14

tools/testing/selftests/net/netfilter/nft_flowtable.sh

··· 127 127 ip -net "$nsr2" addr add 192.168.10.2/24 dev veth0 128 128 ip -net "$nsr2" addr add fee1:2::2/64 dev veth0 nodad 129 129 130 + ip netns exec "$nsr1" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 131 + ip netns exec "$nsr2" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 130 132 for i in 0 1; do 131 133 ip netns exec "$nsr1" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null 132 134 ip netns exec "$nsr2" sysctl net.ipv4.conf.veth$i.forwarding=1 > /dev/null ··· 155 153 ip -net "$ns2" route add default via dead:2::1 156 154 157 155 ip -net "$nsr1" route add default via 192.168.10.2 156 + ip -6 -net "$nsr1" route add default via fee1:2::2 158 157 ip -net "$nsr2" route add default via 192.168.10.1 158 + ip -6 -net "$nsr2" route add default via fee1:2::1 159 159 160 160 ip netns exec "$nsr1" nft -f - <<EOF 161 161 table inet filter { ··· 356 352 local nsa=$1 357 353 local nsb=$2 358 354 local pmtu=$3 359 - local dstip=$4 360 - local dstport=$5 355 + local proto=$4 356 + local dstip=$5 357 + local dstport=$6 361 358 local lret=0 362 359 local socatc 363 360 local socatl ··· 368 363 infile="$nsin_small" 369 364 fi 370 365 371 - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -4 TCP-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & 366 + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsb" socat -${proto} \ 367 + TCP"${proto}"-LISTEN:12345,reuseaddr STDIO < "$infile" > "$ns2out" & 372 368 lpid=$! 373 369 374 370 busywait 1000 listener_ready 375 371 376 - timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -4 TCP:"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" 372 + timeout "$SOCAT_TIMEOUT" ip netns exec "$nsa" socat -${proto} \ 373 + TCP"${proto}":"$dstip":"$dstport" STDIO < "$infile" > "$ns1out" 377 374 socatc=$? 378 375 379 376 wait $lpid ··· 401 394 test_tcp_forwarding() 402 395 { 403 396 local pmtu="$3" 397 + local proto="$4" 398 + local dstip="$5" 399 + local dstport="$6" 404 400 405 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 401 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 406 402 407 403 return $? 408 404 } ··· 413 403 test_tcp_forwarding_set_dscp() 414 404 { 415 405 local pmtu="$3" 406 + local proto="$4" 407 + local dstip="$5" 408 + local dstport="$6" 416 409 417 410 ip netns exec "$nsr1" nft -f - <<EOF 418 411 table netdev dscpmangle { ··· 426 413 } 427 414 EOF 428 415 if [ $? -eq 0 ]; then 429 - test_tcp_forwarding_ip "$1" "$2" "$3" 10.0.2.99 12345 416 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 430 417 check_dscp "dscp_ingress" "$pmtu" 431 418 432 419 ip netns exec "$nsr1" nft delete table netdev dscpmangle ··· 443 430 } 444 431 EOF 445 432 if [ $? -eq 0 ]; then 446 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 433 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 447 434 check_dscp "dscp_egress" "$pmtu" 448 435 449 436 ip netns exec "$nsr1" nft delete table netdev dscpmangle ··· 454 441 # partial. If flowtable really works, then both dscp-is-0 and dscp-is-cs3 455 442 # counters should have seen packets (before and after ft offload kicks in). 456 443 ip netns exec "$nsr1" nft -a insert rule inet filter forward ip dscp set cs3 457 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.0.2.99 12345 444 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" "$proto" "$dstip" "$dstport" 458 445 check_dscp "dscp_fwd" "$pmtu" 459 446 } 460 447 ··· 468 455 469 456 [ "$pmtu" -eq 0 ] && what="$what (pmtu disabled)" 470 457 471 - test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 10.0.2.99 12345 458 + test_tcp_forwarding_ip "$nsa" "$nsb" "$pmtu" 4 10.0.2.99 12345 472 459 lret=$? 473 460 474 461 if [ "$lret" -eq 0 ] ; then ··· 478 465 echo "PASS: flow offload for ns1/ns2 with masquerade $what" 479 466 fi 480 467 481 - test_tcp_forwarding_ip "$1" "$2" "$pmtu" 10.6.6.6 1666 468 + test_tcp_forwarding_ip "$1" "$2" "$pmtu" 4 10.6.6.6 1666 482 469 lret=$? 483 470 if [ "$pmtu" -eq 1 ] ;then 484 471 check_counters "flow offload for ns1/ns2 with dnat $what" ··· 500 487 # Due to MTU mismatch in both directions, all packets (except small packets like pure 501 488 # acks) have to be handled by normal forwarding path. Therefore, packet counters 502 489 # are not checked. 503 - if test_tcp_forwarding "$ns1" "$ns2" 0; then 490 + if test_tcp_forwarding "$ns1" "$ns2" 0 4 10.0.2.99 12345; then 504 491 echo "PASS: flow offloaded for ns1/ns2" 505 492 else 506 493 echo "FAIL: flow offload for ns1/ns2:" 1>&2 494 + ip netns exec "$nsr1" nft list ruleset 495 + ret=1 496 + fi 497 + 498 + if test_tcp_forwarding "$ns1" "$ns2" 0 6 "[dead:2::99]" 12345; then 499 + echo "PASS: IPv6 flow offloaded for ns1/ns2" 500 + else 501 + echo "FAIL: IPv6 flow offload for ns1/ns2:" 1>&2 507 502 ip netns exec "$nsr1" nft list ruleset 508 503 ret=1 509 504 fi ··· 541 520 EOF 542 521 543 522 check_dscp "dscp_none" "0" 544 - if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 ""; then 523 + if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 0 4 10.0.2.99 12345; then 545 524 echo "FAIL: flow offload for ns1/ns2 with dscp update and no pmtu discovery" 1>&2 546 525 exit 0 547 526 fi ··· 567 546 ip netns exec "$nsr1" nft reset counters table inet filter >/dev/null 568 547 ip netns exec "$ns2" nft reset counters table inet filter >/dev/null 569 548 570 - if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 ""; then 549 + if ! test_tcp_forwarding_set_dscp "$ns1" "$ns2" 1 4 10.0.2.99 12345; then 571 550 echo "FAIL: flow offload for ns1/ns2 with dscp update and pmtu discovery" 1>&2 572 551 exit 0 573 552 fi ··· 578 557 echo "FAIL: flow offload for ns1/ns2 with NAT and pmtu discovery" 1>&2 579 558 ip netns exec "$nsr1" nft list ruleset 580 559 fi 560 + 561 + # IPIP tunnel test: 562 + # Add IPIP tunnel interfaces and check flowtable acceleration. 563 + test_ipip() { 564 + if ! ip -net "$nsr1" link add name tun0 type ipip \ 565 + local 192.168.10.1 remote 192.168.10.2 >/dev/null;then 566 + echo "SKIP: could not add ipip tunnel" 567 + [ "$ret" -eq 0 ] && ret=$ksft_skip 568 + return 569 + fi 570 + ip -net "$nsr1" link set tun0 up 571 + ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0 572 + ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null 573 + 574 + ip -net "$nsr2" link add name tun0 type ipip local 192.168.10.2 remote 192.168.10.1 575 + ip -net "$nsr2" link set tun0 up 576 + ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0 577 + ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null 578 + 579 + ip -net "$nsr1" route change default via 192.168.100.2 580 + ip -net "$nsr2" route change default via 192.168.100.1 581 + ip -net "$ns2" route add default via 10.0.2.1 582 + 583 + ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0 accept' 584 + ip netns exec "$nsr1" nft -a insert rule inet filter forward \ 585 + 'meta oif "veth0" tcp sport 12345 ct mark set 1 flow add @f1 counter name routed_repl accept' 586 + 587 + if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel"; then 588 + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel" 1>&2 589 + ip netns exec "$nsr1" nft list ruleset 590 + ret=1 591 + fi 592 + 593 + # Create vlan tagged devices for IPIP traffic. 594 + ip -net "$nsr1" link add link veth1 name veth1.10 type vlan id 10 595 + ip -net "$nsr1" link set veth1.10 up 596 + ip -net "$nsr1" addr add 192.168.20.1/24 dev veth1.10 597 + ip netns exec "$nsr1" sysctl net.ipv4.conf.veth1/10.forwarding=1 > /dev/null 598 + ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif veth1.10 accept' 599 + ip -net "$nsr1" link add name tun1 type ipip local 192.168.20.1 remote 192.168.20.2 600 + ip -net "$nsr1" link set tun1 up 601 + ip -net "$nsr1" addr add 192.168.200.1/24 dev tun1 602 + ip -net "$nsr1" route change default via 192.168.200.2 603 + ip netns exec "$nsr1" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null 604 + ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun1 accept' 605 + 606 + ip -net "$nsr2" link add link veth0 name veth0.10 type vlan id 10 607 + ip -net "$nsr2" link set veth0.10 up 608 + ip -net "$nsr2" addr add 192.168.20.2/24 dev veth0.10 609 + ip netns exec "$nsr2" sysctl net.ipv4.conf.veth0/10.forwarding=1 > /dev/null 610 + ip -net "$nsr2" link add name tun1 type ipip local 192.168.20.2 remote 192.168.20.1 611 + ip -net "$nsr2" link set tun1 up 612 + ip -net "$nsr2" addr add 192.168.200.2/24 dev tun1 613 + ip -net "$nsr2" route change default via 192.168.200.1 614 + ip netns exec "$nsr2" sysctl net.ipv4.conf.tun1.forwarding=1 > /dev/null 615 + 616 + if ! test_tcp_forwarding_nat "$ns1" "$ns2" 1 "IPIP tunnel over vlan"; then 617 + echo "FAIL: flow offload for ns1/ns2 with IPIP tunnel over vlan" 1>&2 618 + ip netns exec "$nsr1" nft list ruleset 619 + ret=1 620 + fi 621 + 622 + # Restore the previous configuration 623 + ip -net "$nsr1" route change default via 192.168.10.2 624 + ip -net "$nsr2" route change default via 192.168.10.1 625 + ip -net "$ns2" route del default via 10.0.2.1 626 + } 581 627 582 628 # Another test: 583 629 # Add bridge interface br0 to Router1, with NAT enabled. ··· 731 643 ip -net "$nsr1" link set up dev veth0 732 644 } 733 645 646 + test_ipip 647 + 734 648 test_bridge 735 649 736 650 KEY_SHA="0x"$(ps -af | sha1sum | cut -d " " -f 1) ··· 773 683 ip -net "$ns2" route add default via 10.0.2.1 774 684 ip -net "$ns2" route add default via dead:2::1 775 685 776 - if test_tcp_forwarding "$ns1" "$ns2" 1; then 686 + if test_tcp_forwarding "$ns1" "$ns2" 1 4 10.0.2.99 12345; then 777 687 check_counters "ipsec tunnel mode for ns1/ns2" 778 688 else 779 689 echo "FAIL: ipsec tunnel mode for ns1/ns2" 690 + ip netns exec "$nsr1" nft list ruleset 1>&2 691 + ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 692 + fi 693 + 694 + if test_tcp_forwarding "$ns1" "$ns2" 1 6 "[dead:2::99]" 12345; then 695 + check_counters "IPv6 ipsec tunnel mode for ns1/ns2" 696 + else 697 + echo "FAIL: IPv6 ipsec tunnel mode for ns1/ns2" 780 698 ip netns exec "$nsr1" nft list ruleset 1>&2 781 699 ip netns exec "$nsr1" cat /proc/net/xfrm_stat 1>&2 782 700 fi

Configure Feed

Configure Feed