Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2024-10-14

The following pull-request contains BPF updates for your *net-next* tree.

We've added 21 non-merge commits during the last 18 day(s) which contain
a total of 21 files changed, 1185 insertions(+), 127 deletions(-).

The main changes are:

1) Put xsk sockets on a struct diet and add various cleanups. Overall, this helps
to bump performance by 12% for some workloads, from Maciej Fijalkowski.

2) Extend BPF selftests to increase coverage of XDP features in combination
with BPF cpumap, from Alexis Lothoré (eBPF Foundation).

3) Extend netkit with an option to delegate skb->{mark,priority} scrubbing to
its BPF program, from Daniel Borkmann.

4) Make the bpf_get_netns_cookie() helper available also to tc(x) BPF programs,
from Mahe Tardy.

5) Extend BPF selftests covering a BPF program setting socket options per MPTCP
subflow, from Geliang Tang and Nicolas Rybowski.

bpf-next-for-netdev

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next: (21 commits)
xsk: Use xsk_buff_pool directly for cq functions
xsk: Wrap duplicated code to function
xsk: Carry a copy of xdp_zc_max_segs within xsk_buff_pool
xsk: Get rid of xdp_buff_xsk::orig_addr
xsk: s/free_list_node/list_node/
xsk: Get rid of xdp_buff_xsk::xskb_list_node
selftests/bpf: check program redirect in xdp_cpumap_attach
selftests/bpf: make xdp_cpumap_attach keep redirect prog attached
selftests/bpf: fix bpf_map_redirect call for cpu map test
selftests/bpf: add tcx netns cookie tests
bpf: add get_netns_cookie helper to tc programs
selftests/bpf: add missing header include for htons
selftests/bpf: Extend netkit tests to validate skb meta data
tools: Sync if_link.h uapi tooling header
netkit: Add add netkit scrub support to rt_link.yaml
netkit: Simplify netkit mode over to use NLA_POLICY_MAX
netkit: Add option for scrubbing skb meta data
bpf: Remove unused macro
selftests/bpf: Add mptcp subflow subtest
selftests/bpf: Add getsockopt to inspect mptcp subflow
...
====================

Link: https://patch.msgid.link/20241014211110.16562-1-daniel@iogearbox.net
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+1185 -127
+15
Documentation/netlink/specs/rt_link.yaml
··· 920 920 - name: l2 921 921 - name: l3 922 922 923 + - 924 + name: netkit-scrub 925 + type: enum 926 + entries: 927 + - name: none 928 + - name: default 929 + 923 930 attribute-sets: 924 931 - 925 932 name: link-attrs ··· 2158 2151 name: mode 2159 2152 type: u32 2160 2153 enum: netkit-mode 2154 + - 2155 + name: scrub 2156 + type: u32 2157 + enum: netkit-scrub 2158 + - 2159 + name: peer-scrub 2160 + type: u32 2161 + enum: netkit-scrub 2161 2162 2162 2163 sub-messages: 2163 2164 -
+1 -1
MAINTAINERS
··· 16300 16300 F: include/trace/events/mptcp.h 16301 16301 F: include/uapi/linux/mptcp*.h 16302 16302 F: net/mptcp/ 16303 - F: tools/testing/selftests/bpf/*/*mptcp*.c 16303 + F: tools/testing/selftests/bpf/*/*mptcp*.[ch] 16304 16304 F: tools/testing/selftests/net/mptcp/ 16305 16305 16306 16306 NETWORKING [TCP]
+57 -34
drivers/net/netkit.c
··· 20 20 struct net_device __rcu *peer; 21 21 struct bpf_mprog_entry __rcu *active; 22 22 enum netkit_action policy; 23 + enum netkit_scrub scrub; 23 24 struct bpf_mprog_bundle bundle; 24 25 25 26 /* Needed in slow-path */ ··· 51 50 return ret; 52 51 } 53 52 54 - static void netkit_prep_forward(struct sk_buff *skb, bool xnet) 53 + static void netkit_xnet(struct sk_buff *skb) 55 54 { 56 - skb_scrub_packet(skb, xnet); 57 55 skb->priority = 0; 56 + skb->mark = 0; 57 + } 58 + 59 + static void netkit_prep_forward(struct sk_buff *skb, 60 + bool xnet, bool xnet_scrub) 61 + { 62 + skb_scrub_packet(skb, false); 58 63 nf_skip_egress(skb, true); 59 64 skb_reset_mac_header(skb); 65 + if (!xnet) 66 + return; 67 + ipvs_reset(skb); 68 + skb_clear_tstamp(skb); 69 + if (xnet_scrub) 70 + netkit_xnet(skb); 60 71 } 61 72 62 73 static struct netkit *netkit_priv(const struct net_device *dev) ··· 93 80 !pskb_may_pull(skb, ETH_HLEN) || 94 81 skb_orphan_frags(skb, GFP_ATOMIC))) 95 82 goto drop; 96 - netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer))); 83 + netkit_prep_forward(skb, !net_eq(dev_net(dev), dev_net(peer)), 84 + nk->scrub); 97 85 eth_skb_pkt_type(skb, peer); 98 86 skb->dev = peer; 99 87 entry = rcu_dereference(nk->active); ··· 311 297 } 312 298 } 313 299 314 - static int netkit_check_mode(int mode, struct nlattr *tb, 315 - struct netlink_ext_ack *extack) 316 - { 317 - switch (mode) { 318 - case NETKIT_L2: 319 - case NETKIT_L3: 320 - return 0; 321 - default: 322 - NL_SET_ERR_MSG_ATTR(extack, tb, 323 - "Provided device mode can only be L2 or L3"); 324 - return -EINVAL; 325 - } 326 - } 327 - 328 300 static int netkit_validate(struct nlattr *tb[], struct nlattr *data[], 329 301 struct netlink_ext_ack *extack) 330 302 { ··· 332 332 struct netlink_ext_ack *extack) 333 333 { 334 334 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb, *attr; 335 - enum netkit_action default_prim = NETKIT_PASS; 336 - enum netkit_action default_peer = NETKIT_PASS; 335 + enum netkit_action policy_prim = NETKIT_PASS; 336 + enum netkit_action policy_peer = NETKIT_PASS; 337 + enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; 338 + enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; 337 339 enum netkit_mode mode = NETKIT_L3; 338 340 unsigned char ifname_assign_type; 339 341 struct ifinfomsg *ifmp = NULL; ··· 346 344 int err; 347 345 348 346 if (data) { 349 - if (data[IFLA_NETKIT_MODE]) { 350 - attr = data[IFLA_NETKIT_MODE]; 351 - mode = nla_get_u32(attr); 352 - err = netkit_check_mode(mode, attr, extack); 353 - if (err < 0) 354 - return err; 355 - } 347 + if (data[IFLA_NETKIT_MODE]) 348 + mode = nla_get_u32(data[IFLA_NETKIT_MODE]); 356 349 if (data[IFLA_NETKIT_PEER_INFO]) { 357 350 attr = data[IFLA_NETKIT_PEER_INFO]; 358 351 ifmp = nla_data(attr); ··· 359 362 return err; 360 363 tbp = peer_tb; 361 364 } 365 + if (data[IFLA_NETKIT_SCRUB]) 366 + scrub_prim = nla_get_u32(data[IFLA_NETKIT_SCRUB]); 367 + if (data[IFLA_NETKIT_PEER_SCRUB]) 368 + scrub_peer = nla_get_u32(data[IFLA_NETKIT_PEER_SCRUB]); 362 369 if (data[IFLA_NETKIT_POLICY]) { 363 370 attr = data[IFLA_NETKIT_POLICY]; 364 - default_prim = nla_get_u32(attr); 365 - err = netkit_check_policy(default_prim, attr, extack); 371 + policy_prim = nla_get_u32(attr); 372 + err = netkit_check_policy(policy_prim, attr, extack); 366 373 if (err < 0) 367 374 return err; 368 375 } 369 376 if (data[IFLA_NETKIT_PEER_POLICY]) { 370 377 attr = data[IFLA_NETKIT_PEER_POLICY]; 371 - default_peer = nla_get_u32(attr); 372 - err = netkit_check_policy(default_peer, attr, extack); 378 + policy_peer = nla_get_u32(attr); 379 + err = netkit_check_policy(policy_peer, attr, extack); 373 380 if (err < 0) 374 381 return err; 375 382 } ··· 410 409 411 410 nk = netkit_priv(peer); 412 411 nk->primary = false; 413 - nk->policy = default_peer; 412 + nk->policy = policy_peer; 413 + nk->scrub = scrub_peer; 414 414 nk->mode = mode; 415 415 bpf_mprog_bundle_init(&nk->bundle); 416 416 ··· 436 434 437 435 nk = netkit_priv(dev); 438 436 nk->primary = true; 439 - nk->policy = default_prim; 437 + nk->policy = policy_prim; 438 + nk->scrub = scrub_prim; 440 439 nk->mode = mode; 441 440 bpf_mprog_bundle_init(&nk->bundle); 442 441 ··· 877 874 return -EACCES; 878 875 } 879 876 877 + if (data[IFLA_NETKIT_SCRUB]) { 878 + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_SCRUB], 879 + "netkit scrubbing cannot be changed after device creation"); 880 + return -EACCES; 881 + } 882 + 883 + if (data[IFLA_NETKIT_PEER_SCRUB]) { 884 + NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_SCRUB], 885 + "netkit scrubbing cannot be changed after device creation"); 886 + return -EACCES; 887 + } 888 + 880 889 if (data[IFLA_NETKIT_PEER_INFO]) { 881 890 NL_SET_ERR_MSG_ATTR(extack, data[IFLA_NETKIT_PEER_INFO], 882 891 "netkit peer info cannot be changed after device creation"); ··· 923 908 { 924 909 return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ 925 910 nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_POLICY */ 926 - nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ 911 + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_SCRUB */ 912 + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PEER_SCRUB */ 927 913 nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_MODE */ 914 + nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ 928 915 0; 929 916 } 930 917 ··· 941 924 return -EMSGSIZE; 942 925 if (nla_put_u32(skb, IFLA_NETKIT_MODE, nk->mode)) 943 926 return -EMSGSIZE; 927 + if (nla_put_u32(skb, IFLA_NETKIT_SCRUB, nk->scrub)) 928 + return -EMSGSIZE; 944 929 945 930 if (peer) { 946 931 nk = netkit_priv(peer); 947 932 if (nla_put_u32(skb, IFLA_NETKIT_PEER_POLICY, nk->policy)) 933 + return -EMSGSIZE; 934 + if (nla_put_u32(skb, IFLA_NETKIT_PEER_SCRUB, nk->scrub)) 948 935 return -EMSGSIZE; 949 936 } 950 937 ··· 957 936 958 937 static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = { 959 938 [IFLA_NETKIT_PEER_INFO] = { .len = sizeof(struct ifinfomsg) }, 939 + [IFLA_NETKIT_MODE] = NLA_POLICY_MAX(NLA_U32, NETKIT_L3), 960 940 [IFLA_NETKIT_POLICY] = { .type = NLA_U32 }, 961 - [IFLA_NETKIT_MODE] = { .type = NLA_U32 }, 962 941 [IFLA_NETKIT_PEER_POLICY] = { .type = NLA_U32 }, 942 + [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), 943 + [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), 963 944 [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, 964 945 .reject_message = "Primary attribute is read-only" }, 965 946 };
+7 -7
include/net/xdp_sock_drv.h
··· 126 126 if (likely(!xdp_buff_has_frags(xdp))) 127 127 goto out; 128 128 129 - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { 130 - list_del(&pos->xskb_list_node); 129 + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { 130 + list_del(&pos->list_node); 131 131 xp_free(pos); 132 132 } 133 133 ··· 140 140 { 141 141 struct xdp_buff_xsk *frag = container_of(xdp, struct xdp_buff_xsk, xdp); 142 142 143 - list_add_tail(&frag->xskb_list_node, &frag->pool->xskb_list); 143 + list_add_tail(&frag->list_node, &frag->pool->xskb_list); 144 144 } 145 145 146 146 static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) ··· 150 150 struct xdp_buff_xsk *frag; 151 151 152 152 frag = list_first_entry_or_null(&xskb->pool->xskb_list, 153 - struct xdp_buff_xsk, xskb_list_node); 153 + struct xdp_buff_xsk, list_node); 154 154 if (frag) { 155 - list_del(&frag->xskb_list_node); 155 + list_del(&frag->list_node); 156 156 ret = &frag->xdp; 157 157 } 158 158 ··· 163 163 { 164 164 struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); 165 165 166 - list_del(&xskb->xskb_list_node); 166 + list_del(&xskb->list_node); 167 167 } 168 168 169 169 static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) ··· 172 172 struct xdp_buff_xsk *frag; 173 173 174 174 frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, 175 - xskb_list_node); 175 + list_node); 176 176 return &frag->xdp; 177 177 } 178 178
+13 -10
include/net/xsk_buff_pool.h
··· 28 28 dma_addr_t dma; 29 29 dma_addr_t frame_dma; 30 30 struct xsk_buff_pool *pool; 31 - u64 orig_addr; 32 - struct list_head free_list_node; 33 - struct list_head xskb_list_node; 31 + struct list_head list_node; 34 32 }; 35 33 36 34 #define XSK_CHECK_PRIV_TYPE(t) BUILD_BUG_ON(sizeof(t) > offsetofend(struct xdp_buff_xsk, cb)) ··· 76 78 u32 chunk_size; 77 79 u32 chunk_shift; 78 80 u32 frame_len; 81 + u32 xdp_zc_max_segs; 79 82 u8 tx_metadata_len; /* inherited from umem */ 80 83 u8 cached_need_wakeup; 81 84 bool uses_need_wakeup; ··· 119 120 static inline void xp_init_xskb_addr(struct xdp_buff_xsk *xskb, struct xsk_buff_pool *pool, 120 121 u64 addr) 121 122 { 122 - xskb->orig_addr = addr; 123 123 xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; 124 124 } 125 125 ··· 220 222 xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; 221 223 } 222 224 223 - static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb) 225 + static inline u64 xp_get_handle(struct xdp_buff_xsk *xskb, 226 + struct xsk_buff_pool *pool) 224 227 { 225 - u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; 228 + u64 orig_addr = xskb->xdp.data - pool->addrs; 229 + u64 offset; 226 230 227 - offset += xskb->pool->headroom; 228 - if (!xskb->pool->unaligned) 229 - return xskb->orig_addr + offset; 230 - return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); 231 + if (!pool->unaligned) 232 + return orig_addr; 233 + 234 + offset = xskb->xdp.data - xskb->xdp.data_hard_start; 235 + orig_addr -= offset; 236 + offset += pool->headroom; 237 + return orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); 231 238 } 232 239 233 240 static inline bool xp_tx_metadata_enabled(const struct xsk_buff_pool *pool)
+15
include/uapi/linux/if_link.h
··· 1293 1293 NETKIT_L3, 1294 1294 }; 1295 1295 1296 + /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to 1297 + * the BPF program if attached. This also means the latter can 1298 + * consume the two fields if they were populated earlier. 1299 + * 1300 + * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before 1301 + * invoking the attached BPF program when the peer device resides 1302 + * in a different network namespace. This is the default behavior. 1303 + */ 1304 + enum netkit_scrub { 1305 + NETKIT_SCRUB_NONE, 1306 + NETKIT_SCRUB_DEFAULT, 1307 + }; 1308 + 1296 1309 enum { 1297 1310 IFLA_NETKIT_UNSPEC, 1298 1311 IFLA_NETKIT_PEER_INFO, ··· 1313 1300 IFLA_NETKIT_POLICY, 1314 1301 IFLA_NETKIT_PEER_POLICY, 1315 1302 IFLA_NETKIT_MODE, 1303 + IFLA_NETKIT_SCRUB, 1304 + IFLA_NETKIT_PEER_SCRUB, 1316 1305 __IFLA_NETKIT_MAX, 1317 1306 }; 1318 1307 #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
+13 -4
net/core/filter.c
··· 5138 5138 return net->net_cookie; 5139 5139 } 5140 5140 5141 + BPF_CALL_1(bpf_get_netns_cookie, struct sk_buff *, skb) 5142 + { 5143 + return __bpf_get_netns_cookie(skb && skb->sk ? skb->sk : NULL); 5144 + } 5145 + 5146 + static const struct bpf_func_proto bpf_get_netns_cookie_proto = { 5147 + .func = bpf_get_netns_cookie, 5148 + .ret_type = RET_INTEGER, 5149 + .arg1_type = ARG_PTR_TO_CTX_OR_NULL, 5150 + }; 5151 + 5141 5152 BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) 5142 5153 { 5143 5154 return __bpf_get_netns_cookie(ctx); ··· 8216 8205 return &bpf_skb_under_cgroup_proto; 8217 8206 case BPF_FUNC_get_socket_cookie: 8218 8207 return &bpf_get_socket_cookie_proto; 8208 + case BPF_FUNC_get_netns_cookie: 8209 + return &bpf_get_netns_cookie_proto; 8219 8210 case BPF_FUNC_get_socket_uid: 8220 8211 return &bpf_get_socket_uid_proto; 8221 8212 case BPF_FUNC_fib_lookup: ··· 10249 10236 S, NS, F, NF, SIZE, OFF); \ 10250 10237 } \ 10251 10238 } while (0) 10252 - 10253 - #define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ 10254 - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ 10255 - S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) 10256 10239 10257 10240 static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, 10258 10241 const struct bpf_insn *si,
+19 -19
net/xdp/xsk.c
··· 141 141 u64 addr; 142 142 int err; 143 143 144 - addr = xp_get_handle(xskb); 144 + addr = xp_get_handle(xskb, xskb->pool); 145 145 err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); 146 146 if (err) { 147 147 xs->rx_queue_full++; ··· 171 171 return 0; 172 172 173 173 xskb_list = &xskb->pool->xskb_list; 174 - list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { 174 + list_for_each_entry_safe(pos, tmp, xskb_list, list_node) { 175 175 if (list_is_singular(xskb_list)) 176 176 contd = 0; 177 177 len = pos->xdp.data_end - pos->xdp.data; 178 178 err = __xsk_rcv_zc(xs, pos, len, contd); 179 179 if (err) 180 180 goto err; 181 - list_del(&pos->xskb_list_node); 181 + list_del(&pos->list_node); 182 182 } 183 183 184 184 return 0; ··· 527 527 return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); 528 528 } 529 529 530 - static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) 530 + static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) 531 531 { 532 532 unsigned long flags; 533 533 int ret; 534 534 535 - spin_lock_irqsave(&xs->pool->cq_lock, flags); 536 - ret = xskq_prod_reserve_addr(xs->pool->cq, addr); 537 - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 535 + spin_lock_irqsave(&pool->cq_lock, flags); 536 + ret = xskq_prod_reserve_addr(pool->cq, addr); 537 + spin_unlock_irqrestore(&pool->cq_lock, flags); 538 538 539 539 return ret; 540 540 } 541 541 542 - static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) 542 + static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) 543 543 { 544 544 unsigned long flags; 545 545 546 - spin_lock_irqsave(&xs->pool->cq_lock, flags); 547 - xskq_prod_submit_n(xs->pool->cq, n); 548 - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 546 + spin_lock_irqsave(&pool->cq_lock, flags); 547 + xskq_prod_submit_n(pool->cq, n); 548 + spin_unlock_irqrestore(&pool->cq_lock, flags); 549 549 } 550 550 551 - static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) 551 + static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) 552 552 { 553 553 unsigned long flags; 554 554 555 - spin_lock_irqsave(&xs->pool->cq_lock, flags); 556 - xskq_prod_cancel_n(xs->pool->cq, n); 557 - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); 555 + spin_lock_irqsave(&pool->cq_lock, flags); 556 + xskq_prod_cancel_n(pool->cq, n); 557 + spin_unlock_irqrestore(&pool->cq_lock, flags); 558 558 } 559 559 560 560 static u32 xsk_get_num_desc(struct sk_buff *skb) ··· 571 571 *compl->tx_timestamp = ktime_get_tai_fast_ns(); 572 572 } 573 573 574 - xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); 574 + xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); 575 575 sock_wfree(skb); 576 576 } 577 577 ··· 587 587 struct xdp_sock *xs = xdp_sk(skb->sk); 588 588 589 589 skb->destructor = sock_wfree; 590 - xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); 590 + xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); 591 591 /* Free skb without triggering the perf drop trace */ 592 592 consume_skb(skb); 593 593 xs->skb = NULL; ··· 765 765 xskq_cons_release(xs->tx); 766 766 } else { 767 767 /* Let application retry */ 768 - xsk_cq_cancel_locked(xs, 1); 768 + xsk_cq_cancel_locked(xs->pool, 1); 769 769 } 770 770 771 771 return ERR_PTR(err); ··· 802 802 * if there is space in it. This avoids having to implement 803 803 * any buffering in the Tx path. 804 804 */ 805 - if (xsk_cq_reserve_addr_locked(xs, desc.addr)) 805 + if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr)) 806 806 goto out; 807 807 808 808 skb = xsk_build_skb(xs, &desc);
+29 -25
net/xdp/xsk_buff_pool.c
··· 101 101 xskb = &pool->heads[i]; 102 102 xskb->pool = pool; 103 103 xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; 104 - INIT_LIST_HEAD(&xskb->free_list_node); 105 - INIT_LIST_HEAD(&xskb->xskb_list_node); 104 + INIT_LIST_HEAD(&xskb->list_node); 106 105 if (pool->unaligned) 107 106 pool->free_heads[i] = xskb; 108 107 else ··· 229 230 goto err_unreg_xsk; 230 231 } 231 232 pool->umem->zc = true; 233 + pool->xdp_zc_max_segs = netdev->xdp_zc_max_segs; 232 234 return 0; 233 235 234 236 err_unreg_xsk: ··· 417 417 418 418 for (i = 0; i < pool->heads_cnt; i++) { 419 419 struct xdp_buff_xsk *xskb = &pool->heads[i]; 420 + u64 orig_addr; 420 421 421 - xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, xskb->orig_addr); 422 + orig_addr = xskb->xdp.data_hard_start - pool->addrs - pool->headroom; 423 + xp_init_xskb_dma(xskb, pool, dma_map->dma_pages, orig_addr); 422 424 } 423 425 } 424 426 ··· 503 501 return *addr < pool->addrs_cnt; 504 502 } 505 503 504 + static struct xdp_buff_xsk *xp_get_xskb(struct xsk_buff_pool *pool, u64 addr) 505 + { 506 + struct xdp_buff_xsk *xskb; 507 + 508 + if (pool->unaligned) { 509 + xskb = pool->free_heads[--pool->free_heads_cnt]; 510 + xp_init_xskb_addr(xskb, pool, addr); 511 + if (pool->dma_pages) 512 + xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); 513 + } else { 514 + xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; 515 + } 516 + 517 + return xskb; 518 + } 519 + 506 520 static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) 507 521 { 508 522 struct xdp_buff_xsk *xskb; ··· 544 526 break; 545 527 } 546 528 547 - if (pool->unaligned) { 548 - xskb = pool->free_heads[--pool->free_heads_cnt]; 549 - xp_init_xskb_addr(xskb, pool, addr); 550 - if (pool->dma_pages) 551 - xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); 552 - } else { 553 - xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; 554 - } 529 + xskb = xp_get_xskb(pool, addr); 555 530 556 531 xskq_cons_release(pool->fq); 557 532 return xskb; ··· 561 550 } else { 562 551 pool->free_list_cnt--; 563 552 xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, 564 - free_list_node); 565 - list_del_init(&xskb->free_list_node); 553 + list_node); 554 + list_del_init(&xskb->list_node); 566 555 } 567 556 568 557 xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; ··· 602 591 continue; 603 592 } 604 593 605 - if (pool->unaligned) { 606 - xskb = pool->free_heads[--pool->free_heads_cnt]; 607 - xp_init_xskb_addr(xskb, pool, addr); 608 - if (pool->dma_pages) 609 - xp_init_xskb_dma(xskb, pool, pool->dma_pages, addr); 610 - } else { 611 - xskb = &pool->heads[xp_aligned_extract_idx(pool, addr)]; 612 - } 594 + xskb = xp_get_xskb(pool, addr); 613 595 614 596 *xdp = &xskb->xdp; 615 597 xdp++; ··· 621 617 622 618 i = nb_entries; 623 619 while (i--) { 624 - xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, free_list_node); 625 - list_del_init(&xskb->free_list_node); 620 + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, list_node); 621 + list_del_init(&xskb->list_node); 626 622 627 623 *xdp = &xskb->xdp; 628 624 xdp++; ··· 692 688 693 689 void xp_free(struct xdp_buff_xsk *xskb) 694 690 { 695 - if (!list_empty(&xskb->free_list_node)) 691 + if (!list_empty(&xskb->list_node)) 696 692 return; 697 693 698 694 xskb->pool->free_list_cnt++; 699 - list_add(&xskb->free_list_node, &xskb->pool->free_list); 695 + list_add(&xskb->list_node, &xskb->pool->free_list); 700 696 } 701 697 EXPORT_SYMBOL(xp_free); 702 698
+1 -1
net/xdp/xsk_queue.h
··· 260 260 nr_frags = 0; 261 261 } else { 262 262 nr_frags++; 263 - if (nr_frags == pool->netdev->xdp_zc_max_segs) { 263 + if (nr_frags == pool->xdp_zc_max_segs) { 264 264 nr_frags = 0; 265 265 break; 266 266 }
+552 -1
tools/include/uapi/linux/if_link.h
··· 462 462 463 463 /* Bridge section */ 464 464 465 + /** 466 + * DOC: Bridge enum definition 467 + * 468 + * Please *note* that the timer values in the following section are expected 469 + * in clock_t format, which is seconds multiplied by USER_HZ (generally 470 + * defined as 100). 471 + * 472 + * @IFLA_BR_FORWARD_DELAY 473 + * The bridge forwarding delay is the time spent in LISTENING state 474 + * (before moving to LEARNING) and in LEARNING state (before moving 475 + * to FORWARDING). Only relevant if STP is enabled. 476 + * 477 + * The valid values are between (2 * USER_HZ) and (30 * USER_HZ). 478 + * The default value is (15 * USER_HZ). 479 + * 480 + * @IFLA_BR_HELLO_TIME 481 + * The time between hello packets sent by the bridge, when it is a root 482 + * bridge or a designated bridge. Only relevant if STP is enabled. 483 + * 484 + * The valid values are between (1 * USER_HZ) and (10 * USER_HZ). 485 + * The default value is (2 * USER_HZ). 486 + * 487 + * @IFLA_BR_MAX_AGE 488 + * The hello packet timeout is the time until another bridge in the 489 + * spanning tree is assumed to be dead, after reception of its last hello 490 + * message. Only relevant if STP is enabled. 491 + * 492 + * The valid values are between (6 * USER_HZ) and (40 * USER_HZ). 493 + * The default value is (20 * USER_HZ). 494 + * 495 + * @IFLA_BR_AGEING_TIME 496 + * Configure the bridge's FDB entries aging time. It is the time a MAC 497 + * address will be kept in the FDB after a packet has been received from 498 + * that address. After this time has passed, entries are cleaned up. 499 + * Allow values outside the 802.1 standard specification for special cases: 500 + * 501 + * * 0 - entry never ages (all permanent) 502 + * * 1 - entry disappears (no persistence) 503 + * 504 + * The default value is (300 * USER_HZ). 505 + * 506 + * @IFLA_BR_STP_STATE 507 + * Turn spanning tree protocol on (*IFLA_BR_STP_STATE* > 0) or off 508 + * (*IFLA_BR_STP_STATE* == 0) for this bridge. 509 + * 510 + * The default value is 0 (disabled). 511 + * 512 + * @IFLA_BR_PRIORITY 513 + * Set this bridge's spanning tree priority, used during STP root bridge 514 + * election. 515 + * 516 + * The valid values are between 0 and 65535. 517 + * 518 + * @IFLA_BR_VLAN_FILTERING 519 + * Turn VLAN filtering on (*IFLA_BR_VLAN_FILTERING* > 0) or off 520 + * (*IFLA_BR_VLAN_FILTERING* == 0). When disabled, the bridge will not 521 + * consider the VLAN tag when handling packets. 522 + * 523 + * The default value is 0 (disabled). 524 + * 525 + * @IFLA_BR_VLAN_PROTOCOL 526 + * Set the protocol used for VLAN filtering. 527 + * 528 + * The valid values are 0x8100(802.1Q) or 0x88A8(802.1AD). The default value 529 + * is 0x8100(802.1Q). 530 + * 531 + * @IFLA_BR_GROUP_FWD_MASK 532 + * The group forwarding mask. This is the bitmask that is applied to 533 + * decide whether to forward incoming frames destined to link-local 534 + * addresses (of the form 01:80:C2:00:00:0X). 535 + * 536 + * The default value is 0, which means the bridge does not forward any 537 + * link-local frames coming on this port. 538 + * 539 + * @IFLA_BR_ROOT_ID 540 + * The bridge root id, read only. 541 + * 542 + * @IFLA_BR_BRIDGE_ID 543 + * The bridge id, read only. 544 + * 545 + * @IFLA_BR_ROOT_PORT 546 + * The bridge root port, read only. 547 + * 548 + * @IFLA_BR_ROOT_PATH_COST 549 + * The bridge root path cost, read only. 550 + * 551 + * @IFLA_BR_TOPOLOGY_CHANGE 552 + * The bridge topology change, read only. 553 + * 554 + * @IFLA_BR_TOPOLOGY_CHANGE_DETECTED 555 + * The bridge topology change detected, read only. 556 + * 557 + * @IFLA_BR_HELLO_TIMER 558 + * The bridge hello timer, read only. 559 + * 560 + * @IFLA_BR_TCN_TIMER 561 + * The bridge tcn timer, read only. 562 + * 563 + * @IFLA_BR_TOPOLOGY_CHANGE_TIMER 564 + * The bridge topology change timer, read only. 565 + * 566 + * @IFLA_BR_GC_TIMER 567 + * The bridge gc timer, read only. 568 + * 569 + * @IFLA_BR_GROUP_ADDR 570 + * Set the MAC address of the multicast group this bridge uses for STP. 571 + * The address must be a link-local address in standard Ethernet MAC address 572 + * format. It is an address of the form 01:80:C2:00:00:0X, with X in [0, 4..f]. 573 + * 574 + * The default value is 0. 575 + * 576 + * @IFLA_BR_FDB_FLUSH 577 + * Flush bridge's fdb dynamic entries. 578 + * 579 + * @IFLA_BR_MCAST_ROUTER 580 + * Set bridge's multicast router if IGMP snooping is enabled. 581 + * The valid values are: 582 + * 583 + * * 0 - disabled. 584 + * * 1 - automatic (queried). 585 + * * 2 - permanently enabled. 586 + * 587 + * The default value is 1. 588 + * 589 + * @IFLA_BR_MCAST_SNOOPING 590 + * Turn multicast snooping on (*IFLA_BR_MCAST_SNOOPING* > 0) or off 591 + * (*IFLA_BR_MCAST_SNOOPING* == 0). 592 + * 593 + * The default value is 1. 594 + * 595 + * @IFLA_BR_MCAST_QUERY_USE_IFADDR 596 + * If enabled use the bridge's own IP address as source address for IGMP 597 + * queries (*IFLA_BR_MCAST_QUERY_USE_IFADDR* > 0) or the default of 0.0.0.0 598 + * (*IFLA_BR_MCAST_QUERY_USE_IFADDR* == 0). 599 + * 600 + * The default value is 0 (disabled). 601 + * 602 + * @IFLA_BR_MCAST_QUERIER 603 + * Enable (*IFLA_BR_MULTICAST_QUERIER* > 0) or disable 604 + * (*IFLA_BR_MULTICAST_QUERIER* == 0) IGMP querier, ie sending of multicast 605 + * queries by the bridge. 606 + * 607 + * The default value is 0 (disabled). 608 + * 609 + * @IFLA_BR_MCAST_HASH_ELASTICITY 610 + * Set multicast database hash elasticity, It is the maximum chain length in 611 + * the multicast hash table. This attribute is *deprecated* and the value 612 + * is always 16. 613 + * 614 + * @IFLA_BR_MCAST_HASH_MAX 615 + * Set maximum size of the multicast hash table 616 + * 617 + * The default value is 4096, the value must be a power of 2. 618 + * 619 + * @IFLA_BR_MCAST_LAST_MEMBER_CNT 620 + * The Last Member Query Count is the number of Group-Specific Queries 621 + * sent before the router assumes there are no local members. The Last 622 + * Member Query Count is also the number of Group-and-Source-Specific 623 + * Queries sent before the router assumes there are no listeners for a 624 + * particular source. 625 + * 626 + * The default value is 2. 627 + * 628 + * @IFLA_BR_MCAST_STARTUP_QUERY_CNT 629 + * The Startup Query Count is the number of Queries sent out on startup, 630 + * separated by the Startup Query Interval. 631 + * 632 + * The default value is 2. 633 + * 634 + * @IFLA_BR_MCAST_LAST_MEMBER_INTVL 635 + * The Last Member Query Interval is the Max Response Time inserted into 636 + * Group-Specific Queries sent in response to Leave Group messages, and 637 + * is also the amount of time between Group-Specific Query messages. 638 + * 639 + * The default value is (1 * USER_HZ). 640 + * 641 + * @IFLA_BR_MCAST_MEMBERSHIP_INTVL 642 + * The interval after which the bridge will leave a group, if no membership 643 + * reports for this group are received. 644 + * 645 + * The default value is (260 * USER_HZ). 646 + * 647 + * @IFLA_BR_MCAST_QUERIER_INTVL 648 + * The interval between queries sent by other routers. if no queries are 649 + * seen after this delay has passed, the bridge will start to send its own 650 + * queries (as if *IFLA_BR_MCAST_QUERIER_INTVL* was enabled). 651 + * 652 + * The default value is (255 * USER_HZ). 653 + * 654 + * @IFLA_BR_MCAST_QUERY_INTVL 655 + * The Query Interval is the interval between General Queries sent by 656 + * the Querier. 657 + * 658 + * The default value is (125 * USER_HZ). The minimum value is (1 * USER_HZ). 659 + * 660 + * @IFLA_BR_MCAST_QUERY_RESPONSE_INTVL 661 + * The Max Response Time used to calculate the Max Resp Code inserted 662 + * into the periodic General Queries. 663 + * 664 + * The default value is (10 * USER_HZ). 665 + * 666 + * @IFLA_BR_MCAST_STARTUP_QUERY_INTVL 667 + * The interval between queries in the startup phase. 668 + * 669 + * The default value is (125 * USER_HZ) / 4. The minimum value is (1 * USER_HZ). 670 + * 671 + * @IFLA_BR_NF_CALL_IPTABLES 672 + * Enable (*NF_CALL_IPTABLES* > 0) or disable (*NF_CALL_IPTABLES* == 0) 673 + * iptables hooks on the bridge. 674 + * 675 + * The default value is 0 (disabled). 676 + * 677 + * @IFLA_BR_NF_CALL_IP6TABLES 678 + * Enable (*NF_CALL_IP6TABLES* > 0) or disable (*NF_CALL_IP6TABLES* == 0) 679 + * ip6tables hooks on the bridge. 680 + * 681 + * The default value is 0 (disabled). 682 + * 683 + * @IFLA_BR_NF_CALL_ARPTABLES 684 + * Enable (*NF_CALL_ARPTABLES* > 0) or disable (*NF_CALL_ARPTABLES* == 0) 685 + * arptables hooks on the bridge. 686 + * 687 + * The default value is 0 (disabled). 688 + * 689 + * @IFLA_BR_VLAN_DEFAULT_PVID 690 + * VLAN ID applied to untagged and priority-tagged incoming packets. 691 + * 692 + * The default value is 1. Setting to the special value 0 makes all ports of 693 + * this bridge not have a PVID by default, which means that they will 694 + * not accept VLAN-untagged traffic. 695 + * 696 + * @IFLA_BR_PAD 697 + * Bridge attribute padding type for netlink message. 698 + * 699 + * @IFLA_BR_VLAN_STATS_ENABLED 700 + * Enable (*IFLA_BR_VLAN_STATS_ENABLED* == 1) or disable 701 + * (*IFLA_BR_VLAN_STATS_ENABLED* == 0) per-VLAN stats accounting. 702 + * 703 + * The default value is 0 (disabled). 704 + * 705 + * @IFLA_BR_MCAST_STATS_ENABLED 706 + * Enable (*IFLA_BR_MCAST_STATS_ENABLED* > 0) or disable 707 + * (*IFLA_BR_MCAST_STATS_ENABLED* == 0) multicast (IGMP/MLD) stats 708 + * accounting. 709 + * 710 + * The default value is 0 (disabled). 711 + * 712 + * @IFLA_BR_MCAST_IGMP_VERSION 713 + * Set the IGMP version. 714 + * 715 + * The valid values are 2 and 3. The default value is 2. 716 + * 717 + * @IFLA_BR_MCAST_MLD_VERSION 718 + * Set the MLD version. 719 + * 720 + * The valid values are 1 and 2. The default value is 1. 721 + * 722 + * @IFLA_BR_VLAN_STATS_PER_PORT 723 + * Enable (*IFLA_BR_VLAN_STATS_PER_PORT* == 1) or disable 724 + * (*IFLA_BR_VLAN_STATS_PER_PORT* == 0) per-VLAN per-port stats accounting. 725 + * Can be changed only when there are no port VLANs configured. 726 + * 727 + * The default value is 0 (disabled). 728 + * 729 + * @IFLA_BR_MULTI_BOOLOPT 730 + * The multi_boolopt is used to control new boolean options to avoid adding 731 + * new netlink attributes. You can look at ``enum br_boolopt_id`` for those 732 + * options. 733 + * 734 + * @IFLA_BR_MCAST_QUERIER_STATE 735 + * Bridge mcast querier states, read only. 736 + * 737 + * @IFLA_BR_FDB_N_LEARNED 738 + * The number of dynamically learned FDB entries for the current bridge, 739 + * read only. 740 + * 741 + * @IFLA_BR_FDB_MAX_LEARNED 742 + * Set the number of max dynamically learned FDB entries for the current 743 + * bridge. 744 + */ 465 745 enum { 466 746 IFLA_BR_UNSPEC, 467 747 IFLA_BR_FORWARD_DELAY, ··· 791 511 IFLA_BR_VLAN_STATS_PER_PORT, 792 512 IFLA_BR_MULTI_BOOLOPT, 793 513 IFLA_BR_MCAST_QUERIER_STATE, 514 + IFLA_BR_FDB_N_LEARNED, 515 + IFLA_BR_FDB_MAX_LEARNED, 794 516 __IFLA_BR_MAX, 795 517 }; 796 518 ··· 803 521 __u8 addr[6]; /* ETH_ALEN */ 804 522 }; 805 523 524 + /** 525 + * DOC: Bridge mode enum definition 526 + * 527 + * @BRIDGE_MODE_HAIRPIN 528 + * Controls whether traffic may be sent back out of the port on which it 529 + * was received. This option is also called reflective relay mode, and is 530 + * used to support basic VEPA (Virtual Ethernet Port Aggregator) 531 + * capabilities. By default, this flag is turned off and the bridge will 532 + * not forward traffic back out of the receiving port. 533 + */ 806 534 enum { 807 535 BRIDGE_MODE_UNSPEC, 808 536 BRIDGE_MODE_HAIRPIN, 809 537 }; 810 538 539 + /** 540 + * DOC: Bridge port enum definition 541 + * 542 + * @IFLA_BRPORT_STATE 543 + * The operation state of the port. Here are the valid values. 544 + * 545 + * * 0 - port is in STP *DISABLED* state. Make this port completely 546 + * inactive for STP. This is also called BPDU filter and could be used 547 + * to disable STP on an untrusted port, like a leaf virtual device. 548 + * The traffic forwarding is also stopped on this port. 549 + * * 1 - port is in STP *LISTENING* state. Only valid if STP is enabled 550 + * on the bridge. In this state the port listens for STP BPDUs and 551 + * drops all other traffic frames. 552 + * * 2 - port is in STP *LEARNING* state. Only valid if STP is enabled on 553 + * the bridge. In this state the port will accept traffic only for the 554 + * purpose of updating MAC address tables. 555 + * * 3 - port is in STP *FORWARDING* state. Port is fully active. 556 + * * 4 - port is in STP *BLOCKING* state. Only valid if STP is enabled on 557 + * the bridge. This state is used during the STP election process. 558 + * In this state, port will only process STP BPDUs. 559 + * 560 + * @IFLA_BRPORT_PRIORITY 561 + * The STP port priority. The valid values are between 0 and 255. 562 + * 563 + * @IFLA_BRPORT_COST 564 + * The STP path cost of the port. The valid values are between 1 and 65535. 565 + * 566 + * @IFLA_BRPORT_MODE 567 + * Set the bridge port mode. See *BRIDGE_MODE_HAIRPIN* for more details. 568 + * 569 + * @IFLA_BRPORT_GUARD 570 + * Controls whether STP BPDUs will be processed by the bridge port. By 571 + * default, the flag is turned off to allow BPDU processing. Turning this 572 + * flag on will disable the bridge port if a STP BPDU packet is received. 573 + * 574 + * If the bridge has Spanning Tree enabled, hostile devices on the network 575 + * may send BPDU on a port and cause network failure. Setting *guard on* 576 + * will detect and stop this by disabling the port. The port will be 577 + * restarted if the link is brought down, or removed and reattached. 578 + * 579 + * @IFLA_BRPORT_PROTECT 580 + * Controls whether a given port is allowed to become a root port or not. 581 + * Only used when STP is enabled on the bridge. By default the flag is off. 582 + * 583 + * This feature is also called root port guard. If BPDU is received from a 584 + * leaf (edge) port, it should not be elected as root port. This could 585 + * be used if using STP on a bridge and the downstream bridges are not fully 586 + * trusted; this prevents a hostile guest from rerouting traffic. 587 + * 588 + * @IFLA_BRPORT_FAST_LEAVE 589 + * This flag allows the bridge to immediately stop multicast traffic 590 + * forwarding on a port that receives an IGMP Leave message. It is only used 591 + * when IGMP snooping is enabled on the bridge. By default the flag is off. 592 + * 593 + * @IFLA_BRPORT_LEARNING 594 + * Controls whether a given port will learn *source* MAC addresses from 595 + * received traffic or not. Also controls whether dynamic FDB entries 596 + * (which can also be added by software) will be refreshed by incoming 597 + * traffic. By default this flag is on. 598 + * 599 + * @IFLA_BRPORT_UNICAST_FLOOD 600 + * Controls whether unicast traffic for which there is no FDB entry will 601 + * be flooded towards this port. By default this flag is on. 602 + * 603 + * @IFLA_BRPORT_PROXYARP 604 + * Enable proxy ARP on this port. 605 + * 606 + * @IFLA_BRPORT_LEARNING_SYNC 607 + * Controls whether a given port will sync MAC addresses learned on device 608 + * port to bridge FDB. 609 + * 610 + * @IFLA_BRPORT_PROXYARP_WIFI 611 + * Enable proxy ARP on this port which meets extended requirements by 612 + * IEEE 802.11 and Hotspot 2.0 specifications. 613 + * 614 + * @IFLA_BRPORT_ROOT_ID 615 + * 616 + * @IFLA_BRPORT_BRIDGE_ID 617 + * 618 + * @IFLA_BRPORT_DESIGNATED_PORT 619 + * 620 + * @IFLA_BRPORT_DESIGNATED_COST 621 + * 622 + * @IFLA_BRPORT_ID 623 + * 624 + * @IFLA_BRPORT_NO 625 + * 626 + * @IFLA_BRPORT_TOPOLOGY_CHANGE_ACK 627 + * 628 + * @IFLA_BRPORT_CONFIG_PENDING 629 + * 630 + * @IFLA_BRPORT_MESSAGE_AGE_TIMER 631 + * 632 + * @IFLA_BRPORT_FORWARD_DELAY_TIMER 633 + * 634 + * @IFLA_BRPORT_HOLD_TIMER 635 + * 636 + * @IFLA_BRPORT_FLUSH 637 + * Flush bridge ports' fdb dynamic entries. 638 + * 639 + * @IFLA_BRPORT_MULTICAST_ROUTER 640 + * Configure the port's multicast router presence. A port with 641 + * a multicast router will receive all multicast traffic. 642 + * The valid values are: 643 + * 644 + * * 0 disable multicast routers on this port 645 + * * 1 let the system detect the presence of routers (default) 646 + * * 2 permanently enable multicast traffic forwarding on this port 647 + * * 3 enable multicast routers temporarily on this port, not depending 648 + * on incoming queries. 649 + * 650 + * @IFLA_BRPORT_PAD 651 + * 652 + * @IFLA_BRPORT_MCAST_FLOOD 653 + * Controls whether a given port will flood multicast traffic for which 654 + * there is no MDB entry. By default this flag is on. 655 + * 656 + * @IFLA_BRPORT_MCAST_TO_UCAST 657 + * Controls whether a given port will replicate packets using unicast 658 + * instead of multicast. By default this flag is off. 659 + * 660 + * This is done by copying the packet per host and changing the multicast 661 + * destination MAC to a unicast one accordingly. 662 + * 663 + * *mcast_to_unicast* works on top of the multicast snooping feature of the 664 + * bridge. Which means unicast copies are only delivered to hosts which 665 + * are interested in unicast and signaled this via IGMP/MLD reports previously. 666 + * 667 + * This feature is intended for interface types which have a more reliable 668 + * and/or efficient way to deliver unicast packets than broadcast ones 669 + * (e.g. WiFi). 670 + * 671 + * However, it should only be enabled on interfaces where no IGMPv2/MLDv1 672 + * report suppression takes place. IGMP/MLD report suppression issue is 673 + * usually overcome by the network daemon (supplicant) enabling AP isolation 674 + * and by that separating all STAs. 675 + * 676 + * Delivery of STA-to-STA IP multicast is made possible again by enabling 677 + * and utilizing the bridge hairpin mode, which considers the incoming port 678 + * as a potential outgoing port, too (see *BRIDGE_MODE_HAIRPIN* option). 679 + * Hairpin mode is performed after multicast snooping, therefore leading 680 + * to only deliver reports to STAs running a multicast router. 681 + * 682 + * @IFLA_BRPORT_VLAN_TUNNEL 683 + * Controls whether vlan to tunnel mapping is enabled on the port. 684 + * By default this flag is off. 685 + * 686 + * @IFLA_BRPORT_BCAST_FLOOD 687 + * Controls flooding of broadcast traffic on the given port. By default 688 + * this flag is on. 689 + * 690 + * @IFLA_BRPORT_GROUP_FWD_MASK 691 + * Set the group forward mask. This is a bitmask that is applied to 692 + * decide whether to forward incoming frames destined to link-local 693 + * addresses. The addresses of the form are 01:80:C2:00:00:0X (defaults 694 + * to 0, which means the bridge does not forward any link-local frames 695 + * coming on this port). 696 + * 697 + * @IFLA_BRPORT_NEIGH_SUPPRESS 698 + * Controls whether neighbor discovery (arp and nd) proxy and suppression 699 + * is enabled on the port. By default this flag is off. 700 + * 701 + * @IFLA_BRPORT_ISOLATED 702 + * Controls whether a given port will be isolated, which means it will be 703 + * able to communicate with non-isolated ports only. By default this 704 + * flag is off. 705 + * 706 + * @IFLA_BRPORT_BACKUP_PORT 707 + * Set a backup port. If the port loses carrier all traffic will be 708 + * redirected to the configured backup port. Set the value to 0 to disable 709 + * it. 710 + * 711 + * @IFLA_BRPORT_MRP_RING_OPEN 712 + * 713 + * @IFLA_BRPORT_MRP_IN_OPEN 714 + * 715 + * @IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT 716 + * The number of per-port EHT hosts limit. The default value is 512. 717 + * Setting to 0 is not allowed. 718 + * 719 + * @IFLA_BRPORT_MCAST_EHT_HOSTS_CNT 720 + * The current number of tracked hosts, read only. 721 + * 722 + * @IFLA_BRPORT_LOCKED 723 + * Controls whether a port will be locked, meaning that hosts behind the 724 + * port will not be able to communicate through the port unless an FDB 725 + * entry with the unit's MAC address is in the FDB. The common use case is 726 + * that hosts are allowed access through authentication with the IEEE 802.1X 727 + * protocol or based on whitelists. By default this flag is off. 728 + * 729 + * Please note that secure 802.1X deployments should always use the 730 + * *BR_BOOLOPT_NO_LL_LEARN* flag, to not permit the bridge to populate its 731 + * FDB based on link-local (EAPOL) traffic received on the port. 732 + * 733 + * @IFLA_BRPORT_MAB 734 + * Controls whether a port will use MAC Authentication Bypass (MAB), a 735 + * technique through which select MAC addresses may be allowed on a locked 736 + * port, without using 802.1X authentication. Packets with an unknown source 737 + * MAC address generates a "locked" FDB entry on the incoming bridge port. 738 + * The common use case is for user space to react to these bridge FDB 739 + * notifications and optionally replace the locked FDB entry with a normal 740 + * one, allowing traffic to pass for whitelisted MAC addresses. 741 + * 742 + * Setting this flag also requires *IFLA_BRPORT_LOCKED* and 743 + * *IFLA_BRPORT_LEARNING*. *IFLA_BRPORT_LOCKED* ensures that unauthorized 744 + * data packets are dropped, and *IFLA_BRPORT_LEARNING* allows the dynamic 745 + * FDB entries installed by user space (as replacements for the locked FDB 746 + * entries) to be refreshed and/or aged out. 747 + * 748 + * @IFLA_BRPORT_MCAST_N_GROUPS 749 + * 750 + * @IFLA_BRPORT_MCAST_MAX_GROUPS 751 + * Sets the maximum number of MDB entries that can be registered for a 752 + * given port. Attempts to register more MDB entries at the port than this 753 + * limit allows will be rejected, whether they are done through netlink 754 + * (e.g. the bridge tool), or IGMP or MLD membership reports. Setting a 755 + * limit of 0 disables the limit. The default value is 0. 756 + * 757 + * @IFLA_BRPORT_NEIGH_VLAN_SUPPRESS 758 + * Controls whether neighbor discovery (arp and nd) proxy and suppression is 759 + * enabled for a given port. By default this flag is off. 760 + * 761 + * Note that this option only takes effect when *IFLA_BRPORT_NEIGH_SUPPRESS* 762 + * is enabled for a given port. 763 + * 764 + * @IFLA_BRPORT_BACKUP_NHID 765 + * The FDB nexthop object ID to attach to packets being redirected to a 766 + * backup port that has VLAN tunnel mapping enabled (via the 767 + * *IFLA_BRPORT_VLAN_TUNNEL* option). Setting a value of 0 (default) has 768 + * the effect of not attaching any ID. 769 + */ 811 770 enum { 812 771 IFLA_BRPORT_UNSPEC, 813 772 IFLA_BRPORT_STATE, /* Spanning tree state */ ··· 1293 770 NETKIT_L3, 1294 771 }; 1295 772 773 + /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to 774 + * the BPF program if attached. This also means the latter can 775 + * consume the two fields if they were populated earlier. 776 + * 777 + * NETKIT_SCRUB_DEFAULT zeroes skb->{mark,priority} fields before 778 + * invoking the attached BPF program when the peer device resides 779 + * in a different network namespace. This is the default behavior. 780 + */ 781 + enum netkit_scrub { 782 + NETKIT_SCRUB_NONE, 783 + NETKIT_SCRUB_DEFAULT, 784 + }; 785 + 1296 786 enum { 1297 787 IFLA_NETKIT_UNSPEC, 1298 788 IFLA_NETKIT_PEER_INFO, ··· 1313 777 IFLA_NETKIT_POLICY, 1314 778 IFLA_NETKIT_PEER_POLICY, 1315 779 IFLA_NETKIT_MODE, 780 + IFLA_NETKIT_SCRUB, 781 + IFLA_NETKIT_PEER_SCRUB, 1316 782 __IFLA_NETKIT_MAX, 1317 783 }; 1318 784 #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) ··· 1393 855 IFLA_VXLAN_DF, 1394 856 IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ 1395 857 IFLA_VXLAN_LOCALBYPASS, 858 + IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ 1396 859 __IFLA_VXLAN_MAX 1397 860 }; 1398 861 #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) ··· 1409 870 VXLAN_DF_INHERIT, 1410 871 __VXLAN_DF_END, 1411 872 VXLAN_DF_MAX = __VXLAN_DF_END - 1, 873 + }; 874 + 875 + enum ifla_vxlan_label_policy { 876 + VXLAN_LABEL_FIXED = 0, 877 + VXLAN_LABEL_INHERIT = 1, 878 + __VXLAN_LABEL_END, 879 + VXLAN_LABEL_MAX = __VXLAN_LABEL_END - 1, 1412 880 }; 1413 881 1414 882 /* GENEVE section */ ··· 1482 936 IFLA_GTP_ROLE, 1483 937 IFLA_GTP_CREATE_SOCKETS, 1484 938 IFLA_GTP_RESTART_COUNT, 939 + IFLA_GTP_LOCAL, 940 + IFLA_GTP_LOCAL6, 1485 941 __IFLA_GTP_MAX, 1486 942 }; 1487 943 #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) ··· 1789 1241 IFLA_HSR_PROTOCOL, /* Indicate different protocol than 1790 1242 * HSR. For example PRP. 1791 1243 */ 1244 + IFLA_HSR_INTERLINK, /* HSR interlink network device */ 1792 1245 __IFLA_HSR_MAX, 1793 1246 }; 1794 1247 ··· 1967 1418 1968 1419 enum { 1969 1420 IFLA_DSA_UNSPEC, 1970 - IFLA_DSA_MASTER, 1421 + IFLA_DSA_CONDUIT, 1422 + /* Deprecated, use IFLA_DSA_CONDUIT instead */ 1423 + IFLA_DSA_MASTER = IFLA_DSA_CONDUIT, 1971 1424 __IFLA_DSA_MAX, 1972 1425 }; 1973 1426
+1
tools/testing/selftests/bpf/network_helpers.h
··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 #ifndef __NETWORK_HELPERS_H 3 3 #define __NETWORK_HELPERS_H 4 + #include <arpa/inet.h> 4 5 #include <sys/socket.h> 5 6 #include <sys/types.h> 6 7 #include <linux/types.h>
+121
tools/testing/selftests/bpf/prog_tests/mptcp.c
··· 5 5 #include <linux/const.h> 6 6 #include <netinet/in.h> 7 7 #include <test_progs.h> 8 + #include <unistd.h> 8 9 #include "cgroup_helpers.h" 9 10 #include "network_helpers.h" 10 11 #include "mptcp_sock.skel.h" 11 12 #include "mptcpify.skel.h" 13 + #include "mptcp_subflow.skel.h" 12 14 13 15 #define NS_TEST "mptcp_ns" 16 + #define ADDR_1 "10.0.1.1" 17 + #define ADDR_2 "10.0.1.2" 18 + #define PORT_1 10001 14 19 15 20 #ifndef IPPROTO_MPTCP 16 21 #define IPPROTO_MPTCP 262 ··· 340 335 close(cgroup_fd); 341 336 } 342 337 338 + static int endpoint_init(char *flags) 339 + { 340 + SYS(fail, "ip -net %s link add veth1 type veth peer name veth2", NS_TEST); 341 + SYS(fail, "ip -net %s addr add %s/24 dev veth1", NS_TEST, ADDR_1); 342 + SYS(fail, "ip -net %s link set dev veth1 up", NS_TEST); 343 + SYS(fail, "ip -net %s addr add %s/24 dev veth2", NS_TEST, ADDR_2); 344 + SYS(fail, "ip -net %s link set dev veth2 up", NS_TEST); 345 + if (SYS_NOFAIL("ip -net %s mptcp endpoint add %s %s", NS_TEST, ADDR_2, flags)) { 346 + printf("'ip mptcp' not supported, skip this test.\n"); 347 + test__skip(); 348 + goto fail; 349 + } 350 + 351 + return 0; 352 + fail: 353 + return -1; 354 + } 355 + 356 + static void wait_for_new_subflows(int fd) 357 + { 358 + socklen_t len; 359 + u8 subflows; 360 + int err, i; 361 + 362 + len = sizeof(subflows); 363 + /* Wait max 5 sec for new subflows to be created */ 364 + for (i = 0; i < 50; i++) { 365 + err = getsockopt(fd, SOL_MPTCP, MPTCP_INFO, &subflows, &len); 366 + if (!err && subflows > 0) 367 + break; 368 + 369 + usleep(100000); /* 0.1s */ 370 + } 371 + } 372 + 373 + static void run_subflow(void) 374 + { 375 + int server_fd, client_fd, err; 376 + char new[TCP_CA_NAME_MAX]; 377 + char cc[TCP_CA_NAME_MAX]; 378 + unsigned int mark; 379 + socklen_t len; 380 + 381 + server_fd = start_mptcp_server(AF_INET, ADDR_1, PORT_1, 0); 382 + if (!ASSERT_OK_FD(server_fd, "start_mptcp_server")) 383 + return; 384 + 385 + client_fd = connect_to_fd(server_fd, 0); 386 + if (!ASSERT_OK_FD(client_fd, "connect_to_fd")) 387 + goto close_server; 388 + 389 + send_byte(client_fd); 390 + wait_for_new_subflows(client_fd); 391 + 392 + len = sizeof(mark); 393 + err = getsockopt(client_fd, SOL_SOCKET, SO_MARK, &mark, &len); 394 + if (ASSERT_OK(err, "getsockopt(client_fd, SO_MARK)")) 395 + ASSERT_EQ(mark, 0, "mark"); 396 + 397 + len = sizeof(new); 398 + err = getsockopt(client_fd, SOL_TCP, TCP_CONGESTION, new, &len); 399 + if (ASSERT_OK(err, "getsockopt(client_fd, TCP_CONGESTION)")) { 400 + get_msk_ca_name(cc); 401 + ASSERT_STREQ(new, cc, "cc"); 402 + } 403 + 404 + close(client_fd); 405 + close_server: 406 + close(server_fd); 407 + } 408 + 409 + static void test_subflow(void) 410 + { 411 + struct mptcp_subflow *skel; 412 + struct nstoken *nstoken; 413 + int cgroup_fd; 414 + 415 + cgroup_fd = test__join_cgroup("/mptcp_subflow"); 416 + if (!ASSERT_OK_FD(cgroup_fd, "join_cgroup: mptcp_subflow")) 417 + return; 418 + 419 + skel = mptcp_subflow__open_and_load(); 420 + if (!ASSERT_OK_PTR(skel, "skel_open_load: mptcp_subflow")) 421 + goto close_cgroup; 422 + 423 + skel->bss->pid = getpid(); 424 + 425 + skel->links.mptcp_subflow = 426 + bpf_program__attach_cgroup(skel->progs.mptcp_subflow, cgroup_fd); 427 + if (!ASSERT_OK_PTR(skel->links.mptcp_subflow, "attach mptcp_subflow")) 428 + goto skel_destroy; 429 + 430 + skel->links._getsockopt_subflow = 431 + bpf_program__attach_cgroup(skel->progs._getsockopt_subflow, cgroup_fd); 432 + if (!ASSERT_OK_PTR(skel->links._getsockopt_subflow, "attach _getsockopt_subflow")) 433 + goto skel_destroy; 434 + 435 + nstoken = create_netns(); 436 + if (!ASSERT_OK_PTR(nstoken, "create_netns: mptcp_subflow")) 437 + goto skel_destroy; 438 + 439 + if (endpoint_init("subflow") < 0) 440 + goto close_netns; 441 + 442 + run_subflow(); 443 + 444 + close_netns: 445 + cleanup_netns(nstoken); 446 + skel_destroy: 447 + mptcp_subflow__destroy(skel); 448 + close_cgroup: 449 + close(cgroup_fd); 450 + } 451 + 343 452 void test_mptcp(void) 344 453 { 345 454 if (test__start_subtest("base")) 346 455 test_base(); 347 456 if (test__start_subtest("mptcpify")) 348 457 test_mptcpify(); 458 + if (test__start_subtest("subflow")) 459 + test_subflow(); 349 460 }
+22 -7
tools/testing/selftests/bpf/prog_tests/netns_cookie.c
··· 8 8 #define SO_NETNS_COOKIE 71 9 9 #endif 10 10 11 + #define loopback 1 12 + 11 13 static int duration; 12 14 13 15 void test_netns_cookie(void) 14 16 { 17 + LIBBPF_OPTS(bpf_prog_attach_opts, opta); 18 + LIBBPF_OPTS(bpf_prog_detach_opts, optd); 15 19 int server_fd = -1, client_fd = -1, cgroup_fd = -1; 16 - int err, val, ret, map, verdict; 20 + int err, val, ret, map, verdict, tc_fd; 17 21 struct netns_cookie_prog *skel; 18 22 uint64_t cookie_expected_value; 19 23 socklen_t vallen = sizeof(cookie_expected_value); ··· 42 38 if (!ASSERT_OK(err, "prog_attach")) 43 39 goto done; 44 40 41 + tc_fd = bpf_program__fd(skel->progs.get_netns_cookie_tcx); 42 + err = bpf_prog_attach_opts(tc_fd, loopback, BPF_TCX_INGRESS, &opta); 43 + if (!ASSERT_OK(err, "prog_attach")) 44 + goto done; 45 + 45 46 server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); 46 47 if (CHECK(server_fd < 0, "start_server", "errno %d\n", errno)) 47 - goto done; 48 + goto cleanup_tc; 48 49 49 50 client_fd = connect_to_fd(server_fd, 0); 50 51 if (CHECK(client_fd < 0, "connect_to_fd", "errno %d\n", errno)) 51 - goto done; 52 + goto cleanup_tc; 52 53 53 54 ret = send(client_fd, send_msg, sizeof(send_msg), 0); 54 55 if (CHECK(ret != sizeof(send_msg), "send(msg)", "ret:%d\n", ret)) 55 - goto done; 56 + goto cleanup_tc; 56 57 57 58 err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sockops_netns_cookies), 58 59 &client_fd, &val); 59 60 if (!ASSERT_OK(err, "map_lookup(sockops_netns_cookies)")) 60 - goto done; 61 + goto cleanup_tc; 61 62 62 63 err = getsockopt(client_fd, SOL_SOCKET, SO_NETNS_COOKIE, 63 64 &cookie_expected_value, &vallen); 64 65 if (!ASSERT_OK(err, "getsockopt")) 65 - goto done; 66 + goto cleanup_tc; 66 67 67 68 ASSERT_EQ(val, cookie_expected_value, "cookie_value"); 68 69 69 70 err = bpf_map_lookup_elem(bpf_map__fd(skel->maps.sk_msg_netns_cookies), 70 71 &client_fd, &val); 71 72 if (!ASSERT_OK(err, "map_lookup(sk_msg_netns_cookies)")) 72 - goto done; 73 + goto cleanup_tc; 73 74 74 75 ASSERT_EQ(val, cookie_expected_value, "cookie_value"); 76 + ASSERT_EQ(skel->bss->tcx_init_netns_cookie, cookie_expected_value, "cookie_value"); 77 + ASSERT_EQ(skel->bss->tcx_netns_cookie, cookie_expected_value, "cookie_value"); 78 + 79 + cleanup_tc: 80 + err = bpf_prog_detach_opts(tc_fd, loopback, BPF_TCX_INGRESS, &optd); 81 + ASSERT_OK(err, "prog_detach"); 75 82 76 83 done: 77 84 if (server_fd != -1)
+85 -9
tools/testing/selftests/bpf/prog_tests/tc_netkit.c
··· 14 14 #include "netlink_helpers.h" 15 15 #include "tc_helpers.h" 16 16 17 - #define ICMP_ECHO 8 17 + #define MARK 42 18 + #define PRIO 0xeb9f 19 + #define ICMP_ECHO 8 18 20 19 21 struct icmphdr { 20 22 __u8 type; ··· 35 33 }; 36 34 37 35 static int create_netkit(int mode, int policy, int peer_policy, int *ifindex, 38 - bool same_netns) 36 + bool same_netns, int scrub, int peer_scrub) 39 37 { 40 38 struct rtnl_handle rth = { .fd = -1 }; 41 39 struct iplink_req req = {}; ··· 60 58 data = addattr_nest(&req.n, sizeof(req), IFLA_INFO_DATA); 61 59 addattr32(&req.n, sizeof(req), IFLA_NETKIT_POLICY, policy); 62 60 addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_POLICY, peer_policy); 61 + addattr32(&req.n, sizeof(req), IFLA_NETKIT_SCRUB, scrub); 62 + addattr32(&req.n, sizeof(req), IFLA_NETKIT_PEER_SCRUB, peer_scrub); 63 63 addattr32(&req.n, sizeof(req), IFLA_NETKIT_MODE, mode); 64 64 addattr_nest_end(&req.n, data); 65 65 addattr_nest_end(&req.n, linkinfo); ··· 122 118 123 119 static int __send_icmp(__u32 dest) 124 120 { 121 + int sock, ret, mark = MARK, prio = PRIO; 125 122 struct sockaddr_in addr; 126 123 struct icmphdr icmp; 127 - int sock, ret; 128 124 129 125 ret = write_sysctl("/proc/sys/net/ipv4/ping_group_range", "0 0"); 130 126 if (!ASSERT_OK(ret, "write_sysctl(net.ipv4.ping_group_range)")) ··· 137 133 ret = setsockopt(sock, SOL_SOCKET, SO_BINDTODEVICE, 138 134 netkit_name, strlen(netkit_name) + 1); 139 135 if (!ASSERT_OK(ret, "setsockopt(SO_BINDTODEVICE)")) 136 + goto out; 137 + 138 + ret = setsockopt(sock, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); 139 + if (!ASSERT_OK(ret, "setsockopt(SO_MARK)")) 140 + goto out; 141 + 142 + ret = setsockopt(sock, SOL_SOCKET, SO_PRIORITY, 143 + &prio, sizeof(prio)); 144 + if (!ASSERT_OK(ret, "setsockopt(SO_PRIORITY)")) 140 145 goto out; 141 146 142 147 memset(&addr, 0, sizeof(addr)); ··· 184 171 int err, ifindex; 185 172 186 173 err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, 187 - &ifindex, false); 174 + &ifindex, false, NETKIT_SCRUB_DEFAULT, 175 + NETKIT_SCRUB_DEFAULT); 188 176 if (err) 189 177 return; 190 178 ··· 299 285 int err, ifindex; 300 286 301 287 err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, 302 - &ifindex, false); 288 + &ifindex, false, NETKIT_SCRUB_DEFAULT, 289 + NETKIT_SCRUB_DEFAULT); 303 290 if (err) 304 291 return; 305 292 ··· 428 413 int err, ifindex; 429 414 430 415 err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, 431 - &ifindex, false); 416 + &ifindex, false, NETKIT_SCRUB_DEFAULT, 417 + NETKIT_SCRUB_DEFAULT); 432 418 if (err) 433 419 return; 434 420 ··· 543 527 int err, ifindex, ifindex2; 544 528 545 529 err = create_netkit(NETKIT_L3, NETKIT_PASS, NETKIT_PASS, 546 - &ifindex, true); 530 + &ifindex, true, NETKIT_SCRUB_DEFAULT, 531 + NETKIT_SCRUB_DEFAULT); 547 532 if (err) 548 533 return; 549 534 ··· 655 638 int err, ifindex; 656 639 657 640 err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, 658 - &ifindex, false); 641 + &ifindex, false, NETKIT_SCRUB_DEFAULT, 642 + NETKIT_SCRUB_DEFAULT); 659 643 if (err) 660 644 return; 661 645 ··· 733 715 struct bpf_link *link; 734 716 735 717 err = create_netkit(mode, NETKIT_PASS, NETKIT_PASS, 736 - &ifindex, true); 718 + &ifindex, true, NETKIT_SCRUB_DEFAULT, 719 + NETKIT_SCRUB_DEFAULT); 737 720 if (err) 738 721 return; 739 722 ··· 797 778 { 798 779 serial_test_tc_netkit_pkt_type_mode(NETKIT_L2); 799 780 serial_test_tc_netkit_pkt_type_mode(NETKIT_L3); 781 + } 782 + 783 + static void serial_test_tc_netkit_scrub_type(int scrub) 784 + { 785 + LIBBPF_OPTS(bpf_netkit_opts, optl); 786 + struct test_tc_link *skel; 787 + struct bpf_link *link; 788 + int err, ifindex; 789 + 790 + err = create_netkit(NETKIT_L2, NETKIT_PASS, NETKIT_PASS, 791 + &ifindex, false, scrub, scrub); 792 + if (err) 793 + return; 794 + 795 + skel = test_tc_link__open(); 796 + if (!ASSERT_OK_PTR(skel, "skel_open")) 797 + goto cleanup; 798 + 799 + ASSERT_EQ(bpf_program__set_expected_attach_type(skel->progs.tc8, 800 + BPF_NETKIT_PRIMARY), 0, "tc8_attach_type"); 801 + 802 + err = test_tc_link__load(skel); 803 + if (!ASSERT_OK(err, "skel_load")) 804 + goto cleanup; 805 + 806 + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); 807 + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0); 808 + 809 + ASSERT_EQ(skel->bss->seen_tc8, false, "seen_tc8"); 810 + 811 + link = bpf_program__attach_netkit(skel->progs.tc8, ifindex, &optl); 812 + if (!ASSERT_OK_PTR(link, "link_attach")) 813 + goto cleanup; 814 + 815 + skel->links.tc8 = link; 816 + 817 + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 1); 818 + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0); 819 + 820 + tc_skel_reset_all_seen(skel); 821 + ASSERT_EQ(send_icmp(), 0, "icmp_pkt"); 822 + 823 + ASSERT_EQ(skel->bss->seen_tc8, true, "seen_tc8"); 824 + ASSERT_EQ(skel->bss->mark, scrub == NETKIT_SCRUB_NONE ? MARK : 0, "mark"); 825 + ASSERT_EQ(skel->bss->prio, scrub == NETKIT_SCRUB_NONE ? PRIO : 0, "prio"); 826 + cleanup: 827 + test_tc_link__destroy(skel); 828 + 829 + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PRIMARY, 0); 830 + assert_mprog_count_ifindex(ifindex, BPF_NETKIT_PEER, 0); 831 + destroy_netkit(); 832 + } 833 + 834 + void serial_test_tc_netkit_scrub(void) 835 + { 836 + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_DEFAULT); 837 + serial_test_tc_netkit_scrub_type(NETKIT_SCRUB_NONE); 800 838 }
+36 -8
tools/testing/selftests/bpf/prog_tests/xdp_cpumap_attach.c
··· 2 2 #include <uapi/linux/bpf.h> 3 3 #include <linux/if_link.h> 4 4 #include <test_progs.h> 5 + #include <network_helpers.h> 5 6 6 7 #include "test_xdp_with_cpumap_frags_helpers.skel.h" 7 8 #include "test_xdp_with_cpumap_helpers.skel.h" 8 9 9 10 #define IFINDEX_LO 1 11 + #define TEST_NS "cpu_attach_ns" 10 12 11 13 static void test_xdp_with_cpumap_helpers(void) 12 14 { 13 - struct test_xdp_with_cpumap_helpers *skel; 15 + struct test_xdp_with_cpumap_helpers *skel = NULL; 14 16 struct bpf_prog_info info = {}; 15 17 __u32 len = sizeof(info); 16 18 struct bpf_cpumap_val val = { 17 19 .qsize = 192, 18 20 }; 19 - int err, prog_fd, map_fd; 21 + int err, prog_fd, prog_redir_fd, map_fd; 22 + struct nstoken *nstoken = NULL; 20 23 __u32 idx = 0; 24 + 25 + SYS(out_close, "ip netns add %s", TEST_NS); 26 + nstoken = open_netns(TEST_NS); 27 + if (!ASSERT_OK_PTR(nstoken, "open_netns")) 28 + goto out_close; 29 + SYS(out_close, "ip link set dev lo up"); 21 30 22 31 skel = test_xdp_with_cpumap_helpers__open_and_load(); 23 32 if (!ASSERT_OK_PTR(skel, "test_xdp_with_cpumap_helpers__open_and_load")) 24 33 return; 25 34 26 - prog_fd = bpf_program__fd(skel->progs.xdp_redir_prog); 27 - err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL); 35 + prog_redir_fd = bpf_program__fd(skel->progs.xdp_redir_prog); 36 + err = bpf_xdp_attach(IFINDEX_LO, prog_redir_fd, XDP_FLAGS_SKB_MODE, NULL); 28 37 if (!ASSERT_OK(err, "Generic attach of program with 8-byte CPUMAP")) 29 38 goto out_close; 30 - 31 - err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL); 32 - ASSERT_OK(err, "XDP program detach"); 33 39 34 40 prog_fd = bpf_program__fd(skel->progs.xdp_dummy_cm); 35 41 map_fd = bpf_map__fd(skel->maps.cpu_map); ··· 50 44 err = bpf_map_lookup_elem(map_fd, &idx, &val); 51 45 ASSERT_OK(err, "Read cpumap entry"); 52 46 ASSERT_EQ(info.id, val.bpf_prog.id, "Match program id to cpumap entry prog_id"); 47 + 48 + /* send a packet to trigger any potential bugs in there */ 49 + char data[10] = {}; 50 + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, 51 + .data_in = &data, 52 + .data_size_in = 10, 53 + .flags = BPF_F_TEST_XDP_LIVE_FRAMES, 54 + .repeat = 1, 55 + ); 56 + err = bpf_prog_test_run_opts(prog_redir_fd, &opts); 57 + ASSERT_OK(err, "XDP test run"); 58 + 59 + /* wait for the packets to be flushed, then check that redirect has been 60 + * performed 61 + */ 62 + kern_sync_rcu(); 63 + ASSERT_NEQ(skel->bss->redirect_count, 0, "redirected packets"); 64 + 65 + err = bpf_xdp_detach(IFINDEX_LO, XDP_FLAGS_SKB_MODE, NULL); 66 + ASSERT_OK(err, "XDP program detach"); 53 67 54 68 /* can not attach BPF_XDP_CPUMAP program to a device */ 55 69 err = bpf_xdp_attach(IFINDEX_LO, prog_fd, XDP_FLAGS_SKB_MODE, NULL); ··· 91 65 ASSERT_NEQ(err, 0, "Add BPF_XDP program with frags to cpumap entry"); 92 66 93 67 out_close: 68 + close_netns(nstoken); 69 + SYS_NOFAIL("ip netns del %s", TEST_NS); 94 70 test_xdp_with_cpumap_helpers__destroy(skel); 95 71 } 96 72 ··· 139 111 test_xdp_with_cpumap_frags_helpers__destroy(skel); 140 112 } 141 113 142 - void serial_test_xdp_cpumap_attach(void) 114 + void test_xdp_cpumap_attach(void) 143 115 { 144 116 if (test__start_subtest("CPUMAP with programs in entries")) 145 117 test_xdp_with_cpumap_helpers();
+42
tools/testing/selftests/bpf/progs/mptcp_bpf.h
··· 1 + /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ 2 + #ifndef __MPTCP_BPF_H__ 3 + #define __MPTCP_BPF_H__ 4 + 5 + #include "bpf_experimental.h" 6 + 7 + /* list helpers from include/linux/list.h */ 8 + static inline int list_is_head(const struct list_head *list, 9 + const struct list_head *head) 10 + { 11 + return list == head; 12 + } 13 + 14 + #define list_entry(ptr, type, member) \ 15 + container_of(ptr, type, member) 16 + 17 + #define list_first_entry(ptr, type, member) \ 18 + list_entry((ptr)->next, type, member) 19 + 20 + #define list_next_entry(pos, member) \ 21 + list_entry((pos)->member.next, typeof(*(pos)), member) 22 + 23 + #define list_entry_is_head(pos, head, member) \ 24 + list_is_head(&pos->member, (head)) 25 + 26 + /* small difference: 'can_loop' has been added in the conditions */ 27 + #define list_for_each_entry(pos, head, member) \ 28 + for (pos = list_first_entry(head, typeof(*pos), member); \ 29 + !list_entry_is_head(pos, head, member) && can_loop; \ 30 + pos = list_next_entry(pos, member)) 31 + 32 + /* mptcp helpers from protocol.h */ 33 + #define mptcp_for_each_subflow(__msk, __subflow) \ 34 + list_for_each_entry(__subflow, &((__msk)->conn_list), node) 35 + 36 + static __always_inline struct sock * 37 + mptcp_subflow_tcp_sock(const struct mptcp_subflow_context *subflow) 38 + { 39 + return subflow->tcp_sock; 40 + } 41 + 42 + #endif
+128
tools/testing/selftests/bpf/progs/mptcp_subflow.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright (c) 2020, Tessares SA. */ 3 + /* Copyright (c) 2024, Kylin Software */ 4 + 5 + /* vmlinux.h, bpf_helpers.h and other 'define' */ 6 + #include "bpf_tracing_net.h" 7 + #include "mptcp_bpf.h" 8 + 9 + char _license[] SEC("license") = "GPL"; 10 + 11 + char cc[TCP_CA_NAME_MAX] = "reno"; 12 + int pid; 13 + 14 + /* Associate a subflow counter to each token */ 15 + struct { 16 + __uint(type, BPF_MAP_TYPE_HASH); 17 + __uint(key_size, sizeof(__u32)); 18 + __uint(value_size, sizeof(__u32)); 19 + __uint(max_entries, 100); 20 + } mptcp_sf SEC(".maps"); 21 + 22 + SEC("sockops") 23 + int mptcp_subflow(struct bpf_sock_ops *skops) 24 + { 25 + __u32 init = 1, key, mark, *cnt; 26 + struct mptcp_sock *msk; 27 + struct bpf_sock *sk; 28 + int err; 29 + 30 + if (skops->op != BPF_SOCK_OPS_TCP_CONNECT_CB) 31 + return 1; 32 + 33 + sk = skops->sk; 34 + if (!sk) 35 + return 1; 36 + 37 + msk = bpf_skc_to_mptcp_sock(sk); 38 + if (!msk) 39 + return 1; 40 + 41 + key = msk->token; 42 + cnt = bpf_map_lookup_elem(&mptcp_sf, &key); 43 + if (cnt) { 44 + /* A new subflow is added to an existing MPTCP connection */ 45 + __sync_fetch_and_add(cnt, 1); 46 + mark = *cnt; 47 + } else { 48 + /* A new MPTCP connection is just initiated and this is its primary subflow */ 49 + bpf_map_update_elem(&mptcp_sf, &key, &init, BPF_ANY); 50 + mark = init; 51 + } 52 + 53 + /* Set the mark of the subflow's socket based on appearance order */ 54 + err = bpf_setsockopt(skops, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); 55 + if (err < 0) 56 + return 1; 57 + if (mark == 2) 58 + err = bpf_setsockopt(skops, SOL_TCP, TCP_CONGESTION, cc, TCP_CA_NAME_MAX); 59 + 60 + return 1; 61 + } 62 + 63 + static int _check_getsockopt_subflow_mark(struct mptcp_sock *msk, struct bpf_sockopt *ctx) 64 + { 65 + struct mptcp_subflow_context *subflow; 66 + int i = 0; 67 + 68 + mptcp_for_each_subflow(msk, subflow) { 69 + struct sock *ssk; 70 + 71 + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow, 72 + struct mptcp_subflow_context)); 73 + 74 + if (ssk->sk_mark != ++i) { 75 + ctx->retval = -2; 76 + break; 77 + } 78 + } 79 + 80 + return 1; 81 + } 82 + 83 + static int _check_getsockopt_subflow_cc(struct mptcp_sock *msk, struct bpf_sockopt *ctx) 84 + { 85 + struct mptcp_subflow_context *subflow; 86 + 87 + mptcp_for_each_subflow(msk, subflow) { 88 + struct inet_connection_sock *icsk; 89 + struct sock *ssk; 90 + 91 + ssk = mptcp_subflow_tcp_sock(bpf_core_cast(subflow, 92 + struct mptcp_subflow_context)); 93 + icsk = bpf_core_cast(ssk, struct inet_connection_sock); 94 + 95 + if (ssk->sk_mark == 2 && 96 + __builtin_memcmp(icsk->icsk_ca_ops->name, cc, TCP_CA_NAME_MAX)) { 97 + ctx->retval = -2; 98 + break; 99 + } 100 + } 101 + 102 + return 1; 103 + } 104 + 105 + SEC("cgroup/getsockopt") 106 + int _getsockopt_subflow(struct bpf_sockopt *ctx) 107 + { 108 + struct bpf_sock *sk = ctx->sk; 109 + struct mptcp_sock *msk; 110 + 111 + if (bpf_get_current_pid_tgid() >> 32 != pid) 112 + return 1; 113 + 114 + if (!sk || sk->protocol != IPPROTO_MPTCP || 115 + (!(ctx->level == SOL_SOCKET && ctx->optname == SO_MARK) && 116 + !(ctx->level == SOL_TCP && ctx->optname == TCP_CONGESTION))) 117 + return 1; 118 + 119 + msk = bpf_core_cast(sk, struct mptcp_sock); 120 + if (msk->pm.subflows != 1) { 121 + ctx->retval = -1; 122 + return 1; 123 + } 124 + 125 + if (ctx->optname == SO_MARK) 126 + return _check_getsockopt_subflow_mark(msk, ctx); 127 + return _check_getsockopt_subflow_cc(msk, ctx); 128 + }
+12
tools/testing/selftests/bpf/progs/test_tc_link.c
··· 18 18 bool seen_tc5; 19 19 bool seen_tc6; 20 20 bool seen_tc7; 21 + bool seen_tc8; 21 22 22 23 bool set_type; 23 24 24 25 bool seen_eth; 25 26 bool seen_host; 26 27 bool seen_mcast; 28 + 29 + int mark, prio; 27 30 28 31 SEC("tc/ingress") 29 32 int tc1(struct __sk_buff *skb) ··· 101 98 } 102 99 out: 103 100 seen_tc7 = true; 101 + return TCX_PASS; 102 + } 103 + 104 + SEC("tc/egress") 105 + int tc8(struct __sk_buff *skb) 106 + { 107 + seen_tc8 = true; 108 + mark = skb->mark; 109 + prio = skb->priority; 104 110 return TCX_PASS; 105 111 }
+6 -1
tools/testing/selftests/bpf/progs/test_xdp_with_cpumap_helpers.c
··· 12 12 __uint(max_entries, 4); 13 13 } cpu_map SEC(".maps"); 14 14 15 + __u32 redirect_count = 0; 16 + 15 17 SEC("xdp") 16 18 int xdp_redir_prog(struct xdp_md *ctx) 17 19 { 18 - return bpf_redirect_map(&cpu_map, 1, 0); 20 + return bpf_redirect_map(&cpu_map, 0, 0); 19 21 } 20 22 21 23 SEC("xdp") ··· 29 27 SEC("xdp/cpumap") 30 28 int xdp_dummy_cm(struct xdp_md *ctx) 31 29 { 30 + if (bpf_get_smp_processor_id() == 0) 31 + redirect_count++; 32 + 32 33 if (ctx->ingress_ifindex == IFINDEX_LO) 33 34 return XDP_DROP; 34 35