Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'netkit-support-for-io_uring-zero-copy-and-af_xdp'

Daniel Borkmann says:

====================
netkit: Support for io_uring zero-copy and AF_XDP

Containers use virtual netdevs to route traffic from a physical netdev
in the host namespace. They do not have access to the physical netdev
in the host and thus can't use memory providers or AF_XDP that require
reconfiguring/restarting queues in the physical netdev.

This patchset adds the concept of queue leasing to virtual netdevs that
allow containers to use memory providers and AF_XDP at native speed.
Leased queues are bound to a real queue in a physical netdev and act
as a proxy.

Memory providers and AF_XDP operations take an ifindex and queue id,
so containers would pass in an ifindex for a virtual netdev and a queue
id of a leased queue, which then gets proxied to the underlying real
queue.

We have implemented support for this concept in netkit and tested the
latter against Nvidia ConnectX-6 (mlx5) as well as Broadcom BCM957504
(bnxt_en) 100G NICs. For more details see the individual patches.
====================

Link: https://patch.msgid.link/20260402231031.447597-1-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+2560 -167
+46
Documentation/netlink/specs/netdev.yaml
··· 339 339 doc: XSK information for this queue, if any. 340 340 type: nest 341 341 nested-attributes: xsk-info 342 + - 343 + name: lease 344 + doc: | 345 + A queue from a virtual device can have a lease which refers to 346 + another queue from a physical device. This is useful for memory 347 + providers and AF_XDP operations which take an ifindex and queue id 348 + to allow applications to bind against virtual devices in containers. 349 + type: nest 350 + nested-attributes: lease 342 351 - 343 352 name: qstats 344 353 doc: | ··· 547 538 - 548 539 name: type 549 540 - 541 + name: lease 542 + attributes: 543 + - 544 + name: ifindex 545 + doc: The netdev ifindex to lease the queue from. 546 + type: u32 547 + checks: 548 + min: 1 549 + - 550 + name: queue 551 + doc: The netdev queue to lease from. 552 + type: nest 553 + nested-attributes: queue-id 554 + - 555 + name: netns-id 556 + doc: The network namespace id of the netdev. 557 + type: s32 558 + checks: 559 + min: 0 560 + - 550 561 name: dmabuf 551 562 attributes: 552 563 - ··· 715 686 - dmabuf 716 687 - io-uring 717 688 - xsk 689 + - lease 718 690 dump: 719 691 request: 720 692 attributes: ··· 825 795 - ifindex 826 796 - fd 827 797 reply: 798 + attributes: 799 + - id 800 + - 801 + name: queue-create 802 + doc: | 803 + Create a new queue for the given netdevice. Whether this operation 804 + is supported depends on the device and the driver. 805 + attribute-set: queue 806 + flags: [admin-perm] 807 + do: 808 + request: 809 + attributes: 810 + - ifindex 811 + - type 812 + - lease 813 + reply: &queue-create-op 828 814 attributes: 829 815 - id 830 816
+11
Documentation/netlink/specs/rt-link.yaml
··· 826 826 - name: none 827 827 - name: default 828 828 - 829 + name: netkit-pairing 830 + type: enum 831 + enum-name: netkit-pairing 832 + entries: 833 + - name: pair 834 + - name: single 835 + - 829 836 name: ovpn-mode 830 837 enum-name: ovpn-mode 831 838 name-prefix: ovpn-mode ··· 2306 2299 - 2307 2300 name: tailroom 2308 2301 type: u16 2302 + - 2303 + name: pairing 2304 + type: u32 2305 + enum: netkit-pairing 2309 2306 - 2310 2307 name: linkinfo-ovpn-attrs 2311 2308 name-prefix: ifla-ovpn-
+6
Documentation/networking/netdevices.rst
··· 329 329 to drivers which have ops called under the instance lock as "ops locked". 330 330 See also the documentation of the ``lock`` member of struct net_device. 331 331 332 + There is also a case of taking two per-netdev locks in sequence when netdev 333 + queues are leased, that is, the netdev-scope lock is taken for both the 334 + virtual and the physical device. To prevent deadlocks, the virtual device's 335 + lock must always be acquired before the physical device's (see 336 + ``netdev_nl_queue_create_doit``). 337 + 332 338 In the future, there will be an option for individual 333 339 drivers to opt out of using ``rtnl_lock`` and instead perform their control 334 340 operations directly under the netdev instance lock.
+349 -63
drivers/net/netkit.c
··· 9 9 #include <linux/bpf_mprog.h> 10 10 #include <linux/indirect_call_wrapper.h> 11 11 12 + #include <net/netdev_lock.h> 13 + #include <net/netdev_queues.h> 14 + #include <net/netdev_rx_queue.h> 15 + #include <net/xdp_sock_drv.h> 12 16 #include <net/netkit.h> 13 17 #include <net/dst.h> 14 18 #include <net/tcx.h> 15 19 16 - #define DRV_NAME "netkit" 20 + #define NETKIT_DRV_NAME "netkit" 21 + 22 + #define NETKIT_NUM_RX_QUEUES_MAX 1024 23 + #define NETKIT_NUM_TX_QUEUES_MAX 1 24 + 25 + #define NETKIT_NUM_RX_QUEUES_REAL 1 26 + #define NETKIT_NUM_TX_QUEUES_REAL 1 17 27 18 28 struct netkit { 19 29 __cacheline_group_begin(netkit_fastpath); ··· 36 26 37 27 __cacheline_group_begin(netkit_slowpath); 38 28 enum netkit_mode mode; 29 + enum netkit_pairing pair; 39 30 bool primary; 40 31 u32 headroom; 41 32 __cacheline_group_end(netkit_slowpath); ··· 46 35 struct bpf_link link; 47 36 struct net_device *dev; 48 37 }; 38 + 39 + static struct rtnl_link_ops netkit_link_ops; 49 40 50 41 static __always_inline int 51 42 netkit_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, ··· 148 135 struct netkit *nk = netkit_priv(dev); 149 136 struct net_device *peer = rtnl_dereference(nk->peer); 150 137 138 + if (nk->pair == NETKIT_DEVICE_SINGLE) { 139 + netif_carrier_on(dev); 140 + return 0; 141 + } 151 142 if (!peer) 152 143 return -ENOTCONN; 153 144 if (peer->flags & IFF_UP) { ··· 211 194 212 195 rcu_read_lock(); 213 196 peer = rcu_dereference(nk->peer); 214 - if (unlikely(!peer)) 215 - goto out; 197 + if (!peer) { 198 + nk->headroom = headroom; 199 + dev->needed_headroom = headroom; 200 + } else { 201 + nk2 = netkit_priv(peer); 202 + nk->headroom = headroom; 203 + headroom = max(nk->headroom, nk2->headroom); 216 204 217 - nk2 = netkit_priv(peer); 218 - nk->headroom = headroom; 219 - headroom = max(nk->headroom, nk2->headroom); 220 - 221 - peer->needed_headroom = headroom; 222 - dev->needed_headroom = headroom; 223 - out: 205 + peer->needed_headroom = headroom; 206 + dev->needed_headroom = headroom; 207 + } 224 208 rcu_read_unlock(); 225 209 } 226 210 ··· 237 219 stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped); 238 220 } 239 221 222 + static bool netkit_xsk_supported_at_phys(const struct net_device *dev) 223 + { 224 + if (!dev->netdev_ops->ndo_bpf || 225 + !dev->netdev_ops->ndo_xdp_xmit || 226 + !dev->netdev_ops->ndo_xsk_wakeup) 227 + return false; 228 + return true; 229 + } 230 + 231 + static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp) 232 + { 233 + struct netkit *nk = netkit_priv(dev); 234 + struct netdev_bpf xdp_lower; 235 + struct netdev_rx_queue *rxq; 236 + struct net_device *phys; 237 + bool create = false; 238 + int ret = -EBUSY; 239 + 240 + switch (xdp->command) { 241 + case XDP_SETUP_XSK_POOL: 242 + if (nk->pair == NETKIT_DEVICE_PAIR) 243 + return -EOPNOTSUPP; 244 + if (xdp->xsk.queue_id >= dev->real_num_rx_queues) 245 + return -EINVAL; 246 + 247 + rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id); 248 + if (!rxq->lease) 249 + return -EOPNOTSUPP; 250 + 251 + phys = rxq->lease->dev; 252 + if (!netkit_xsk_supported_at_phys(phys)) 253 + return -EOPNOTSUPP; 254 + 255 + create = xdp->xsk.pool; 256 + memcpy(&xdp_lower, xdp, sizeof(xdp_lower)); 257 + xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->lease); 258 + break; 259 + case XDP_SETUP_PROG: 260 + return -EOPNOTSUPP; 261 + default: 262 + return -EINVAL; 263 + } 264 + 265 + netdev_lock(phys); 266 + if (create && 267 + (phys->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) { 268 + ret = -EOPNOTSUPP; 269 + goto out; 270 + } 271 + if (!create || !dev_get_min_mp_channel_count(phys)) 272 + ret = phys->netdev_ops->ndo_bpf(phys, &xdp_lower); 273 + out: 274 + netdev_unlock(phys); 275 + return ret; 276 + } 277 + 278 + static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags) 279 + { 280 + struct netdev_rx_queue *rxq, *rxq_lease; 281 + struct net_device *phys; 282 + 283 + if (queue_id >= dev->real_num_rx_queues) 284 + return -EINVAL; 285 + 286 + rxq = __netif_get_rx_queue(dev, queue_id); 287 + rxq_lease = READ_ONCE(rxq->lease); 288 + if (unlikely(!rxq_lease)) 289 + return -EOPNOTSUPP; 290 + 291 + /* netkit_xsk already validated full xsk support, hence it's 292 + * fine to call into ndo_xsk_wakeup right away given this 293 + * was a prerequisite to get here in the first place. The 294 + * phys xsk support cannot change without tearing down the 295 + * device (which clears the lease first). 296 + */ 297 + phys = rxq_lease->dev; 298 + return phys->netdev_ops->ndo_xsk_wakeup(phys, 299 + get_netdev_rx_queue_index(rxq_lease), flags); 300 + } 301 + 302 + static int netkit_init(struct net_device *dev) 303 + { 304 + netdev_lockdep_set_classes(dev); 305 + return 0; 306 + } 307 + 240 308 static void netkit_uninit(struct net_device *dev); 241 309 242 310 static const struct net_device_ops netkit_netdev_ops = { 311 + .ndo_init = netkit_init, 243 312 .ndo_open = netkit_open, 244 313 .ndo_stop = netkit_close, 245 314 .ndo_start_xmit = netkit_xmit, ··· 337 232 .ndo_get_peer_dev = netkit_peer_dev, 338 233 .ndo_get_stats64 = netkit_get_stats, 339 234 .ndo_uninit = netkit_uninit, 235 + .ndo_bpf = netkit_xsk, 236 + .ndo_xsk_wakeup = netkit_xsk_wakeup, 340 237 .ndo_features_check = passthru_features_check, 341 238 }; 342 239 343 240 static void netkit_get_drvinfo(struct net_device *dev, 344 241 struct ethtool_drvinfo *info) 345 242 { 346 - strscpy(info->driver, DRV_NAME, sizeof(info->driver)); 243 + strscpy(info->driver, NETKIT_DRV_NAME, sizeof(info->driver)); 347 244 } 348 245 349 246 static const struct ethtool_ops netkit_ethtool_ops = { 350 247 .get_drvinfo = netkit_get_drvinfo, 351 248 }; 249 + 250 + static int netkit_queue_create(struct net_device *dev, 251 + struct netlink_ext_ack *extack) 252 + { 253 + struct netkit *nk = netkit_priv(dev); 254 + u32 rxq_count_old, rxq_count_new; 255 + int err; 256 + 257 + rxq_count_old = dev->real_num_rx_queues; 258 + rxq_count_new = rxq_count_old + 1; 259 + 260 + /* In paired mode, only the non-primary (peer) device can 261 + * create leased queues since the primary is the management 262 + * side. In single device mode, leasing is always allowed. 263 + */ 264 + if (nk->pair == NETKIT_DEVICE_PAIR && nk->primary) { 265 + NL_SET_ERR_MSG(extack, 266 + "netkit can only lease against the peer device"); 267 + return -EOPNOTSUPP; 268 + } 269 + 270 + err = netif_set_real_num_rx_queues(dev, rxq_count_new); 271 + if (err) { 272 + if (rxq_count_new > dev->num_rx_queues) 273 + NL_SET_ERR_MSG(extack, 274 + "netkit maximum queue limit reached"); 275 + else 276 + NL_SET_ERR_MSG_FMT(extack, 277 + "netkit cannot create more queues err=%d", err); 278 + return err; 279 + } 280 + 281 + return rxq_count_old; 282 + } 283 + 284 + static const struct netdev_queue_mgmt_ops netkit_queue_mgmt_ops = { 285 + .ndo_queue_create = netkit_queue_create, 286 + }; 287 + 288 + static struct net_device *netkit_alloc(struct nlattr *tb[], 289 + const char *ifname, 290 + unsigned char name_assign_type, 291 + unsigned int num_tx_queues, 292 + unsigned int num_rx_queues) 293 + { 294 + const struct rtnl_link_ops *ops = &netkit_link_ops; 295 + struct net_device *dev; 296 + 297 + if (num_tx_queues > NETKIT_NUM_TX_QUEUES_MAX || 298 + num_rx_queues > NETKIT_NUM_RX_QUEUES_MAX) 299 + return ERR_PTR(-EOPNOTSUPP); 300 + 301 + dev = alloc_netdev_mqs(ops->priv_size, ifname, 302 + name_assign_type, ops->setup, 303 + num_tx_queues, num_rx_queues); 304 + if (dev) { 305 + dev->real_num_tx_queues = NETKIT_NUM_TX_QUEUES_REAL; 306 + dev->real_num_rx_queues = NETKIT_NUM_RX_QUEUES_REAL; 307 + } 308 + return dev; 309 + } 310 + 311 + static void netkit_queue_unlease(struct net_device *dev) 312 + { 313 + struct netdev_rx_queue *rxq, *rxq_lease; 314 + struct net_device *dev_lease; 315 + int i; 316 + 317 + if (dev->real_num_rx_queues == 1) 318 + return; 319 + 320 + netdev_lock(dev); 321 + for (i = 1; i < dev->real_num_rx_queues; i++) { 322 + rxq = __netif_get_rx_queue(dev, i); 323 + rxq_lease = rxq->lease; 324 + dev_lease = rxq_lease->dev; 325 + 326 + netdev_lock(dev_lease); 327 + netdev_rx_queue_unlease(rxq, rxq_lease); 328 + netdev_unlock(dev_lease); 329 + } 330 + netdev_unlock(dev); 331 + } 352 332 353 333 static void netkit_setup(struct net_device *dev) 354 334 { ··· 465 275 dev->priv_flags |= IFF_DISABLE_NETPOLL; 466 276 dev->lltx = true; 467 277 468 - dev->ethtool_ops = &netkit_ethtool_ops; 469 - dev->netdev_ops = &netkit_netdev_ops; 278 + dev->netdev_ops = &netkit_netdev_ops; 279 + dev->ethtool_ops = &netkit_ethtool_ops; 280 + dev->queue_mgmt_ops = &netkit_queue_mgmt_ops; 470 281 471 282 dev->features |= netkit_features; 472 283 dev->hw_features = netkit_features; ··· 516 325 return 0; 517 326 } 518 327 519 - static struct rtnl_link_ops netkit_link_ops; 520 - 521 328 static int netkit_new_link(struct net_device *dev, 522 329 struct rtnl_newlink_params *params, 523 330 struct netlink_ext_ack *extack) ··· 524 335 enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT; 525 336 enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT; 526 337 struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr; 338 + enum netkit_pairing pair = NETKIT_DEVICE_PAIR; 527 339 enum netkit_action policy_prim = NETKIT_PASS; 528 340 enum netkit_action policy_peer = NETKIT_PASS; 341 + bool seen_peer = false, seen_scrub = false; 529 342 struct nlattr **data = params->data; 530 343 enum netkit_mode mode = NETKIT_L3; 531 344 unsigned char ifname_assign_type; 532 345 struct nlattr **tb = params->tb; 533 346 u16 headroom = 0, tailroom = 0; 534 347 struct ifinfomsg *ifmp = NULL; 535 - struct net_device *peer; 348 + struct net_device *peer = NULL; 536 349 char ifname[IFNAMSIZ]; 537 350 struct netkit *nk; 538 351 int err; ··· 571 380 headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]); 572 381 if (data[IFLA_NETKIT_TAILROOM]) 573 382 tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]); 383 + if (data[IFLA_NETKIT_PAIRING]) 384 + pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]); 385 + 386 + seen_scrub = data[IFLA_NETKIT_SCRUB]; 387 + seen_peer = data[IFLA_NETKIT_PEER_INFO] || 388 + data[IFLA_NETKIT_PEER_SCRUB] || 389 + data[IFLA_NETKIT_PEER_POLICY]; 574 390 } 575 391 576 392 if (ifmp && tbp[IFLA_IFNAME]) { ··· 590 392 if (mode != NETKIT_L2 && 591 393 (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS])) 592 394 return -EOPNOTSUPP; 395 + if (pair == NETKIT_DEVICE_SINGLE && 396 + (tb != tbp || seen_peer || seen_scrub || 397 + policy_prim != NETKIT_PASS)) 398 + return -EOPNOTSUPP; 593 399 594 - peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, 595 - &netkit_link_ops, tbp, extack); 596 - if (IS_ERR(peer)) 597 - return PTR_ERR(peer); 400 + if (pair == NETKIT_DEVICE_PAIR) { 401 + peer = rtnl_create_link(peer_net, ifname, ifname_assign_type, 402 + &netkit_link_ops, tbp, extack); 403 + if (IS_ERR(peer)) 404 + return PTR_ERR(peer); 598 405 599 - netif_inherit_tso_max(peer, dev); 600 - if (headroom) { 601 - peer->needed_headroom = headroom; 602 - dev->needed_headroom = headroom; 406 + netif_inherit_tso_max(peer, dev); 407 + if (headroom) 408 + peer->needed_headroom = headroom; 409 + if (tailroom) 410 + peer->needed_tailroom = tailroom; 411 + if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) 412 + eth_hw_addr_random(peer); 413 + if (ifmp && dev->ifindex) 414 + peer->ifindex = ifmp->ifi_index; 415 + 416 + nk = netkit_priv(peer); 417 + nk->primary = false; 418 + nk->policy = policy_peer; 419 + nk->scrub = scrub_peer; 420 + nk->mode = mode; 421 + nk->pair = pair; 422 + nk->headroom = headroom; 423 + bpf_mprog_bundle_init(&nk->bundle); 424 + 425 + err = register_netdevice(peer); 426 + if (err < 0) 427 + goto err_register_peer; 428 + netif_carrier_off(peer); 429 + if (mode == NETKIT_L2) 430 + dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); 431 + 432 + err = rtnl_configure_link(peer, NULL, 0, NULL); 433 + if (err < 0) 434 + goto err_configure_peer; 603 435 } 604 - if (tailroom) { 605 - peer->needed_tailroom = tailroom; 606 - dev->needed_tailroom = tailroom; 607 - } 608 - 609 - if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS])) 610 - eth_hw_addr_random(peer); 611 - if (ifmp && dev->ifindex) 612 - peer->ifindex = ifmp->ifi_index; 613 - 614 - nk = netkit_priv(peer); 615 - nk->primary = false; 616 - nk->policy = policy_peer; 617 - nk->scrub = scrub_peer; 618 - nk->mode = mode; 619 - nk->headroom = headroom; 620 - bpf_mprog_bundle_init(&nk->bundle); 621 - 622 - err = register_netdevice(peer); 623 - if (err < 0) 624 - goto err_register_peer; 625 - netif_carrier_off(peer); 626 - if (mode == NETKIT_L2) 627 - dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL); 628 - 629 - err = rtnl_configure_link(peer, NULL, 0, NULL); 630 - if (err < 0) 631 - goto err_configure_peer; 632 436 633 437 if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS]) 634 438 eth_hw_addr_random(dev); ··· 638 438 nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); 639 439 else 640 440 strscpy(dev->name, "nk%d", IFNAMSIZ); 441 + if (headroom) 442 + dev->needed_headroom = headroom; 443 + if (tailroom) 444 + dev->needed_tailroom = tailroom; 641 445 642 446 nk = netkit_priv(dev); 643 447 nk->primary = true; 644 448 nk->policy = policy_prim; 645 449 nk->scrub = scrub_prim; 646 450 nk->mode = mode; 451 + nk->pair = pair; 647 452 nk->headroom = headroom; 648 453 bpf_mprog_bundle_init(&nk->bundle); 454 + 455 + if (pair == NETKIT_DEVICE_SINGLE) 456 + xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK); 649 457 650 458 err = register_netdevice(dev); 651 459 if (err < 0) ··· 663 455 dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL); 664 456 665 457 rcu_assign_pointer(netkit_priv(dev)->peer, peer); 666 - rcu_assign_pointer(netkit_priv(peer)->peer, dev); 458 + if (peer) 459 + rcu_assign_pointer(netkit_priv(peer)->peer, dev); 667 460 return 0; 668 461 err_configure_peer: 669 - unregister_netdevice(peer); 462 + if (peer) 463 + unregister_netdevice(peer); 670 464 return err; 671 465 err_register_peer: 672 466 free_netdev(peer); ··· 728 518 nk = netkit_priv(dev); 729 519 if (!nk->primary) 730 520 return ERR_PTR(-EACCES); 521 + if (nk->pair == NETKIT_DEVICE_SINGLE) 522 + return ERR_PTR(-EOPNOTSUPP); 731 523 if (which == BPF_NETKIT_PEER) { 732 524 dev = rcu_dereference_rtnl(nk->peer); 733 525 if (!dev) ··· 1056 844 static void netkit_uninit(struct net_device *dev) 1057 845 { 1058 846 netkit_release_all(dev); 847 + netkit_queue_unlease(dev); 1059 848 } 1060 849 1061 850 static void netkit_del_link(struct net_device *dev, struct list_head *head) ··· 1069 856 if (peer) { 1070 857 nk = netkit_priv(peer); 1071 858 RCU_INIT_POINTER(nk->peer, NULL); 1072 - unregister_netdevice_queue(peer, head); 859 + /* Guard against the peer already being in an unregister 860 + * list (e.g. same-namespace teardown where the peer is 861 + * in the caller's dev_kill_list). list_move_tail() on an 862 + * already-queued device would otherwise corrupt that 863 + * list's iteration. This situation can occur via netkit 864 + * notifier, hence guard against this scenario. 865 + */ 866 + if (!unregister_netdevice_queued(peer)) 867 + unregister_netdevice_queue(peer, head); 1073 868 } 1074 869 } 1075 870 ··· 1100 879 { IFLA_NETKIT_PEER_INFO, "peer info" }, 1101 880 { IFLA_NETKIT_HEADROOM, "headroom" }, 1102 881 { IFLA_NETKIT_TAILROOM, "tailroom" }, 882 + { IFLA_NETKIT_PAIRING, "pairing" }, 1103 883 }; 1104 884 1105 885 if (!nk->primary) { ··· 1120 898 } 1121 899 1122 900 if (data[IFLA_NETKIT_POLICY]) { 901 + err = -EOPNOTSUPP; 1123 902 attr = data[IFLA_NETKIT_POLICY]; 1124 903 policy = nla_get_u32(attr); 1125 - err = netkit_check_policy(policy, attr, extack); 904 + if (nk->pair == NETKIT_DEVICE_PAIR) 905 + err = netkit_check_policy(policy, attr, extack); 1126 906 if (err) 1127 907 return err; 1128 908 WRITE_ONCE(nk->policy, policy); ··· 1145 921 return 0; 1146 922 } 1147 923 924 + static void netkit_check_lease_unregister(struct net_device *dev) 925 + { 926 + LIST_HEAD(list_kill); 927 + u32 q_idx; 928 + 929 + if (READ_ONCE(dev->reg_state) != NETREG_UNREGISTERING || 930 + !dev->dev.parent) 931 + return; 932 + 933 + netdev_lock_ops(dev); 934 + for (q_idx = 0; q_idx < dev->real_num_rx_queues; q_idx++) { 935 + struct net_device *tmp = dev; 936 + struct netdev_rx_queue *rxq; 937 + u32 tmp_q_idx = q_idx; 938 + 939 + rxq = __netif_get_rx_queue_lease(&tmp, &tmp_q_idx, 940 + NETIF_PHYS_TO_VIRT); 941 + if (rxq && tmp != dev && 942 + tmp->netdev_ops == &netkit_netdev_ops) { 943 + /* A single phys device can have multiple queues leased 944 + * to one netkit device. We can only queue that netkit 945 + * device once to the list_kill. Queues of that phys 946 + * device can be leased with different individual netkit 947 + * devices, hence we batch via list_kill. 948 + */ 949 + if (unregister_netdevice_queued(tmp)) 950 + continue; 951 + netkit_del_link(tmp, &list_kill); 952 + } 953 + } 954 + netdev_unlock_ops(dev); 955 + unregister_netdevice_many(&list_kill); 956 + } 957 + 958 + static int netkit_notifier(struct notifier_block *this, 959 + unsigned long event, void *ptr) 960 + { 961 + struct net_device *dev = netdev_notifier_info_to_dev(ptr); 962 + 963 + if (event == NETDEV_UNREGISTER) 964 + netkit_check_lease_unregister(dev); 965 + return NOTIFY_DONE; 966 + } 967 + 1148 968 static size_t netkit_get_size(const struct net_device *dev) 1149 969 { 1150 970 return nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_POLICY */ ··· 1199 931 nla_total_size(sizeof(u8)) + /* IFLA_NETKIT_PRIMARY */ 1200 932 nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */ 1201 933 nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */ 934 + nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */ 1202 935 0; 1203 936 } 1204 937 ··· 1219 950 if (nla_put_u16(skb, IFLA_NETKIT_HEADROOM, dev->needed_headroom)) 1220 951 return -EMSGSIZE; 1221 952 if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom)) 953 + return -EMSGSIZE; 954 + if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair)) 1222 955 return -EMSGSIZE; 1223 956 1224 957 if (peer) { ··· 1243 972 [IFLA_NETKIT_TAILROOM] = { .type = NLA_U16 }, 1244 973 [IFLA_NETKIT_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), 1245 974 [IFLA_NETKIT_PEER_SCRUB] = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT), 975 + [IFLA_NETKIT_PAIRING] = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE), 1246 976 [IFLA_NETKIT_PRIMARY] = { .type = NLA_REJECT, 1247 977 .reject_message = "Primary attribute is read-only" }, 1248 978 }; 1249 979 1250 980 static struct rtnl_link_ops netkit_link_ops = { 1251 - .kind = DRV_NAME, 981 + .kind = NETKIT_DRV_NAME, 1252 982 .priv_size = sizeof(struct netkit), 983 + .alloc = netkit_alloc, 1253 984 .setup = netkit_setup, 1254 985 .newlink = netkit_new_link, 1255 986 .dellink = netkit_del_link, ··· 1265 992 .maxtype = IFLA_NETKIT_MAX, 1266 993 }; 1267 994 1268 - static __init int netkit_init(void) 995 + static struct notifier_block netkit_netdev_notifier = { 996 + .notifier_call = netkit_notifier, 997 + }; 998 + 999 + static __init int netkit_mod_init(void) 1269 1000 { 1001 + int ret; 1002 + 1270 1003 BUILD_BUG_ON((int)NETKIT_NEXT != (int)TCX_NEXT || 1271 1004 (int)NETKIT_PASS != (int)TCX_PASS || 1272 1005 (int)NETKIT_DROP != (int)TCX_DROP || 1273 1006 (int)NETKIT_REDIRECT != (int)TCX_REDIRECT); 1274 1007 1275 - return rtnl_link_register(&netkit_link_ops); 1008 + ret = rtnl_link_register(&netkit_link_ops); 1009 + if (ret) 1010 + return ret; 1011 + ret = register_netdevice_notifier(&netkit_netdev_notifier); 1012 + if (ret) 1013 + rtnl_link_unregister(&netkit_link_ops); 1014 + return ret; 1276 1015 } 1277 1016 1278 - static __exit void netkit_exit(void) 1017 + static __exit void netkit_mod_exit(void) 1279 1018 { 1019 + unregister_netdevice_notifier(&netkit_netdev_notifier); 1280 1020 rtnl_link_unregister(&netkit_link_ops); 1281 1021 } 1282 1022 1283 - module_init(netkit_init); 1284 - module_exit(netkit_exit); 1023 + module_init(netkit_mod_init); 1024 + module_exit(netkit_mod_exit); 1285 1025 1286 1026 MODULE_DESCRIPTION("BPF-programmable network device"); 1287 1027 MODULE_AUTHOR("Daniel Borkmann <daniel@iogearbox.net>"); 1288 1028 MODULE_AUTHOR("Nikolay Aleksandrov <razor@blackwall.org>"); 1289 1029 MODULE_LICENSE("GPL"); 1290 - MODULE_ALIAS_RTNL_LINK(DRV_NAME); 1030 + MODULE_ALIAS_RTNL_LINK(NETKIT_DRV_NAME);
+10 -1
include/linux/netdevice.h
··· 2561 2561 * Also protects some fields in: 2562 2562 * struct napi_struct, struct netdev_queue, struct netdev_rx_queue 2563 2563 * 2564 - * Ordering: take after rtnl_lock. 2564 + * Ordering: 2565 + * 2566 + * - take after rtnl_lock 2567 + * 2568 + * - for the case of netdev queue leasing, the netdev-scope lock is 2569 + * taken for both the virtual and the physical device; to prevent 2570 + * deadlocks, the virtual device's lock must always be acquired 2571 + * before the physical device's (see netdev_nl_queue_create_doit) 2565 2572 */ 2566 2573 struct mutex lock; 2567 2574 ··· 3420 3413 int register_netdevice(struct net_device *dev); 3421 3414 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head); 3422 3415 void unregister_netdevice_many(struct list_head *head); 3416 + bool unregister_netdevice_queued(const struct net_device *dev); 3417 + 3423 3418 static inline void unregister_netdevice(struct net_device *dev) 3424 3419 { 3425 3420 unregister_netdevice_queue(dev, NULL);
+19 -4
include/net/netdev_queues.h
··· 150 150 * When NIC-wide config is changed the callback will 151 151 * be invoked for all queues. 152 152 * 153 + * @ndo_queue_create: Create a new RX queue on a virtual device that will 154 + * be paired with a physical device's queue via leasing. 155 + * Return the new queue id on success, negative error 156 + * on failure. 157 + * 153 158 * @supported_params: Bitmask of supported parameters, see QCFG_*. 154 159 * 155 160 * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while ··· 183 178 struct netlink_ext_ack *extack); 184 179 struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, 185 180 int idx); 181 + int (*ndo_queue_create)(struct net_device *dev, 182 + struct netlink_ext_ack *extack); 186 183 187 184 unsigned int supported_params; 188 185 }; ··· 192 185 void netdev_queue_config(struct net_device *dev, int rxq, 193 186 struct netdev_queue_config *qcfg); 194 187 195 - bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx); 188 + bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx); 196 189 197 190 /** 198 191 * DOC: Lockless queue stopping / waking helpers. ··· 380 373 get_desc, start_thrs); \ 381 374 }) 382 375 383 - struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); 384 - 385 - #endif 376 + struct device *netdev_queue_get_dma_dev(struct net_device *dev, 377 + unsigned int idx, 378 + enum netdev_queue_type type); 379 + bool netdev_can_create_queue(const struct net_device *dev, 380 + struct netlink_ext_ack *extack); 381 + bool netdev_can_lease_queue(const struct net_device *dev, 382 + struct netlink_ext_ack *extack); 383 + bool netdev_queue_busy(struct net_device *dev, unsigned int idx, 384 + enum netdev_queue_type type, 385 + struct netlink_ext_ack *extack); 386 + #endif /* _LINUX_NET_QUEUES_H */
+27 -2
include/net/netdev_rx_queue.h
··· 31 31 struct napi_struct *napi; 32 32 struct netdev_queue_config qcfg; 33 33 struct pp_memory_provider_params mp_params; 34 + 35 + /* If a queue is leased, then the lease pointer is always 36 + * valid. From the physical device it points to the virtual 37 + * queue, and from the virtual device it points to the 38 + * physical queue. 39 + */ 40 + struct netdev_rx_queue *lease; 41 + netdevice_tracker lease_tracker; 34 42 } ____cacheline_aligned_in_smp; 35 43 36 44 /* ··· 67 59 return index; 68 60 } 69 61 70 - int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); 62 + enum netif_lease_dir { 63 + NETIF_VIRT_TO_PHYS, 64 + NETIF_PHYS_TO_VIRT, 65 + }; 71 66 72 - #endif 67 + struct netdev_rx_queue * 68 + __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq, 69 + enum netif_lease_dir dir); 70 + 71 + struct netdev_rx_queue * 72 + netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq); 73 + void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, 74 + struct net_device *dev); 75 + 76 + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq); 77 + void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, 78 + struct netdev_rx_queue *rxq_src); 79 + void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, 80 + struct netdev_rx_queue *rxq_src); 81 + #endif /* _LINUX_NETDEV_RX_QUEUE_H */
+2 -6
include/net/page_pool/memory_provider.h
··· 23 23 void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov); 24 24 void net_mp_niov_clear_page_pool(struct net_iov *niov); 25 25 26 - int net_mp_open_rxq(struct net_device *dev, unsigned ifq_idx, 27 - struct pp_memory_provider_params *p); 28 - int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 26 + int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 29 27 const struct pp_memory_provider_params *p, 30 28 struct netlink_ext_ack *extack); 31 - void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, 32 - struct pp_memory_provider_params *old_p); 33 - void __net_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, 29 + void netif_mp_close_rxq(struct net_device *dev, unsigned int rxq_idx, 34 30 const struct pp_memory_provider_params *old_p); 35 31 36 32 /**
+6
include/uapi/linux/if_link.h
··· 1296 1296 NETKIT_L3, 1297 1297 }; 1298 1298 1299 + enum netkit_pairing { 1300 + NETKIT_DEVICE_PAIR, 1301 + NETKIT_DEVICE_SINGLE, 1302 + }; 1303 + 1299 1304 /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to 1300 1305 * the BPF program if attached. This also means the latter can 1301 1306 * consume the two fields if they were populated earlier. ··· 1325 1320 IFLA_NETKIT_PEER_SCRUB, 1326 1321 IFLA_NETKIT_HEADROOM, 1327 1322 IFLA_NETKIT_TAILROOM, 1323 + IFLA_NETKIT_PAIRING, 1328 1324 __IFLA_NETKIT_MAX, 1329 1325 }; 1330 1326 #define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1)
+11
include/uapi/linux/netdev.h
··· 160 160 NETDEV_A_QUEUE_DMABUF, 161 161 NETDEV_A_QUEUE_IO_URING, 162 162 NETDEV_A_QUEUE_XSK, 163 + NETDEV_A_QUEUE_LEASE, 163 164 164 165 __NETDEV_A_QUEUE_MAX, 165 166 NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) ··· 204 203 }; 205 204 206 205 enum { 206 + NETDEV_A_LEASE_IFINDEX = 1, 207 + NETDEV_A_LEASE_QUEUE, 208 + NETDEV_A_LEASE_NETNS_ID, 209 + 210 + __NETDEV_A_LEASE_MAX, 211 + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) 212 + }; 213 + 214 + enum { 207 215 NETDEV_A_DMABUF_IFINDEX = 1, 208 216 NETDEV_A_DMABUF_QUEUES, 209 217 NETDEV_A_DMABUF_FD, ··· 238 228 NETDEV_CMD_BIND_RX, 239 229 NETDEV_CMD_NAPI_SET, 240 230 NETDEV_CMD_BIND_TX, 231 + NETDEV_CMD_QUEUE_CREATE, 241 232 242 233 __NETDEV_CMD_MAX, 243 234 NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
+8 -4
io_uring/zcrx.c
··· 552 552 } 553 553 554 554 if (netdev) { 555 - if (ifq->if_rxq != -1) 556 - net_mp_close_rxq(netdev, ifq->if_rxq, &p); 555 + if (ifq->if_rxq != -1) { 556 + netdev_lock(netdev); 557 + netif_mp_close_rxq(netdev, ifq->if_rxq, &p); 558 + netdev_unlock(netdev); 559 + } 557 560 netdev_put(netdev, &netdev_tracker); 558 561 } 559 562 ifq->if_rxq = -1; ··· 829 826 } 830 827 netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); 831 828 832 - ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); 829 + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq, 830 + NETDEV_QUEUE_TYPE_RX); 833 831 if (!ifq->dev) { 834 832 ret = -EOPNOTSUPP; 835 833 goto netdev_put_unlock; ··· 845 841 mp_param.rx_page_size = 1U << ifq->niov_shift; 846 842 mp_param.mp_ops = &io_uring_pp_zc_ops; 847 843 mp_param.mp_priv = ifq; 848 - ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); 844 + ret = netif_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); 849 845 if (ret) 850 846 goto netdev_put_unlock; 851 847 netdev_unlock(ifq->netdev);
+15 -3
net/core/dev.c
··· 1122 1122 } 1123 1123 1124 1124 struct net_device * 1125 + netdev_put_lock(struct net_device *dev, struct net *net, 1126 + netdevice_tracker *tracker) 1127 + { 1128 + netdev_tracker_free(dev, tracker); 1129 + return __netdev_put_lock(dev, net); 1130 + } 1131 + 1132 + struct net_device * 1125 1133 netdev_xa_find_lock(struct net *net, struct net_device *dev, 1126 1134 unsigned long *index) 1127 1135 { ··· 12350 12342 12351 12343 for (i = 0; i < dev->real_num_rx_queues; i++) { 12352 12344 struct netdev_rx_queue *rxq = &dev->_rx[i]; 12353 - struct pp_memory_provider_params *p = &rxq->mp_params; 12354 12345 12355 - if (p->mp_ops && p->mp_ops->uninstall) 12356 - p->mp_ops->uninstall(rxq->mp_params.mp_priv, rxq); 12346 + __netif_mp_uninstall_rxq(rxq, &rxq->mp_params); 12357 12347 } 12358 12348 } 12359 12349 ··· 12382 12376 if (lockdep_depth(current) > limit) 12383 12377 netif_close_many_and_unlock(close_head); 12384 12378 #endif 12379 + } 12380 + 12381 + bool unregister_netdevice_queued(const struct net_device *dev) 12382 + { 12383 + ASSERT_RTNL(); 12384 + return !list_empty(&dev->unreg_list); 12385 12385 } 12386 12386 12387 12387 void unregister_netdevice_many_notify(struct list_head *head,
+12
net/core/dev.h
··· 12 12 struct netlink_ext_ack; 13 13 struct netdev_queue_config; 14 14 struct cpumask; 15 + struct pp_memory_provider_params; 15 16 16 17 /* Random bits of netdevice that don't need to be exposed */ 17 18 #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ ··· 32 31 struct net_device *dev_get_by_napi_id(unsigned int napi_id); 33 32 34 33 struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); 34 + struct net_device *netdev_put_lock(struct net_device *dev, struct net *net, 35 + netdevice_tracker *tracker); 35 36 struct net_device * 36 37 netdev_xa_find_lock(struct net *net, struct net_device *dev, 37 38 unsigned long *index); ··· 98 95 int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, 99 96 struct netdev_queue_config *qcfg, 100 97 struct netlink_ext_ack *extack); 98 + 99 + bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx); 100 + bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx); 101 + 102 + void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq, 103 + const struct pp_memory_provider_params *p); 104 + 105 + void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq, 106 + struct netdev_rx_queue *virt_rxq); 101 107 102 108 /* netdev management, shared between various uAPI entry points */ 103 109 struct netdev_name_node {
+3 -3
net/core/devmem.c
··· 145 145 146 146 rxq_idx = get_netdev_rx_queue_index(rxq); 147 147 148 - __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); 148 + netif_mp_close_rxq(binding->dev, rxq_idx, &mp_params); 149 149 } 150 150 151 151 percpu_ref_kill(&binding->ref); ··· 163 163 u32 xa_idx; 164 164 int err; 165 165 166 - err = __net_mp_open_rxq(dev, rxq_idx, &mp_params, extack); 166 + err = netif_mp_open_rxq(dev, rxq_idx, &mp_params, extack); 167 167 if (err) 168 168 return err; 169 169 ··· 176 176 return 0; 177 177 178 178 err_close_rxq: 179 - __net_mp_close_rxq(dev, rxq_idx, &mp_params); 179 + netif_mp_close_rxq(dev, rxq_idx, &mp_params); 180 180 return err; 181 181 } 182 182
+20
net/core/netdev-genl-gen.c
··· 28 28 }; 29 29 30 30 /* Common nested types */ 31 + const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1] = { 32 + [NETDEV_A_LEASE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 33 + [NETDEV_A_LEASE_QUEUE] = NLA_POLICY_NESTED(netdev_queue_id_nl_policy), 34 + [NETDEV_A_LEASE_NETNS_ID] = NLA_POLICY_MIN(NLA_S32, 0), 35 + }; 36 + 31 37 const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1] = { 32 38 [NETDEV_A_PAGE_POOL_ID] = NLA_POLICY_FULL_RANGE(NLA_UINT, &netdev_a_page_pool_id_range), 33 39 [NETDEV_A_PAGE_POOL_IFINDEX] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_page_pool_ifindex_range), ··· 111 105 static const struct nla_policy netdev_bind_tx_nl_policy[NETDEV_A_DMABUF_FD + 1] = { 112 106 [NETDEV_A_DMABUF_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 113 107 [NETDEV_A_DMABUF_FD] = { .type = NLA_U32, }, 108 + }; 109 + 110 + /* NETDEV_CMD_QUEUE_CREATE - do */ 111 + static const struct nla_policy netdev_queue_create_nl_policy[NETDEV_A_QUEUE_LEASE + 1] = { 112 + [NETDEV_A_QUEUE_IFINDEX] = NLA_POLICY_MIN(NLA_U32, 1), 113 + [NETDEV_A_QUEUE_TYPE] = NLA_POLICY_MAX(NLA_U32, 1), 114 + [NETDEV_A_QUEUE_LEASE] = NLA_POLICY_NESTED(netdev_lease_nl_policy), 114 115 }; 115 116 116 117 /* Ops table for netdev */ ··· 217 204 .policy = netdev_bind_tx_nl_policy, 218 205 .maxattr = NETDEV_A_DMABUF_FD, 219 206 .flags = GENL_CMD_CAP_DO, 207 + }, 208 + { 209 + .cmd = NETDEV_CMD_QUEUE_CREATE, 210 + .doit = netdev_nl_queue_create_doit, 211 + .policy = netdev_queue_create_nl_policy, 212 + .maxattr = NETDEV_A_QUEUE_LEASE, 213 + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, 220 214 }, 221 215 }; 222 216
+2
net/core/netdev-genl-gen.h
··· 14 14 #include <net/netdev_netlink.h> 15 15 16 16 /* Common nested types */ 17 + extern const struct nla_policy netdev_lease_nl_policy[NETDEV_A_LEASE_NETNS_ID + 1]; 17 18 extern const struct nla_policy netdev_page_pool_info_nl_policy[NETDEV_A_PAGE_POOL_IFINDEX + 1]; 18 19 extern const struct nla_policy netdev_queue_id_nl_policy[NETDEV_A_QUEUE_TYPE + 1]; 19 20 ··· 37 36 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info); 38 37 int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info); 39 38 int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info); 39 + int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info); 40 40 41 41 enum { 42 42 NETDEV_NLGRP_MGMT,
+232 -6
net/core/netdev-genl.c
··· 387 387 } 388 388 389 389 static int 390 + netdev_nl_queue_fill_lease(struct sk_buff *rsp, struct net_device *netdev, 391 + u32 q_idx, u32 q_type) 392 + { 393 + struct net_device *orig_netdev = netdev; 394 + struct nlattr *nest_lease, *nest_queue; 395 + struct netdev_rx_queue *rxq; 396 + struct net *net, *peer_net; 397 + 398 + rxq = __netif_get_rx_queue_lease(&netdev, &q_idx, 399 + NETIF_PHYS_TO_VIRT); 400 + if (!rxq || orig_netdev == netdev) 401 + return 0; 402 + 403 + nest_lease = nla_nest_start(rsp, NETDEV_A_QUEUE_LEASE); 404 + if (!nest_lease) 405 + goto nla_put_failure; 406 + 407 + nest_queue = nla_nest_start(rsp, NETDEV_A_LEASE_QUEUE); 408 + if (!nest_queue) 409 + goto nla_put_failure; 410 + if (nla_put_u32(rsp, NETDEV_A_QUEUE_ID, q_idx)) 411 + goto nla_put_failure; 412 + if (nla_put_u32(rsp, NETDEV_A_QUEUE_TYPE, q_type)) 413 + goto nla_put_failure; 414 + nla_nest_end(rsp, nest_queue); 415 + 416 + if (nla_put_u32(rsp, NETDEV_A_LEASE_IFINDEX, 417 + READ_ONCE(netdev->ifindex))) 418 + goto nla_put_failure; 419 + 420 + rcu_read_lock(); 421 + peer_net = dev_net_rcu(netdev); 422 + net = dev_net_rcu(orig_netdev); 423 + if (!net_eq(net, peer_net)) { 424 + s32 id = peernet2id_alloc(net, peer_net, GFP_ATOMIC); 425 + 426 + if (nla_put_s32(rsp, NETDEV_A_LEASE_NETNS_ID, id)) 427 + goto nla_put_failure_unlock; 428 + } 429 + rcu_read_unlock(); 430 + nla_nest_end(rsp, nest_lease); 431 + return 0; 432 + 433 + nla_put_failure_unlock: 434 + rcu_read_unlock(); 435 + nla_put_failure: 436 + return -ENOMEM; 437 + } 438 + 439 + static int 390 440 netdev_nl_queue_fill_one(struct sk_buff *rsp, struct net_device *netdev, 391 441 u32 q_idx, u32 q_type, const struct genl_info *info) 392 442 { 393 443 struct pp_memory_provider_params *params; 394 - struct netdev_rx_queue *rxq; 444 + struct net_device *orig_netdev = netdev; 445 + struct netdev_rx_queue *rxq, *rxq_lease; 395 446 struct netdev_queue *txq; 396 447 void *hdr; 397 448 ··· 460 409 rxq = __netif_get_rx_queue(netdev, q_idx); 461 410 if (nla_put_napi_id(rsp, rxq->napi)) 462 411 goto nla_put_failure; 412 + if (netdev_nl_queue_fill_lease(rsp, netdev, q_idx, q_type)) 413 + goto nla_put_failure; 463 414 415 + rxq_lease = netif_get_rx_queue_lease_locked(&netdev, &q_idx); 416 + if (rxq_lease) 417 + rxq = rxq_lease; 464 418 params = &rxq->mp_params; 465 419 if (params->mp_ops && 466 420 params->mp_ops->nl_fill(params->mp_priv, rsp, rxq)) 467 - goto nla_put_failure; 421 + goto nla_put_failure_lease; 468 422 #ifdef CONFIG_XDP_SOCKETS 469 423 if (rxq->pool) 470 424 if (nla_put_empty_nest(rsp, NETDEV_A_QUEUE_XSK)) 471 - goto nla_put_failure; 425 + goto nla_put_failure_lease; 472 426 #endif 473 - 427 + netif_put_rx_queue_lease_locked(orig_netdev, netdev); 474 428 break; 475 429 case NETDEV_QUEUE_TYPE_TX: 476 430 txq = netdev_get_tx_queue(netdev, q_idx); ··· 493 437 494 438 return 0; 495 439 440 + nla_put_failure_lease: 441 + netif_put_rx_queue_lease_locked(orig_netdev, netdev); 496 442 nla_put_failure: 497 443 genlmsg_cancel(rsp, hdr); 498 444 return -EMSGSIZE; ··· 976 918 for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { 977 919 struct device *rxq_dma_dev; 978 920 979 - rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); 921 + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx, 922 + NETDEV_QUEUE_TYPE_RX); 980 923 if (dma_dev && rxq_dma_dev != dma_dev) { 981 924 NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", 982 925 rxq_idx, prev_rxq_idx); ··· 1154 1095 goto err_unlock_netdev; 1155 1096 } 1156 1097 1157 - dma_dev = netdev_queue_get_dma_dev(netdev, 0); 1098 + dma_dev = netdev_queue_get_dma_dev(netdev, 0, NETDEV_QUEUE_TYPE_TX); 1158 1099 binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, 1159 1100 dmabuf_fd, priv, info->extack); 1160 1101 if (IS_ERR(binding)) { ··· 1174 1115 netdev_unlock(netdev); 1175 1116 err_unlock_sock: 1176 1117 mutex_unlock(&priv->lock); 1118 + err_genlmsg_free: 1119 + nlmsg_free(rsp); 1120 + return err; 1121 + } 1122 + 1123 + int netdev_nl_queue_create_doit(struct sk_buff *skb, struct genl_info *info) 1124 + { 1125 + const int qmaxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; 1126 + const int lmaxtype = ARRAY_SIZE(netdev_lease_nl_policy) - 1; 1127 + int err, ifindex, ifindex_lease, queue_id, queue_id_lease; 1128 + struct nlattr *qtb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; 1129 + struct nlattr *ltb[ARRAY_SIZE(netdev_lease_nl_policy)]; 1130 + struct netdev_rx_queue *rxq, *rxq_lease; 1131 + struct net_device *dev, *dev_lease; 1132 + netdevice_tracker dev_tracker; 1133 + s32 netns_lease = -1; 1134 + struct nlattr *nest; 1135 + struct sk_buff *rsp; 1136 + struct net *net; 1137 + void *hdr; 1138 + 1139 + if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_IFINDEX) || 1140 + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_TYPE) || 1141 + GENL_REQ_ATTR_CHECK(info, NETDEV_A_QUEUE_LEASE)) 1142 + return -EINVAL; 1143 + if (nla_get_u32(info->attrs[NETDEV_A_QUEUE_TYPE]) != 1144 + NETDEV_QUEUE_TYPE_RX) { 1145 + NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_QUEUE_TYPE]); 1146 + return -EINVAL; 1147 + } 1148 + 1149 + ifindex = nla_get_u32(info->attrs[NETDEV_A_QUEUE_IFINDEX]); 1150 + 1151 + nest = info->attrs[NETDEV_A_QUEUE_LEASE]; 1152 + err = nla_parse_nested(ltb, lmaxtype, nest, 1153 + netdev_lease_nl_policy, info->extack); 1154 + if (err < 0) 1155 + return err; 1156 + if (NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_IFINDEX) || 1157 + NL_REQ_ATTR_CHECK(info->extack, nest, ltb, NETDEV_A_LEASE_QUEUE)) 1158 + return -EINVAL; 1159 + if (ltb[NETDEV_A_LEASE_NETNS_ID]) { 1160 + if (!capable(CAP_NET_ADMIN)) 1161 + return -EPERM; 1162 + netns_lease = nla_get_s32(ltb[NETDEV_A_LEASE_NETNS_ID]); 1163 + } 1164 + 1165 + ifindex_lease = nla_get_u32(ltb[NETDEV_A_LEASE_IFINDEX]); 1166 + 1167 + nest = ltb[NETDEV_A_LEASE_QUEUE]; 1168 + err = nla_parse_nested(qtb, qmaxtype, nest, 1169 + netdev_queue_id_nl_policy, info->extack); 1170 + if (err < 0) 1171 + return err; 1172 + if (NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_ID) || 1173 + NL_REQ_ATTR_CHECK(info->extack, nest, qtb, NETDEV_A_QUEUE_TYPE)) 1174 + return -EINVAL; 1175 + if (nla_get_u32(qtb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { 1176 + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_TYPE]); 1177 + return -EINVAL; 1178 + } 1179 + 1180 + queue_id_lease = nla_get_u32(qtb[NETDEV_A_QUEUE_ID]); 1181 + 1182 + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); 1183 + if (!rsp) 1184 + return -ENOMEM; 1185 + 1186 + hdr = genlmsg_iput(rsp, info); 1187 + if (!hdr) { 1188 + err = -EMSGSIZE; 1189 + goto err_genlmsg_free; 1190 + } 1191 + 1192 + /* Locking order is always from the virtual to the physical device 1193 + * since this is also the same order when applications open the 1194 + * memory provider later on. 1195 + */ 1196 + dev = netdev_get_by_index_lock(genl_info_net(info), ifindex); 1197 + if (!dev) { 1198 + err = -ENODEV; 1199 + goto err_genlmsg_free; 1200 + } 1201 + if (!netdev_can_create_queue(dev, info->extack)) { 1202 + err = -EINVAL; 1203 + goto err_unlock_dev; 1204 + } 1205 + 1206 + net = genl_info_net(info); 1207 + if (netns_lease >= 0) { 1208 + net = get_net_ns_by_id(net, netns_lease); 1209 + if (!net) { 1210 + err = -ENONET; 1211 + goto err_unlock_dev; 1212 + } 1213 + } 1214 + 1215 + dev_lease = netdev_get_by_index(net, ifindex_lease, &dev_tracker, 1216 + GFP_KERNEL); 1217 + if (!dev_lease) { 1218 + err = -ENODEV; 1219 + goto err_put_netns; 1220 + } 1221 + if (!netdev_can_lease_queue(dev_lease, info->extack)) { 1222 + netdev_put(dev_lease, &dev_tracker); 1223 + err = -EINVAL; 1224 + goto err_put_netns; 1225 + } 1226 + 1227 + dev_lease = netdev_put_lock(dev_lease, net, &dev_tracker); 1228 + if (!dev_lease) { 1229 + err = -ENODEV; 1230 + goto err_put_netns; 1231 + } 1232 + if (queue_id_lease >= dev_lease->real_num_rx_queues) { 1233 + err = -ERANGE; 1234 + NL_SET_BAD_ATTR(info->extack, qtb[NETDEV_A_QUEUE_ID]); 1235 + goto err_unlock_dev_lease; 1236 + } 1237 + if (netdev_queue_busy(dev_lease, queue_id_lease, NETDEV_QUEUE_TYPE_RX, 1238 + info->extack)) { 1239 + err = -EBUSY; 1240 + goto err_unlock_dev_lease; 1241 + } 1242 + 1243 + rxq_lease = __netif_get_rx_queue(dev_lease, queue_id_lease); 1244 + rxq = __netif_get_rx_queue(dev, dev->real_num_rx_queues - 1); 1245 + 1246 + /* Leasing queues from different physical devices is currently 1247 + * not supported. Capabilities such as XDP features and DMA 1248 + * device may differ between physical devices, and computing 1249 + * a correct intersection for the virtual device is not yet 1250 + * implemented. 1251 + */ 1252 + if (rxq->lease && rxq->lease->dev != dev_lease) { 1253 + err = -EOPNOTSUPP; 1254 + NL_SET_ERR_MSG(info->extack, 1255 + "Leasing queues from different devices not supported"); 1256 + goto err_unlock_dev_lease; 1257 + } 1258 + 1259 + queue_id = dev->queue_mgmt_ops->ndo_queue_create(dev, info->extack); 1260 + if (queue_id < 0) { 1261 + err = queue_id; 1262 + goto err_unlock_dev_lease; 1263 + } 1264 + rxq = __netif_get_rx_queue(dev, queue_id); 1265 + 1266 + netdev_rx_queue_lease(rxq, rxq_lease); 1267 + 1268 + nla_put_u32(rsp, NETDEV_A_QUEUE_ID, queue_id); 1269 + genlmsg_end(rsp, hdr); 1270 + 1271 + netdev_unlock(dev_lease); 1272 + netdev_unlock(dev); 1273 + if (netns_lease >= 0) 1274 + put_net(net); 1275 + 1276 + return genlmsg_reply(rsp, info); 1277 + 1278 + err_unlock_dev_lease: 1279 + netdev_unlock(dev_lease); 1280 + err_put_netns: 1281 + if (netns_lease >= 0) 1282 + put_net(net); 1283 + err_unlock_dev: 1284 + netdev_unlock(dev); 1177 1285 err_genlmsg_free: 1178 1286 nlmsg_free(rsp); 1179 1287 return err;
+96 -11
net/core/netdev_queues.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-or-later 2 2 3 3 #include <net/netdev_queues.h> 4 + #include <net/netdev_rx_queue.h> 5 + #include <net/xdp_sock_drv.h> 4 6 5 - /** 6 - * netdev_queue_get_dma_dev() - get dma device for zero-copy operations 7 - * @dev: net_device 8 - * @idx: queue index 9 - * 10 - * Get dma device for zero-copy operations to be used for this queue. 11 - * When such device is not available or valid, the function will return NULL. 12 - * 13 - * Return: Device or NULL on error 14 - */ 15 - struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) 7 + #include "dev.h" 8 + 9 + static struct device * 10 + __netdev_queue_get_dma_dev(struct net_device *dev, unsigned int idx) 16 11 { 17 12 const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; 18 13 struct device *dma_dev; ··· 20 25 return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; 21 26 } 22 27 28 + /** 29 + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations 30 + * @dev: net_device 31 + * @idx: queue index 32 + * @type: queue type (RX or TX) 33 + * 34 + * Get dma device for zero-copy operations to be used for this queue. If 35 + * the queue is an RX queue leased from a physical queue, we retrieve the 36 + * physical queue's dma device. When the dma device is not available or 37 + * valid, the function will return NULL. 38 + * 39 + * Return: Device or NULL on error 40 + */ 41 + struct device *netdev_queue_get_dma_dev(struct net_device *dev, 42 + unsigned int idx, 43 + enum netdev_queue_type type) 44 + { 45 + struct net_device *orig_dev = dev; 46 + struct device *dma_dev; 47 + 48 + /* Only RX side supports queue leasing today. */ 49 + if (type != NETDEV_QUEUE_TYPE_RX || !netif_rxq_is_leased(dev, idx)) 50 + return __netdev_queue_get_dma_dev(dev, idx); 51 + 52 + if (!netif_get_rx_queue_lease_locked(&dev, &idx)) 53 + return NULL; 54 + 55 + dma_dev = __netdev_queue_get_dma_dev(dev, idx); 56 + netif_put_rx_queue_lease_locked(orig_dev, dev); 57 + return dma_dev; 58 + } 59 + 60 + bool netdev_can_create_queue(const struct net_device *dev, 61 + struct netlink_ext_ack *extack) 62 + { 63 + if (dev->dev.parent) { 64 + NL_SET_ERR_MSG(extack, "Device is not a virtual device"); 65 + return false; 66 + } 67 + if (!dev->queue_mgmt_ops || 68 + !dev->queue_mgmt_ops->ndo_queue_create) { 69 + NL_SET_ERR_MSG(extack, "Device does not support queue creation"); 70 + return false; 71 + } 72 + if (dev->real_num_rx_queues < 1 || 73 + dev->real_num_tx_queues < 1) { 74 + NL_SET_ERR_MSG(extack, "Device must have at least one real queue"); 75 + return false; 76 + } 77 + return true; 78 + } 79 + 80 + bool netdev_can_lease_queue(const struct net_device *dev, 81 + struct netlink_ext_ack *extack) 82 + { 83 + if (!dev->dev.parent) { 84 + NL_SET_ERR_MSG(extack, "Lease device is a virtual device"); 85 + return false; 86 + } 87 + if (!netif_device_present(dev)) { 88 + NL_SET_ERR_MSG(extack, "Lease device has been removed from the system"); 89 + return false; 90 + } 91 + if (!dev->queue_mgmt_ops) { 92 + NL_SET_ERR_MSG(extack, "Lease device does not support queue management operations"); 93 + return false; 94 + } 95 + return true; 96 + } 97 + 98 + bool netdev_queue_busy(struct net_device *dev, unsigned int idx, 99 + enum netdev_queue_type type, 100 + struct netlink_ext_ack *extack) 101 + { 102 + if (xsk_get_pool_from_qid(dev, idx)) { 103 + NL_SET_ERR_MSG(extack, "Device queue in use by AF_XDP"); 104 + return true; 105 + } 106 + if (type == NETDEV_QUEUE_TYPE_TX) 107 + return false; 108 + if (netif_rxq_is_leased(dev, idx)) { 109 + NL_SET_ERR_MSG(extack, "Device queue in use due to queue leasing"); 110 + return true; 111 + } 112 + if (netif_rxq_has_mp(dev, idx)) { 113 + NL_SET_ERR_MSG(extack, "Device queue in use by memory provider"); 114 + return true; 115 + } 116 + return false; 117 + }
+174 -28
net/core/netdev_rx_queue.c
··· 10 10 #include "dev.h" 11 11 #include "page_pool_priv.h" 12 12 13 - /* See also page_pool_is_unreadable() */ 14 - bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) 13 + void netdev_rx_queue_lease(struct netdev_rx_queue *rxq_dst, 14 + struct netdev_rx_queue *rxq_src) 15 15 { 16 - struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); 16 + netdev_assert_locked(rxq_src->dev); 17 + netdev_assert_locked(rxq_dst->dev); 17 18 18 - return !!rxq->mp_params.mp_ops; 19 + netdev_hold(rxq_src->dev, &rxq_src->lease_tracker, GFP_KERNEL); 20 + 21 + WRITE_ONCE(rxq_src->lease, rxq_dst); 22 + WRITE_ONCE(rxq_dst->lease, rxq_src); 23 + } 24 + 25 + void netdev_rx_queue_unlease(struct netdev_rx_queue *rxq_dst, 26 + struct netdev_rx_queue *rxq_src) 27 + { 28 + netdev_assert_locked(rxq_dst->dev); 29 + netdev_assert_locked(rxq_src->dev); 30 + 31 + netif_rxq_cleanup_unlease(rxq_src, rxq_dst); 32 + 33 + WRITE_ONCE(rxq_src->lease, NULL); 34 + WRITE_ONCE(rxq_dst->lease, NULL); 35 + 36 + netdev_put(rxq_src->dev, &rxq_src->lease_tracker); 37 + } 38 + 39 + bool netif_rxq_is_leased(struct net_device *dev, unsigned int rxq_idx) 40 + { 41 + if (rxq_idx < dev->real_num_rx_queues) 42 + return READ_ONCE(__netif_get_rx_queue(dev, rxq_idx)->lease); 43 + return false; 44 + } 45 + 46 + /* Virtual devices eligible for leasing have no dev->dev.parent, while 47 + * physical devices always have one. Use this to enforce the correct 48 + * lease traversal direction. 49 + */ 50 + static bool netif_lease_dir_ok(const struct net_device *dev, 51 + enum netif_lease_dir dir) 52 + { 53 + if (dir == NETIF_VIRT_TO_PHYS && !dev->dev.parent) 54 + return true; 55 + if (dir == NETIF_PHYS_TO_VIRT && dev->dev.parent) 56 + return true; 57 + return false; 58 + } 59 + 60 + struct netdev_rx_queue * 61 + __netif_get_rx_queue_lease(struct net_device **dev, unsigned int *rxq_idx, 62 + enum netif_lease_dir dir) 63 + { 64 + struct net_device *orig_dev = *dev; 65 + struct netdev_rx_queue *rxq = __netif_get_rx_queue(orig_dev, *rxq_idx); 66 + 67 + if (rxq->lease) { 68 + if (!netif_lease_dir_ok(orig_dev, dir)) 69 + return NULL; 70 + rxq = rxq->lease; 71 + *rxq_idx = get_netdev_rx_queue_index(rxq); 72 + *dev = rxq->dev; 73 + } 74 + return rxq; 75 + } 76 + 77 + struct netdev_rx_queue * 78 + netif_get_rx_queue_lease_locked(struct net_device **dev, unsigned int *rxq_idx) 79 + { 80 + struct net_device *orig_dev = *dev; 81 + struct netdev_rx_queue *rxq; 82 + 83 + /* Locking order is always from the virtual to the physical device 84 + * see netdev_nl_queue_create_doit(). 85 + */ 86 + netdev_ops_assert_locked(orig_dev); 87 + rxq = __netif_get_rx_queue_lease(dev, rxq_idx, NETIF_VIRT_TO_PHYS); 88 + if (rxq && orig_dev != *dev) 89 + netdev_lock(*dev); 90 + return rxq; 91 + } 92 + 93 + void netif_put_rx_queue_lease_locked(struct net_device *orig_dev, 94 + struct net_device *dev) 95 + { 96 + if (orig_dev != dev) 97 + netdev_unlock(dev); 98 + } 99 + 100 + /* See also page_pool_is_unreadable() */ 101 + bool netif_rxq_has_unreadable_mp(struct net_device *dev, unsigned int rxq_idx) 102 + { 103 + if (rxq_idx < dev->real_num_rx_queues) 104 + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_ops; 105 + return false; 19 106 } 20 107 EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); 108 + 109 + bool netif_rxq_has_mp(struct net_device *dev, unsigned int rxq_idx) 110 + { 111 + if (rxq_idx < dev->real_num_rx_queues) 112 + return __netif_get_rx_queue(dev, rxq_idx)->mp_params.mp_priv; 113 + return false; 114 + } 21 115 22 116 static int netdev_rx_queue_reconfig(struct net_device *dev, 23 117 unsigned int rxq_idx, ··· 202 108 } 203 109 EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); 204 110 205 - int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 206 - const struct pp_memory_provider_params *p, 207 - struct netlink_ext_ack *extack) 111 + static int __netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 112 + const struct pp_memory_provider_params *p, 113 + struct netlink_ext_ack *extack) 208 114 { 209 115 const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; 210 116 struct netdev_queue_config qcfg[2]; ··· 213 119 214 120 if (!qops) 215 121 return -EOPNOTSUPP; 216 - 217 - if (rxq_idx >= dev->real_num_rx_queues) { 218 - NL_SET_ERR_MSG(extack, "rx queue index out of range"); 219 - return -ERANGE; 220 - } 221 - rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); 222 122 223 123 if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { 224 124 NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); ··· 260 172 return ret; 261 173 } 262 174 263 - int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 264 - struct pp_memory_provider_params *p) 175 + int netif_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, 176 + const struct pp_memory_provider_params *p, 177 + struct netlink_ext_ack *extack) 265 178 { 179 + struct net_device *orig_dev = dev; 266 180 int ret; 267 181 268 - netdev_lock(dev); 269 - ret = __net_mp_open_rxq(dev, rxq_idx, p, NULL); 270 - netdev_unlock(dev); 182 + if (!netdev_need_ops_lock(dev)) 183 + return -EOPNOTSUPP; 184 + 185 + if (rxq_idx >= dev->real_num_rx_queues) { 186 + NL_SET_ERR_MSG(extack, "rx queue index out of range"); 187 + return -ERANGE; 188 + } 189 + rxq_idx = array_index_nospec(rxq_idx, dev->real_num_rx_queues); 190 + 191 + if (!netif_rxq_is_leased(dev, rxq_idx)) 192 + return __netif_mp_open_rxq(dev, rxq_idx, p, extack); 193 + 194 + if (!netif_get_rx_queue_lease_locked(&dev, &rxq_idx)) { 195 + NL_SET_ERR_MSG(extack, "rx queue leased to a virtual netdev"); 196 + return -EBUSY; 197 + } 198 + if (!dev->dev.parent) { 199 + NL_SET_ERR_MSG(extack, "rx queue belongs to a virtual netdev"); 200 + ret = -EOPNOTSUPP; 201 + goto out; 202 + } 203 + 204 + ret = __netif_mp_open_rxq(dev, rxq_idx, p, extack); 205 + out: 206 + netif_put_rx_queue_lease_locked(orig_dev, dev); 271 207 return ret; 272 208 } 273 209 274 - void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, 275 - const struct pp_memory_provider_params *old_p) 210 + static void __netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, 211 + const struct pp_memory_provider_params *old_p) 276 212 { 277 213 struct netdev_queue_config qcfg[2]; 278 214 struct netdev_rx_queue *rxq; 279 215 int err; 280 - 281 - if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) 282 - return; 283 216 284 217 rxq = __netif_get_rx_queue(dev, ifq_idx); 285 218 ··· 323 214 WARN_ON(err && err != -ENETDOWN); 324 215 } 325 216 326 - void net_mp_close_rxq(struct net_device *dev, unsigned ifq_idx, 327 - struct pp_memory_provider_params *old_p) 217 + void netif_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, 218 + const struct pp_memory_provider_params *old_p) 328 219 { 329 - netdev_lock(dev); 330 - __net_mp_close_rxq(dev, ifq_idx, old_p); 331 - netdev_unlock(dev); 220 + struct net_device *orig_dev = dev; 221 + 222 + if (WARN_ON_ONCE(ifq_idx >= dev->real_num_rx_queues)) 223 + return; 224 + if (!netif_rxq_is_leased(dev, ifq_idx)) 225 + return __netif_mp_close_rxq(dev, ifq_idx, old_p); 226 + 227 + if (WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &ifq_idx))) 228 + return; 229 + 230 + __netif_mp_close_rxq(dev, ifq_idx, old_p); 231 + netif_put_rx_queue_lease_locked(orig_dev, dev); 232 + } 233 + 234 + void __netif_mp_uninstall_rxq(struct netdev_rx_queue *rxq, 235 + const struct pp_memory_provider_params *p) 236 + { 237 + if (p->mp_ops && p->mp_ops->uninstall) 238 + p->mp_ops->uninstall(p->mp_priv, rxq); 239 + } 240 + 241 + /* Clean up memory provider state when a queue lease is torn down. If 242 + * a memory provider was installed on the physical queue via the lease, 243 + * close it now. The memory provider is a property of the queue itself, 244 + * and it was _guaranteed_ to be installed on the physical queue via 245 + * the lease redirection. The extra __netif_mp_close_rxq is needed 246 + * since the physical queue can outlive the virtual queue in the lease 247 + * case, so it needs to be reconfigured to clear the memory provider. 248 + */ 249 + void netif_rxq_cleanup_unlease(struct netdev_rx_queue *phys_rxq, 250 + struct netdev_rx_queue *virt_rxq) 251 + { 252 + struct pp_memory_provider_params *p = &phys_rxq->mp_params; 253 + unsigned int ifq_idx = get_netdev_rx_queue_index(phys_rxq); 254 + 255 + if (!p->mp_ops) 256 + return; 257 + 258 + __netif_mp_uninstall_rxq(virt_rxq, p); 259 + __netif_mp_close_rxq(phys_rxq->dev, ifq_idx, p); 332 260 }
+17 -11
net/ethtool/channels.c
··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 3 - #include <net/xdp_sock_drv.h> 3 + #include <net/netdev_queues.h> 4 4 5 5 #include "common.h" 6 6 #include "netlink.h" ··· 109 109 static int 110 110 ethnl_set_channels(struct ethnl_req_info *req_info, struct genl_info *info) 111 111 { 112 - unsigned int from_channel, old_total, i; 112 + unsigned int old_combined, old_rx, old_tx, i; 113 113 bool mod = false, mod_combined = false; 114 114 struct net_device *dev = req_info->dev; 115 115 struct ethtool_channels channels = {}; ··· 118 118 int ret; 119 119 120 120 dev->ethtool_ops->get_channels(dev, &channels); 121 - old_total = channels.combined_count + 122 - max(channels.rx_count, channels.tx_count); 121 + old_combined = channels.combined_count; 122 + old_rx = channels.rx_count; 123 + old_tx = channels.tx_count; 123 124 124 125 ethnl_update_u32(&channels.rx_count, tb[ETHTOOL_A_CHANNELS_RX_COUNT], 125 126 &mod); ··· 170 169 if (ret) 171 170 return ret; 172 171 173 - /* Disabling channels, query zero-copy AF_XDP sockets */ 174 - from_channel = channels.combined_count + 175 - min(channels.rx_count, channels.tx_count); 176 - for (i = from_channel; i < old_total; i++) 177 - if (xsk_get_pool_from_qid(dev, i)) { 178 - GENL_SET_ERR_MSG(info, "requested channel counts are too low for existing zerocopy AF_XDP sockets"); 172 + /* ensure channels are not busy at the moment */ 173 + for (i = channels.combined_count + channels.rx_count; 174 + i < old_combined + old_rx; i++) { 175 + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_RX, 176 + info->extack)) 179 177 return -EINVAL; 180 - } 178 + } 179 + for (i = channels.combined_count + channels.tx_count; 180 + i < old_combined + old_tx; i++) { 181 + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_TX, 182 + info->extack)) 183 + return -EINVAL; 184 + } 181 185 182 186 ret = dev->ethtool_ops->set_channels(dev, &channels); 183 187 return ret < 0 ? ret : 1;
+12 -9
net/ethtool/ioctl.c
··· 27 27 #include <linux/net.h> 28 28 #include <linux/pm_runtime.h> 29 29 #include <linux/utsname.h> 30 + #include <linux/ethtool_netlink.h> 30 31 #include <net/devlink.h> 31 32 #include <net/ipv6.h> 32 - #include <net/xdp_sock_drv.h> 33 33 #include <net/flow_offload.h> 34 34 #include <net/netdev_lock.h> 35 - #include <linux/ethtool_netlink.h> 35 + #include <net/netdev_queues.h> 36 36 37 37 #include "common.h" 38 38 ··· 2250 2250 void __user *useraddr) 2251 2251 { 2252 2252 struct ethtool_channels channels, curr = { .cmd = ETHTOOL_GCHANNELS }; 2253 - u16 from_channel, to_channel; 2254 2253 unsigned int i; 2255 2254 int ret; 2256 2255 ··· 2283 2284 if (ret) 2284 2285 return ret; 2285 2286 2286 - /* Disabling channels, query zero-copy AF_XDP sockets */ 2287 - from_channel = channels.combined_count + 2288 - min(channels.rx_count, channels.tx_count); 2289 - to_channel = curr.combined_count + max(curr.rx_count, curr.tx_count); 2290 - for (i = from_channel; i < to_channel; i++) 2291 - if (xsk_get_pool_from_qid(dev, i)) 2287 + /* Disabling channels, query busy queues (AF_XDP, queue leasing) */ 2288 + for (i = channels.combined_count + channels.rx_count; 2289 + i < curr.combined_count + curr.rx_count; i++) { 2290 + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_RX, NULL)) 2292 2291 return -EINVAL; 2292 + } 2293 + for (i = channels.combined_count + channels.tx_count; 2294 + i < curr.combined_count + curr.tx_count; i++) { 2295 + if (netdev_queue_busy(dev, i, NETDEV_QUEUE_TYPE_TX, NULL)) 2296 + return -EINVAL; 2297 + } 2293 2298 2294 2299 ret = dev->ethtool_ops->set_channels(dev, &channels); 2295 2300 if (!ret)
+61 -14
net/xdp/xsk.c
··· 23 23 #include <linux/netdevice.h> 24 24 #include <linux/rculist.h> 25 25 #include <linux/vmalloc.h> 26 + 27 + #include <net/netdev_queues.h> 26 28 #include <net/xdp_sock_drv.h> 27 29 #include <net/busy_poll.h> 28 30 #include <net/netdev_lock.h> ··· 119 117 120 118 void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) 121 119 { 122 - if (queue_id < dev->num_rx_queues) 123 - dev->_rx[queue_id].pool = NULL; 124 - if (queue_id < dev->num_tx_queues) 125 - dev->_tx[queue_id].pool = NULL; 120 + struct net_device *orig_dev = dev; 121 + unsigned int id = queue_id; 122 + 123 + if (id < dev->real_num_rx_queues) 124 + WARN_ON_ONCE(!netif_get_rx_queue_lease_locked(&dev, &id)); 125 + 126 + if (id < dev->num_rx_queues) 127 + dev->_rx[id].pool = NULL; 128 + if (id < dev->num_tx_queues) 129 + dev->_tx[id].pool = NULL; 130 + 131 + netif_put_rx_queue_lease_locked(orig_dev, dev); 126 132 } 127 133 128 134 /* The buffer pool is stored both in the _rx struct and the _tx struct as we do ··· 140 130 int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, 141 131 u16 queue_id) 142 132 { 143 - if (queue_id >= max_t(unsigned int, 144 - dev->real_num_rx_queues, 145 - dev->real_num_tx_queues)) 133 + struct net_device *orig_dev = dev; 134 + unsigned int id = queue_id; 135 + int ret = 0; 136 + 137 + if (id >= max(dev->real_num_rx_queues, 138 + dev->real_num_tx_queues)) 146 139 return -EINVAL; 147 140 148 - if (queue_id < dev->real_num_rx_queues) 149 - dev->_rx[queue_id].pool = pool; 150 - if (queue_id < dev->real_num_tx_queues) 151 - dev->_tx[queue_id].pool = pool; 141 + if (id < dev->real_num_rx_queues) { 142 + if (!netif_get_rx_queue_lease_locked(&dev, &id)) 143 + return -EBUSY; 144 + if (xsk_get_pool_from_qid(dev, id)) { 145 + ret = -EBUSY; 146 + goto out; 147 + } 148 + } 152 149 153 - return 0; 150 + if (id < dev->real_num_rx_queues) 151 + dev->_rx[id].pool = pool; 152 + if (id < dev->real_num_tx_queues) 153 + dev->_tx[id].pool = pool; 154 + out: 155 + netif_put_rx_queue_lease_locked(orig_dev, dev); 156 + return ret; 154 157 } 155 158 156 159 static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, ··· 353 330 return false; 354 331 } 355 332 333 + static bool xsk_dev_queue_valid(const struct xdp_sock *xs, 334 + const struct xdp_rxq_info *info) 335 + { 336 + struct net_device *dev = xs->dev; 337 + u32 queue_index = xs->queue_id; 338 + struct netdev_rx_queue *rxq; 339 + 340 + if (info->dev == dev && 341 + info->queue_index == queue_index) 342 + return true; 343 + 344 + if (queue_index < dev->real_num_rx_queues) { 345 + rxq = READ_ONCE(__netif_get_rx_queue(dev, queue_index)->lease); 346 + if (!rxq) 347 + return false; 348 + 349 + dev = rxq->dev; 350 + queue_index = get_netdev_rx_queue_index(rxq); 351 + 352 + return info->dev == dev && 353 + info->queue_index == queue_index; 354 + } 355 + return false; 356 + } 357 + 356 358 static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) 357 359 { 358 360 if (!xsk_is_bound(xs)) 359 361 return -ENXIO; 360 - 361 - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) 362 + if (!xsk_dev_queue_valid(xs, xdp->rxq)) 362 363 return -EINVAL; 363 364 364 365 if (len > __xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) {
+11
tools/include/uapi/linux/netdev.h
··· 160 160 NETDEV_A_QUEUE_DMABUF, 161 161 NETDEV_A_QUEUE_IO_URING, 162 162 NETDEV_A_QUEUE_XSK, 163 + NETDEV_A_QUEUE_LEASE, 163 164 164 165 __NETDEV_A_QUEUE_MAX, 165 166 NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) ··· 204 203 }; 205 204 206 205 enum { 206 + NETDEV_A_LEASE_IFINDEX = 1, 207 + NETDEV_A_LEASE_QUEUE, 208 + NETDEV_A_LEASE_NETNS_ID, 209 + 210 + __NETDEV_A_LEASE_MAX, 211 + NETDEV_A_LEASE_MAX = (__NETDEV_A_LEASE_MAX - 1) 212 + }; 213 + 214 + enum { 207 215 NETDEV_A_DMABUF_IFINDEX = 1, 208 216 NETDEV_A_DMABUF_QUEUES, 209 217 NETDEV_A_DMABUF_FD, ··· 238 228 NETDEV_CMD_BIND_RX, 239 229 NETDEV_CMD_NAPI_SET, 240 230 NETDEV_CMD_BIND_TX, 231 + NETDEV_CMD_QUEUE_CREATE, 241 232 242 233 __NETDEV_CMD_MAX, 243 234 NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1)
+1
tools/testing/selftests/drivers/net/hw/Makefile
··· 35 35 loopback.sh \ 36 36 nic_timestamp.py \ 37 37 nk_netns.py \ 38 + nk_qlease.py \ 38 39 pp_alloc_fail.py \ 39 40 rss_api.py \ 40 41 rss_ctx.py \
+2 -2
tools/testing/selftests/drivers/net/hw/lib/py/__init__.py
··· 20 20 # Import one by one to avoid pylint false positives 21 21 from net.lib.py import NetNS, NetNSEnter, NetdevSimDev 22 22 from net.lib.py import EthtoolFamily, NetdevFamily, NetshaperFamily, \ 23 - NlError, RtnlFamily, DevlinkFamily, PSPFamily 23 + NlError, RtnlFamily, DevlinkFamily, PSPFamily, Netlink 24 24 from net.lib.py import CmdExitFailure 25 25 from net.lib.py import bkg, cmd, bpftool, bpftrace, defer, ethtool, \ 26 26 fd_read_timeout, ip, rand_port, rand_ports, wait_port_listen, \ ··· 36 36 37 37 __all__ = ["NetNS", "NetNSEnter", "NetdevSimDev", 38 38 "EthtoolFamily", "NetdevFamily", "NetshaperFamily", 39 - "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily", 39 + "NlError", "RtnlFamily", "DevlinkFamily", "PSPFamily", "Netlink", 40 40 "CmdExitFailure", 41 41 "bkg", "cmd", "bpftool", "bpftrace", "defer", "ethtool", 42 42 "fd_read_timeout", "ip", "rand_port", "rand_ports",
+1407
tools/testing/selftests/drivers/net/hw/nk_qlease.py
··· 1 + #!/usr/bin/env python3 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + import errno 5 + import re 6 + import time 7 + import threading 8 + from os import path 9 + from lib.py import ( 10 + ksft_run, 11 + ksft_exit, 12 + ksft_eq, 13 + ksft_ne, 14 + ksft_in, 15 + ksft_not_in, 16 + ksft_raises, 17 + ) 18 + from lib.py import ( 19 + NetDrvContEnv, 20 + NetNS, 21 + NetNSEnter, 22 + EthtoolFamily, 23 + NetdevFamily, 24 + RtnlFamily, 25 + NetdevSimDev, 26 + ) 27 + from lib.py import ( 28 + NlError, 29 + Netlink, 30 + bkg, 31 + cmd, 32 + defer, 33 + ethtool, 34 + ip, 35 + rand_port, 36 + wait_port_listen, 37 + ) 38 + from lib.py import KsftSkipEx, CmdExitFailure 39 + 40 + 41 + def set_flow_rule(cfg): 42 + output = ethtool( 43 + f"-N {cfg.ifname} flow-type tcp6 dst-port {cfg.port} action {cfg.src_queue}" 44 + ).stdout 45 + values = re.search(r"ID (\d+)", output).group(1) 46 + return int(values) 47 + 48 + 49 + def create_netkit(rxqueues): 50 + all_links = ip("-d link show", json=True) 51 + old_idxs = { 52 + link["ifindex"] 53 + for link in all_links 54 + if link.get("linkinfo", {}).get("info_kind") == "netkit" 55 + } 56 + 57 + rtnl = RtnlFamily() 58 + rtnl.newlink( 59 + { 60 + "linkinfo": { 61 + "kind": "netkit", 62 + "data": { 63 + "mode": "l2", 64 + "policy": "forward", 65 + "peer-policy": "forward", 66 + }, 67 + }, 68 + "num-rx-queues": rxqueues, 69 + }, 70 + flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL], 71 + ) 72 + 73 + all_links = ip("-d link show", json=True) 74 + nk_links = [ 75 + link 76 + for link in all_links 77 + if link.get("linkinfo", {}).get("info_kind") == "netkit" 78 + and link["ifindex"] not in old_idxs 79 + ] 80 + nk_links.sort(key=lambda x: x["ifindex"]) 81 + return ( 82 + nk_links[1]["ifname"], 83 + nk_links[1]["ifindex"], 84 + nk_links[0]["ifname"], 85 + nk_links[0]["ifindex"], 86 + ) 87 + 88 + 89 + def create_netkit_single(rxqueues): 90 + rtnl = RtnlFamily() 91 + rtnl.newlink( 92 + { 93 + "linkinfo": { 94 + "kind": "netkit", 95 + "data": { 96 + "mode": "l2", 97 + "pairing": "single", 98 + }, 99 + }, 100 + "num-rx-queues": rxqueues, 101 + }, 102 + flags=[Netlink.NLM_F_CREATE, Netlink.NLM_F_EXCL], 103 + ) 104 + 105 + all_links = ip("-d link show", json=True) 106 + nk_links = [ 107 + link 108 + for link in all_links 109 + if link.get("linkinfo", {}).get("info_kind") == "netkit" 110 + and "UP" not in link.get("flags", []) 111 + ] 112 + return nk_links[0]["ifname"], nk_links[0]["ifindex"] 113 + 114 + 115 + def test_remove_phys(netns) -> None: 116 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 117 + defer(nsimdev.remove) 118 + nsim = nsimdev.nsims[0] 119 + ip(f"link set dev {nsim.ifname} up") 120 + 121 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 122 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 123 + 124 + ip(f"link set dev {nk_guest} netns {netns.name}") 125 + ip(f"link set dev {nk_host} up") 126 + ip(f"link set dev {nk_guest} up", ns=netns) 127 + 128 + src_queue = 1 129 + with NetNSEnter(str(netns)): 130 + netdevnl = NetdevFamily() 131 + result = netdevnl.queue_create( 132 + { 133 + "ifindex": nk_guest_idx, 134 + "type": "rx", 135 + "lease": { 136 + "ifindex": nsim.ifindex, 137 + "queue": {"id": src_queue, "type": "rx"}, 138 + "netns-id": 0, 139 + }, 140 + } 141 + ) 142 + nk_queue_id = result["id"] 143 + 144 + netdevnl = NetdevFamily() 145 + queue_info = netdevnl.queue_get( 146 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 147 + ) 148 + ksft_in("lease", queue_info) 149 + ksft_eq(queue_info["lease"]["ifindex"], nk_guest_idx) 150 + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) 151 + 152 + nsimdev.remove() 153 + time.sleep(0.1) 154 + ret = cmd(f"ip link show dev {nk_host}", fail=False) 155 + ksft_ne(ret.ret, 0) 156 + 157 + 158 + def test_double_lease(netns) -> None: 159 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 160 + defer(nsimdev.remove) 161 + nsim = nsimdev.nsims[0] 162 + ip(f"link set dev {nsim.ifname} up") 163 + 164 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=3) 165 + defer(cmd, f"ip link del dev {nk_host}") 166 + 167 + ip(f"link set dev {nk_guest} netns {netns.name}") 168 + ip(f"link set dev {nk_host} up") 169 + ip(f"link set dev {nk_guest} up", ns=netns) 170 + 171 + src_queue = 1 172 + with NetNSEnter(str(netns)): 173 + netdevnl = NetdevFamily() 174 + result = netdevnl.queue_create( 175 + { 176 + "ifindex": nk_guest_idx, 177 + "type": "rx", 178 + "lease": { 179 + "ifindex": nsim.ifindex, 180 + "queue": {"id": src_queue, "type": "rx"}, 181 + "netns-id": 0, 182 + }, 183 + } 184 + ) 185 + ksft_eq(result["id"], 1) 186 + 187 + with ksft_raises(NlError) as e: 188 + netdevnl.queue_create( 189 + { 190 + "ifindex": nk_guest_idx, 191 + "type": "rx", 192 + "lease": { 193 + "ifindex": nsim.ifindex, 194 + "queue": {"id": src_queue, "type": "rx"}, 195 + "netns-id": 0, 196 + }, 197 + } 198 + ) 199 + ksft_eq(e.exception.nl_msg.error, -errno.EBUSY) 200 + 201 + 202 + def test_virtual_lessor(netns) -> None: 203 + nk_host_a, _, nk_guest_a, nk_guest_a_idx = create_netkit(rxqueues=2) 204 + defer(cmd, f"ip link del dev {nk_host_a}") 205 + ip(f"link set dev {nk_host_a} up") 206 + ip(f"link set dev {nk_guest_a} up") 207 + 208 + nk_host_b, _, nk_guest_b, nk_guest_b_idx = create_netkit(rxqueues=2) 209 + defer(cmd, f"ip link del dev {nk_host_b}") 210 + 211 + ip(f"link set dev {nk_guest_b} netns {netns.name}") 212 + ip(f"link set dev {nk_host_b} up") 213 + ip(f"link set dev {nk_guest_b} up", ns=netns) 214 + 215 + with NetNSEnter(str(netns)): 216 + netdevnl = NetdevFamily() 217 + with ksft_raises(NlError) as e: 218 + netdevnl.queue_create( 219 + { 220 + "ifindex": nk_guest_b_idx, 221 + "type": "rx", 222 + "lease": { 223 + "ifindex": nk_guest_a_idx, 224 + "queue": {"id": 0, "type": "rx"}, 225 + "netns-id": 0, 226 + }, 227 + } 228 + ) 229 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 230 + 231 + 232 + def test_phys_lessee(_netns) -> None: 233 + nsimdev_a = NetdevSimDev(port_count=1, queue_count=2) 234 + defer(nsimdev_a.remove) 235 + nsim_a = nsimdev_a.nsims[0] 236 + ip(f"link set dev {nsim_a.ifname} up") 237 + 238 + nsimdev_b = NetdevSimDev(port_count=1, queue_count=2) 239 + defer(nsimdev_b.remove) 240 + nsim_b = nsimdev_b.nsims[0] 241 + ip(f"link set dev {nsim_b.ifname} up") 242 + 243 + netdevnl = NetdevFamily() 244 + with ksft_raises(NlError) as e: 245 + netdevnl.queue_create( 246 + { 247 + "ifindex": nsim_a.ifindex, 248 + "type": "rx", 249 + "lease": { 250 + "ifindex": nsim_b.ifindex, 251 + "queue": {"id": 0, "type": "rx"}, 252 + }, 253 + } 254 + ) 255 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 256 + 257 + 258 + def test_different_lessors(netns) -> None: 259 + nsimdev_a = NetdevSimDev(port_count=1, queue_count=2) 260 + defer(nsimdev_a.remove) 261 + nsim_a = nsimdev_a.nsims[0] 262 + ip(f"link set dev {nsim_a.ifname} up") 263 + 264 + nsimdev_b = NetdevSimDev(port_count=1, queue_count=2) 265 + defer(nsimdev_b.remove) 266 + nsim_b = nsimdev_b.nsims[0] 267 + ip(f"link set dev {nsim_b.ifname} up") 268 + 269 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=3) 270 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 271 + 272 + ip(f"link set dev {nk_guest} netns {netns.name}") 273 + ip(f"link set dev {nk_host} up") 274 + ip(f"link set dev {nk_guest} up", ns=netns) 275 + 276 + with NetNSEnter(str(netns)): 277 + netdevnl = NetdevFamily() 278 + netdevnl.queue_create( 279 + { 280 + "ifindex": nk_guest_idx, 281 + "type": "rx", 282 + "lease": { 283 + "ifindex": nsim_a.ifindex, 284 + "queue": {"id": 1, "type": "rx"}, 285 + "netns-id": 0, 286 + }, 287 + } 288 + ) 289 + 290 + with ksft_raises(NlError) as e: 291 + netdevnl.queue_create( 292 + { 293 + "ifindex": nk_guest_idx, 294 + "type": "rx", 295 + "lease": { 296 + "ifindex": nsim_b.ifindex, 297 + "queue": {"id": 1, "type": "rx"}, 298 + "netns-id": 0, 299 + }, 300 + } 301 + ) 302 + ksft_eq(e.exception.nl_msg.error, -errno.EOPNOTSUPP) 303 + 304 + 305 + def test_queue_out_of_range(netns) -> None: 306 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 307 + defer(nsimdev.remove) 308 + nsim = nsimdev.nsims[0] 309 + ip(f"link set dev {nsim.ifname} up") 310 + 311 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 312 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 313 + 314 + ip(f"link set dev {nk_guest} netns {netns.name}") 315 + ip(f"link set dev {nk_host} up") 316 + ip(f"link set dev {nk_guest} up", ns=netns) 317 + 318 + with NetNSEnter(str(netns)): 319 + netdevnl = NetdevFamily() 320 + with ksft_raises(NlError) as e: 321 + netdevnl.queue_create( 322 + { 323 + "ifindex": nk_guest_idx, 324 + "type": "rx", 325 + "lease": { 326 + "ifindex": nsim.ifindex, 327 + "queue": {"id": 2, "type": "rx"}, 328 + "netns-id": 0, 329 + }, 330 + } 331 + ) 332 + ksft_eq(e.exception.nl_msg.error, -errno.ERANGE) 333 + 334 + 335 + def test_resize_leased(netns) -> None: 336 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 337 + defer(nsimdev.remove) 338 + nsim = nsimdev.nsims[0] 339 + ip(f"link set dev {nsim.ifname} up") 340 + 341 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 342 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 343 + 344 + ip(f"link set dev {nk_guest} netns {netns.name}") 345 + ip(f"link set dev {nk_host} up") 346 + ip(f"link set dev {nk_guest} up", ns=netns) 347 + 348 + with NetNSEnter(str(netns)): 349 + netdevnl = NetdevFamily() 350 + netdevnl.queue_create( 351 + { 352 + "ifindex": nk_guest_idx, 353 + "type": "rx", 354 + "lease": { 355 + "ifindex": nsim.ifindex, 356 + "queue": {"id": 1, "type": "rx"}, 357 + "netns-id": 0, 358 + }, 359 + } 360 + ) 361 + 362 + ethnl = EthtoolFamily() 363 + with ksft_raises(NlError) as e: 364 + ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1}) 365 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 366 + 367 + 368 + def test_self_lease(_netns) -> None: 369 + nk_host, _, _, nk_guest_idx = create_netkit(rxqueues=2) 370 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 371 + 372 + netdevnl = NetdevFamily() 373 + with ksft_raises(NlError) as e: 374 + netdevnl.queue_create( 375 + { 376 + "ifindex": nk_guest_idx, 377 + "type": "rx", 378 + "lease": { 379 + "ifindex": nk_guest_idx, 380 + "queue": {"id": 0, "type": "rx"}, 381 + }, 382 + } 383 + ) 384 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 385 + 386 + 387 + def test_veth_queue_create(netns) -> None: 388 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 389 + defer(nsimdev.remove) 390 + nsim = nsimdev.nsims[0] 391 + ip(f"link set dev {nsim.ifname} up") 392 + 393 + ip("link add veth0 type veth peer name veth1") 394 + defer(cmd, "ip link del dev veth0", fail=False) 395 + 396 + all_links = ip("-d link show", json=True) 397 + veth_peer = [ 398 + link 399 + for link in all_links 400 + if link.get("ifname") == "veth1" 401 + ] 402 + veth_peer_idx = veth_peer[0]["ifindex"] 403 + 404 + ip(f"link set dev veth1 netns {netns.name}") 405 + ip("link set dev veth0 up") 406 + ip("link set dev veth1 up", ns=netns) 407 + 408 + with NetNSEnter(str(netns)): 409 + netdevnl = NetdevFamily() 410 + with ksft_raises(NlError) as e: 411 + netdevnl.queue_create( 412 + { 413 + "ifindex": veth_peer_idx, 414 + "type": "rx", 415 + "lease": { 416 + "ifindex": nsim.ifindex, 417 + "queue": {"id": 1, "type": "rx"}, 418 + "netns-id": 0, 419 + }, 420 + } 421 + ) 422 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 423 + 424 + 425 + def test_create_tx_type(netns) -> None: 426 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 427 + defer(nsimdev.remove) 428 + nsim = nsimdev.nsims[0] 429 + ip(f"link set dev {nsim.ifname} up") 430 + 431 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 432 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 433 + 434 + ip(f"link set dev {nk_guest} netns {netns.name}") 435 + ip(f"link set dev {nk_host} up") 436 + ip(f"link set dev {nk_guest} up", ns=netns) 437 + 438 + with NetNSEnter(str(netns)): 439 + netdevnl = NetdevFamily() 440 + with ksft_raises(NlError) as e: 441 + netdevnl.queue_create( 442 + { 443 + "ifindex": nk_guest_idx, 444 + "type": "tx", 445 + "lease": { 446 + "ifindex": nsim.ifindex, 447 + "queue": {"id": 1, "type": "rx"}, 448 + "netns-id": 0, 449 + }, 450 + } 451 + ) 452 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 453 + 454 + 455 + def test_create_primary(_netns) -> None: 456 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 457 + defer(nsimdev.remove) 458 + nsim = nsimdev.nsims[0] 459 + ip(f"link set dev {nsim.ifname} up") 460 + 461 + nk_host, nk_host_idx, _, _ = create_netkit(rxqueues=2) 462 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 463 + 464 + ip(f"link set dev {nk_host} up") 465 + 466 + netdevnl = NetdevFamily() 467 + with ksft_raises(NlError) as e: 468 + netdevnl.queue_create( 469 + { 470 + "ifindex": nk_host_idx, 471 + "type": "rx", 472 + "lease": { 473 + "ifindex": nsim.ifindex, 474 + "queue": {"id": 1, "type": "rx"}, 475 + }, 476 + } 477 + ) 478 + ksft_eq(e.exception.nl_msg.error, -errno.EOPNOTSUPP) 479 + 480 + 481 + def test_create_limit(netns) -> None: 482 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 483 + defer(nsimdev.remove) 484 + nsim = nsimdev.nsims[0] 485 + ip(f"link set dev {nsim.ifname} up") 486 + 487 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=1) 488 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 489 + 490 + ip(f"link set dev {nk_guest} netns {netns.name}") 491 + ip(f"link set dev {nk_host} up") 492 + ip(f"link set dev {nk_guest} up", ns=netns) 493 + 494 + with NetNSEnter(str(netns)): 495 + netdevnl = NetdevFamily() 496 + with ksft_raises(NlError) as e: 497 + netdevnl.queue_create( 498 + { 499 + "ifindex": nk_guest_idx, 500 + "type": "rx", 501 + "lease": { 502 + "ifindex": nsim.ifindex, 503 + "queue": {"id": 1, "type": "rx"}, 504 + "netns-id": 0, 505 + }, 506 + } 507 + ) 508 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 509 + 510 + 511 + def test_link_flap_phys(netns) -> None: 512 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 513 + defer(nsimdev.remove) 514 + nsim = nsimdev.nsims[0] 515 + ip(f"link set dev {nsim.ifname} up") 516 + 517 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 518 + defer(cmd, f"ip link del dev {nk_host}") 519 + 520 + ip(f"link set dev {nk_guest} netns {netns.name}") 521 + ip(f"link set dev {nk_host} up") 522 + ip(f"link set dev {nk_guest} up", ns=netns) 523 + 524 + src_queue = 1 525 + with NetNSEnter(str(netns)): 526 + netdevnl = NetdevFamily() 527 + result = netdevnl.queue_create( 528 + { 529 + "ifindex": nk_guest_idx, 530 + "type": "rx", 531 + "lease": { 532 + "ifindex": nsim.ifindex, 533 + "queue": {"id": src_queue, "type": "rx"}, 534 + "netns-id": 0, 535 + }, 536 + } 537 + ) 538 + nk_queue_id = result["id"] 539 + 540 + netdevnl = NetdevFamily() 541 + queue_info = netdevnl.queue_get( 542 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 543 + ) 544 + ksft_in("lease", queue_info) 545 + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) 546 + 547 + # Link flap the physical device 548 + ip(f"link set dev {nsim.ifname} down") 549 + ip(f"link set dev {nsim.ifname} up") 550 + 551 + # Verify lease survives the flap 552 + queue_info = netdevnl.queue_get( 553 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 554 + ) 555 + ksft_in("lease", queue_info) 556 + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) 557 + 558 + 559 + def test_queue_get_virtual(netns) -> None: 560 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 561 + defer(nsimdev.remove) 562 + nsim = nsimdev.nsims[0] 563 + ip(f"link set dev {nsim.ifname} up") 564 + 565 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 566 + defer(cmd, f"ip link del dev {nk_host}") 567 + 568 + ip(f"link set dev {nk_guest} netns {netns.name}") 569 + ip(f"link set dev {nk_host} up") 570 + ip(f"link set dev {nk_guest} up", ns=netns) 571 + 572 + src_queue = 1 573 + with NetNSEnter(str(netns)): 574 + netdevnl = NetdevFamily() 575 + result = netdevnl.queue_create( 576 + { 577 + "ifindex": nk_guest_idx, 578 + "type": "rx", 579 + "lease": { 580 + "ifindex": nsim.ifindex, 581 + "queue": {"id": src_queue, "type": "rx"}, 582 + "netns-id": 0, 583 + }, 584 + } 585 + ) 586 + nk_queue_id = result["id"] 587 + 588 + # queue-get on virtual device's leased queue should not show lease 589 + # info (lease info is only shown from the physical device's side) 590 + queue_info = netdevnl.queue_get( 591 + {"ifindex": nk_guest_idx, "id": nk_queue_id, "type": "rx"} 592 + ) 593 + ksft_eq(queue_info["id"], nk_queue_id) 594 + ksft_eq(queue_info["ifindex"], nk_guest_idx) 595 + ksft_not_in("lease", queue_info) 596 + 597 + # Default queue (not leased) also has no lease info 598 + queue_info = netdevnl.queue_get( 599 + {"ifindex": nk_guest_idx, "id": 0, "type": "rx"} 600 + ) 601 + ksft_not_in("lease", queue_info) 602 + 603 + 604 + def test_remove_virt_first(netns) -> None: 605 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 606 + defer(nsimdev.remove) 607 + nsim = nsimdev.nsims[0] 608 + ip(f"link set dev {nsim.ifname} up") 609 + 610 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 611 + 612 + ip(f"link set dev {nk_guest} netns {netns.name}") 613 + ip(f"link set dev {nk_host} up") 614 + ip(f"link set dev {nk_guest} up", ns=netns) 615 + 616 + src_queue = 1 617 + with NetNSEnter(str(netns)): 618 + netdevnl = NetdevFamily() 619 + result = netdevnl.queue_create( 620 + { 621 + "ifindex": nk_guest_idx, 622 + "type": "rx", 623 + "lease": { 624 + "ifindex": nsim.ifindex, 625 + "queue": {"id": src_queue, "type": "rx"}, 626 + "netns-id": 0, 627 + }, 628 + } 629 + ) 630 + ksft_eq(result["id"], 1) 631 + 632 + netdevnl = NetdevFamily() 633 + queue_info = netdevnl.queue_get( 634 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 635 + ) 636 + ksft_in("lease", queue_info) 637 + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) 638 + 639 + # Delete netkit (virtual device removed first, physical stays) 640 + cmd(f"ip link del dev {nk_host}") 641 + 642 + # Verify lease is cleaned up on physical device 643 + queue_info = netdevnl.queue_get( 644 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 645 + ) 646 + ksft_not_in("lease", queue_info) 647 + 648 + 649 + def test_multiple_leases(netns) -> None: 650 + nsimdev = NetdevSimDev(port_count=1, queue_count=3) 651 + defer(nsimdev.remove) 652 + nsim = nsimdev.nsims[0] 653 + ip(f"link set dev {nsim.ifname} up") 654 + 655 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=4) 656 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 657 + 658 + ip(f"link set dev {nk_guest} netns {netns.name}") 659 + ip(f"link set dev {nk_host} up") 660 + ip(f"link set dev {nk_guest} up", ns=netns) 661 + 662 + with NetNSEnter(str(netns)): 663 + netdevnl = NetdevFamily() 664 + r1 = netdevnl.queue_create( 665 + { 666 + "ifindex": nk_guest_idx, 667 + "type": "rx", 668 + "lease": { 669 + "ifindex": nsim.ifindex, 670 + "queue": {"id": 1, "type": "rx"}, 671 + "netns-id": 0, 672 + }, 673 + } 674 + ) 675 + r2 = netdevnl.queue_create( 676 + { 677 + "ifindex": nk_guest_idx, 678 + "type": "rx", 679 + "lease": { 680 + "ifindex": nsim.ifindex, 681 + "queue": {"id": 2, "type": "rx"}, 682 + "netns-id": 0, 683 + }, 684 + } 685 + ) 686 + 687 + ksft_eq(r1["id"], 1) 688 + ksft_eq(r2["id"], 2) 689 + 690 + # Verify both leases visible on physical device 691 + netdevnl = NetdevFamily() 692 + q1 = netdevnl.queue_get( 693 + {"ifindex": nsim.ifindex, "id": 1, "type": "rx"} 694 + ) 695 + q2 = netdevnl.queue_get( 696 + {"ifindex": nsim.ifindex, "id": 2, "type": "rx"} 697 + ) 698 + ksft_in("lease", q1) 699 + ksft_in("lease", q2) 700 + ksft_eq(q1["lease"]["ifindex"], nk_guest_idx) 701 + ksft_eq(q2["lease"]["ifindex"], nk_guest_idx) 702 + ksft_eq(q1["lease"]["queue"]["id"], r1["id"]) 703 + ksft_eq(q2["lease"]["queue"]["id"], r2["id"]) 704 + 705 + 706 + def test_lease_queue_tx_type(netns) -> None: 707 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 708 + defer(nsimdev.remove) 709 + nsim = nsimdev.nsims[0] 710 + ip(f"link set dev {nsim.ifname} up") 711 + 712 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 713 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 714 + 715 + ip(f"link set dev {nk_guest} netns {netns.name}") 716 + ip(f"link set dev {nk_host} up") 717 + ip(f"link set dev {nk_guest} up", ns=netns) 718 + 719 + with NetNSEnter(str(netns)): 720 + netdevnl = NetdevFamily() 721 + with ksft_raises(NlError) as e: 722 + netdevnl.queue_create( 723 + { 724 + "ifindex": nk_guest_idx, 725 + "type": "rx", 726 + "lease": { 727 + "ifindex": nsim.ifindex, 728 + "queue": {"id": 1, "type": "tx"}, 729 + "netns-id": 0, 730 + }, 731 + } 732 + ) 733 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 734 + 735 + 736 + def test_invalid_netns(netns) -> None: 737 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 738 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 739 + 740 + ip(f"link set dev {nk_guest} netns {netns.name}") 741 + ip(f"link set dev {nk_host} up") 742 + ip(f"link set dev {nk_guest} up", ns=netns) 743 + 744 + with NetNSEnter(str(netns)): 745 + netdevnl = NetdevFamily() 746 + with ksft_raises(NlError) as e: 747 + netdevnl.queue_create( 748 + { 749 + "ifindex": nk_guest_idx, 750 + "type": "rx", 751 + "lease": { 752 + "ifindex": 1, 753 + "queue": {"id": 0, "type": "rx"}, 754 + "netns-id": 999, 755 + }, 756 + } 757 + ) 758 + ksft_eq(e.exception.nl_msg.error, -errno.ENONET) 759 + 760 + 761 + def test_invalid_phys_ifindex(netns) -> None: 762 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 763 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 764 + 765 + ip(f"link set dev {nk_guest} netns {netns.name}") 766 + ip(f"link set dev {nk_host} up") 767 + ip(f"link set dev {nk_guest} up", ns=netns) 768 + 769 + with NetNSEnter(str(netns)): 770 + netdevnl = NetdevFamily() 771 + with ksft_raises(NlError) as e: 772 + netdevnl.queue_create( 773 + { 774 + "ifindex": nk_guest_idx, 775 + "type": "rx", 776 + "lease": { 777 + "ifindex": 99999, 778 + "queue": {"id": 0, "type": "rx"}, 779 + "netns-id": 0, 780 + }, 781 + } 782 + ) 783 + ksft_eq(e.exception.nl_msg.error, -errno.ENODEV) 784 + 785 + 786 + def test_multi_netkit_remove_phys(netns) -> None: 787 + nsimdev = NetdevSimDev(port_count=1, queue_count=3) 788 + defer(nsimdev.remove) 789 + nsim = nsimdev.nsims[0] 790 + ip(f"link set dev {nsim.ifname} up") 791 + 792 + # Create two netkit pairs, each leasing a different physical queue 793 + nk_host_a, _, nk_guest_a, nk_guest_a_idx = create_netkit(rxqueues=2) 794 + defer(cmd, f"ip link del dev {nk_host_a}", fail=False) 795 + 796 + nk_host_b, _, nk_guest_b, nk_guest_b_idx = create_netkit(rxqueues=2) 797 + defer(cmd, f"ip link del dev {nk_host_b}", fail=False) 798 + 799 + ip(f"link set dev {nk_guest_a} netns {netns.name}") 800 + ip(f"link set dev {nk_host_a} up") 801 + ip(f"link set dev {nk_guest_a} up", ns=netns) 802 + 803 + ip(f"link set dev {nk_guest_b} netns {netns.name}") 804 + ip(f"link set dev {nk_host_b} up") 805 + ip(f"link set dev {nk_guest_b} up", ns=netns) 806 + 807 + with NetNSEnter(str(netns)): 808 + netdevnl = NetdevFamily() 809 + netdevnl.queue_create( 810 + { 811 + "ifindex": nk_guest_a_idx, 812 + "type": "rx", 813 + "lease": { 814 + "ifindex": nsim.ifindex, 815 + "queue": {"id": 1, "type": "rx"}, 816 + "netns-id": 0, 817 + }, 818 + } 819 + ) 820 + netdevnl.queue_create( 821 + { 822 + "ifindex": nk_guest_b_idx, 823 + "type": "rx", 824 + "lease": { 825 + "ifindex": nsim.ifindex, 826 + "queue": {"id": 2, "type": "rx"}, 827 + "netns-id": 0, 828 + }, 829 + } 830 + ) 831 + 832 + # Removing the physical device should take down both netkit pairs 833 + nsimdev.remove() 834 + time.sleep(0.1) 835 + ret = cmd(f"ip link show dev {nk_host_a}", fail=False) 836 + ksft_ne(ret.ret, 0) 837 + ret = cmd(f"ip link show dev {nk_host_b}", fail=False) 838 + ksft_ne(ret.ret, 0) 839 + 840 + 841 + def test_single_remove_phys(_netns) -> None: 842 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 843 + defer(nsimdev.remove) 844 + nsim = nsimdev.nsims[0] 845 + ip(f"link set dev {nsim.ifname} up") 846 + 847 + nk_name, nk_idx = create_netkit_single(rxqueues=2) 848 + defer(cmd, f"ip link del dev {nk_name}", fail=False) 849 + 850 + ip(f"link set dev {nk_name} up") 851 + 852 + netdevnl = NetdevFamily() 853 + netdevnl.queue_create( 854 + { 855 + "ifindex": nk_idx, 856 + "type": "rx", 857 + "lease": { 858 + "ifindex": nsim.ifindex, 859 + "queue": {"id": 1, "type": "rx"}, 860 + }, 861 + } 862 + ) 863 + 864 + # Removing the physical device should take down the single netkit device 865 + nsimdev.remove() 866 + time.sleep(0.1) 867 + ret = cmd(f"ip link show dev {nk_name}", fail=False) 868 + ksft_ne(ret.ret, 0) 869 + 870 + 871 + def test_link_flap_virt(netns) -> None: 872 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 873 + defer(nsimdev.remove) 874 + nsim = nsimdev.nsims[0] 875 + ip(f"link set dev {nsim.ifname} up") 876 + 877 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 878 + defer(cmd, f"ip link del dev {nk_host}") 879 + 880 + ip(f"link set dev {nk_guest} netns {netns.name}") 881 + ip(f"link set dev {nk_host} up") 882 + ip(f"link set dev {nk_guest} up", ns=netns) 883 + 884 + src_queue = 1 885 + with NetNSEnter(str(netns)): 886 + netdevnl = NetdevFamily() 887 + result = netdevnl.queue_create( 888 + { 889 + "ifindex": nk_guest_idx, 890 + "type": "rx", 891 + "lease": { 892 + "ifindex": nsim.ifindex, 893 + "queue": {"id": src_queue, "type": "rx"}, 894 + "netns-id": 0, 895 + }, 896 + } 897 + ) 898 + nk_queue_id = result["id"] 899 + 900 + netdevnl = NetdevFamily() 901 + queue_info = netdevnl.queue_get( 902 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 903 + ) 904 + ksft_in("lease", queue_info) 905 + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) 906 + 907 + # Link flap the virtual (netkit) device 908 + ip(f"link set dev {nk_guest} down", ns=netns) 909 + ip(f"link set dev {nk_guest} up", ns=netns) 910 + 911 + # Verify lease survives the virtual device flap 912 + queue_info = netdevnl.queue_get( 913 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 914 + ) 915 + ksft_in("lease", queue_info) 916 + ksft_eq(queue_info["lease"]["queue"]["id"], nk_queue_id) 917 + 918 + 919 + def test_phys_queue_no_lease(netns) -> None: 920 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 921 + defer(nsimdev.remove) 922 + nsim = nsimdev.nsims[0] 923 + ip(f"link set dev {nsim.ifname} up") 924 + 925 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 926 + defer(cmd, f"ip link del dev {nk_host}") 927 + 928 + ip(f"link set dev {nk_guest} netns {netns.name}") 929 + ip(f"link set dev {nk_host} up") 930 + ip(f"link set dev {nk_guest} up", ns=netns) 931 + 932 + with NetNSEnter(str(netns)): 933 + netdevnl = NetdevFamily() 934 + netdevnl.queue_create( 935 + { 936 + "ifindex": nk_guest_idx, 937 + "type": "rx", 938 + "lease": { 939 + "ifindex": nsim.ifindex, 940 + "queue": {"id": 1, "type": "rx"}, 941 + "netns-id": 0, 942 + }, 943 + } 944 + ) 945 + 946 + # Physical queue 0 (not leased) should have no lease info 947 + netdevnl = NetdevFamily() 948 + queue_info = netdevnl.queue_get( 949 + {"ifindex": nsim.ifindex, "id": 0, "type": "rx"} 950 + ) 951 + ksft_not_in("lease", queue_info) 952 + 953 + # Physical queue 1 (leased) should have lease info 954 + queue_info = netdevnl.queue_get( 955 + {"ifindex": nsim.ifindex, "id": 1, "type": "rx"} 956 + ) 957 + ksft_in("lease", queue_info) 958 + 959 + 960 + def test_same_ns_lease(_netns) -> None: 961 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 962 + defer(nsimdev.remove) 963 + nsim = nsimdev.nsims[0] 964 + ip(f"link set dev {nsim.ifname} up") 965 + 966 + nk_name, nk_idx = create_netkit_single(rxqueues=2) 967 + defer(cmd, f"ip link del dev {nk_name}", fail=False) 968 + 969 + ip(f"link set dev {nk_name} up") 970 + 971 + netdevnl = NetdevFamily() 972 + result = netdevnl.queue_create( 973 + { 974 + "ifindex": nk_idx, 975 + "type": "rx", 976 + "lease": { 977 + "ifindex": nsim.ifindex, 978 + "queue": {"id": 1, "type": "rx"}, 979 + }, 980 + } 981 + ) 982 + ksft_eq(result["id"], 1) 983 + 984 + # Same namespace: lease info should NOT have netns-id 985 + queue_info = netdevnl.queue_get( 986 + {"ifindex": nsim.ifindex, "id": 1, "type": "rx"} 987 + ) 988 + ksft_in("lease", queue_info) 989 + ksft_eq(queue_info["lease"]["ifindex"], nk_idx) 990 + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) 991 + ksft_not_in("netns-id", queue_info["lease"]) 992 + 993 + 994 + def test_resize_after_unlease(netns) -> None: 995 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 996 + defer(nsimdev.remove) 997 + nsim = nsimdev.nsims[0] 998 + ip(f"link set dev {nsim.ifname} up") 999 + 1000 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 1001 + 1002 + ip(f"link set dev {nk_guest} netns {netns.name}") 1003 + ip(f"link set dev {nk_host} up") 1004 + ip(f"link set dev {nk_guest} up", ns=netns) 1005 + 1006 + with NetNSEnter(str(netns)): 1007 + netdevnl = NetdevFamily() 1008 + netdevnl.queue_create( 1009 + { 1010 + "ifindex": nk_guest_idx, 1011 + "type": "rx", 1012 + "lease": { 1013 + "ifindex": nsim.ifindex, 1014 + "queue": {"id": 1, "type": "rx"}, 1015 + "netns-id": 0, 1016 + }, 1017 + } 1018 + ) 1019 + 1020 + # Resize should fail while lease is active 1021 + ethnl = EthtoolFamily() 1022 + with ksft_raises(NlError) as e: 1023 + ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1}) 1024 + ksft_eq(e.exception.nl_msg.error, -errno.EINVAL) 1025 + 1026 + # Delete netkit, clearing the lease 1027 + cmd(f"ip link del dev {nk_host}") 1028 + 1029 + # Resize should now succeed 1030 + ethnl.channels_set({"header": {"dev-index": nsim.ifindex}, "combined-count": 1}) 1031 + 1032 + 1033 + def test_lease_queue_zero(netns) -> None: 1034 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 1035 + defer(nsimdev.remove) 1036 + nsim = nsimdev.nsims[0] 1037 + ip(f"link set dev {nsim.ifname} up") 1038 + 1039 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 1040 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 1041 + 1042 + ip(f"link set dev {nk_guest} netns {netns.name}") 1043 + ip(f"link set dev {nk_host} up") 1044 + ip(f"link set dev {nk_guest} up", ns=netns) 1045 + 1046 + with NetNSEnter(str(netns)): 1047 + netdevnl = NetdevFamily() 1048 + result = netdevnl.queue_create( 1049 + { 1050 + "ifindex": nk_guest_idx, 1051 + "type": "rx", 1052 + "lease": { 1053 + "ifindex": nsim.ifindex, 1054 + "queue": {"id": 0, "type": "rx"}, 1055 + "netns-id": 0, 1056 + }, 1057 + } 1058 + ) 1059 + ksft_eq(result["id"], 1) 1060 + 1061 + netdevnl = NetdevFamily() 1062 + queue_info = netdevnl.queue_get( 1063 + {"ifindex": nsim.ifindex, "id": 0, "type": "rx"} 1064 + ) 1065 + ksft_in("lease", queue_info) 1066 + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) 1067 + 1068 + 1069 + def test_release_and_reuse(netns) -> None: 1070 + nsimdev = NetdevSimDev(port_count=1, queue_count=2) 1071 + defer(nsimdev.remove) 1072 + nsim = nsimdev.nsims[0] 1073 + ip(f"link set dev {nsim.ifname} up") 1074 + 1075 + src_queue = 1 1076 + 1077 + # First lease 1078 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 1079 + 1080 + ip(f"link set dev {nk_guest} netns {netns.name}") 1081 + ip(f"link set dev {nk_host} up") 1082 + ip(f"link set dev {nk_guest} up", ns=netns) 1083 + 1084 + with NetNSEnter(str(netns)): 1085 + netdevnl = NetdevFamily() 1086 + netdevnl.queue_create( 1087 + { 1088 + "ifindex": nk_guest_idx, 1089 + "type": "rx", 1090 + "lease": { 1091 + "ifindex": nsim.ifindex, 1092 + "queue": {"id": src_queue, "type": "rx"}, 1093 + "netns-id": 0, 1094 + }, 1095 + } 1096 + ) 1097 + 1098 + netdevnl = NetdevFamily() 1099 + queue_info = netdevnl.queue_get( 1100 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 1101 + ) 1102 + ksft_in("lease", queue_info) 1103 + 1104 + # Delete netkit, freeing the lease 1105 + cmd(f"ip link del dev {nk_host}") 1106 + 1107 + queue_info = netdevnl.queue_get( 1108 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 1109 + ) 1110 + ksft_not_in("lease", queue_info) 1111 + 1112 + # Re-create netkit and lease the same physical queue again 1113 + nk_host, _, nk_guest, nk_guest_idx = create_netkit(rxqueues=2) 1114 + defer(cmd, f"ip link del dev {nk_host}", fail=False) 1115 + 1116 + ip(f"link set dev {nk_guest} netns {netns.name}") 1117 + ip(f"link set dev {nk_host} up") 1118 + ip(f"link set dev {nk_guest} up", ns=netns) 1119 + 1120 + with NetNSEnter(str(netns)): 1121 + netdevnl = NetdevFamily() 1122 + result = netdevnl.queue_create( 1123 + { 1124 + "ifindex": nk_guest_idx, 1125 + "type": "rx", 1126 + "lease": { 1127 + "ifindex": nsim.ifindex, 1128 + "queue": {"id": src_queue, "type": "rx"}, 1129 + "netns-id": 0, 1130 + }, 1131 + } 1132 + ) 1133 + ksft_eq(result["id"], 1) 1134 + 1135 + netdevnl = NetdevFamily() 1136 + queue_info = netdevnl.queue_get( 1137 + {"ifindex": nsim.ifindex, "id": src_queue, "type": "rx"} 1138 + ) 1139 + ksft_in("lease", queue_info) 1140 + ksft_eq(queue_info["lease"]["queue"]["id"], result["id"]) 1141 + 1142 + 1143 + def test_iou_zcrx(cfg) -> None: 1144 + cfg.require_ipver("6") 1145 + ethnl = EthtoolFamily() 1146 + 1147 + rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) 1148 + rx_rings = rings["rx"] 1149 + hds_thresh = rings.get("hds-thresh", 0) 1150 + 1151 + ethnl.rings_set( 1152 + { 1153 + "header": {"dev-index": cfg.ifindex}, 1154 + "tcp-data-split": "enabled", 1155 + "hds-thresh": 0, 1156 + "rx": 64, 1157 + } 1158 + ) 1159 + defer( 1160 + ethnl.rings_set, 1161 + { 1162 + "header": {"dev-index": cfg.ifindex}, 1163 + "tcp-data-split": "unknown", 1164 + "hds-thresh": hds_thresh, 1165 + "rx": rx_rings, 1166 + }, 1167 + ) 1168 + 1169 + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") 1170 + defer(ethtool, f"-X {cfg.ifname} default") 1171 + 1172 + flow_rule_id = set_flow_rule(cfg) 1173 + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") 1174 + 1175 + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" 1176 + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.nk_guest_ipv6} -p {cfg.port} -l 12840" 1177 + with bkg(rx_cmd, exit_wait=True): 1178 + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) 1179 + cmd(tx_cmd, host=cfg.remote) 1180 + 1181 + 1182 + def test_attrs(cfg) -> None: 1183 + cfg.require_ipver("6") 1184 + netdevnl = NetdevFamily() 1185 + queue_info = netdevnl.queue_get( 1186 + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} 1187 + ) 1188 + 1189 + ksft_eq(queue_info["id"], cfg.src_queue) 1190 + ksft_eq(queue_info["type"], "rx") 1191 + ksft_eq(queue_info["ifindex"], cfg.ifindex) 1192 + 1193 + ksft_in("lease", queue_info) 1194 + lease = queue_info["lease"] 1195 + ksft_eq(lease["ifindex"], cfg.nk_guest_ifindex) 1196 + ksft_eq(lease["queue"]["id"], cfg.nk_queue) 1197 + ksft_eq(lease["queue"]["type"], "rx") 1198 + ksft_in("netns-id", lease) 1199 + 1200 + 1201 + def test_attach_xdp_with_mp(cfg) -> None: 1202 + cfg.require_ipver("6") 1203 + ethnl = EthtoolFamily() 1204 + 1205 + rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) 1206 + rx_rings = rings["rx"] 1207 + hds_thresh = rings.get("hds-thresh", 0) 1208 + 1209 + ethnl.rings_set( 1210 + { 1211 + "header": {"dev-index": cfg.ifindex}, 1212 + "tcp-data-split": "enabled", 1213 + "hds-thresh": 0, 1214 + "rx": 64, 1215 + } 1216 + ) 1217 + defer( 1218 + ethnl.rings_set, 1219 + { 1220 + "header": {"dev-index": cfg.ifindex}, 1221 + "tcp-data-split": "unknown", 1222 + "hds-thresh": hds_thresh, 1223 + "rx": rx_rings, 1224 + }, 1225 + ) 1226 + 1227 + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") 1228 + defer(ethtool, f"-X {cfg.ifname} default") 1229 + 1230 + netdevnl = NetdevFamily() 1231 + 1232 + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" 1233 + with bkg(rx_cmd): 1234 + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) 1235 + 1236 + time.sleep(0.1) 1237 + queue_info = netdevnl.queue_get( 1238 + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} 1239 + ) 1240 + ksft_in("io-uring", queue_info) 1241 + 1242 + prog = cfg.net_lib_dir / "xdp_dummy.bpf.o" 1243 + with ksft_raises(CmdExitFailure): 1244 + ip(f"link set dev {cfg.ifname} xdp obj {prog} sec xdp.frags") 1245 + 1246 + time.sleep(0.1) 1247 + queue_info = netdevnl.queue_get( 1248 + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} 1249 + ) 1250 + ksft_not_in("io-uring", queue_info) 1251 + 1252 + 1253 + def test_destroy(cfg) -> None: 1254 + cfg.require_ipver("6") 1255 + ethnl = EthtoolFamily() 1256 + 1257 + rings = ethnl.rings_get({"header": {"dev-index": cfg.ifindex}}) 1258 + rx_rings = rings["rx"] 1259 + hds_thresh = rings.get("hds-thresh", 0) 1260 + 1261 + ethnl.rings_set( 1262 + { 1263 + "header": {"dev-index": cfg.ifindex}, 1264 + "tcp-data-split": "enabled", 1265 + "hds-thresh": 0, 1266 + "rx": 64, 1267 + } 1268 + ) 1269 + defer( 1270 + ethnl.rings_set, 1271 + { 1272 + "header": {"dev-index": cfg.ifindex}, 1273 + "tcp-data-split": "unknown", 1274 + "hds-thresh": hds_thresh, 1275 + "rx": rx_rings, 1276 + }, 1277 + ) 1278 + 1279 + ethtool(f"-X {cfg.ifname} equal {cfg.src_queue}") 1280 + defer(ethtool, f"-X {cfg.ifname} default") 1281 + 1282 + rx_cmd = f"ip netns exec {cfg.netns.name} {cfg.bin_local} -s -p {cfg.port} -i {cfg._nk_guest_ifname} -q {cfg.nk_queue}" 1283 + rx_proc = cmd(rx_cmd, background=True) 1284 + wait_port_listen(cfg.port, proto="tcp", ns=cfg.netns) 1285 + 1286 + netdevnl = NetdevFamily() 1287 + queue_info = netdevnl.queue_get( 1288 + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} 1289 + ) 1290 + ksft_in("io-uring", queue_info) 1291 + 1292 + # ip link del will wait for all refs to drop first, but iou-zcrx is holding 1293 + # onto a ref. Terminate iou-zcrx async via a thread after a delay. 1294 + kill_timer = threading.Timer(1, rx_proc.proc.terminate) 1295 + kill_timer.start() 1296 + 1297 + ip(f"link del dev {cfg._nk_host_ifname}") 1298 + kill_timer.join() 1299 + cfg._nk_host_ifname = None 1300 + cfg._nk_guest_ifname = None 1301 + 1302 + queue_info = netdevnl.queue_get( 1303 + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} 1304 + ) 1305 + ksft_not_in("io-uring", queue_info) 1306 + 1307 + cmd(f"tc filter del dev {cfg.ifname} ingress pref {cfg._bpf_prog_pref}") 1308 + cfg._tc_attached = False 1309 + 1310 + flow_rule_id = set_flow_rule(cfg) 1311 + defer(ethtool, f"-N {cfg.ifname} delete {flow_rule_id}") 1312 + 1313 + rx_cmd = f"{cfg.bin_local} -s -p {cfg.port} -i {cfg.ifname} -q {cfg.src_queue}" 1314 + tx_cmd = f"{cfg.bin_remote} -c -h {cfg.addr_v['6']} -p {cfg.port} -l 12840" 1315 + with bkg(rx_cmd, exit_wait=True): 1316 + wait_port_listen(cfg.port, proto="tcp") 1317 + cmd(tx_cmd, host=cfg.remote) 1318 + # Short delay since iou cleanup is async and takes a bit of time. 1319 + time.sleep(0.1) 1320 + queue_info = netdevnl.queue_get( 1321 + {"ifindex": cfg.ifindex, "id": cfg.src_queue, "type": "rx"} 1322 + ) 1323 + ksft_not_in("io-uring", queue_info) 1324 + 1325 + 1326 + def main() -> None: 1327 + netns = NetNS() 1328 + cmd("ip netns attach init 1") 1329 + ip("netns set init 0", ns=netns) 1330 + ip("link set lo up", ns=netns) 1331 + 1332 + ksft_run( 1333 + [ 1334 + test_remove_phys, 1335 + test_double_lease, 1336 + test_virtual_lessor, 1337 + test_phys_lessee, 1338 + test_different_lessors, 1339 + test_queue_out_of_range, 1340 + test_resize_leased, 1341 + test_self_lease, 1342 + test_create_tx_type, 1343 + test_create_primary, 1344 + test_create_limit, 1345 + test_link_flap_phys, 1346 + test_queue_get_virtual, 1347 + test_remove_virt_first, 1348 + test_multiple_leases, 1349 + test_lease_queue_tx_type, 1350 + test_invalid_netns, 1351 + test_invalid_phys_ifindex, 1352 + test_multi_netkit_remove_phys, 1353 + test_single_remove_phys, 1354 + test_link_flap_virt, 1355 + test_phys_queue_no_lease, 1356 + test_same_ns_lease, 1357 + test_resize_after_unlease, 1358 + test_lease_queue_zero, 1359 + test_release_and_reuse, 1360 + test_veth_queue_create, 1361 + ], 1362 + args=(netns,), 1363 + ) 1364 + 1365 + cmd("ip netns del init", fail=False) 1366 + del netns 1367 + 1368 + with NetDrvContEnv(__file__, rxqueues=2) as cfg: 1369 + cfg.bin_local = path.abspath( 1370 + path.dirname(__file__) + "/../../../drivers/net/hw/iou-zcrx" 1371 + ) 1372 + cfg.bin_remote = cfg.remote.deploy(cfg.bin_local) 1373 + cfg.port = rand_port() 1374 + 1375 + ethnl = EthtoolFamily() 1376 + channels = ethnl.channels_get({"header": {"dev-index": cfg.ifindex}}) 1377 + channels = channels["combined-count"] 1378 + if channels < 2: 1379 + raise KsftSkipEx("Test requires NETIF with at least 2 combined channels") 1380 + 1381 + cfg.src_queue = channels - 1 1382 + 1383 + with NetNSEnter(str(cfg.netns)): 1384 + netdevnl = NetdevFamily() 1385 + bind_result = netdevnl.queue_create( 1386 + { 1387 + "ifindex": cfg.nk_guest_ifindex, 1388 + "type": "rx", 1389 + "lease": { 1390 + "ifindex": cfg.ifindex, 1391 + "queue": {"id": cfg.src_queue, "type": "rx"}, 1392 + "netns-id": 0, 1393 + }, 1394 + } 1395 + ) 1396 + cfg.nk_queue = bind_result["id"] 1397 + 1398 + # test_destroy must be last because it destroys the netkit devices 1399 + ksft_run( 1400 + [test_iou_zcrx, test_attrs, test_attach_xdp_with_mp, test_destroy], 1401 + args=(cfg,), 1402 + ) 1403 + ksft_exit() 1404 + 1405 + 1406 + if __name__ == "__main__": 1407 + main()