Merge branch 'net-improve-core-queue-api-handling-while-device-is-down'

Jakub Kicinski says:

====================
net: improve core queue API handling while device is down

The core netdev_rx_queue_restart() doesn't currently take into account
that the device may be down. The current and proposed queue API
implementations deal with this by rejecting queue API calls while
the device is down. We can do better, in theory we can still allow
devmem binding when the device is down - we shouldn't stop and start
the queues just try to allocate the memory. The reason we allocate
the memory is that memory provider binding checks if any compatible
page pool has been created (page_pool_check_memory_provider()).

Alternatively we could reject installing MP while the device is down
but the MP assignment survives ifdown (so presumably MP doesn't cease
to exist while down), and in general we allow configuration while down.

Previously I thought we need this as a fix, but gve rejects page pool
calls while down, and so did Saeed in the patches he posted. So this
series just makes the core act more sensibly but practically should
be a noop for now.

v1: https://lore.kernel.org/20250205190131.564456-1-kuba@kernel.org
====================

Link: https://patch.msgid.link/20250206225638.1387810-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

Jakub Kicinski 1 year ago acdefab0 6a0ca73e

+59 -29

6 changed files

expand all

drivers

net

netdevsim

netdev.c

include

net

netdev_queues.h

net

core

dev.h

netdev_rx_queue.c

page_pool.c

tools

testing

selftests

net

nl_netdev.py

+4 -6

drivers/net/netdevsim/netdev.c

··· 645 645 if (ns->rq_reset_mode > 3) 646 646 return -EINVAL; 647 647 648 - if (ns->rq_reset_mode == 1) 648 + if (ns->rq_reset_mode == 1) { 649 + if (!netif_running(ns->netdev)) 650 + return -ENETDOWN; 649 651 return nsim_create_page_pool(&qmem->pp, &ns->rq[idx]->napi); 652 + } 650 653 651 654 qmem->rq = nsim_queue_alloc(); 652 655 if (!qmem->rq) ··· 757 754 return -EINVAL; 758 755 759 756 rtnl_lock(); 760 - if (!netif_running(ns->netdev)) { 761 - ret = -ENETDOWN; 762 - goto exit_unlock; 763 - } 764 - 765 757 if (queue >= ns->netdev->real_num_rx_queues) { 766 758 ret = -EINVAL; 767 759 goto exit_unlock;

include/net/netdev_queues.h

··· 117 117 * 118 118 * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped 119 119 * queue's memory is written at the specified address. 120 + * 121 + * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while 122 + * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only 123 + * be called for an interface which is open. 120 124 */ 121 125 struct netdev_queue_mgmt_ops { 122 126 size_t ndo_queue_mem_size;

+12

net/core/dev.h

··· 299 299 static inline void xdp_do_check_flushed(struct napi_struct *napi) { } 300 300 #endif 301 301 302 + /* Best effort check that NAPI is not idle (can't be scheduled to run) */ 303 + static inline void napi_assert_will_not_race(const struct napi_struct *napi) 304 + { 305 + /* uninitialized instance, can't race */ 306 + if (!napi->poll_list.next) 307 + return; 308 + 309 + /* SCHED bit is set on disabled instances */ 310 + WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); 311 + WARN_ON(READ_ONCE(napi->list_owner) != -1); 312 + } 313 + 302 314 void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); 303 315 304 316 #define XMIT_RECURSION_LIMIT 8

+20 -17

net/core/netdev_rx_queue.c

··· 10 10 int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) 11 11 { 12 12 struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); 13 + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; 13 14 void *new_mem, *old_mem; 14 15 int err; 15 16 16 - if (!dev->queue_mgmt_ops || !dev->queue_mgmt_ops->ndo_queue_stop || 17 - !dev->queue_mgmt_ops->ndo_queue_mem_free || 18 - !dev->queue_mgmt_ops->ndo_queue_mem_alloc || 19 - !dev->queue_mgmt_ops->ndo_queue_start) 17 + if (!qops || !qops->ndo_queue_stop || !qops->ndo_queue_mem_free || 18 + !qops->ndo_queue_mem_alloc || !qops->ndo_queue_start) 20 19 return -EOPNOTSUPP; 21 20 22 21 ASSERT_RTNL(); 23 22 24 - new_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); 23 + new_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); 25 24 if (!new_mem) 26 25 return -ENOMEM; 27 26 28 - old_mem = kvzalloc(dev->queue_mgmt_ops->ndo_queue_mem_size, GFP_KERNEL); 27 + old_mem = kvzalloc(qops->ndo_queue_mem_size, GFP_KERNEL); 29 28 if (!old_mem) { 30 29 err = -ENOMEM; 31 30 goto err_free_new_mem; 32 31 } 33 32 34 - err = dev->queue_mgmt_ops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); 33 + err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); 35 34 if (err) 36 35 goto err_free_old_mem; 37 36 ··· 38 39 if (err) 39 40 goto err_free_new_queue_mem; 40 41 41 - err = dev->queue_mgmt_ops->ndo_queue_stop(dev, old_mem, rxq_idx); 42 - if (err) 43 - goto err_free_new_queue_mem; 42 + if (netif_running(dev)) { 43 + err = qops->ndo_queue_stop(dev, old_mem, rxq_idx); 44 + if (err) 45 + goto err_free_new_queue_mem; 44 46 45 - err = dev->queue_mgmt_ops->ndo_queue_start(dev, new_mem, rxq_idx); 46 - if (err) 47 - goto err_start_queue; 47 + err = qops->ndo_queue_start(dev, new_mem, rxq_idx); 48 + if (err) 49 + goto err_start_queue; 50 + } else { 51 + swap(new_mem, old_mem); 52 + } 48 53 49 - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); 54 + qops->ndo_queue_mem_free(dev, old_mem); 50 55 51 56 kvfree(old_mem); 52 57 kvfree(new_mem); ··· 65 62 * WARN if we fail to recover the old rx queue, and at least free 66 63 * old_mem so we don't also leak that. 67 64 */ 68 - if (dev->queue_mgmt_ops->ndo_queue_start(dev, old_mem, rxq_idx)) { 65 + if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { 69 66 WARN(1, 70 67 "Failed to restart old queue in error path. RX queue %d may be unhealthy.", 71 68 rxq_idx); 72 - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, old_mem); 69 + qops->ndo_queue_mem_free(dev, old_mem); 73 70 } 74 71 75 72 err_free_new_queue_mem: 76 - dev->queue_mgmt_ops->ndo_queue_mem_free(dev, new_mem); 73 + qops->ndo_queue_mem_free(dev, new_mem); 77 74 78 75 err_free_old_mem: 79 76 kvfree(old_mem);

+2 -5

net/core/page_pool.c

··· 26 26 27 27 #include <trace/events/page_pool.h> 28 28 29 + #include "dev.h" 29 30 #include "mp_dmabuf_devmem.h" 30 31 #include "netmem_priv.h" 31 32 #include "page_pool_priv.h" ··· 1148 1147 if (!pool->p.napi) 1149 1148 return; 1150 1149 1151 - /* To avoid races with recycling and additional barriers make sure 1152 - * pool and NAPI are unlinked when NAPI is disabled. 1153 - */ 1154 - WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state)); 1155 - WARN_ON(READ_ONCE(pool->p.napi->list_owner) != -1); 1150 + napi_assert_will_not_race(pool->p.napi); 1156 1151 1157 1152 mutex_lock(&page_pools_lock); 1158 1153 WRITE_ONCE(pool->p.napi, NULL);

+17 -1

tools/testing/selftests/net/nl_netdev.py

··· 35 35 comment=f"queue count after reset queue {q} mode {i}") 36 36 37 37 38 + def nsim_rxq_reset_down(nf) -> None: 39 + """ 40 + Test that the queue API supports resetting a queue 41 + while the interface is down. We should convert this 42 + test to testing real HW once more devices support 43 + queue API. 44 + """ 45 + with NetdevSimDev(queue_count=4) as nsimdev: 46 + nsim = nsimdev.nsims[0] 47 + 48 + ip(f"link set dev {nsim.ifname} down") 49 + for i in [0, 2, 3]: 50 + nsim.dfs_write("queue_reset", f"1 {i}") 51 + 52 + 38 53 def page_pool_check(nf) -> None: 39 54 with NetdevSimDev() as nsimdev: 40 55 nsim = nsimdev.nsims[0] ··· 121 106 122 107 def main() -> None: 123 108 nf = NetdevFamily() 124 - ksft_run([empty_check, lo_check, page_pool_check, napi_list_check], 109 + ksft_run([empty_check, lo_check, page_pool_check, napi_list_check, 110 + nsim_rxq_reset_down], 125 111 args=(nf, )) 126 112 ksft_exit() 127 113

Configure Feed

Configure Feed