Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net: move aRFS rmap management and CPU affinity to core

A common task for most drivers is to remember the user-set CPU affinity
to its IRQs. On each netdev reset, the driver should re-assign the user's
settings to the IRQs. Unify this task across all drivers by moving the CPU
affinity to napi->config.

However, to move the CPU affinity to core, we also need to move aRFS
rmap management since aRFS uses its own IRQ notifiers.

For the aRFS, add a new netdev flag "rx_cpu_rmap_auto". Drivers supporting
aRFS should set the flag via netif_enable_cpu_rmap() and core will allocate
and manage the aRFS rmaps. Freeing the rmap is also done by core when the
netdev is freed. For better IRQ affinity management, move the IRQ rmap
notifier inside the napi_struct and add new notify.notify and
notify.release functions: netif_irq_cpu_rmap_notify() and
netif_napi_affinity_release().

Now we have the aRFS rmap management in core, add CPU affinity mask to
napi_config. To delegate the CPU affinity management to the core, drivers
must:
1 - set the new netdev flag "irq_affinity_auto":
netif_enable_irq_affinity(netdev)
2 - create the napi with persistent config:
netif_napi_add_config()
3 - bind an IRQ to the napi instance: netif_napi_set_irq()

the core will then make sure to use re-assign affinity to the napi's
IRQ.

The default IRQ mask is set to one cpu starting from the closest NUMA.

Signed-off-by: Ahmed Zaki <ahmed.zaki@intel.com>
Link: https://patch.msgid.link/20250224232228.990783-2-ahmed.zaki@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Ahmed Zaki and committed by
Jakub Kicinski
bd7c0060 28d68d39

+195 -7
+4 -2
Documentation/networking/scaling.rst
··· 434 434 is maintained by the NIC driver. This is an auto-generated reverse map of 435 435 the IRQ affinity table shown by /proc/interrupts. Drivers can use 436 436 functions in the cpu_rmap (“CPU affinity reverse map”) kernel library 437 - to populate the map. For each CPU, the corresponding queue in the map is 438 - set to be one whose processing CPU is closest in cache locality. 437 + to populate the map. Alternatively, drivers can delegate the cpu_rmap 438 + management to the Kernel by calling netif_enable_cpu_rmap(). For each CPU, 439 + the corresponding queue in the map is set to be one whose processing CPU is 440 + closest in cache locality. 439 441 440 442 441 443 Accelerated RFS Configuration
+1
include/linux/cpu_rmap.h
··· 32 32 #define CPU_RMAP_DIST_INF 0xffff 33 33 34 34 extern struct cpu_rmap *alloc_cpu_rmap(unsigned int size, gfp_t flags); 35 + extern void cpu_rmap_get(struct cpu_rmap *rmap); 35 36 extern int cpu_rmap_put(struct cpu_rmap *rmap); 36 37 37 38 extern int cpu_rmap_add(struct cpu_rmap *rmap, void *obj);
+20 -4
include/linux/netdevice.h
··· 352 352 u64 gro_flush_timeout; 353 353 u64 irq_suspend_timeout; 354 354 u32 defer_hard_irqs; 355 + cpumask_t affinity_mask; 355 356 unsigned int napi_id; 356 357 }; 357 358 ··· 395 394 struct list_head dev_list; 396 395 struct hlist_node napi_hash_node; 397 396 int irq; 397 + struct irq_affinity_notify notify; 398 + int napi_rmap_idx; 398 399 int index; 399 400 struct napi_config *config; 400 401 }; ··· 412 409 NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ 413 410 NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ 414 411 NAPI_STATE_SCHED_THREADED, /* Napi is currently scheduled in threaded mode */ 412 + NAPI_STATE_HAS_NOTIFIER, /* Napi has an IRQ notifier */ 415 413 }; 416 414 417 415 enum { ··· 426 422 NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), 427 423 NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), 428 424 NAPIF_STATE_SCHED_THREADED = BIT(NAPI_STATE_SCHED_THREADED), 425 + NAPIF_STATE_HAS_NOTIFIER = BIT(NAPI_STATE_HAS_NOTIFIER), 429 426 }; 430 427 431 428 enum gro_result { ··· 1994 1989 * 1995 1990 * @threaded: napi threaded mode is enabled 1996 1991 * 1992 + * @irq_affinity_auto: driver wants the core to store and re-assign the IRQ 1993 + * affinity. Set by netif_enable_irq_affinity(), then 1994 + * the driver must create a persistent napi by 1995 + * netif_napi_add_config() and finally bind the napi to 1996 + * IRQ (via netif_napi_set_irq()). 1997 + * 1998 + * @rx_cpu_rmap_auto: driver wants the core to manage the ARFS rmap. 1999 + * Set by calling netif_enable_cpu_rmap(). 2000 + * 1997 2001 * @see_all_hwtstamp_requests: device wants to see calls to 1998 2002 * ndo_hwtstamp_set() for all timestamp requests 1999 2003 * regardless of source, even if those aren't ··· 2410 2396 struct lock_class_key *qdisc_tx_busylock; 2411 2397 bool proto_down; 2412 2398 bool threaded; 2399 + bool irq_affinity_auto; 2400 + bool rx_cpu_rmap_auto; 2413 2401 2414 2402 /* priv_flags_slow, ungrouped to save space */ 2415 2403 unsigned long see_all_hwtstamp_requests:1; ··· 2740 2724 netdev_assert_locked(dev); 2741 2725 } 2742 2726 2743 - static inline void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) 2744 - { 2745 - napi->irq = irq; 2746 - } 2727 + void netif_napi_set_irq_locked(struct napi_struct *napi, int irq); 2747 2728 2748 2729 static inline void netif_napi_set_irq(struct napi_struct *napi, int irq) 2749 2730 { ··· 2877 2864 __netif_napi_del(napi); 2878 2865 synchronize_net(); 2879 2866 } 2867 + 2868 + int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs); 2869 + void netif_set_affinity_auto(struct net_device *dev); 2880 2870 2881 2871 struct packet_type { 2882 2872 __be16 type; /* This is really htons(ether_type). */
+1 -1
lib/cpu_rmap.c
··· 73 73 * cpu_rmap_get - internal helper to get new ref on a cpu_rmap 74 74 * @rmap: reverse-map allocated with alloc_cpu_rmap() 75 75 */ 76 - static inline void cpu_rmap_get(struct cpu_rmap *rmap) 76 + void cpu_rmap_get(struct cpu_rmap *rmap) 77 77 { 78 78 kref_get(&rmap->refcount); 79 79 }
+169
net/core/dev.c
··· 6943 6943 } 6944 6944 EXPORT_SYMBOL(netif_queue_set_napi); 6945 6945 6946 + static void 6947 + netif_napi_irq_notify(struct irq_affinity_notify *notify, 6948 + const cpumask_t *mask) 6949 + { 6950 + struct napi_struct *napi = 6951 + container_of(notify, struct napi_struct, notify); 6952 + #ifdef CONFIG_RFS_ACCEL 6953 + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; 6954 + int err; 6955 + #endif 6956 + 6957 + if (napi->config && napi->dev->irq_affinity_auto) 6958 + cpumask_copy(&napi->config->affinity_mask, mask); 6959 + 6960 + #ifdef CONFIG_RFS_ACCEL 6961 + if (napi->dev->rx_cpu_rmap_auto) { 6962 + err = cpu_rmap_update(rmap, napi->napi_rmap_idx, mask); 6963 + if (err) 6964 + netdev_warn(napi->dev, "RMAP update failed (%d)\n", 6965 + err); 6966 + } 6967 + #endif 6968 + } 6969 + 6970 + #ifdef CONFIG_RFS_ACCEL 6971 + static void netif_napi_affinity_release(struct kref *ref) 6972 + { 6973 + struct napi_struct *napi = 6974 + container_of(ref, struct napi_struct, notify.kref); 6975 + struct cpu_rmap *rmap = napi->dev->rx_cpu_rmap; 6976 + 6977 + netdev_assert_locked(napi->dev); 6978 + WARN_ON(test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, 6979 + &napi->state)); 6980 + 6981 + if (!napi->dev->rx_cpu_rmap_auto) 6982 + return; 6983 + rmap->obj[napi->napi_rmap_idx] = NULL; 6984 + napi->napi_rmap_idx = -1; 6985 + cpu_rmap_put(rmap); 6986 + } 6987 + 6988 + int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) 6989 + { 6990 + if (dev->rx_cpu_rmap_auto) 6991 + return 0; 6992 + 6993 + dev->rx_cpu_rmap = alloc_irq_cpu_rmap(num_irqs); 6994 + if (!dev->rx_cpu_rmap) 6995 + return -ENOMEM; 6996 + 6997 + dev->rx_cpu_rmap_auto = true; 6998 + return 0; 6999 + } 7000 + EXPORT_SYMBOL(netif_enable_cpu_rmap); 7001 + 7002 + static void netif_del_cpu_rmap(struct net_device *dev) 7003 + { 7004 + struct cpu_rmap *rmap = dev->rx_cpu_rmap; 7005 + 7006 + if (!dev->rx_cpu_rmap_auto) 7007 + return; 7008 + 7009 + /* Free the rmap */ 7010 + cpu_rmap_put(rmap); 7011 + dev->rx_cpu_rmap = NULL; 7012 + dev->rx_cpu_rmap_auto = false; 7013 + } 7014 + 7015 + #else 7016 + static void netif_napi_affinity_release(struct kref *ref) 7017 + { 7018 + } 7019 + 7020 + int netif_enable_cpu_rmap(struct net_device *dev, unsigned int num_irqs) 7021 + { 7022 + return 0; 7023 + } 7024 + EXPORT_SYMBOL(netif_enable_cpu_rmap); 7025 + 7026 + static void netif_del_cpu_rmap(struct net_device *dev) 7027 + { 7028 + } 7029 + #endif 7030 + 7031 + void netif_set_affinity_auto(struct net_device *dev) 7032 + { 7033 + unsigned int i, maxqs, numa; 7034 + 7035 + maxqs = max(dev->num_tx_queues, dev->num_rx_queues); 7036 + numa = dev_to_node(&dev->dev); 7037 + 7038 + for (i = 0; i < maxqs; i++) 7039 + cpumask_set_cpu(cpumask_local_spread(i, numa), 7040 + &dev->napi_config[i].affinity_mask); 7041 + 7042 + dev->irq_affinity_auto = true; 7043 + } 7044 + EXPORT_SYMBOL(netif_set_affinity_auto); 7045 + 7046 + void netif_napi_set_irq_locked(struct napi_struct *napi, int irq) 7047 + { 7048 + int rc; 7049 + 7050 + netdev_assert_locked_or_invisible(napi->dev); 7051 + 7052 + if (napi->irq == irq) 7053 + return; 7054 + 7055 + /* Remove existing resources */ 7056 + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) 7057 + irq_set_affinity_notifier(napi->irq, NULL); 7058 + 7059 + napi->irq = irq; 7060 + if (irq < 0 || 7061 + (!napi->dev->rx_cpu_rmap_auto && !napi->dev->irq_affinity_auto)) 7062 + return; 7063 + 7064 + /* Abort for buggy drivers */ 7065 + if (napi->dev->irq_affinity_auto && WARN_ON_ONCE(!napi->config)) 7066 + return; 7067 + 7068 + #ifdef CONFIG_RFS_ACCEL 7069 + if (napi->dev->rx_cpu_rmap_auto) { 7070 + rc = cpu_rmap_add(napi->dev->rx_cpu_rmap, napi); 7071 + if (rc < 0) 7072 + return; 7073 + 7074 + cpu_rmap_get(napi->dev->rx_cpu_rmap); 7075 + napi->napi_rmap_idx = rc; 7076 + } 7077 + #endif 7078 + 7079 + /* Use core IRQ notifier */ 7080 + napi->notify.notify = netif_napi_irq_notify; 7081 + napi->notify.release = netif_napi_affinity_release; 7082 + rc = irq_set_affinity_notifier(irq, &napi->notify); 7083 + if (rc) { 7084 + netdev_warn(napi->dev, "Unable to set IRQ notifier (%d)\n", 7085 + rc); 7086 + goto put_rmap; 7087 + } 7088 + 7089 + set_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state); 7090 + return; 7091 + 7092 + put_rmap: 7093 + #ifdef CONFIG_RFS_ACCEL 7094 + if (napi->dev->rx_cpu_rmap_auto) { 7095 + cpu_rmap_put(napi->dev->rx_cpu_rmap); 7096 + napi->dev->rx_cpu_rmap->obj[napi->napi_rmap_idx] = NULL; 7097 + napi->napi_rmap_idx = -1; 7098 + } 7099 + #endif 7100 + napi->notify.notify = NULL; 7101 + napi->notify.release = NULL; 7102 + } 7103 + EXPORT_SYMBOL(netif_napi_set_irq_locked); 7104 + 6946 7105 static void napi_restore_config(struct napi_struct *n) 6947 7106 { 6948 7107 n->defer_hard_irqs = n->config->defer_hard_irqs; 6949 7108 n->gro_flush_timeout = n->config->gro_flush_timeout; 6950 7109 n->irq_suspend_timeout = n->config->irq_suspend_timeout; 7110 + 7111 + if (n->dev->irq_affinity_auto && 7112 + test_bit(NAPI_STATE_HAS_NOTIFIER, &n->state)) 7113 + irq_set_affinity(n->irq, &n->config->affinity_mask); 7114 + 6951 7115 /* a NAPI ID might be stored in the config, if so use it. if not, use 6952 7116 * napi_hash_add to generate one for us. 6953 7117 */ ··· 7331 7167 7332 7168 /* Make sure NAPI is disabled (or was never enabled). */ 7333 7169 WARN_ON(!test_bit(NAPI_STATE_SCHED, &napi->state)); 7170 + 7171 + if (test_and_clear_bit(NAPI_STATE_HAS_NOTIFIER, &napi->state)) 7172 + irq_set_affinity_notifier(napi->irq, NULL); 7334 7173 7335 7174 if (napi->config) { 7336 7175 napi->index = -1; ··· 11886 11719 dev_addr_flush(dev); 11887 11720 11888 11721 netdev_napi_exit(dev); 11722 + 11723 + netif_del_cpu_rmap(dev); 11889 11724 11890 11725 ref_tracker_dir_exit(&dev->refcnt_tracker); 11891 11726 #ifdef CONFIG_PCPU_DEV_REFCNT