Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net: introduce per netns packet chains

Currently network taps unbound to any interface are linked in the
global ptype_all list, affecting the performance in all the network
namespaces.

Add per netns ptypes chains, so that in the mentioned case only
the netns owning the packet socket(s) is affected.

While at that drop the global ptype_all list: no in kernel user
registers a tap on "any" type without specifying either the target
device or the target namespace (and IMHO doing that would not make
any sense).

Note that this adds a conditional in the fast path (to check for
per netns ptype_specific list) and increases the dataset size by
a cacheline (owing the per netns lists).

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumaze@google.com>
Link: https://patch.msgid.link/ae405f98875ee87f8150c460ad162de7e466f8a7.1742494826.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Paolo Abeni and committed by
Jakub Kicinski
c353e898 29abdf66

+78 -22
+11 -1
include/linux/netdevice.h
··· 4278 4278 return 0; 4279 4279 } 4280 4280 4281 - bool dev_nit_active(struct net_device *dev); 4281 + bool dev_nit_active_rcu(const struct net_device *dev); 4282 + static inline bool dev_nit_active(const struct net_device *dev) 4283 + { 4284 + bool ret; 4285 + 4286 + rcu_read_lock(); 4287 + ret = dev_nit_active_rcu(dev); 4288 + rcu_read_unlock(); 4289 + return ret; 4290 + } 4291 + 4282 4292 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); 4283 4293 4284 4294 static inline void __dev_put(struct net_device *dev)
-1
include/net/hotdata.h
··· 23 23 struct net_offload udpv6_offload; 24 24 #endif 25 25 struct list_head offload_base; 26 - struct list_head ptype_all; 27 26 struct kmem_cache *skbuff_cache; 28 27 struct kmem_cache *skbuff_fclone_cache; 29 28 struct kmem_cache *skb_small_head_cache;
+3
include/net/net_namespace.h
··· 83 83 struct llist_node defer_free_list; 84 84 struct llist_node cleanup_list; /* namespaces on death row */ 85 85 86 + struct list_head ptype_all; 87 + struct list_head ptype_specific; 88 + 86 89 #ifdef CONFIG_KEYS 87 90 struct key_tag *key_domain; /* Key domain of operation tag */ 88 91 #endif
+41 -12
net/core/dev.c
··· 572 572 573 573 static inline struct list_head *ptype_head(const struct packet_type *pt) 574 574 { 575 - if (pt->type == htons(ETH_P_ALL)) 576 - return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all; 577 - else 578 - return pt->dev ? &pt->dev->ptype_specific : 575 + if (pt->type == htons(ETH_P_ALL)) { 576 + if (!pt->af_packet_net && !pt->dev) 577 + return NULL; 578 + 579 + return pt->dev ? &pt->dev->ptype_all : 580 + &pt->af_packet_net->ptype_all; 581 + } 582 + 583 + if (pt->dev) 584 + return &pt->dev->ptype_specific; 585 + 586 + return pt->af_packet_net ? &pt->af_packet_net->ptype_specific : 579 587 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; 580 588 } 581 589 ··· 603 595 void dev_add_pack(struct packet_type *pt) 604 596 { 605 597 struct list_head *head = ptype_head(pt); 598 + 599 + if (WARN_ON_ONCE(!head)) 600 + return; 606 601 607 602 spin_lock(&ptype_lock); 608 603 list_add_rcu(&pt->list, head); ··· 630 619 { 631 620 struct list_head *head = ptype_head(pt); 632 621 struct packet_type *pt1; 622 + 623 + if (!head) 624 + return; 633 625 634 626 spin_lock(&ptype_lock); 635 627 ··· 2455 2441 } 2456 2442 2457 2443 /** 2458 - * dev_nit_active - return true if any network interface taps are in use 2444 + * dev_nit_active_rcu - return true if any network interface taps are in use 2445 + * 2446 + * The caller must hold the RCU lock 2459 2447 * 2460 2448 * @dev: network device to check for the presence of taps 2461 2449 */ 2462 - bool dev_nit_active(struct net_device *dev) 2450 + bool dev_nit_active_rcu(const struct net_device *dev) 2463 2451 { 2464 - return !list_empty(&net_hotdata.ptype_all) || 2452 + /* Callers may hold either RCU or RCU BH lock */ 2453 + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); 2454 + 2455 + return !list_empty(&dev_net(dev)->ptype_all) || 2465 2456 !list_empty(&dev->ptype_all); 2466 2457 } 2467 - EXPORT_SYMBOL_GPL(dev_nit_active); 2458 + EXPORT_SYMBOL_GPL(dev_nit_active_rcu); 2468 2459 2469 2460 /* 2470 2461 * Support routine. Sends outgoing frames to any network ··· 2478 2459 2479 2460 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) 2480 2461 { 2481 - struct list_head *ptype_list = &net_hotdata.ptype_all; 2482 2462 struct packet_type *ptype, *pt_prev = NULL; 2463 + struct list_head *ptype_list; 2483 2464 struct sk_buff *skb2 = NULL; 2484 2465 2485 2466 rcu_read_lock(); 2467 + ptype_list = &dev_net_rcu(dev)->ptype_all; 2486 2468 again: 2487 2469 list_for_each_entry_rcu(ptype, ptype_list, list) { 2488 2470 if (READ_ONCE(ptype->ignore_outgoing)) ··· 2527 2507 pt_prev = ptype; 2528 2508 } 2529 2509 2530 - if (ptype_list == &net_hotdata.ptype_all) { 2510 + if (ptype_list != &dev->ptype_all) { 2531 2511 ptype_list = &dev->ptype_all; 2532 2512 goto again; 2533 2513 } ··· 3772 3752 unsigned int len; 3773 3753 int rc; 3774 3754 3775 - if (dev_nit_active(dev)) 3755 + if (dev_nit_active_rcu(dev)) 3776 3756 dev_queue_xmit_nit(skb, dev); 3777 3757 3778 3758 len = skb->len; ··· 5716 5696 if (pfmemalloc) 5717 5697 goto skip_taps; 5718 5698 5719 - list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) { 5699 + list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, 5700 + list) { 5720 5701 if (pt_prev) 5721 5702 ret = deliver_skb(skb, pt_prev, orig_dev); 5722 5703 pt_prev = ptype; ··· 5829 5808 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 5830 5809 &ptype_base[ntohs(type) & 5831 5810 PTYPE_HASH_MASK]); 5811 + 5812 + /* orig_dev and skb->dev could belong to different netns; 5813 + * Even in such case we need to traverse only the list 5814 + * coming from skb->dev, as the ptype owner (packet socket) 5815 + * will use dev_net(skb->dev) to do namespace filtering. 5816 + */ 5817 + deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type, 5818 + &dev_net_rcu(skb->dev)->ptype_specific); 5832 5819 } 5833 5820 5834 5821 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
-1
net/core/hotdata.c
··· 7 7 8 8 struct net_hotdata net_hotdata __cacheline_aligned = { 9 9 .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base), 10 - .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all), 11 10 .gro_normal_batch = 8, 12 11 13 12 .netdev_budget = 300,
+21 -7
net/core/net-procfs.c
··· 185 185 } 186 186 } 187 187 188 - list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) { 188 + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) { 189 + if (i == pos) 190 + return pt; 191 + ++i; 192 + } 193 + 194 + list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) { 189 195 if (i == pos) 190 196 return pt; 191 197 ++i; ··· 216 210 217 211 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos) 218 212 { 213 + struct net *net = seq_file_net(seq); 219 214 struct net_device *dev; 220 215 struct packet_type *pt; 221 216 struct list_head *nxt; ··· 239 232 goto found; 240 233 } 241 234 } 242 - 243 - nxt = net_hotdata.ptype_all.next; 244 - goto ptype_all; 235 + nxt = net->ptype_all.next; 236 + goto net_ptype_all; 245 237 } 246 238 247 - if (pt->type == htons(ETH_P_ALL)) { 248 - ptype_all: 249 - if (nxt != &net_hotdata.ptype_all) 239 + if (pt->af_packet_net) { 240 + net_ptype_all: 241 + if (nxt != &net->ptype_all && nxt != &net->ptype_specific) 250 242 goto found; 243 + 244 + if (nxt == &net->ptype_all) { 245 + /* continue with ->ptype_specific if it's not empty */ 246 + nxt = net->ptype_specific.next; 247 + if (nxt != &net->ptype_specific) 248 + goto found; 249 + } 250 + 251 251 hash = 0; 252 252 nxt = ptype_base[0].next; 253 253 } else
+2
net/core/net_namespace.c
··· 340 340 lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); 341 341 #endif 342 342 343 + INIT_LIST_HEAD(&net->ptype_all); 344 + INIT_LIST_HEAD(&net->ptype_specific); 343 345 preinit_net_sysctl(net); 344 346 } 345 347