Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'ipv4-namespacify-ipv4-address-hash-table'

Kuniyuki Iwashima says:

====================
ipv4: Namespacify IPv4 address hash table.

This is a prep of per-net RTNL conversion for RTM_(NEW|DEL|SET)ADDR.

Currently, each IPv4 address is linked to the global hash table, and
this needs to be protected by another global lock or namespacified to
support per-net RTNL.

Adding a global lock will cause deadlock in the rtnetlink path and GC,

rtnetlink check_lifetime
|- rtnl_net_lock(net) |- acquire the global lock
|- acquire the global lock |- check ifa's netns
`- put ifa into hash table `- rtnl_net_lock(net)

so we need to namespacify the hash table.

The IPv6 one is already namespacified, let's follow that.

v2: https://lore.kernel.org/netdev/20241004195958.64396-1-kuniyu@amazon.com/
v1: https://lore.kernel.org/netdev/20241001024837.96425-1-kuniyu@amazon.com/
====================

Link: https://patch.msgid.link/20241008172906.1326-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+42 -31
+1 -1
include/linux/inetdevice.h
··· 141 141 ARP_EVICT_NOCARRIER) 142 142 143 143 struct in_ifaddr { 144 - struct hlist_node hash; 144 + struct hlist_node addr_lst; 145 145 struct in_ifaddr __rcu *ifa_next; 146 146 struct in_device *ifa_dev; 147 147 struct rcu_head rcu_head;
+2
include/net/netns/ipv4.h
··· 270 270 271 271 atomic_t rt_genid; 272 272 siphash_key_t ip_id_key; 273 + struct hlist_head *inet_addr_lst; 274 + struct delayed_work addr_chk_work; 273 275 }; 274 276 #endif
+39 -30
net/ipv4/devinet.c
··· 119 119 #define IN4_ADDR_HSIZE_SHIFT 8 120 120 #define IN4_ADDR_HSIZE (1U << IN4_ADDR_HSIZE_SHIFT) 121 121 122 - static struct hlist_head inet_addr_lst[IN4_ADDR_HSIZE]; 123 - 124 122 static u32 inet_addr_hash(const struct net *net, __be32 addr) 125 123 { 126 124 u32 val = (__force u32) addr ^ net_hash_mix(net); ··· 131 133 u32 hash = inet_addr_hash(net, ifa->ifa_local); 132 134 133 135 ASSERT_RTNL(); 134 - hlist_add_head_rcu(&ifa->hash, &inet_addr_lst[hash]); 136 + hlist_add_head_rcu(&ifa->addr_lst, &net->ipv4.inet_addr_lst[hash]); 135 137 } 136 138 137 139 static void inet_hash_remove(struct in_ifaddr *ifa) 138 140 { 139 141 ASSERT_RTNL(); 140 - hlist_del_init_rcu(&ifa->hash); 142 + hlist_del_init_rcu(&ifa->addr_lst); 141 143 } 142 144 143 145 /** ··· 184 186 u32 hash = inet_addr_hash(net, addr); 185 187 struct in_ifaddr *ifa; 186 188 187 - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[hash], hash) 188 - if (ifa->ifa_local == addr && 189 - net_eq(dev_net(ifa->ifa_dev->dev), net)) 189 + hlist_for_each_entry_rcu(ifa, &net->ipv4.inet_addr_lst[hash], addr_lst) 190 + if (ifa->ifa_local == addr) 190 191 return ifa; 191 192 192 193 return NULL; ··· 224 227 in_dev_hold(in_dev); 225 228 ifa->ifa_dev = in_dev; 226 229 227 - INIT_HLIST_NODE(&ifa->hash); 230 + INIT_HLIST_NODE(&ifa->addr_lst); 228 231 229 232 return ifa; 230 233 } ··· 481 484 __inet_del_ifa(in_dev, ifap, destroy, NULL, 0); 482 485 } 483 486 484 - static void check_lifetime(struct work_struct *work); 485 - 486 - static DECLARE_DELAYED_WORK(check_lifetime_work, check_lifetime); 487 - 488 487 static int __inet_insert_ifa(struct in_ifaddr *ifa, struct nlmsghdr *nlh, 489 488 u32 portid, struct netlink_ext_ack *extack) 490 489 { 491 490 struct in_ifaddr __rcu **last_primary, **ifap; 492 491 struct in_device *in_dev = ifa->ifa_dev; 492 + struct net *net = dev_net(in_dev->dev); 493 493 struct in_validator_info ivi; 494 494 struct in_ifaddr *ifa1; 495 495 int ret; ··· 555 561 556 562 inet_hash_insert(dev_net(in_dev->dev), ifa); 557 563 558 - cancel_delayed_work(&check_lifetime_work); 559 - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); 564 + cancel_delayed_work(&net->ipv4.addr_chk_work); 565 + queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 0); 560 566 561 567 /* Send message first, then call notifier. 562 568 Notifier will trigger FIB update, so that ··· 702 708 unsigned long now, next, next_sec, next_sched; 703 709 struct in_ifaddr *ifa; 704 710 struct hlist_node *n; 711 + struct net *net; 705 712 int i; 706 713 714 + net = container_of(to_delayed_work(work), struct net, ipv4.addr_chk_work); 707 715 now = jiffies; 708 716 next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); 709 717 710 718 for (i = 0; i < IN4_ADDR_HSIZE; i++) { 719 + struct hlist_head *head = &net->ipv4.inet_addr_lst[i]; 711 720 bool change_needed = false; 712 721 713 722 rcu_read_lock(); 714 - hlist_for_each_entry_rcu(ifa, &inet_addr_lst[i], hash) { 723 + hlist_for_each_entry_rcu(ifa, head, addr_lst) { 715 724 unsigned long age, tstamp; 716 725 u32 preferred_lft; 717 726 u32 valid_lft; ··· 752 755 if (!change_needed) 753 756 continue; 754 757 rtnl_lock(); 755 - hlist_for_each_entry_safe(ifa, n, &inet_addr_lst[i], hash) { 758 + hlist_for_each_entry_safe(ifa, n, head, addr_lst) { 756 759 unsigned long age; 757 760 758 761 if (ifa->ifa_flags & IFA_F_PERMANENT) ··· 801 804 if (time_before(next_sched, now + ADDRCONF_TIMER_FUZZ_MAX)) 802 805 next_sched = now + ADDRCONF_TIMER_FUZZ_MAX; 803 806 804 - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 805 - next_sched - now); 807 + queue_delayed_work(system_power_efficient_wq, &net->ipv4.addr_chk_work, 808 + next_sched - now); 806 809 } 807 810 808 811 static void set_ifa_lifetime(struct in_ifaddr *ifa, __u32 valid_lft, ··· 999 1002 ifa->ifa_proto = new_proto; 1000 1003 1001 1004 set_ifa_lifetime(ifa, valid_lft, prefered_lft); 1002 - cancel_delayed_work(&check_lifetime_work); 1005 + cancel_delayed_work(&net->ipv4.addr_chk_work); 1003 1006 queue_delayed_work(system_power_efficient_wq, 1004 - &check_lifetime_work, 0); 1007 + &net->ipv4.addr_chk_work, 0); 1005 1008 rtmsg_ifa(RTM_NEWADDR, ifa, nlh, NETLINK_CB(skb).portid); 1006 1009 } 1007 1010 return 0; ··· 2660 2663 2661 2664 static __net_init int devinet_init_net(struct net *net) 2662 2665 { 2663 - int err; 2664 - struct ipv4_devconf *all, *dflt; 2665 2666 #ifdef CONFIG_SYSCTL 2666 - struct ctl_table *tbl; 2667 2667 struct ctl_table_header *forw_hdr; 2668 + struct ctl_table *tbl; 2668 2669 #endif 2670 + struct ipv4_devconf *all, *dflt; 2671 + int err; 2672 + int i; 2669 2673 2670 2674 err = -ENOMEM; 2675 + net->ipv4.inet_addr_lst = kmalloc_array(IN4_ADDR_HSIZE, 2676 + sizeof(struct hlist_head), 2677 + GFP_KERNEL); 2678 + if (!net->ipv4.inet_addr_lst) 2679 + goto err_alloc_hash; 2680 + 2671 2681 all = kmemdup(&ipv4_devconf, sizeof(ipv4_devconf), GFP_KERNEL); 2672 2682 if (!all) 2673 2683 goto err_alloc_all; ··· 2735 2731 net->ipv4.forw_hdr = forw_hdr; 2736 2732 #endif 2737 2733 2734 + for (i = 0; i < IN4_ADDR_HSIZE; i++) 2735 + INIT_HLIST_HEAD(&net->ipv4.inet_addr_lst[i]); 2736 + 2737 + INIT_DEFERRABLE_WORK(&net->ipv4.addr_chk_work, check_lifetime); 2738 + 2738 2739 net->ipv4.devconf_all = all; 2739 2740 net->ipv4.devconf_dflt = dflt; 2740 2741 return 0; ··· 2757 2748 err_alloc_dflt: 2758 2749 kfree(all); 2759 2750 err_alloc_all: 2751 + kfree(net->ipv4.inet_addr_lst); 2752 + err_alloc_hash: 2760 2753 return err; 2761 2754 } 2762 2755 ··· 2766 2755 { 2767 2756 #ifdef CONFIG_SYSCTL 2768 2757 const struct ctl_table *tbl; 2758 + #endif 2769 2759 2760 + cancel_delayed_work_sync(&net->ipv4.addr_chk_work); 2761 + 2762 + #ifdef CONFIG_SYSCTL 2770 2763 tbl = net->ipv4.forw_hdr->ctl_table_arg; 2771 2764 unregister_net_sysctl_table(net->ipv4.forw_hdr); 2772 2765 __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, ··· 2781 2766 #endif 2782 2767 kfree(net->ipv4.devconf_dflt); 2783 2768 kfree(net->ipv4.devconf_all); 2769 + kfree(net->ipv4.inet_addr_lst); 2784 2770 } 2785 2771 2786 2772 static __net_initdata struct pernet_operations devinet_ops = { ··· 2799 2783 2800 2784 void __init devinet_init(void) 2801 2785 { 2802 - int i; 2803 - 2804 - for (i = 0; i < IN4_ADDR_HSIZE; i++) 2805 - INIT_HLIST_HEAD(&inet_addr_lst[i]); 2806 - 2807 2786 register_pernet_subsys(&devinet_ops); 2808 2787 register_netdevice_notifier(&ip_netdev_notifier); 2809 - 2810 - queue_delayed_work(system_power_efficient_wq, &check_lifetime_work, 0); 2811 2788 2812 2789 rtnl_af_register(&inet_af_ops); 2813 2790