Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'ipmr-no-rtnl-for-rtnl_family_ipmr-rtnetlink'

Kuniyuki Iwashima says:

====================
ipmr: No RTNL for RTNL_FAMILY_IPMR rtnetlink.

This series removes RTNL from ipmr rtnetlink handlers.

After this series, there are a few RTNL left in net/ipv4/ipmr.c
and such users will be converted to per-netns RTNL in another
series.

Patch 1 adds a selftest to exercise most? of the RTNL paths
in net/ipv4/ipmr.c

Patch 2 - 6 converts RTM_GETLINK / RTM_GETROUTE handlers
to RCU.

Patch 7 - 9 converts ->exit_batch() to ->exit_rtnl() to
save one RTNL in cleanup_net().

Patch 10 - 11 removes unnecessary RTNL during setup_net()
failure.

Patch 12 is a random cleanup.

Patch 13 - 15 drops RTNL for RTM_NEWROUTE and RTM_DELROUTE.
====================

Link: https://patch.msgid.link/20260228221800.1082070-1-kuniyu@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+636 -114
+4 -5
include/linux/mroute_base.h
··· 76 76 struct vif_device *vif, 77 77 struct net_device *vif_dev, 78 78 unsigned short vif_index, u32 tb_id, 79 - unsigned int *ipmr_seq) 79 + atomic_t *ipmr_seq) 80 80 { 81 81 struct vif_entry_notifier_info info = { 82 82 .info = { ··· 89 89 }; 90 90 91 91 ASSERT_RTNL(); 92 - (*ipmr_seq)++; 92 + atomic_inc(ipmr_seq); 93 93 return call_fib_notifiers(net, event_type, &info.info); 94 94 } 95 95 ··· 198 198 unsigned short family, 199 199 enum fib_event_type event_type, 200 200 struct mr_mfc *mfc, u32 tb_id, 201 - unsigned int *ipmr_seq) 201 + atomic_t *ipmr_seq) 202 202 { 203 203 struct mfc_entry_notifier_info info = { 204 204 .info = { ··· 208 208 .tb_id = tb_id 209 209 }; 210 210 211 - ASSERT_RTNL(); 212 - (*ipmr_seq)++; 211 + atomic_inc(ipmr_seq); 213 212 return call_fib_notifiers(net, event_type, &info.info); 214 213 } 215 214
+3 -3
include/net/netns/ipv4.h
··· 279 279 struct list_head mr_tables; 280 280 struct fib_rules_ops *mr_rules_ops; 281 281 #endif 282 + struct fib_notifier_ops *ipmr_notifier_ops; 283 + atomic_t ipmr_seq; 284 + struct mutex mfc_mutex; 282 285 #endif 283 286 #ifdef CONFIG_IP_ROUTE_MULTIPATH 284 287 struct sysctl_fib_multipath_hash_seed sysctl_fib_multipath_hash_seed; ··· 292 289 293 290 struct fib_notifier_ops *notifier_ops; 294 291 unsigned int fib_seq; /* writes protected by rtnl_mutex */ 295 - 296 - struct fib_notifier_ops *ipmr_notifier_ops; 297 - unsigned int ipmr_seq; /* protected by rtnl_mutex */ 298 292 299 293 atomic_t rt_genid; 300 294 siphash_key_t ip_id_key;
+1 -1
include/net/netns/ipv6.h
··· 118 118 struct seg6_pernet_data *seg6_data; 119 119 struct fib_notifier_ops *notifier_ops; 120 120 struct fib_notifier_ops *ip6mr_notifier_ops; 121 - unsigned int ipmr_seq; /* protected by rtnl_mutex */ 121 + atomic_t ipmr_seq; 122 122 struct { 123 123 struct hlist_head head; 124 124 spinlock_t lock;
+164 -101
net/ipv4/ipmr.c
··· 102 102 static struct kmem_cache *mrt_cachep __ro_after_init; 103 103 104 104 static struct mr_table *ipmr_new_table(struct net *net, u32 id); 105 - static void ipmr_free_table(struct mr_table *mrt); 105 + static void ipmr_free_table(struct mr_table *mrt, 106 + struct list_head *dev_kill_list); 106 107 107 108 static void ip_mr_forward(struct net *net, struct mr_table *mrt, 108 109 struct net_device *dev, struct sk_buff *skb, ··· 113 112 static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, 114 113 int cmd); 115 114 static void igmpmsg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); 116 - static void mroute_clean_tables(struct mr_table *mrt, int flags); 115 + static void mroute_clean_tables(struct mr_table *mrt, int flags, 116 + struct list_head *dev_kill_list); 117 117 static void ipmr_expire_process(struct timer_list *t); 118 118 119 119 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES ··· 252 250 static int __net_init ipmr_rules_init(struct net *net) 253 251 { 254 252 struct fib_rules_ops *ops; 253 + LIST_HEAD(dev_kill_list); 255 254 struct mr_table *mrt; 256 255 int err; 257 256 ··· 276 273 return 0; 277 274 278 275 err2: 279 - rtnl_lock(); 280 - ipmr_free_table(mrt); 281 - rtnl_unlock(); 276 + ipmr_free_table(mrt, &dev_kill_list); 282 277 err1: 283 278 fib_rules_unregister(ops); 284 279 return err; ··· 284 283 285 284 static void __net_exit ipmr_rules_exit(struct net *net) 286 285 { 286 + fib_rules_unregister(net->ipv4.mr_rules_ops); 287 + } 288 + 289 + static void __net_exit ipmr_rules_exit_rtnl(struct net *net, 290 + struct list_head *dev_kill_list) 291 + { 287 292 struct mr_table *mrt, *next; 288 293 289 - ASSERT_RTNL(); 290 294 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { 291 295 list_del(&mrt->list); 292 - ipmr_free_table(mrt); 296 + ipmr_free_table(mrt, dev_kill_list); 293 297 } 294 - fib_rules_unregister(net->ipv4.mr_rules_ops); 295 298 } 296 299 297 300 static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, ··· 353 348 354 349 static void __net_exit ipmr_rules_exit(struct net *net) 355 350 { 356 - ASSERT_RTNL(); 357 - ipmr_free_table(net->ipv4.mrt); 351 + } 352 + 353 + static void __net_exit ipmr_rules_exit_rtnl(struct net *net, 354 + struct list_head *dev_kill_list) 355 + { 356 + ipmr_free_table(net->ipv4.mrt, dev_kill_list); 357 + 358 358 net->ipv4.mrt = NULL; 359 359 } 360 360 ··· 434 424 ipmr_expire_process, ipmr_new_table_set); 435 425 } 436 426 437 - static void ipmr_free_table(struct mr_table *mrt) 427 + static void ipmr_free_table(struct mr_table *mrt, struct list_head *dev_kill_list) 438 428 { 439 429 struct net *net = read_pnet(&mrt->net); 430 + LIST_HEAD(ipmr_dev_kill_list); 440 431 441 432 WARN_ON_ONCE(!mr_can_free_table(net)); 442 433 443 434 timer_shutdown_sync(&mrt->ipmr_expire_timer); 444 435 mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | 445 - MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); 436 + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC, 437 + &ipmr_dev_kill_list); 446 438 rhltable_destroy(&mrt->mfc_hash); 447 439 kfree(mrt); 440 + 441 + WARN_ON_ONCE(!net_initialized(net) && !list_empty(&ipmr_dev_kill_list)); 442 + list_splice(&ipmr_dev_kill_list, dev_kill_list); 448 443 } 449 444 450 445 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ ··· 1211 1196 struct net *net = read_pnet(&mrt->net); 1212 1197 struct mfc_cache *c; 1213 1198 1214 - /* The entries are added/deleted only under RTNL */ 1215 1199 rcu_read_lock(); 1216 1200 c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, 1217 1201 mfc->mfcc_mcastgrp.s_addr, parent); ··· 1237 1223 if (mfc->mfcc_parent >= MAXVIFS) 1238 1224 return -ENFILE; 1239 1225 1240 - /* The entries are added/deleted only under RTNL */ 1241 1226 rcu_read_lock(); 1242 1227 c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr, 1243 1228 mfc->mfcc_mcastgrp.s_addr, parent); ··· 1306 1293 } 1307 1294 1308 1295 /* Close the multicast socket, and clear the vif tables etc */ 1309 - static void mroute_clean_tables(struct mr_table *mrt, int flags) 1296 + static void mroute_clean_tables(struct mr_table *mrt, int flags, 1297 + struct list_head *dev_kill_list) 1310 1298 { 1311 1299 struct net *net = read_pnet(&mrt->net); 1312 - struct mr_mfc *c, *tmp; 1313 1300 struct mfc_cache *cache; 1314 - LIST_HEAD(list); 1301 + struct mr_mfc *c, *tmp; 1315 1302 int i; 1316 1303 1317 1304 /* Shut down all active vif entries */ ··· 1321 1308 !(flags & MRT_FLUSH_VIFS_STATIC)) || 1322 1309 (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) 1323 1310 continue; 1324 - vif_delete(mrt, i, 0, &list); 1311 + vif_delete(mrt, i, 0, dev_kill_list); 1325 1312 } 1326 - unregister_netdevice_many(&list); 1327 1313 } 1328 1314 1329 1315 /* Wipe the cache */ 1330 1316 if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { 1317 + mutex_lock(&net->ipv4.mfc_mutex); 1318 + 1331 1319 list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { 1332 1320 if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || 1333 1321 (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) ··· 1341 1327 mroute_netlink_event(mrt, cache, RTM_DELROUTE); 1342 1328 mr_cache_put(c); 1343 1329 } 1330 + 1331 + mutex_unlock(&net->ipv4.mfc_mutex); 1344 1332 } 1345 1333 1346 1334 if (flags & MRT_FLUSH_MFC) { ··· 1365 1349 static void mrtsock_destruct(struct sock *sk) 1366 1350 { 1367 1351 struct net *net = sock_net(sk); 1352 + LIST_HEAD(dev_kill_list); 1368 1353 struct mr_table *mrt; 1369 1354 1370 1355 rtnl_lock(); 1356 + 1371 1357 ipmr_for_each_table(mrt, net) { 1372 1358 if (sk == rtnl_dereference(mrt->mroute_sk)) { 1373 1359 IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; ··· 1378 1360 NETCONFA_IFINDEX_ALL, 1379 1361 net->ipv4.devconf_all); 1380 1362 RCU_INIT_POINTER(mrt->mroute_sk, NULL); 1381 - mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); 1363 + mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC, 1364 + &dev_kill_list); 1382 1365 } 1383 1366 } 1367 + 1368 + unregister_netdevice_many(&dev_kill_list); 1369 + 1384 1370 rtnl_unlock(); 1385 1371 } 1386 1372 ··· 1500 1478 } 1501 1479 if (parent == 0) 1502 1480 parent = mfc.mfcc_parent; 1481 + 1482 + mutex_lock(&net->ipv4.mfc_mutex); 1483 + 1503 1484 if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY) 1504 1485 ret = ipmr_mfc_delete(mrt, &mfc, parent); 1505 1486 else 1506 1487 ret = ipmr_mfc_add(net, mrt, &mfc, 1507 1488 sk == rtnl_dereference(mrt->mroute_sk), 1508 1489 parent); 1490 + 1491 + mutex_unlock(&net->ipv4.mfc_mutex); 1509 1492 break; 1510 - case MRT_FLUSH: 1493 + case MRT_FLUSH: { 1494 + LIST_HEAD(dev_kill_list); 1495 + 1511 1496 if (optlen != sizeof(val)) { 1512 1497 ret = -EINVAL; 1513 1498 break; ··· 1523 1494 ret = -EFAULT; 1524 1495 break; 1525 1496 } 1526 - mroute_clean_tables(mrt, val); 1497 + 1498 + mroute_clean_tables(mrt, val, &dev_kill_list); 1499 + unregister_netdevice_many(&dev_kill_list); 1527 1500 break; 1501 + } 1528 1502 /* Control PIM assert. */ 1529 1503 case MRT_ASSERT: 1530 1504 if (optlen != sizeof(val)) { ··· 1538 1506 ret = -EFAULT; 1539 1507 break; 1540 1508 } 1541 - mrt->mroute_do_assert = val; 1509 + WRITE_ONCE(mrt->mroute_do_assert, val); 1542 1510 break; 1543 1511 case MRT_PIM: 1544 1512 if (!ipmr_pimsm_enabled()) { ··· 1557 1525 do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE); 1558 1526 val = !!val; 1559 1527 if (val != mrt->mroute_do_pim) { 1560 - mrt->mroute_do_pim = val; 1561 - mrt->mroute_do_assert = val; 1562 - mrt->mroute_do_wrvifwhole = do_wrvifwhole; 1528 + WRITE_ONCE(mrt->mroute_do_pim, val); 1529 + WRITE_ONCE(mrt->mroute_do_assert, val); 1530 + WRITE_ONCE(mrt->mroute_do_wrvifwhole, do_wrvifwhole); 1563 1531 } 1564 1532 break; 1565 1533 case MRT_TABLE: ··· 1642 1610 case MRT_PIM: 1643 1611 if (!ipmr_pimsm_enabled()) 1644 1612 return -ENOPROTOOPT; 1645 - val = mrt->mroute_do_pim; 1613 + val = READ_ONCE(mrt->mroute_do_pim); 1646 1614 break; 1647 1615 case MRT_ASSERT: 1648 - val = mrt->mroute_do_assert; 1616 + val = READ_ONCE(mrt->mroute_do_assert); 1649 1617 break; 1650 1618 default: 1651 1619 return -ENOPROTOOPT; ··· 2069 2037 2070 2038 atomic_long_inc(&c->_c.mfc_un.res.wrong_if); 2071 2039 2072 - if (true_vifi >= 0 && mrt->mroute_do_assert && 2040 + if (true_vifi >= 0 && READ_ONCE(mrt->mroute_do_assert) && 2073 2041 /* pimsm uses asserts, when switching from RPT to SPT, 2074 2042 * so that we cannot check that packet arrived on an oif. 2075 2043 * It is bad, but otherwise we would need to move pretty 2076 2044 * large chunk of pimd to kernel. Ough... --ANK 2077 2045 */ 2078 - (mrt->mroute_do_pim || 2046 + (READ_ONCE(mrt->mroute_do_pim) || 2079 2047 c->_c.mfc_un.res.ttls[true_vifi] < 255) && 2080 2048 time_after(jiffies, 2081 2049 c->_c.mfc_un.res.last_assert + 2082 2050 MFC_ASSERT_THRESH)) { 2083 2051 c->_c.mfc_un.res.last_assert = jiffies; 2084 2052 ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF); 2085 - if (mrt->mroute_do_wrvifwhole) 2053 + if (READ_ONCE(mrt->mroute_do_wrvifwhole)) 2086 2054 ipmr_cache_report(mrt, skb, true_vifi, 2087 2055 IGMPMSG_WRVIFWHOLE); 2088 2056 } ··· 2390 2358 mrt = ipmr_rt_fib_lookup(net, skb); 2391 2359 if (IS_ERR(mrt)) 2392 2360 goto drop; 2393 - if (!mrt->mroute_do_pim || 2361 + if (!READ_ONCE(mrt->mroute_do_pim) || 2394 2362 pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 2395 2363 goto drop; 2396 2364 ··· 2542 2510 cmd, flags); 2543 2511 } 2544 2512 2545 - static size_t mroute_msgsize(bool unresolved, int maxvif) 2513 + static size_t mroute_msgsize(bool unresolved) 2546 2514 { 2547 2515 size_t len = 2548 2516 NLMSG_ALIGN(sizeof(struct rtmsg)) ··· 2555 2523 len = len 2556 2524 + nla_total_size(4) /* RTA_IIF */ 2557 2525 + nla_total_size(0) /* RTA_MULTIPATH */ 2558 - + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) 2526 + + MAXVIFS * NLA_ALIGN(sizeof(struct rtnexthop)) 2559 2527 /* RTA_MFC_STATS */ 2560 2528 + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) 2561 2529 ; ··· 2570 2538 struct sk_buff *skb; 2571 2539 int err = -ENOBUFS; 2572 2540 2573 - skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS, 2574 - mrt->maxvif), 2541 + skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS), 2575 2542 GFP_ATOMIC); 2576 2543 if (!skb) 2577 2544 goto errout; ··· 2712 2681 { 2713 2682 struct net *net = sock_net(in_skb->sk); 2714 2683 struct nlattr *tb[RTA_MAX + 1]; 2715 - struct sk_buff *skb = NULL; 2716 2684 struct mfc_cache *cache; 2717 2685 struct mr_table *mrt; 2686 + struct sk_buff *skb; 2718 2687 __be32 src, grp; 2719 2688 u32 tableid; 2720 2689 int err; ··· 2727 2696 grp = nla_get_in_addr_default(tb[RTA_DST], 0); 2728 2697 tableid = nla_get_u32_default(tb[RTA_TABLE], 0); 2729 2698 2699 + skb = nlmsg_new(mroute_msgsize(false), GFP_KERNEL); 2700 + if (!skb) { 2701 + err = -ENOBUFS; 2702 + goto errout; 2703 + } 2704 + 2705 + rcu_read_lock(); 2706 + 2730 2707 mrt = __ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT); 2731 2708 if (!mrt) { 2732 2709 err = -ENOENT; 2733 - goto errout_free; 2710 + goto errout_unlock; 2734 2711 } 2735 2712 2736 - /* entries are added/deleted only under RTNL */ 2737 - rcu_read_lock(); 2738 2713 cache = ipmr_cache_find(mrt, src, grp); 2739 - rcu_read_unlock(); 2740 2714 if (!cache) { 2741 2715 err = -ENOENT; 2742 - goto errout_free; 2743 - } 2744 - 2745 - skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL); 2746 - if (!skb) { 2747 - err = -ENOBUFS; 2748 - goto errout_free; 2716 + goto errout_unlock; 2749 2717 } 2750 2718 2751 2719 err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, 2752 2720 nlh->nlmsg_seq, cache, 2753 2721 RTM_NEWROUTE, 0); 2754 2722 if (err < 0) 2755 - goto errout_free; 2723 + goto errout_unlock; 2724 + 2725 + rcu_read_unlock(); 2756 2726 2757 2727 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 2758 - 2759 2728 errout: 2760 2729 return err; 2761 2730 2762 - errout_free: 2731 + errout_unlock: 2732 + rcu_read_unlock(); 2763 2733 kfree_skb(skb); 2764 2734 goto errout; 2765 2735 } ··· 2768 2736 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) 2769 2737 { 2770 2738 struct fib_dump_filter filter = { 2771 - .rtnl_held = true, 2739 + .rtnl_held = false, 2772 2740 }; 2773 2741 int err; 2742 + 2743 + rcu_read_lock(); 2774 2744 2775 2745 if (cb->strict_check) { 2776 2746 err = ip_valid_fib_dump_req(sock_net(skb->sk), cb->nlh, 2777 2747 &filter, cb); 2778 2748 if (err < 0) 2779 - return err; 2749 + goto out; 2780 2750 } 2781 2751 2782 2752 if (filter.table_id) { ··· 2786 2752 2787 2753 mrt = __ipmr_get_table(sock_net(skb->sk), filter.table_id); 2788 2754 if (!mrt) { 2789 - if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) 2790 - return skb->len; 2755 + if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IPMR) { 2756 + err = skb->len; 2757 + goto out; 2758 + } 2791 2759 2792 2760 NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); 2793 - return -ENOENT; 2761 + err = -ENOENT; 2762 + goto out; 2794 2763 } 2764 + 2795 2765 err = mr_table_dump(mrt, skb, cb, _ipmr_fill_mroute, 2796 2766 &mfc_unres_lock, &filter); 2797 - return skb->len ? : err; 2767 + err = skb->len ? : err; 2768 + goto out; 2798 2769 } 2799 2770 2800 - return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, 2801 - _ipmr_fill_mroute, &mfc_unres_lock, &filter); 2771 + err = mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter, 2772 + _ipmr_fill_mroute, &mfc_unres_lock, &filter); 2773 + out: 2774 + rcu_read_unlock(); 2775 + 2776 + return err; 2802 2777 } 2803 2778 2804 2779 static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = { ··· 2851 2808 { 2852 2809 struct net_device *dev = NULL; 2853 2810 u32 tblid = RT_TABLE_DEFAULT; 2811 + int ret, rem, iif = 0; 2854 2812 struct mr_table *mrt; 2855 2813 struct nlattr *attr; 2856 2814 struct rtmsg *rtm; 2857 - int ret, rem; 2858 2815 2859 2816 ret = nlmsg_validate_deprecated(nlh, sizeof(*rtm), RTA_MAX, 2860 2817 rtm_ipmr_policy, extack); ··· 2881 2838 mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr); 2882 2839 break; 2883 2840 case RTA_IIF: 2884 - dev = __dev_get_by_index(net, nla_get_u32(attr)); 2885 - if (!dev) { 2886 - ret = -ENODEV; 2887 - goto out; 2888 - } 2841 + iif = nla_get_u32(attr); 2889 2842 break; 2890 2843 case RTA_MULTIPATH: 2891 2844 if (ipmr_nla_get_ttls(attr, mfcc) < 0) { ··· 2897 2858 break; 2898 2859 } 2899 2860 } 2861 + 2862 + rcu_read_lock(); 2863 + 2900 2864 mrt = __ipmr_get_table(net, tblid); 2901 2865 if (!mrt) { 2902 2866 ret = -ENOENT; 2903 - goto out; 2867 + goto unlock; 2904 2868 } 2869 + 2870 + if (iif) { 2871 + dev = dev_get_by_index_rcu(net, iif); 2872 + if (!dev) { 2873 + ret = -ENODEV; 2874 + goto unlock; 2875 + } 2876 + 2877 + mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); 2878 + } 2879 + 2905 2880 *mrtret = mrt; 2906 2881 *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0; 2907 - if (dev) 2908 - mfcc->mfcc_parent = ipmr_find_vif(mrt, dev); 2909 2882 2883 + unlock: 2884 + rcu_read_unlock(); 2910 2885 out: 2911 2886 return ret; 2912 2887 } ··· 2930 2877 struct netlink_ext_ack *extack) 2931 2878 { 2932 2879 struct net *net = sock_net(skb->sk); 2933 - int ret, mrtsock, parent; 2934 - struct mr_table *tbl; 2880 + int ret, mrtsock = 0, parent; 2881 + struct mr_table *tbl = NULL; 2935 2882 struct mfcctl mfcc; 2936 2883 2937 - mrtsock = 0; 2938 - tbl = NULL; 2939 2884 ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); 2940 2885 if (ret < 0) 2941 2886 return ret; 2942 2887 2943 2888 parent = ret ? mfcc.mfcc_parent : -1; 2889 + 2890 + mutex_lock(&net->ipv4.mfc_mutex); 2891 + 2944 2892 if (nlh->nlmsg_type == RTM_NEWROUTE) 2945 - return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); 2893 + ret = ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent); 2946 2894 else 2947 - return ipmr_mfc_delete(tbl, &mfcc, parent); 2895 + ret = ipmr_mfc_delete(tbl, &mfcc, parent); 2896 + 2897 + mutex_unlock(&net->ipv4.mfc_mutex); 2898 + 2899 + return ret; 2948 2900 } 2949 2901 2950 2902 static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb) ··· 2959 2901 if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) || 2960 2902 nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) || 2961 2903 nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM, 2962 - mrt->mroute_reg_vif_num) || 2904 + READ_ONCE(mrt->mroute_reg_vif_num)) || 2963 2905 nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT, 2964 - mrt->mroute_do_assert) || 2965 - nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) || 2906 + READ_ONCE(mrt->mroute_do_assert)) || 2907 + nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, 2908 + READ_ONCE(mrt->mroute_do_pim)) || 2966 2909 nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE, 2967 - mrt->mroute_do_wrvifwhole)) 2910 + READ_ONCE(mrt->mroute_do_wrvifwhole))) 2968 2911 return false; 2969 2912 2970 2913 return true; ··· 2978 2919 struct vif_device *vif; 2979 2920 2980 2921 vif = &mrt->vif_table[vifid]; 2981 - vif_dev = rtnl_dereference(vif->dev); 2922 + vif_dev = vif_dev_read(vif); 2982 2923 /* if the VIF doesn't exist just continue */ 2983 2924 if (!vif_dev) 2984 2925 return true; ··· 2987 2928 if (!vif_nest) 2988 2929 return false; 2989 2930 2990 - if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif_dev->ifindex) || 2931 + if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, READ_ONCE(vif_dev->ifindex)) || 2991 2932 nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) || 2992 2933 nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) || 2993 - nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in, 2934 + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, READ_ONCE(vif->bytes_in), 2994 2935 IPMRA_VIFA_PAD) || 2995 - nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out, 2936 + nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, READ_ONCE(vif->bytes_out), 2996 2937 IPMRA_VIFA_PAD) || 2997 - nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in, 2938 + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, READ_ONCE(vif->pkt_in), 2998 2939 IPMRA_VIFA_PAD) || 2999 - nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out, 2940 + nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, READ_ONCE(vif->pkt_out), 3000 2941 IPMRA_VIFA_PAD) || 3001 2942 nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) || 3002 2943 nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) { ··· 3051 2992 s_t = cb->args[0]; 3052 2993 s_e = cb->args[1]; 3053 2994 2995 + rcu_read_lock(); 2996 + 3054 2997 ipmr_for_each_table(mrt, net) { 3055 2998 struct nlattr *vifs, *af; 3056 2999 struct ifinfomsg *hdr; ··· 3087 3026 nlmsg_end(skb, nlh); 3088 3027 goto out; 3089 3028 } 3090 - for (i = 0; i < mrt->maxvif; i++) { 3029 + for (i = 0; i < READ_ONCE(mrt->maxvif); i++) { 3091 3030 if (e < s_e) 3092 3031 goto skip_entry; 3093 3032 if (!ipmr_fill_vif(mrt, i, skb)) { ··· 3109 3048 } 3110 3049 3111 3050 out: 3051 + rcu_read_unlock(); 3052 + 3112 3053 cb->args[1] = e; 3113 3054 cb->args[0] = t; 3114 3055 ··· 3248 3185 3249 3186 static unsigned int ipmr_seq_read(const struct net *net) 3250 3187 { 3251 - return READ_ONCE(net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); 3188 + return atomic_read(&net->ipv4.ipmr_seq) + ipmr_rules_seq_read(net); 3252 3189 } 3253 3190 3254 3191 static int ipmr_dump(struct net *net, struct notifier_block *nb, ··· 3269 3206 { 3270 3207 struct fib_notifier_ops *ops; 3271 3208 3272 - net->ipv4.ipmr_seq = 0; 3209 + atomic_set(&net->ipv4.ipmr_seq, 0); 3273 3210 3274 3211 ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net); 3275 3212 if (IS_ERR(ops)) ··· 3288 3225 /* Setup for IP multicast routing */ 3289 3226 static int __net_init ipmr_net_init(struct net *net) 3290 3227 { 3228 + LIST_HEAD(dev_kill_list); 3291 3229 int err; 3230 + 3231 + mutex_init(&net->ipv4.mfc_mutex); 3292 3232 3293 3233 err = ipmr_notifier_init(net); 3294 3234 if (err) ··· 3316 3250 proc_cache_fail: 3317 3251 remove_proc_entry("ip_mr_vif", net->proc_net); 3318 3252 proc_vif_fail: 3319 - rtnl_lock(); 3253 + ipmr_rules_exit_rtnl(net, &dev_kill_list); 3320 3254 ipmr_rules_exit(net); 3321 - rtnl_unlock(); 3322 3255 #endif 3323 3256 ipmr_rules_fail: 3324 3257 ipmr_notifier_exit(net); ··· 3331 3266 remove_proc_entry("ip_mr_cache", net->proc_net); 3332 3267 remove_proc_entry("ip_mr_vif", net->proc_net); 3333 3268 #endif 3269 + ipmr_rules_exit(net); 3334 3270 ipmr_notifier_exit(net); 3335 3271 } 3336 3272 3337 - static void __net_exit ipmr_net_exit_batch(struct list_head *net_list) 3273 + static void __net_exit ipmr_net_exit_rtnl(struct net *net, 3274 + struct list_head *dev_kill_list) 3338 3275 { 3339 - struct net *net; 3340 - 3341 - rtnl_lock(); 3342 - list_for_each_entry(net, net_list, exit_list) 3343 - ipmr_rules_exit(net); 3344 - rtnl_unlock(); 3276 + ipmr_rules_exit_rtnl(net, dev_kill_list); 3345 3277 } 3346 3278 3347 3279 static struct pernet_operations ipmr_net_ops = { 3348 3280 .init = ipmr_net_init, 3349 3281 .exit = ipmr_net_exit, 3350 - .exit_batch = ipmr_net_exit_batch, 3282 + .exit_rtnl = ipmr_net_exit_rtnl, 3351 3283 }; 3352 3284 3353 3285 static const struct rtnl_msg_handler ipmr_rtnl_msg_handlers[] __initconst = { 3354 3286 {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETLINK, 3355 - .dumpit = ipmr_rtm_dumplink}, 3287 + .dumpit = ipmr_rtm_dumplink, .flags = RTNL_FLAG_DUMP_UNLOCKED}, 3356 3288 {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_NEWROUTE, 3357 - .doit = ipmr_rtm_route}, 3289 + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, 3358 3290 {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_DELROUTE, 3359 - .doit = ipmr_rtm_route}, 3291 + .doit = ipmr_rtm_route, .flags = RTNL_FLAG_DOIT_UNLOCKED}, 3360 3292 {.protocol = RTNL_FAMILY_IPMR, .msgtype = RTM_GETROUTE, 3361 - .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute}, 3293 + .doit = ipmr_rtm_getroute, .dumpit = ipmr_rtm_dumproute, 3294 + .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, 3362 3295 }; 3363 3296 3364 3297 int __init ip_mr_init(void)
+2 -2
net/ipv4/ipmr_base.c
··· 223 223 224 224 rcu_read_lock(); 225 225 vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); 226 - if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) { 226 + if (vif_dev && nla_put_u32(skb, RTA_IIF, READ_ONCE(vif_dev->ifindex)) < 0) { 227 227 rcu_read_unlock(); 228 228 return -EMSGSIZE; 229 229 } ··· 252 252 253 253 nhp->rtnh_flags = 0; 254 254 nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; 255 - nhp->rtnh_ifindex = vif_dev->ifindex; 255 + nhp->rtnh_ifindex = READ_ONCE(vif_dev->ifindex); 256 256 nhp->rtnh_len = sizeof(*nhp); 257 257 } 258 258 }
+2 -2
net/ipv6/ip6mr.c
··· 1280 1280 1281 1281 static unsigned int ip6mr_seq_read(const struct net *net) 1282 1282 { 1283 - return READ_ONCE(net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); 1283 + return atomic_read(&net->ipv6.ipmr_seq) + ip6mr_rules_seq_read(net); 1284 1284 } 1285 1285 1286 1286 static int ip6mr_dump(struct net *net, struct notifier_block *nb, ··· 1305 1305 { 1306 1306 struct fib_notifier_ops *ops; 1307 1307 1308 - net->ipv6.ipmr_seq = 0; 1308 + atomic_set(&net->ipv6.ipmr_seq, 0); 1309 1309 1310 1310 ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); 1311 1311 if (IS_ERR(ops))
+1
tools/testing/selftests/net/forwarding/.gitignore
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 forwarding.config 3 + ipmr
+4
tools/testing/selftests/net/forwarding/Makefile
··· 133 133 tc_common.sh \ 134 134 # end of TEST_FILES 135 135 136 + TEST_GEN_PROGS := \ 137 + ipmr 138 + # end of TEST_GEN_PROGS 139 + 136 140 TEST_INCLUDES := \ 137 141 $(wildcard ../lib/sh/*.sh) \ 138 142 ../lib.sh \
+455
tools/testing/selftests/net/forwarding/ipmr.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* Copyright 2026 Google LLC */ 3 + 4 + #include <linux/if.h> 5 + #include <linux/mroute.h> 6 + #include <linux/netlink.h> 7 + #include <linux/rtnetlink.h> 8 + #include <linux/socket.h> 9 + #include <sched.h> 10 + #include <sys/ioctl.h> 11 + #include <sys/socket.h> 12 + 13 + #include "kselftest_harness.h" 14 + 15 + FIXTURE(ipmr) 16 + { 17 + int netlink_sk; 18 + int raw_sk; 19 + int veth_ifindex; 20 + }; 21 + 22 + FIXTURE_VARIANT(ipmr) 23 + { 24 + int family; 25 + int protocol; 26 + int level; 27 + int opts[MRT_MAX - MRT_BASE + 1]; 28 + }; 29 + 30 + FIXTURE_VARIANT_ADD(ipmr, ipv4) 31 + { 32 + .family = AF_INET, 33 + .protocol = IPPROTO_IGMP, 34 + .level = IPPROTO_IP, 35 + .opts = { 36 + MRT_INIT, 37 + MRT_DONE, 38 + MRT_ADD_VIF, 39 + MRT_DEL_VIF, 40 + MRT_ADD_MFC, 41 + MRT_DEL_MFC, 42 + MRT_VERSION, 43 + MRT_ASSERT, 44 + MRT_PIM, 45 + MRT_TABLE, 46 + MRT_ADD_MFC_PROXY, 47 + MRT_DEL_MFC_PROXY, 48 + MRT_FLUSH, 49 + }, 50 + }; 51 + 52 + struct mfc_attr { 53 + int table; 54 + __u32 origin; 55 + __u32 group; 56 + int ifindex; 57 + bool proxy; 58 + }; 59 + 60 + static struct rtattr *nl_add_rtattr(struct nlmsghdr *nlmsg, struct rtattr *rta, 61 + int type, const void *data, int len) 62 + { 63 + int unused = 0; 64 + 65 + rta->rta_type = type; 66 + rta->rta_len = RTA_LENGTH(len); 67 + memcpy(RTA_DATA(rta), data, len); 68 + 69 + nlmsg->nlmsg_len += NLMSG_ALIGN(rta->rta_len); 70 + 71 + return RTA_NEXT(rta, unused); 72 + } 73 + 74 + static int nl_sendmsg_mfc(struct __test_metadata *_metadata, FIXTURE_DATA(ipmr) *self, 75 + __u16 nlmsg_type, struct mfc_attr *mfc_attr) 76 + { 77 + struct { 78 + struct nlmsghdr nlmsg; 79 + struct rtmsg rtm; 80 + char buf[4096]; 81 + } req = { 82 + .nlmsg = { 83 + .nlmsg_len = NLMSG_LENGTH(sizeof(req.rtm)), 84 + /* ipmr does not care about NLM_F_CREATE and NLM_F_EXCL ... */ 85 + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK, 86 + .nlmsg_type = nlmsg_type, 87 + }, 88 + .rtm = { 89 + /* hard requirements in rtm_to_ipmr_mfcc() */ 90 + .rtm_family = RTNL_FAMILY_IPMR, 91 + .rtm_dst_len = 32, 92 + .rtm_type = RTN_MULTICAST, 93 + .rtm_scope = RT_SCOPE_UNIVERSE, 94 + .rtm_protocol = RTPROT_MROUTED, 95 + }, 96 + }; 97 + struct nlmsghdr *nlmsg = &req.nlmsg; 98 + struct nlmsgerr *errmsg; 99 + struct rtattr *rta; 100 + int err; 101 + 102 + rta = (struct rtattr *)&req.buf; 103 + rta = nl_add_rtattr(nlmsg, rta, RTA_TABLE, &mfc_attr->table, sizeof(mfc_attr->table)); 104 + rta = nl_add_rtattr(nlmsg, rta, RTA_SRC, &mfc_attr->origin, sizeof(mfc_attr->origin)); 105 + rta = nl_add_rtattr(nlmsg, rta, RTA_DST, &mfc_attr->group, sizeof(mfc_attr->group)); 106 + if (mfc_attr->ifindex) 107 + rta = nl_add_rtattr(nlmsg, rta, RTA_IIF, &mfc_attr->ifindex, sizeof(mfc_attr->ifindex)); 108 + if (mfc_attr->proxy) 109 + rta = nl_add_rtattr(nlmsg, rta, RTA_PREFSRC, NULL, 0); 110 + 111 + err = send(self->netlink_sk, &req, req.nlmsg.nlmsg_len, 0); 112 + ASSERT_EQ(err, req.nlmsg.nlmsg_len); 113 + 114 + memset(&req, 0, sizeof(req)); 115 + 116 + err = recv(self->netlink_sk, &req, sizeof(req), 0); 117 + ASSERT_TRUE(NLMSG_OK(nlmsg, err)); 118 + ASSERT_EQ(NLMSG_ERROR, nlmsg->nlmsg_type); 119 + 120 + errmsg = (struct nlmsgerr *)NLMSG_DATA(nlmsg); 121 + return errmsg->error; 122 + } 123 + 124 + FIXTURE_SETUP(ipmr) 125 + { 126 + struct ifreq ifr = { 127 + .ifr_name = "veth0", 128 + }; 129 + int err; 130 + 131 + err = unshare(CLONE_NEWNET); 132 + ASSERT_EQ(0, err); 133 + 134 + self->netlink_sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); 135 + ASSERT_LE(0, self->netlink_sk); 136 + 137 + self->raw_sk = socket(variant->family, SOCK_RAW, variant->protocol); 138 + ASSERT_LT(0, self->raw_sk); 139 + 140 + err = system("ip link add veth0 type veth peer veth1"); 141 + ASSERT_EQ(0, err); 142 + 143 + err = ioctl(self->raw_sk, SIOCGIFINDEX, &ifr); 144 + ASSERT_EQ(0, err); 145 + 146 + self->veth_ifindex = ifr.ifr_ifindex; 147 + } 148 + 149 + FIXTURE_TEARDOWN(ipmr) 150 + { 151 + close(self->raw_sk); 152 + close(self->netlink_sk); 153 + } 154 + 155 + TEST_F(ipmr, mrt_init) 156 + { 157 + int err, val = 0; /* any value is ok, but size must be int for MRT_INIT. */ 158 + 159 + err = setsockopt(self->raw_sk, 160 + variant->level, variant->opts[MRT_INIT - MRT_BASE], 161 + &val, sizeof(val)); 162 + ASSERT_EQ(0, err); 163 + 164 + err = setsockopt(self->raw_sk, 165 + variant->level, variant->opts[MRT_DONE - MRT_BASE], 166 + &val, sizeof(val)); 167 + ASSERT_EQ(0, err); 168 + } 169 + 170 + TEST_F(ipmr, mrt_add_vif_register) 171 + { 172 + struct vifctl vif = { 173 + .vifc_vifi = 0, 174 + .vifc_flags = VIFF_REGISTER, 175 + }; 176 + int err; 177 + 178 + err = setsockopt(self->raw_sk, 179 + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], 180 + &vif, sizeof(vif)); 181 + ASSERT_EQ(0, err); 182 + 183 + err = system("cat /proc/net/ip_mr_vif | grep -q pimreg"); 184 + ASSERT_EQ(0, err); 185 + 186 + err = setsockopt(self->raw_sk, 187 + variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], 188 + &vif, sizeof(vif)); 189 + ASSERT_EQ(0, err); 190 + } 191 + 192 + TEST_F(ipmr, mrt_del_vif_unreg) 193 + { 194 + struct vifctl vif = { 195 + .vifc_vifi = 0, 196 + .vifc_flags = VIFF_USE_IFINDEX, 197 + .vifc_lcl_ifindex = self->veth_ifindex, 198 + }; 199 + int err; 200 + 201 + err = setsockopt(self->raw_sk, 202 + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], 203 + &vif, sizeof(vif)); 204 + ASSERT_EQ(0, err); 205 + 206 + err = system("cat /proc/net/ip_mr_vif | grep -q veth0"); 207 + ASSERT_EQ(0, err); 208 + 209 + /* VIF is removed along with its device. */ 210 + err = system("ip link del veth0"); 211 + ASSERT_EQ(0, err); 212 + 213 + /* mrt->vif_table[veth_ifindex]->dev is NULL. */ 214 + err = setsockopt(self->raw_sk, 215 + variant->level, variant->opts[MRT_DEL_VIF - MRT_BASE], 216 + &vif, sizeof(vif)); 217 + ASSERT_EQ(-1, err); 218 + ASSERT_EQ(EADDRNOTAVAIL, errno); 219 + } 220 + 221 + TEST_F(ipmr, mrt_del_vif_netns_dismantle) 222 + { 223 + struct vifctl vif = { 224 + .vifc_vifi = 0, 225 + .vifc_flags = VIFF_USE_IFINDEX, 226 + .vifc_lcl_ifindex = self->veth_ifindex, 227 + }; 228 + int err; 229 + 230 + err = setsockopt(self->raw_sk, 231 + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], 232 + &vif, sizeof(vif)); 233 + ASSERT_EQ(0, err); 234 + 235 + /* Let cleanup_net() remove veth0 and VIF. */ 236 + } 237 + 238 + TEST_F(ipmr, mrt_add_mfc) 239 + { 240 + struct mfcctl mfc = {}; 241 + int err; 242 + 243 + /* MRT_ADD_MFC / MRT_ADD_MFC_PROXY does not need vif to exist (unlike netlink). */ 244 + err = setsockopt(self->raw_sk, 245 + variant->level, variant->opts[MRT_ADD_MFC - MRT_BASE], 246 + &mfc, sizeof(mfc)); 247 + ASSERT_EQ(0, err); 248 + 249 + /* (0.0.0.0 -> 0.0.0.0) */ 250 + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); 251 + ASSERT_EQ(0, err); 252 + 253 + err = setsockopt(self->raw_sk, 254 + variant->level, variant->opts[MRT_DEL_MFC - MRT_BASE], 255 + &mfc, sizeof(mfc)); 256 + } 257 + 258 + TEST_F(ipmr, mrt_add_mfc_proxy) 259 + { 260 + struct mfcctl mfc = {}; 261 + int err; 262 + 263 + err = setsockopt(self->raw_sk, 264 + variant->level, variant->opts[MRT_ADD_MFC_PROXY - MRT_BASE], 265 + &mfc, sizeof(mfc)); 266 + ASSERT_EQ(0, err); 267 + 268 + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); 269 + ASSERT_EQ(0, err); 270 + 271 + err = setsockopt(self->raw_sk, 272 + variant->level, variant->opts[MRT_DEL_MFC_PROXY - MRT_BASE], 273 + &mfc, sizeof(mfc)); 274 + } 275 + 276 + TEST_F(ipmr, mrt_add_mfc_netlink) 277 + { 278 + struct vifctl vif = { 279 + .vifc_vifi = 0, 280 + .vifc_flags = VIFF_USE_IFINDEX, 281 + .vifc_lcl_ifindex = self->veth_ifindex, 282 + }; 283 + struct mfc_attr mfc_attr = { 284 + .table = RT_TABLE_DEFAULT, 285 + .origin = 0, 286 + .group = 0, 287 + .ifindex = self->veth_ifindex, 288 + .proxy = false, 289 + }; 290 + int err; 291 + 292 + err = setsockopt(self->raw_sk, 293 + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], 294 + &vif, sizeof(vif)); 295 + ASSERT_EQ(0, err); 296 + 297 + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); 298 + ASSERT_EQ(0, err); 299 + 300 + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); 301 + ASSERT_EQ(0, err); 302 + 303 + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); 304 + ASSERT_EQ(0, err); 305 + } 306 + 307 + TEST_F(ipmr, mrt_add_mfc_netlink_proxy) 308 + { 309 + struct vifctl vif = { 310 + .vifc_vifi = 0, 311 + .vifc_flags = VIFF_USE_IFINDEX, 312 + .vifc_lcl_ifindex = self->veth_ifindex, 313 + }; 314 + struct mfc_attr mfc_attr = { 315 + .table = RT_TABLE_DEFAULT, 316 + .origin = 0, 317 + .group = 0, 318 + .ifindex = self->veth_ifindex, 319 + .proxy = true, 320 + }; 321 + int err; 322 + 323 + err = setsockopt(self->raw_sk, 324 + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], 325 + &vif, sizeof(vif)); 326 + ASSERT_EQ(0, err); 327 + 328 + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); 329 + ASSERT_EQ(0, err); 330 + 331 + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); 332 + ASSERT_EQ(0, err); 333 + 334 + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); 335 + ASSERT_EQ(0, err); 336 + } 337 + 338 + TEST_F(ipmr, mrt_add_mfc_netlink_no_vif) 339 + { 340 + struct mfc_attr mfc_attr = { 341 + .table = RT_TABLE_DEFAULT, 342 + .origin = 0, 343 + .group = 0, 344 + .proxy = false, 345 + }; 346 + int err; 347 + 348 + /* netlink always requires RTA_IIF of an existing vif. */ 349 + mfc_attr.ifindex = 0; 350 + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); 351 + ASSERT_EQ(-ENFILE, err); 352 + 353 + /* netlink always requires RTA_IIF of an existing vif. */ 354 + mfc_attr.ifindex = self->veth_ifindex; 355 + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); 356 + ASSERT_EQ(-ENFILE, err); 357 + } 358 + 359 + TEST_F(ipmr, mrt_del_mfc_netlink_netns_dismantle) 360 + { 361 + struct vifctl vifs[2] = { 362 + { 363 + .vifc_vifi = 0, 364 + .vifc_flags = VIFF_USE_IFINDEX, 365 + .vifc_lcl_ifindex = self->veth_ifindex, 366 + }, 367 + { 368 + .vifc_vifi = 1, 369 + .vifc_flags = VIFF_REGISTER, 370 + } 371 + }; 372 + struct mfc_attr mfc_attr = { 373 + .table = RT_TABLE_DEFAULT, 374 + .origin = 0, 375 + .group = 0, 376 + .ifindex = self->veth_ifindex, 377 + .proxy = false, 378 + }; 379 + int i, err; 380 + 381 + for (i = 0; i < 2; i++) { 382 + /* Create 2 VIFs just to avoid -ENFILE later. */ 383 + err = setsockopt(self->raw_sk, 384 + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], 385 + &vifs[i], sizeof(vifs[i])); 386 + ASSERT_EQ(0, err); 387 + } 388 + 389 + /* Create a MFC for mrt->vif_table[0]. */ 390 + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); 391 + ASSERT_EQ(0, err); 392 + 393 + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); 394 + ASSERT_EQ(0, err); 395 + 396 + /* Remove mrt->vif_table[0]. */ 397 + err = system("ip link del veth0"); 398 + ASSERT_EQ(0, err); 399 + 400 + /* MFC entry is NOT removed even if the tied VIF is removed... */ 401 + err = system("cat /proc/net/ip_mr_cache | grep -q '00000000 00000000' "); 402 + ASSERT_EQ(0, err); 403 + 404 + /* ... and netlink is not capable of removing such an entry 405 + * because netlink always requires a valid RTA_IIF ... :/ 406 + */ 407 + err = nl_sendmsg_mfc(_metadata, self, RTM_DELROUTE, &mfc_attr); 408 + ASSERT_EQ(-ENODEV, err); 409 + 410 + /* It can be removed by setsockopt(), but let cleanup_net() remove this time. */ 411 + } 412 + 413 + TEST_F(ipmr, mrt_table_flush) 414 + { 415 + struct vifctl vif = { 416 + .vifc_vifi = 0, 417 + .vifc_flags = VIFF_USE_IFINDEX, 418 + .vifc_lcl_ifindex = self->veth_ifindex, 419 + }; 420 + struct mfc_attr mfc_attr = { 421 + .origin = 0, 422 + .group = 0, 423 + .ifindex = self->veth_ifindex, 424 + .proxy = false, 425 + }; 426 + int table_id = 92; 427 + int err, flags; 428 + 429 + /* Set a random table id rather than RT_TABLE_DEFAULT. 430 + * Note that /proc/net/ip_mr_{vif,cache} only supports RT_TABLE_DEFAULT. 431 + */ 432 + err = setsockopt(self->raw_sk, 433 + variant->level, variant->opts[MRT_TABLE - MRT_BASE], 434 + &table_id, sizeof(table_id)); 435 + ASSERT_EQ(0, err); 436 + 437 + err = setsockopt(self->raw_sk, 438 + variant->level, variant->opts[MRT_ADD_VIF - MRT_BASE], 439 + &vif, sizeof(vif)); 440 + ASSERT_EQ(0, err); 441 + 442 + mfc_attr.table = table_id; 443 + err = nl_sendmsg_mfc(_metadata, self, RTM_NEWROUTE, &mfc_attr); 444 + ASSERT_EQ(0, err); 445 + 446 + /* Flush mrt->vif_table[] and all caches. */ 447 + flags = MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | 448 + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC; 449 + err = setsockopt(self->raw_sk, 450 + variant->level, variant->opts[MRT_FLUSH - MRT_BASE], 451 + &flags, sizeof(flags)); 452 + ASSERT_EQ(0, err); 453 + } 454 + 455 + TEST_HARNESS_MAIN