Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'ipv4-fib-convert-rtm_newroute-and-rtm_delroute-to-per-netns-rtnl'

Kuniyuki Iwashima says:

====================
ipv4: fib: Convert RTM_NEWROUTE and RTM_DELROUTE to per-netns RTNL.

Patch 1 is misc cleanup.
Patch 2 ~ 8 converts two fib_info hash tables to per-netns.
Patch 9 ~ 12 converts rtnl_lock() to rtnl_net_lcok().

v2: https://lore.kernel.org/20250226192556.21633-1-kuniyu@amazon.com
v1: https://lore.kernel.org/20250225182250.74650-1-kuniyu@amazon.com
====================

Link: https://patch.msgid.link/20250228042328.96624-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+159 -148
+2
include/net/ip_fib.h
··· 162 162 struct fib_nh fib_nh[] __counted_by(fib_nhs); 163 163 }; 164 164 165 + int __net_init fib4_semantics_init(struct net *net); 166 + void __net_exit fib4_semantics_exit(struct net *net); 165 167 166 168 #ifdef CONFIG_IP_MULTIPLE_TABLES 167 169 struct fib_rule;
+3
include/net/netns/ipv4.h
··· 111 111 #endif 112 112 struct hlist_head *fib_table_hash; 113 113 struct sock *fibnl; 114 + struct hlist_head *fib_info_hash; 115 + unsigned int fib_info_hash_bits; 116 + unsigned int fib_info_cnt; 114 117 115 118 struct sock *mc_autojoin_sk; 116 119
+54 -20
net/ipv4/fib_frontend.c
··· 553 553 const struct in_ifaddr *ifa; 554 554 struct in_device *in_dev; 555 555 556 - in_dev = __in_dev_get_rtnl(dev); 556 + in_dev = __in_dev_get_rtnl_net(dev); 557 557 if (!in_dev) 558 558 return -ENODEV; 559 559 560 560 *colon = ':'; 561 561 562 - rcu_read_lock(); 563 - in_dev_for_each_ifa_rcu(ifa, in_dev) { 562 + in_dev_for_each_ifa_rtnl_net(net, ifa, in_dev) { 564 563 if (strcmp(ifa->ifa_label, devname) == 0) 565 564 break; 566 565 } 567 - rcu_read_unlock(); 568 566 569 567 if (!ifa) 570 568 return -ENODEV; ··· 633 635 if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) 634 636 return -EPERM; 635 637 636 - rtnl_lock(); 638 + rtnl_net_lock(net); 637 639 err = rtentry_to_fib_config(net, cmd, rt, &cfg); 638 640 if (err == 0) { 639 641 struct fib_table *tb; ··· 657 659 /* allocated by rtentry_to_fib_config() */ 658 660 kfree(cfg.fc_mx); 659 661 } 660 - rtnl_unlock(); 662 + rtnl_net_unlock(net); 661 663 return err; 662 664 } 663 665 return -EINVAL; ··· 835 837 } 836 838 } 837 839 840 + if (cfg->fc_dst_len > 32) { 841 + NL_SET_ERR_MSG(extack, "Invalid prefix length"); 842 + err = -EINVAL; 843 + goto errout; 844 + } 845 + 846 + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { 847 + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); 848 + err = -EINVAL; 849 + goto errout; 850 + } 851 + 838 852 if (cfg->fc_nh_id) { 839 853 if (cfg->fc_oif || cfg->fc_gw_family || 840 854 cfg->fc_encap || cfg->fc_mp) { 841 855 NL_SET_ERR_MSG(extack, 842 856 "Nexthop specification and nexthop id are mutually exclusive"); 843 - return -EINVAL; 857 + err = -EINVAL; 858 + goto errout; 844 859 } 845 860 } 846 861 847 862 if (has_gw && has_via) { 848 863 NL_SET_ERR_MSG(extack, 849 864 "Nexthop configuration can not contain both GATEWAY and VIA"); 850 - return -EINVAL; 865 + err = -EINVAL; 866 + goto errout; 851 867 } 852 868 853 869 if (!cfg->fc_table) ··· 884 872 if (err < 0) 885 873 goto errout; 886 874 875 + rtnl_net_lock(net); 876 + 887 877 if (cfg.fc_nh_id && !nexthop_find_by_id(net, cfg.fc_nh_id)) { 888 878 NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); 889 879 err = -EINVAL; 890 - goto errout; 880 + goto unlock; 891 881 } 892 882 893 883 tb = fib_get_table(net, cfg.fc_table); 894 884 if (!tb) { 895 885 NL_SET_ERR_MSG(extack, "FIB table does not exist"); 896 886 err = -ESRCH; 897 - goto errout; 887 + goto unlock; 898 888 } 899 889 900 890 err = fib_table_delete(net, tb, &cfg, extack); 891 + unlock: 892 + rtnl_net_unlock(net); 901 893 errout: 902 894 return err; 903 895 } ··· 918 902 if (err < 0) 919 903 goto errout; 920 904 905 + rtnl_net_lock(net); 906 + 921 907 tb = fib_new_table(net, cfg.fc_table); 922 908 if (!tb) { 923 909 err = -ENOBUFS; 924 - goto errout; 910 + goto unlock; 925 911 } 926 912 927 913 err = fib_table_insert(net, tb, &cfg, extack); 928 914 if (!err && cfg.fc_type == RTN_LOCAL) 929 915 net->ipv4.fib_has_custom_local_routes = true; 916 + 917 + unlock: 918 + rtnl_net_unlock(net); 930 919 errout: 931 920 return err; 932 921 } ··· 1471 1450 fib_sync_up(dev, RTNH_F_DEAD); 1472 1451 #endif 1473 1452 atomic_inc(&net->ipv4.dev_addr_genid); 1474 - rt_cache_flush(dev_net(dev)); 1453 + rt_cache_flush(net); 1475 1454 break; 1476 1455 case NETDEV_DOWN: 1477 1456 fib_del_ifaddr(ifa, NULL); ··· 1482 1461 */ 1483 1462 fib_disable_ip(dev, event, true); 1484 1463 } else { 1485 - rt_cache_flush(dev_net(dev)); 1464 + rt_cache_flush(net); 1486 1465 } 1487 1466 break; 1488 1467 } ··· 1596 1575 { 1597 1576 int i; 1598 1577 1599 - ASSERT_RTNL(); 1578 + ASSERT_RTNL_NET(net); 1600 1579 #ifdef CONFIG_IP_MULTIPLE_TABLES 1601 1580 RCU_INIT_POINTER(net->ipv4.fib_main, NULL); 1602 1581 RCU_INIT_POINTER(net->ipv4.fib_default, NULL); ··· 1636 1615 error = ip_fib_net_init(net); 1637 1616 if (error < 0) 1638 1617 goto out; 1618 + 1619 + error = fib4_semantics_init(net); 1620 + if (error) 1621 + goto out_semantics; 1622 + 1639 1623 error = nl_fib_lookup_init(net); 1640 1624 if (error < 0) 1641 1625 goto out_nlfl; 1626 + 1642 1627 error = fib_proc_init(net); 1643 1628 if (error < 0) 1644 1629 goto out_proc; ··· 1654 1627 out_proc: 1655 1628 nl_fib_lookup_exit(net); 1656 1629 out_nlfl: 1657 - rtnl_lock(); 1630 + fib4_semantics_exit(net); 1631 + out_semantics: 1632 + rtnl_net_lock(net); 1658 1633 ip_fib_net_exit(net); 1659 - rtnl_unlock(); 1634 + rtnl_net_unlock(net); 1660 1635 goto out; 1661 1636 } 1662 1637 ··· 1673 1644 struct net *net; 1674 1645 1675 1646 rtnl_lock(); 1676 - list_for_each_entry(net, net_list, exit_list) 1647 + list_for_each_entry(net, net_list, exit_list) { 1648 + __rtnl_net_lock(net); 1677 1649 ip_fib_net_exit(net); 1678 - 1650 + __rtnl_net_unlock(net); 1651 + } 1679 1652 rtnl_unlock(); 1653 + 1654 + list_for_each_entry(net, net_list, exit_list) 1655 + fib4_semantics_exit(net); 1680 1656 } 1681 1657 1682 1658 static struct pernet_operations fib_net_ops = { ··· 1692 1658 1693 1659 static const struct rtnl_msg_handler fib_rtnl_msg_handlers[] __initconst = { 1694 1660 {.protocol = PF_INET, .msgtype = RTM_NEWROUTE, 1695 - .doit = inet_rtm_newroute}, 1661 + .doit = inet_rtm_newroute, .flags = RTNL_FLAG_DOIT_PERNET}, 1696 1662 {.protocol = PF_INET, .msgtype = RTM_DELROUTE, 1697 - .doit = inet_rtm_delroute}, 1663 + .doit = inet_rtm_delroute, .flags = RTNL_FLAG_DOIT_PERNET}, 1698 1664 {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .dumpit = inet_dump_fib, 1699 1665 .flags = RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE}, 1700 1666 };
+100 -106
net/ipv4/fib_semantics.c
··· 50 50 51 51 #include "fib_lookup.h" 52 52 53 - static struct hlist_head *fib_info_hash; 54 - static struct hlist_head *fib_info_laddrhash; 55 - static unsigned int fib_info_hash_size; 56 - static unsigned int fib_info_hash_bits; 57 - static unsigned int fib_info_cnt; 58 - 59 53 /* for_nexthops and change_nexthops only used when nexthop object 60 54 * is not set in a fib_info. The logic within can reference fib_nh. 61 55 */ ··· 252 258 ASSERT_RTNL(); 253 259 if (fi && refcount_dec_and_test(&fi->fib_treeref)) { 254 260 hlist_del(&fi->fib_hash); 255 - 256 - fib_info_cnt--; 261 + fi->fib_net->ipv4.fib_info_cnt--; 257 262 258 263 if (fi->fib_prefsrc) 259 264 hlist_del(&fi->fib_lhash); ··· 328 335 static unsigned int fib_info_hashfn_result(const struct net *net, 329 336 unsigned int val) 330 337 { 331 - return hash_32(val ^ net_hash_mix(net), fib_info_hash_bits); 338 + return hash_32(val ^ net_hash_mix(net), net->ipv4.fib_info_hash_bits); 332 339 } 333 340 334 - static inline unsigned int fib_info_hashfn(struct fib_info *fi) 341 + static struct hlist_head *fib_info_hash_bucket(struct fib_info *fi) 335 342 { 343 + struct net *net = fi->fib_net; 336 344 unsigned int val; 337 345 338 346 val = fib_info_hashfn_1(fi->fib_nhs, fi->fib_protocol, ··· 348 354 } endfor_nexthops(fi) 349 355 } 350 356 351 - return fib_info_hashfn_result(fi->fib_net, val); 357 + return &net->ipv4.fib_info_hash[fib_info_hashfn_result(net, val)]; 358 + } 359 + 360 + static struct hlist_head *fib_info_laddrhash_bucket(const struct net *net, 361 + __be32 val) 362 + { 363 + unsigned int hash_bits = net->ipv4.fib_info_hash_bits; 364 + u32 slot; 365 + 366 + slot = hash_32(net_hash_mix(net) ^ (__force u32)val, hash_bits); 367 + 368 + return &net->ipv4.fib_info_hash[(1 << hash_bits) + slot]; 369 + } 370 + 371 + static struct hlist_head *fib_info_hash_alloc(unsigned int hash_bits) 372 + { 373 + /* The second half is used for prefsrc */ 374 + return kvcalloc((1 << hash_bits) * 2, sizeof(struct hlist_head *), 375 + GFP_KERNEL); 376 + } 377 + 378 + static void fib_info_hash_free(struct hlist_head *head) 379 + { 380 + kvfree(head); 381 + } 382 + 383 + static void fib_info_hash_grow(struct net *net) 384 + { 385 + unsigned int old_size = 1 << net->ipv4.fib_info_hash_bits; 386 + struct hlist_head *new_info_hash, *old_info_hash; 387 + unsigned int i; 388 + 389 + if (net->ipv4.fib_info_cnt < old_size) 390 + return; 391 + 392 + new_info_hash = fib_info_hash_alloc(net->ipv4.fib_info_hash_bits + 1); 393 + if (!new_info_hash) 394 + return; 395 + 396 + old_info_hash = net->ipv4.fib_info_hash; 397 + net->ipv4.fib_info_hash = new_info_hash; 398 + net->ipv4.fib_info_hash_bits += 1; 399 + 400 + for (i = 0; i < old_size; i++) { 401 + struct hlist_head *head = &old_info_hash[i]; 402 + struct hlist_node *n; 403 + struct fib_info *fi; 404 + 405 + hlist_for_each_entry_safe(fi, n, head, fib_hash) 406 + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); 407 + } 408 + 409 + for (i = 0; i < old_size; i++) { 410 + struct hlist_head *lhead = &old_info_hash[old_size + i]; 411 + struct hlist_node *n; 412 + struct fib_info *fi; 413 + 414 + hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) 415 + hlist_add_head(&fi->fib_lhash, 416 + fib_info_laddrhash_bucket(fi->fib_net, 417 + fi->fib_prefsrc)); 418 + } 419 + 420 + fib_info_hash_free(old_info_hash); 352 421 } 353 422 354 423 /* no metrics, only nexthop id */ ··· 427 370 (__force u32)cfg->fc_prefsrc, 428 371 cfg->fc_priority); 429 372 hash = fib_info_hashfn_result(net, hash); 430 - head = &fib_info_hash[hash]; 373 + head = &net->ipv4.fib_info_hash[hash]; 431 374 432 375 hlist_for_each_entry(fi, head, fib_hash) { 433 - if (!net_eq(fi->fib_net, net)) 434 - continue; 435 376 if (!fi->nh || fi->nh->id != cfg->fc_nh_id) 436 377 continue; 378 + 437 379 if (cfg->fc_protocol == fi->fib_protocol && 438 380 cfg->fc_scope == fi->fib_scope && 439 381 cfg->fc_prefsrc == fi->fib_prefsrc && ··· 448 392 449 393 static struct fib_info *fib_find_info(struct fib_info *nfi) 450 394 { 451 - struct hlist_head *head; 395 + struct hlist_head *head = fib_info_hash_bucket(nfi); 452 396 struct fib_info *fi; 453 - unsigned int hash; 454 - 455 - hash = fib_info_hashfn(nfi); 456 - head = &fib_info_hash[hash]; 457 397 458 398 hlist_for_each_entry(fi, head, fib_hash) { 459 - if (!net_eq(fi->fib_net, nfi->fib_net)) 460 - continue; 461 399 if (fi->fib_nhs != nfi->fib_nhs) 462 400 continue; 401 + 463 402 if (nfi->fib_protocol == fi->fib_protocol && 464 403 nfi->fib_scope == fi->fib_scope && 465 404 nfi->fib_prefsrc == fi->fib_prefsrc && ··· 1290 1239 return err; 1291 1240 } 1292 1241 1293 - static struct hlist_head * 1294 - fib_info_laddrhash_bucket(const struct net *net, __be32 val) 1295 - { 1296 - u32 slot = hash_32(net_hash_mix(net) ^ (__force u32)val, 1297 - fib_info_hash_bits); 1298 - 1299 - return &fib_info_laddrhash[slot]; 1300 - } 1301 - 1302 - static void fib_info_hash_move(struct hlist_head *new_info_hash, 1303 - struct hlist_head *new_laddrhash, 1304 - unsigned int new_size) 1305 - { 1306 - struct hlist_head *old_info_hash, *old_laddrhash; 1307 - unsigned int old_size = fib_info_hash_size; 1308 - unsigned int i; 1309 - 1310 - ASSERT_RTNL(); 1311 - old_info_hash = fib_info_hash; 1312 - old_laddrhash = fib_info_laddrhash; 1313 - fib_info_hash_size = new_size; 1314 - fib_info_hash_bits = ilog2(new_size); 1315 - 1316 - for (i = 0; i < old_size; i++) { 1317 - struct hlist_head *head = &fib_info_hash[i]; 1318 - struct hlist_node *n; 1319 - struct fib_info *fi; 1320 - 1321 - hlist_for_each_entry_safe(fi, n, head, fib_hash) { 1322 - struct hlist_head *dest; 1323 - unsigned int new_hash; 1324 - 1325 - new_hash = fib_info_hashfn(fi); 1326 - dest = &new_info_hash[new_hash]; 1327 - hlist_add_head(&fi->fib_hash, dest); 1328 - } 1329 - } 1330 - fib_info_hash = new_info_hash; 1331 - 1332 - fib_info_laddrhash = new_laddrhash; 1333 - for (i = 0; i < old_size; i++) { 1334 - struct hlist_head *lhead = &old_laddrhash[i]; 1335 - struct hlist_node *n; 1336 - struct fib_info *fi; 1337 - 1338 - hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) { 1339 - struct hlist_head *ldest; 1340 - 1341 - ldest = fib_info_laddrhash_bucket(fi->fib_net, 1342 - fi->fib_prefsrc); 1343 - hlist_add_head(&fi->fib_lhash, ldest); 1344 - } 1345 - } 1346 - 1347 - kvfree(old_info_hash); 1348 - kvfree(old_laddrhash); 1349 - } 1350 - 1351 1242 __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, 1352 1243 unsigned char scope) 1353 1244 { ··· 1402 1409 } 1403 1410 #endif 1404 1411 1405 - err = -ENOBUFS; 1406 - 1407 - if (fib_info_cnt >= fib_info_hash_size) { 1408 - unsigned int new_size = fib_info_hash_size << 1; 1409 - struct hlist_head *new_info_hash; 1410 - struct hlist_head *new_laddrhash; 1411 - size_t bytes; 1412 - 1413 - if (!new_size) 1414 - new_size = 16; 1415 - bytes = (size_t)new_size * sizeof(struct hlist_head *); 1416 - new_info_hash = kvzalloc(bytes, GFP_KERNEL); 1417 - new_laddrhash = kvzalloc(bytes, GFP_KERNEL); 1418 - if (!new_info_hash || !new_laddrhash) { 1419 - kvfree(new_info_hash); 1420 - kvfree(new_laddrhash); 1421 - } else { 1422 - fib_info_hash_move(new_info_hash, new_laddrhash, new_size); 1423 - } 1424 - if (!fib_info_hash_size) 1425 - goto failure; 1426 - } 1412 + fib_info_hash_grow(net); 1427 1413 1428 1414 fi = kzalloc(struct_size(fi, fib_nh, nhs), GFP_KERNEL); 1429 - if (!fi) 1415 + if (!fi) { 1416 + err = -ENOBUFS; 1430 1417 goto failure; 1418 + } 1419 + 1431 1420 fi->fib_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len, extack); 1432 1421 if (IS_ERR(fi->fib_metrics)) { 1433 1422 err = PTR_ERR(fi->fib_metrics); ··· 1546 1571 refcount_set(&fi->fib_treeref, 1); 1547 1572 refcount_set(&fi->fib_clntref, 1); 1548 1573 1549 - fib_info_cnt++; 1550 - hlist_add_head(&fi->fib_hash, 1551 - &fib_info_hash[fib_info_hashfn(fi)]); 1574 + net->ipv4.fib_info_cnt++; 1575 + hlist_add_head(&fi->fib_hash, fib_info_hash_bucket(fi)); 1576 + 1552 1577 if (fi->fib_prefsrc) { 1553 1578 struct hlist_head *head; 1554 1579 ··· 1830 1855 struct fib_info *fi; 1831 1856 int ret = 0; 1832 1857 1833 - if (!fib_info_laddrhash || local == 0) 1858 + if (!local) 1834 1859 return 0; 1835 1860 1836 1861 head = fib_info_laddrhash_bucket(net, local); ··· 2231 2256 else 2232 2257 fl4->saddr = inet_select_addr(l3mdev, 0, RT_SCOPE_LINK); 2233 2258 } 2259 + } 2260 + 2261 + int __net_init fib4_semantics_init(struct net *net) 2262 + { 2263 + unsigned int hash_bits = 4; 2264 + 2265 + net->ipv4.fib_info_hash = fib_info_hash_alloc(hash_bits); 2266 + if (!net->ipv4.fib_info_hash) 2267 + return -ENOMEM; 2268 + 2269 + net->ipv4.fib_info_hash_bits = hash_bits; 2270 + net->ipv4.fib_info_cnt = 0; 2271 + 2272 + return 0; 2273 + } 2274 + 2275 + void __net_exit fib4_semantics_exit(struct net *net) 2276 + { 2277 + fib_info_hash_free(net->ipv4.fib_info_hash); 2234 2278 }
-22
net/ipv4/fib_trie.c
··· 1187 1187 return 0; 1188 1188 } 1189 1189 1190 - static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) 1191 - { 1192 - if (plen > KEYLENGTH) { 1193 - NL_SET_ERR_MSG(extack, "Invalid prefix length"); 1194 - return false; 1195 - } 1196 - 1197 - if ((plen < KEYLENGTH) && (key << plen)) { 1198 - NL_SET_ERR_MSG(extack, 1199 - "Invalid prefix for given prefix length"); 1200 - return false; 1201 - } 1202 - 1203 - return true; 1204 - } 1205 - 1206 1190 static void fib_remove_alias(struct trie *t, struct key_vector *tp, 1207 1191 struct key_vector *l, struct fib_alias *old); 1208 1192 ··· 1206 1222 int err; 1207 1223 1208 1224 key = ntohl(cfg->fc_dst); 1209 - 1210 - if (!fib_valid_key_len(key, plen, extack)) 1211 - return -EINVAL; 1212 1225 1213 1226 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); 1214 1227 ··· 1697 1716 u32 key; 1698 1717 1699 1718 key = ntohl(cfg->fc_dst); 1700 - 1701 - if (!fib_valid_key_len(key, plen, extack)) 1702 - return -EINVAL; 1703 1719 1704 1720 l = fib_find_node(t, &tp, key); 1705 1721 if (!l)