Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/rxe: Support RDMA link creation and destruction per net namespace

After introducing dellink handling and per-net namespace management
for IPv4 and IPv6 sockets, extend rxe to create and destroy RDMA links
within each network namespace.

With this change, RDMA links can be instantiated both in init_net and
in other network namespaces. The lifecycle of the RDMA link is now tied
to the corresponding namespace and is properly cleaned up when the
namespace or link is removed.

This ensures rxe behaves correctly in multi-namespace environments and
keeps socket and RDMA link resources consistent across namespace
creation and teardown.

Reviewed-by: David Ahern <dsahern@kernel.org>
Signed-off-by: Zhu Yanjun <yanjun.zhu@linux.dev>
Link: https://patch.msgid.link/20260313023058.13020-4-yanjun.zhu@linux.dev
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Zhu Yanjun and committed by
Leon Romanovsky
f1327abd 13f2a53c

+146 -45
+33 -5
drivers/infiniband/sw/rxe/rxe.c
··· 8 8 #include <net/addrconf.h> 9 9 #include "rxe.h" 10 10 #include "rxe_loc.h" 11 + #include "rxe_net.h" 12 + #include "rxe_ns.h" 11 13 12 14 MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); 13 15 MODULE_DESCRIPTION("Soft RDMA transport"); ··· 202 200 port->mtu_cap = ib_mtu_enum_to_int(mtu); 203 201 } 204 202 203 + static struct rdma_link_ops rxe_link_ops; 204 + 205 205 /* called by ifc layer to create new rxe device. 206 206 * The caller should allocate memory for rxe by calling ib_alloc_device. 207 207 */ ··· 212 208 { 213 209 rxe_init(rxe, ndev); 214 210 rxe_set_mtu(rxe, mtu); 211 + rxe->ib_dev.link_ops = &rxe_link_ops; 215 212 216 213 return rxe_register_device(rxe, ibdev_name, ndev); 217 214 } ··· 236 231 goto err; 237 232 } 238 233 234 + err = rxe_net_init(ndev); 235 + if (err) 236 + return err; 237 + 239 238 err = rxe_net_add(ibdev_name, ndev); 240 239 if (err) { 241 240 rxe_err("failed to add %s\n", ndev->name); ··· 249 240 return err; 250 241 } 251 242 243 + static int rxe_dellink(struct ib_device *dev) 244 + { 245 + rxe_net_del(dev); 246 + 247 + return 0; 248 + } 249 + 252 250 static struct rdma_link_ops rxe_link_ops = { 253 251 .type = "rxe", 254 252 .newlink = rxe_newlink, 253 + .dellink = rxe_dellink, 255 254 }; 256 255 257 256 static int __init rxe_module_init(void) ··· 270 253 if (err) 271 254 return err; 272 255 273 - err = rxe_net_init(); 274 - if (err) { 275 - rxe_destroy_wq(); 276 - return err; 277 - } 256 + err = rxe_namespace_init(); 257 + if (err) 258 + goto err_destroy_wq; 259 + 260 + err = rxe_register_notifier(); 261 + if (err) 262 + goto err_namespace_exit; 278 263 279 264 rdma_link_register(&rxe_link_ops); 265 + 280 266 pr_info("loaded\n"); 281 267 return 0; 268 + 269 + err_namespace_exit: 270 + rxe_namespace_exit(); 271 + err_destroy_wq: 272 + rxe_destroy_wq(); 273 + return err; 282 274 } 283 275 284 276 static void __exit rxe_module_exit(void) ··· 296 270 ib_unregister_driver(RDMA_DRIVER_RXE); 297 271 rxe_net_exit(); 298 272 rxe_destroy_wq(); 273 + 274 + rxe_namespace_exit(); 299 275 300 276 pr_info("unloaded\n"); 301 277 }
+110 -34
drivers/infiniband/sw/rxe/rxe_net.c
··· 17 17 #include "rxe.h" 18 18 #include "rxe_net.h" 19 19 #include "rxe_loc.h" 20 + #include "rxe_ns.h" 20 21 21 - static struct rxe_recv_sockets recv_sockets; 22 + #ifndef SK_REF_FOR_TUNNEL 23 + #define SK_REF_FOR_TUNNEL 2 24 + #endif 22 25 23 26 #ifdef CONFIG_DEBUG_LOCK_ALLOC 24 27 /* ··· 104 101 } 105 102 106 103 static struct dst_entry *rxe_find_route4(struct rxe_qp *qp, 104 + struct net *net, 107 105 struct net_device *ndev, 108 106 struct in_addr *saddr, 109 107 struct in_addr *daddr) 110 108 { 111 109 struct rtable *rt; 112 - struct flowi4 fl = { { 0 } }; 110 + struct flowi4 fl = {}; 113 111 114 - memset(&fl, 0, sizeof(fl)); 115 112 fl.flowi4_oif = ndev->ifindex; 116 113 memcpy(&fl.saddr, saddr, sizeof(*saddr)); 117 114 memcpy(&fl.daddr, daddr, sizeof(*daddr)); 118 115 fl.flowi4_proto = IPPROTO_UDP; 119 116 120 - rt = ip_route_output_key(&init_net, &fl); 117 + rt = ip_route_output_key(net, &fl); 121 118 if (IS_ERR(rt)) { 122 119 rxe_dbg_qp(qp, "no route to %pI4\n", &daddr->s_addr); 123 120 return NULL; ··· 128 125 129 126 #if IS_ENABLED(CONFIG_IPV6) 130 127 static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, 128 + struct net *net, 131 129 struct net_device *ndev, 132 130 struct in6_addr *saddr, 133 131 struct in6_addr *daddr) 134 132 { 135 133 struct dst_entry *ndst; 136 - struct flowi6 fl6 = { { 0 } }; 134 + struct flowi6 fl6 = {}; 137 135 138 - memset(&fl6, 0, sizeof(fl6)); 139 136 fl6.flowi6_oif = ndev->ifindex; 140 137 memcpy(&fl6.saddr, saddr, sizeof(*saddr)); 141 138 memcpy(&fl6.daddr, daddr, sizeof(*daddr)); 142 139 fl6.flowi6_proto = IPPROTO_UDP; 143 140 144 - ndst = ipv6_stub->ipv6_dst_lookup_flow(sock_net(recv_sockets.sk6->sk), 145 - recv_sockets.sk6->sk, &fl6, 141 + ndst = ipv6_stub->ipv6_dst_lookup_flow(net, 142 + rxe_ns_pernet_sk6(net), &fl6, 146 143 NULL); 147 144 if (IS_ERR(ndst)) { 148 145 rxe_dbg_qp(qp, "no route to %pI6\n", daddr); ··· 163 160 #else 164 161 165 162 static struct dst_entry *rxe_find_route6(struct rxe_qp *qp, 163 + struct net *net, 166 164 struct net_device *ndev, 167 165 struct in6_addr *saddr, 168 166 struct in6_addr *daddr) ··· 178 174 struct rxe_av *av) 179 175 { 180 176 struct dst_entry *dst = NULL; 177 + struct net *net; 181 178 182 179 if (qp_type(qp) == IB_QPT_RC) 183 180 dst = sk_dst_get(qp->sk->sk); ··· 187 182 if (dst) 188 183 dst_release(dst); 189 184 185 + net = dev_net(ndev); 186 + 190 187 if (av->network_type == RXE_NETWORK_TYPE_IPV4) { 191 188 struct in_addr *saddr; 192 189 struct in_addr *daddr; 193 190 194 191 saddr = &av->sgid_addr._sockaddr_in.sin_addr; 195 192 daddr = &av->dgid_addr._sockaddr_in.sin_addr; 196 - dst = rxe_find_route4(qp, ndev, saddr, daddr); 193 + dst = rxe_find_route4(qp, net, ndev, saddr, daddr); 197 194 } else if (av->network_type == RXE_NETWORK_TYPE_IPV6) { 198 195 struct in6_addr *saddr6; 199 196 struct in6_addr *daddr6; 200 197 201 198 saddr6 = &av->sgid_addr._sockaddr_in6.sin6_addr; 202 199 daddr6 = &av->dgid_addr._sockaddr_in6.sin6_addr; 203 - dst = rxe_find_route6(qp, ndev, saddr6, daddr6); 200 + dst = rxe_find_route6(qp, net, ndev, saddr6, daddr6); 204 201 #if IS_ENABLED(CONFIG_IPV6) 205 202 if (dst) 206 203 qp->dst_cookie = ··· 631 624 return 0; 632 625 } 633 626 627 + static void rxe_sock_put(struct sock *sk, 628 + void (*set_sk)(struct net *, struct sock *), 629 + struct net *net) 630 + { 631 + if (refcount_read(&sk->sk_refcnt) > SK_REF_FOR_TUNNEL) { 632 + __sock_put(sk); 633 + } else { 634 + rxe_release_udp_tunnel(sk->sk_socket); 635 + sk = NULL; 636 + set_sk(net, sk); 637 + } 638 + } 639 + 640 + void rxe_net_del(struct ib_device *dev) 641 + { 642 + struct rxe_dev *rxe = container_of(dev, struct rxe_dev, ib_dev); 643 + struct net_device *ndev; 644 + struct sock *sk; 645 + struct net *net; 646 + 647 + ndev = rxe_ib_device_get_netdev(&rxe->ib_dev); 648 + if (!ndev) 649 + return; 650 + 651 + net = dev_net(ndev); 652 + 653 + sk = rxe_ns_pernet_sk4(net); 654 + if (sk) 655 + rxe_sock_put(sk, rxe_ns_pernet_set_sk4, net); 656 + 657 + sk = rxe_ns_pernet_sk6(net); 658 + if (sk) 659 + rxe_sock_put(sk, rxe_ns_pernet_set_sk6, net); 660 + 661 + dev_put(ndev); 662 + } 663 + 634 664 static void rxe_port_event(struct rxe_dev *rxe, 635 665 enum ib_event_type event) 636 666 { ··· 724 680 switch (event) { 725 681 case NETDEV_UNREGISTER: 726 682 ib_unregister_device_queued(&rxe->ib_dev); 683 + rxe_net_del(&rxe->ib_dev); 727 684 break; 728 685 case NETDEV_CHANGEMTU: 729 686 rxe_dbg_dev(rxe, "%s changed mtu to %d\n", ndev->name, ndev->mtu); ··· 754 709 .notifier_call = rxe_notify, 755 710 }; 756 711 757 - static int rxe_net_ipv4_init(void) 712 + static int rxe_net_ipv4_init(struct net *net) 758 713 { 759 - recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net, 760 - htons(ROCE_V2_UDP_DPORT), false); 761 - if (IS_ERR(recv_sockets.sk4)) { 762 - recv_sockets.sk4 = NULL; 714 + struct sock *sk; 715 + struct socket *sock; 716 + 717 + sk = rxe_ns_pernet_sk4(net); 718 + if (sk) { 719 + sock_hold(sk); 720 + return 0; 721 + } 722 + 723 + sock = rxe_setup_udp_tunnel(net, htons(ROCE_V2_UDP_DPORT), false); 724 + if (IS_ERR(sock)) { 763 725 pr_err("Failed to create IPv4 UDP tunnel\n"); 764 726 return -1; 765 727 } 728 + rxe_ns_pernet_set_sk4(net, sock->sk); 766 729 767 730 return 0; 768 731 } 769 732 770 - static int rxe_net_ipv6_init(void) 733 + static int rxe_net_ipv6_init(struct net *net) 771 734 { 772 735 #if IS_ENABLED(CONFIG_IPV6) 736 + struct sock *sk; 737 + struct socket *sock; 773 738 774 - recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net, 775 - htons(ROCE_V2_UDP_DPORT), true); 776 - if (PTR_ERR(recv_sockets.sk6) == -EAFNOSUPPORT) { 777 - recv_sockets.sk6 = NULL; 739 + sk = rxe_ns_pernet_sk6(net); 740 + if (sk) { 741 + sock_hold(sk); 742 + return 0; 743 + } 744 + 745 + sock = rxe_setup_udp_tunnel(net, htons(ROCE_V2_UDP_DPORT), true); 746 + if (PTR_ERR(sock) == -EAFNOSUPPORT) { 778 747 pr_warn("IPv6 is not supported, can not create a UDPv6 socket\n"); 779 748 return 0; 780 749 } 781 750 782 - if (IS_ERR(recv_sockets.sk6)) { 783 - recv_sockets.sk6 = NULL; 751 + if (IS_ERR(sock)) { 784 752 pr_err("Failed to create IPv6 UDP tunnel\n"); 785 753 return -1; 786 754 } 755 + 756 + rxe_ns_pernet_set_sk6(net, sock->sk); 757 + 787 758 #endif 759 + return 0; 760 + } 761 + 762 + int rxe_register_notifier(void) 763 + { 764 + int err; 765 + 766 + err = register_netdevice_notifier(&rxe_net_notifier); 767 + if (err) { 768 + pr_err("Failed to register netdev notifier\n"); 769 + return -1; 770 + } 771 + 788 772 return 0; 789 773 } 790 774 791 775 void rxe_net_exit(void) 792 776 { 793 - rxe_release_udp_tunnel(recv_sockets.sk6); 794 - rxe_release_udp_tunnel(recv_sockets.sk4); 795 777 unregister_netdevice_notifier(&rxe_net_notifier); 796 778 } 797 779 798 - int rxe_net_init(void) 780 + int rxe_net_init(struct net_device *ndev) 799 781 { 782 + struct net *net; 783 + struct sock *sk; 800 784 int err; 801 785 802 - recv_sockets.sk6 = NULL; 786 + net = dev_net(ndev); 803 787 804 - err = rxe_net_ipv4_init(); 788 + err = rxe_net_ipv4_init(net); 805 789 if (err) 806 790 return err; 807 - err = rxe_net_ipv6_init(); 791 + 792 + err = rxe_net_ipv6_init(net); 808 793 if (err) 809 794 goto err_out; 810 - err = register_netdevice_notifier(&rxe_net_notifier); 811 - if (err) { 812 - pr_err("Failed to register netdev notifier\n"); 813 - goto err_out; 814 - } 795 + 815 796 return 0; 797 + 816 798 err_out: 817 - rxe_net_exit(); 799 + /* If ipv6 error, release ipv4 resource */ 800 + sk = rxe_ns_pernet_sk4(net); 801 + if (sk) 802 + rxe_sock_put(sk, rxe_ns_pernet_set_sk4, net); 803 + 818 804 return err; 819 805 }
+3 -6
drivers/infiniband/sw/rxe/rxe_net.h
··· 11 11 #include <net/if_inet6.h> 12 12 #include <linux/module.h> 13 13 14 - struct rxe_recv_sockets { 15 - struct socket *sk4; 16 - struct socket *sk6; 17 - }; 18 - 19 14 int rxe_net_add(const char *ibdev_name, struct net_device *ndev); 15 + void rxe_net_del(struct ib_device *dev); 20 16 21 - int rxe_net_init(void); 17 + int rxe_register_notifier(void); 18 + int rxe_net_init(struct net_device *ndev); 22 19 void rxe_net_exit(void); 23 20 24 21 #endif /* RXE_NET_H */