Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'mlxsw-add-vxlan-to-the-same-hardware-domain-as-physical-bridge-ports'

Petr Machata says:

====================
mlxsw: Add VXLAN to the same hardware domain as physical bridge ports

Amit Cohen writes:

Packets which are trapped to CPU for forwarding in software data path
are handled according to driver marking of skb->offload_{,l3}_fwd_mark.
Packets which are marked as L2-forwarded in hardware, will not be flooded
by the bridge to bridge ports which are in the same hardware domain as the
ingress port.

Currently, mlxsw does not add VXLAN bridge ports to the same hardware
domain as physical bridge ports despite the fact that the device is able
to forward packets to and from VXLAN tunnels in hardware. In some
scenarios this can result in remote VTEPs receiving duplicate packets.

To solve such packets duplication, add VXLAN bridge ports to the same
hardware domain as other bridge ports.

One complication is ARP suppression which requires the local VTEP to avoid
flooding ARP packets to remote VTEPs if the local VTEP is able to reply on
behalf of remote hosts. This is currently implemented by having the device
flood ARP packets in hardware and trapping them during VXLAN encapsulation,
but marking them with skb->offload_fwd_mark=1 so that the bridge will not
re-flood them to physical bridge ports.

The above scheme will break when VXLAN bridge ports are added to the same
hardware domain as physical bridge ports as ARP packets that cannot be
suppressed by the bridge will not be able to egress the VXLAN bridge ports
due to hardware domain filtering. This is solved by trapping ARP packets
when they enter the device and not marking them as being forwarded in
hardware.

Patch set overview:
Patch #1 sets hardware to trap ARP packets at layer 2
Patches #2-#4 are preparations for setting hardwarwe domain of VXLAN
Patch #5 sets hardware domain of VXLAN
Patch #6 extends VXLAN flood test to verify that this set solves the
packets duplication
====================

Link: https://patch.msgid.link/cover.1742224300.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+83 -49
+4 -18
drivers/net/ethernet/mellanox/mlxsw/spectrum.c
··· 2409 2409 /* Multicast Router Traps */ 2410 2410 MLXSW_SP_RXL_MARK(ACL1, TRAP_TO_CPU, MULTICAST, false), 2411 2411 MLXSW_SP_RXL_L3_MARK(ACL2, TRAP_TO_CPU, MULTICAST, false), 2412 - /* NVE traps */ 2413 - MLXSW_SP_RXL_MARK(NVE_ENCAP_ARP, TRAP_TO_CPU, NEIGH_DISCOVERY, false), 2414 2412 }; 2415 2413 2416 2414 static const struct mlxsw_listener mlxsw_sp1_listener[] = { ··· 5230 5232 return 0; 5231 5233 if (!mlxsw_sp_bridge_vxlan_is_valid(upper_dev, extack)) 5232 5234 return -EOPNOTSUPP; 5233 - if (cu_info->linking) { 5234 - if (!netif_running(dev)) 5235 - return 0; 5236 - /* When the bridge is VLAN-aware, the VNI of the VxLAN 5237 - * device needs to be mapped to a VLAN, but at this 5238 - * point no VLANs are configured on the VxLAN device 5239 - */ 5240 - if (br_vlan_enabled(upper_dev)) 5241 - return 0; 5235 + if (!netif_running(dev)) 5236 + return 0; 5237 + if (cu_info->linking) 5242 5238 return mlxsw_sp_bridge_vxlan_join(mlxsw_sp, upper_dev, 5243 5239 dev, 0, extack); 5244 - } else { 5245 - /* VLANs were already flushed, which triggered the 5246 - * necessary cleanup 5247 - */ 5248 - if (br_vlan_enabled(upper_dev)) 5249 - return 0; 5240 + else 5250 5241 mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, dev); 5251 - } 5252 5242 break; 5253 5243 case NETDEV_PRE_UP: 5254 5244 upper_dev = netdev_master_upper_dev_get(dev);
+2 -2
drivers/net/ethernet/mellanox/mlxsw/spectrum.h
··· 661 661 const struct net_device *br_dev); 662 662 int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, 663 663 const struct net_device *br_dev, 664 - const struct net_device *vxlan_dev, u16 vid, 664 + struct net_device *vxlan_dev, u16 vid, 665 665 struct netlink_ext_ack *extack); 666 666 void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, 667 - const struct net_device *vxlan_dev); 667 + struct net_device *vxlan_dev); 668 668 extern struct notifier_block mlxsw_sp_switchdev_notifier; 669 669 670 670 /* spectrum.c */
+46 -20
drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c
··· 2929 2929 mlxsw_sp_bridge_port_put(mlxsw_sp->bridge, bridge_port); 2930 2930 } 2931 2931 2932 - int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, 2933 - const struct net_device *br_dev, 2934 - const struct net_device *vxlan_dev, u16 vid, 2935 - struct netlink_ext_ack *extack) 2936 - { 2937 - struct mlxsw_sp_bridge_device *bridge_device; 2938 - 2939 - bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); 2940 - if (WARN_ON(!bridge_device)) 2941 - return -EINVAL; 2942 - 2943 - return bridge_device->ops->vxlan_join(bridge_device, vxlan_dev, vid, 2944 - extack); 2945 - } 2946 - 2947 - void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, 2948 - const struct net_device *vxlan_dev) 2932 + static void __mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, 2933 + const struct net_device *vxlan_dev) 2949 2934 { 2950 2935 struct vxlan_dev *vxlan = netdev_priv(vxlan_dev); 2951 2936 struct mlxsw_sp_fid *fid; ··· 2946 2961 */ 2947 2962 mlxsw_sp_fid_put(fid); 2948 2963 mlxsw_sp_fid_put(fid); 2964 + } 2965 + 2966 + int mlxsw_sp_bridge_vxlan_join(struct mlxsw_sp *mlxsw_sp, 2967 + const struct net_device *br_dev, 2968 + struct net_device *vxlan_dev, u16 vid, 2969 + struct netlink_ext_ack *extack) 2970 + { 2971 + struct mlxsw_sp_bridge_device *bridge_device; 2972 + struct mlxsw_sp_port *mlxsw_sp_port; 2973 + int err; 2974 + 2975 + bridge_device = mlxsw_sp_bridge_device_find(mlxsw_sp->bridge, br_dev); 2976 + if (WARN_ON(!bridge_device)) 2977 + return -EINVAL; 2978 + 2979 + mlxsw_sp_port = mlxsw_sp_port_dev_lower_find(bridge_device->dev); 2980 + if (!mlxsw_sp_port) 2981 + return -EINVAL; 2982 + 2983 + err = bridge_device->ops->vxlan_join(bridge_device, vxlan_dev, vid, 2984 + extack); 2985 + if (err) 2986 + return err; 2987 + 2988 + err = switchdev_bridge_port_offload(vxlan_dev, mlxsw_sp_port->dev, 2989 + NULL, NULL, NULL, false, extack); 2990 + if (err) 2991 + goto err_bridge_port_offload; 2992 + 2993 + return 0; 2994 + 2995 + err_bridge_port_offload: 2996 + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 2997 + return err; 2998 + } 2999 + 3000 + void mlxsw_sp_bridge_vxlan_leave(struct mlxsw_sp *mlxsw_sp, 3001 + struct net_device *vxlan_dev) 3002 + { 3003 + switchdev_bridge_port_unoffload(vxlan_dev, NULL, NULL, NULL); 3004 + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 2949 3005 } 2950 3006 2951 3007 static void ··· 3893 3867 mlxsw_sp_fid_put(fid); 3894 3868 return -EINVAL; 3895 3869 } 3896 - mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 3870 + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 3897 3871 mlxsw_sp_fid_put(fid); 3898 3872 return 0; 3899 3873 } ··· 3909 3883 /* Fourth case: Thew new VLAN is PVID, which means the VLAN currently 3910 3884 * mapped to the VNI should be unmapped 3911 3885 */ 3912 - mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 3886 + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 3913 3887 mlxsw_sp_fid_put(fid); 3914 3888 3915 3889 /* Fifth case: The new VLAN is also egress untagged, which means the ··· 3949 3923 if (mlxsw_sp_fid_8021q_vid(fid) != vid) 3950 3924 goto out; 3951 3925 3952 - mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 3926 + __mlxsw_sp_bridge_vxlan_leave(mlxsw_sp, vxlan_dev); 3953 3927 3954 3928 out: 3955 3929 mlxsw_sp_fid_put(fid);
+6 -6
drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
··· 959 959 }, 960 960 { 961 961 .trap = MLXSW_SP_TRAP_CONTROL(ARP_REQUEST, NEIGH_DISCOVERY, 962 - MIRROR), 962 + TRAP), 963 963 .listeners_arr = { 964 - MLXSW_SP_RXL_MARK(ROUTER_ARPBC, NEIGH_DISCOVERY, 965 - TRAP_TO_CPU, false), 964 + MLXSW_SP_RXL_NO_MARK(ARPBC, NEIGH_DISCOVERY, 965 + TRAP_TO_CPU, false), 966 966 }, 967 967 }, 968 968 { 969 969 .trap = MLXSW_SP_TRAP_CONTROL(ARP_RESPONSE, NEIGH_DISCOVERY, 970 - MIRROR), 970 + TRAP), 971 971 .listeners_arr = { 972 - MLXSW_SP_RXL_MARK(ROUTER_ARPUC, NEIGH_DISCOVERY, 973 - TRAP_TO_CPU, false), 972 + MLXSW_SP_RXL_NO_MARK(ARPUC, NEIGH_DISCOVERY, 973 + TRAP_TO_CPU, false), 974 974 }, 975 975 }, 976 976 {
+2 -3
drivers/net/ethernet/mellanox/mlxsw/trap.h
··· 29 29 MLXSW_TRAP_ID_FDB_MISMATCH = 0x3B, 30 30 MLXSW_TRAP_ID_FID_MISS = 0x3D, 31 31 MLXSW_TRAP_ID_DECAP_ECN0 = 0x40, 32 + MLXSW_TRAP_ID_ARPBC = 0x50, 33 + MLXSW_TRAP_ID_ARPUC = 0x51, 32 34 MLXSW_TRAP_ID_MTUERROR = 0x52, 33 35 MLXSW_TRAP_ID_TTLERROR = 0x53, 34 36 MLXSW_TRAP_ID_LBERROR = 0x54, ··· 68 66 MLXSW_TRAP_ID_HOST_MISS_IPV6 = 0x92, 69 67 MLXSW_TRAP_ID_IPIP_DECAP_ERROR = 0xB1, 70 68 MLXSW_TRAP_ID_NVE_DECAP_ARP = 0xB8, 71 - MLXSW_TRAP_ID_NVE_ENCAP_ARP = 0xBD, 72 69 MLXSW_TRAP_ID_IPV4_BFD = 0xD0, 73 70 MLXSW_TRAP_ID_IPV6_BFD = 0xD1, 74 71 MLXSW_TRAP_ID_ROUTER_ALERT_IPV4 = 0xD6, 75 72 MLXSW_TRAP_ID_ROUTER_ALERT_IPV6 = 0xD7, 76 - MLXSW_TRAP_ID_ROUTER_ARPBC = 0xE0, 77 - MLXSW_TRAP_ID_ROUTER_ARPUC = 0xE1, 78 73 MLXSW_TRAP_ID_DISCARD_NON_ROUTABLE = 0x11A, 79 74 MLXSW_TRAP_ID_DISCARD_ROUTER2 = 0x130, 80 75 MLXSW_TRAP_ID_DISCARD_ROUTER3 = 0x131,
+8
tools/testing/selftests/net/forwarding/vxlan_bridge_1d.sh
··· 428 428 test_flood() 429 429 { 430 430 __test_flood de:ad:be:ef:13:37 192.0.2.100 "flood" 431 + 432 + # Add an entry with arbitrary destination IP. Verify that packets are 433 + # not duplicated (this can happen if hardware floods the packets, and 434 + # then traps them due to misconfiguration, so software data path repeats 435 + # flooding and resends packets). 436 + bridge fdb append dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self 437 + __test_flood de:ad:be:ef:13:37 192.0.2.100 "flood, unresolved FDB entry" 438 + bridge fdb del dev vx1 00:00:00:00:00:00 dst 198.51.100.1 self 431 439 } 432 440 433 441 vxlan_fdb_add_del()
+15
tools/testing/selftests/net/forwarding/vxlan_bridge_1q.sh
··· 539 539 10 10 0 10 0 540 540 __test_flood ca:fe:be:ef:13:37 198.51.100.100 20 "flood vlan 20" \ 541 541 10 0 10 0 10 542 + 543 + # Add entries with arbitrary destination IP. Verify that packets are 544 + # not duplicated (this can happen if hardware floods the packets, and 545 + # then traps them due to misconfiguration, so software data path repeats 546 + # flooding and resends packets). 547 + bridge fdb append dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self 548 + bridge fdb append dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self 549 + 550 + __test_flood de:ad:be:ef:13:37 192.0.2.100 10 \ 551 + "flood vlan 10, unresolved FDB entry" 10 10 0 10 0 552 + __test_flood ca:fe:be:ef:13:37 198.51.100.100 20 \ 553 + "flood vlan 20, unresolved FDB entry" 10 0 10 0 10 554 + 555 + bridge fdb del dev vx20 00:00:00:00:00:00 dst 203.0.113.2 self 556 + bridge fdb del dev vx10 00:00:00:00:00:00 dst 203.0.113.1 self 542 557 } 543 558 544 559 vxlan_fdb_add_del()