Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

net/mlx5: Propagate LAG effective max_tx_speed to vports

Currently, vports report only their parent's uplink speed, which in LAG
setups does not reflect the true aggregated bandwidth. This makes it
hard for upper-layer software to optimize load balancing decisions
based on accurate bandwidth information.

Fix the issue by calculating the possible maximum speed of a LAG as
the sum of speeds of all active uplinks that are part of the LAG.
Propagate this effective max speed to vports associated with the LAG
whenever a relevant event occurs, such as physical port link state
changes or LAG creation/modification.

With this change, upper-layer components receive accurate bandwidth
information corresponding to the active members of the LAG and can
make better load balancing decisions.

Signed-off-by: Or Har-Toov <ohartoov@nvidia.com>
Reviewed-by: Maher Sanalla <msanalla@nvidia.com>
Reviewed-by: Mark Bloch <mbloch@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Or Har-Toov and committed by
Leon Romanovsky
50f1d188 3df5dd46

+241
+158
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
··· 996 996 ldev->mode != MLX5_LAG_MODE_MPESW; 997 997 } 998 998 999 + #ifdef CONFIG_MLX5_ESWITCH 1000 + static int 1001 + mlx5_lag_sum_devices_speed(struct mlx5_lag *ldev, u32 *sum_speed, 1002 + int (*get_speed)(struct mlx5_core_dev *, u32 *)) 1003 + { 1004 + struct mlx5_core_dev *pf_mdev; 1005 + int pf_idx; 1006 + u32 speed; 1007 + int ret; 1008 + 1009 + *sum_speed = 0; 1010 + mlx5_ldev_for_each(pf_idx, 0, ldev) { 1011 + pf_mdev = ldev->pf[pf_idx].dev; 1012 + if (!pf_mdev) 1013 + continue; 1014 + 1015 + ret = get_speed(pf_mdev, &speed); 1016 + if (ret) { 1017 + mlx5_core_dbg(pf_mdev, 1018 + "Failed to get device speed using %ps. Device %s speed is not available (err=%d)\n", 1019 + get_speed, dev_name(pf_mdev->device), 1020 + ret); 1021 + return ret; 1022 + } 1023 + 1024 + *sum_speed += speed; 1025 + } 1026 + 1027 + return 0; 1028 + } 1029 + 1030 + static int mlx5_lag_sum_devices_max_speed(struct mlx5_lag *ldev, u32 *max_speed) 1031 + { 1032 + return mlx5_lag_sum_devices_speed(ldev, max_speed, 1033 + mlx5_port_max_linkspeed); 1034 + } 1035 + 1036 + static void mlx5_lag_modify_device_vports_speed(struct mlx5_core_dev *mdev, 1037 + u32 speed) 1038 + { 1039 + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; 1040 + struct mlx5_eswitch *esw = mdev->priv.eswitch; 1041 + struct mlx5_vport *vport; 1042 + unsigned long i; 1043 + int ret; 1044 + 1045 + if (!esw) 1046 + return; 1047 + 1048 + if (!MLX5_CAP_ESW(mdev, esw_vport_state_max_tx_speed)) 1049 + return; 1050 + 1051 + mlx5_esw_for_each_vport(esw, i, vport) { 1052 + if (!vport) 1053 + continue; 1054 + 1055 + if (vport->vport == MLX5_VPORT_UPLINK) 1056 + continue; 1057 + 1058 + ret = mlx5_modify_vport_max_tx_speed(mdev, op_mod, 1059 + vport->vport, true, speed); 1060 + if (ret) 1061 + mlx5_core_dbg(mdev, 1062 + "Failed to set vport %d speed %d, err=%d\n", 1063 + vport->vport, speed, ret); 1064 + } 1065 + } 1066 + 1067 + void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) 1068 + { 1069 + struct mlx5_core_dev *mdev; 1070 + u32 speed; 1071 + int pf_idx; 1072 + 1073 + speed = ldev->tracker.bond_speed_mbps; 1074 + 1075 + if (speed == SPEED_UNKNOWN) 1076 + return; 1077 + 1078 + /* If speed is not set, use the sum of max speeds of all PFs */ 1079 + if (!speed && mlx5_lag_sum_devices_max_speed(ldev, &speed)) 1080 + return; 1081 + 1082 + speed = speed / MLX5_MAX_TX_SPEED_UNIT; 1083 + 1084 + mlx5_ldev_for_each(pf_idx, 0, ldev) { 1085 + mdev = ldev->pf[pf_idx].dev; 1086 + if (!mdev) 1087 + continue; 1088 + 1089 + mlx5_lag_modify_device_vports_speed(mdev, speed); 1090 + } 1091 + } 1092 + 1093 + void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) 1094 + { 1095 + struct mlx5_core_dev *mdev; 1096 + u32 speed; 1097 + int pf_idx; 1098 + int ret; 1099 + 1100 + mlx5_ldev_for_each(pf_idx, 0, ldev) { 1101 + mdev = ldev->pf[pf_idx].dev; 1102 + if (!mdev) 1103 + continue; 1104 + 1105 + ret = mlx5_port_oper_linkspeed(mdev, &speed); 1106 + if (ret) { 1107 + mlx5_core_dbg(mdev, 1108 + "Failed to reset vports speed for device %s. Oper speed is not available (err=%d)\n", 1109 + dev_name(mdev->device), ret); 1110 + continue; 1111 + } 1112 + 1113 + speed = speed / MLX5_MAX_TX_SPEED_UNIT; 1114 + mlx5_lag_modify_device_vports_speed(mdev, speed); 1115 + } 1116 + } 1117 + #endif 1118 + 999 1119 static void mlx5_do_bond(struct mlx5_lag *ldev) 1000 1120 { 1001 1121 int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1); ··· 1203 1083 ndev); 1204 1084 dev_put(ndev); 1205 1085 } 1086 + mlx5_lag_set_vports_agg_speed(ldev); 1206 1087 } else if (mlx5_lag_should_modify_lag(ldev, do_bond)) { 1207 1088 mlx5_modify_lag(ldev, &tracker); 1089 + mlx5_lag_set_vports_agg_speed(ldev); 1208 1090 } else if (mlx5_lag_should_disable_lag(ldev, do_bond)) { 1091 + mlx5_lag_reset_vports_speed(ldev); 1209 1092 mlx5_disable_lag(ldev); 1210 1093 } 1211 1094 } ··· 1409 1286 return 1; 1410 1287 } 1411 1288 1289 + static void mlx5_lag_update_tracker_speed(struct lag_tracker *tracker, 1290 + struct net_device *ndev) 1291 + { 1292 + struct ethtool_link_ksettings lksettings; 1293 + struct net_device *bond_dev; 1294 + int err; 1295 + 1296 + if (netif_is_lag_master(ndev)) 1297 + bond_dev = ndev; 1298 + else 1299 + bond_dev = netdev_master_upper_dev_get(ndev); 1300 + 1301 + if (!bond_dev) { 1302 + tracker->bond_speed_mbps = SPEED_UNKNOWN; 1303 + return; 1304 + } 1305 + 1306 + err = __ethtool_get_link_ksettings(bond_dev, &lksettings); 1307 + if (err) { 1308 + netdev_dbg(bond_dev, 1309 + "Failed to get speed for bond dev %s, err=%d\n", 1310 + bond_dev->name, err); 1311 + tracker->bond_speed_mbps = SPEED_UNKNOWN; 1312 + return; 1313 + } 1314 + 1315 + if (lksettings.base.speed == SPEED_UNKNOWN) 1316 + tracker->bond_speed_mbps = 0; 1317 + else 1318 + tracker->bond_speed_mbps = lksettings.base.speed; 1319 + } 1320 + 1412 1321 /* this handler is always registered to netdev events */ 1413 1322 static int mlx5_lag_netdev_event(struct notifier_block *this, 1414 1323 unsigned long event, void *ptr) ··· 1471 1316 changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev); 1472 1317 break; 1473 1318 } 1319 + 1320 + if (changed) 1321 + mlx5_lag_update_tracker_speed(&tracker, ndev); 1474 1322 1475 1323 ldev->tracker = tracker; 1476 1324
+9
drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
··· 48 48 unsigned int is_bonded:1; 49 49 unsigned int has_inactive:1; 50 50 enum netdev_lag_hash hash_type; 51 + u32 bond_speed_mbps; 51 52 }; 52 53 53 54 /* LAG data of a ConnectX card. ··· 116 115 int mlx5_deactivate_lag(struct mlx5_lag *ldev); 117 116 void mlx5_lag_add_devices(struct mlx5_lag *ldev); 118 117 struct mlx5_devcom_comp_dev *mlx5_lag_get_devcom_comp(struct mlx5_lag *ldev); 118 + 119 + #ifdef CONFIG_MLX5_ESWITCH 120 + void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev); 121 + void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev); 122 + #else 123 + static inline void mlx5_lag_set_vports_agg_speed(struct mlx5_lag *ldev) {} 124 + static inline void mlx5_lag_reset_vports_speed(struct mlx5_lag *ldev) {} 125 + #endif 119 126 120 127 static inline bool mlx5_lag_is_supported(struct mlx5_core_dev *dev) 121 128 {
+1
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
··· 381 381 u32 mlx5_port_info2linkmodes(struct mlx5_core_dev *mdev, 382 382 struct mlx5_link_info *info, 383 383 bool force_legacy); 384 + int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); 384 385 int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed); 385 386 386 387 #define MLX5_PPS_CAP(mdev) (MLX5_CAP_GEN((mdev), pps) && \
+24
drivers/net/ethernet/mellanox/mlx5/core/port.c
··· 1200 1200 return link_modes; 1201 1201 } 1202 1202 1203 + int mlx5_port_oper_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) 1204 + { 1205 + const struct mlx5_link_info *table; 1206 + struct mlx5_port_eth_proto eproto; 1207 + u32 oper_speed = 0; 1208 + u32 max_size; 1209 + bool ext; 1210 + int err; 1211 + int i; 1212 + 1213 + ext = mlx5_ptys_ext_supported(mdev); 1214 + err = mlx5_port_query_eth_proto(mdev, 1, ext, &eproto); 1215 + if (err) 1216 + return err; 1217 + 1218 + mlx5e_port_get_link_mode_info_arr(mdev, &table, &max_size, false); 1219 + for (i = 0; i < max_size; ++i) 1220 + if (eproto.oper & MLX5E_PROT_MASK(i)) 1221 + oper_speed = max(oper_speed, table[i].speed); 1222 + 1223 + *speed = oper_speed; 1224 + return 0; 1225 + } 1226 + 1203 1227 int mlx5_port_max_linkspeed(struct mlx5_core_dev *mdev, u32 *speed) 1204 1228 { 1205 1229 const struct mlx5_link_info *table;
+45
drivers/net/ethernet/mellanox/mlx5/core/vport.c
··· 62 62 return MLX5_GET(query_vport_state_out, out, state); 63 63 } 64 64 65 + static int mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, 66 + u16 vport, u8 other_vport, 67 + u8 *admin_state) 68 + { 69 + u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {}; 70 + u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {}; 71 + int err; 72 + 73 + MLX5_SET(query_vport_state_in, in, opcode, 74 + MLX5_CMD_OP_QUERY_VPORT_STATE); 75 + MLX5_SET(query_vport_state_in, in, op_mod, opmod); 76 + MLX5_SET(query_vport_state_in, in, vport_number, vport); 77 + MLX5_SET(query_vport_state_in, in, other_vport, other_vport); 78 + 79 + err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out); 80 + if (err) 81 + return err; 82 + 83 + *admin_state = MLX5_GET(query_vport_state_out, out, admin_state); 84 + return 0; 85 + } 86 + 65 87 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, 66 88 u16 vport, u8 other_vport, u8 state) 67 89 { ··· 95 73 MLX5_SET(modify_vport_state_in, in, vport_number, vport); 96 74 MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); 97 75 MLX5_SET(modify_vport_state_in, in, admin_state, state); 76 + 77 + return mlx5_cmd_exec_in(mdev, modify_vport_state, in); 78 + } 79 + 80 + int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, 81 + u16 vport, u8 other_vport, u16 max_tx_speed) 82 + { 83 + u32 in[MLX5_ST_SZ_DW(modify_vport_state_in)] = {}; 84 + u8 admin_state; 85 + int err; 86 + 87 + err = mlx5_query_vport_admin_state(mdev, opmod, vport, other_vport, 88 + &admin_state); 89 + if (err) 90 + return err; 91 + 92 + MLX5_SET(modify_vport_state_in, in, opcode, 93 + MLX5_CMD_OP_MODIFY_VPORT_STATE); 94 + MLX5_SET(modify_vport_state_in, in, op_mod, opmod); 95 + MLX5_SET(modify_vport_state_in, in, vport_number, vport); 96 + MLX5_SET(modify_vport_state_in, in, other_vport, other_vport); 97 + MLX5_SET(modify_vport_state_in, in, admin_state, admin_state); 98 + MLX5_SET(modify_vport_state_in, in, max_tx_speed, max_tx_speed); 98 99 99 100 return mlx5_cmd_exec_in(mdev, modify_vport_state, in); 100 101 }
+4
include/linux/mlx5/vport.h
··· 41 41 (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && \ 42 42 mlx5_core_is_pf(mdev)) 43 43 44 + #define MLX5_MAX_TX_SPEED_UNIT 100 45 + 44 46 enum { 45 47 MLX5_CAP_INLINE_MODE_L2, 46 48 MLX5_CAP_INLINE_MODE_VPORT_CONTEXT, ··· 60 58 u8 mlx5_query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport); 61 59 int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, 62 60 u16 vport, u8 other_vport, u8 state); 61 + int mlx5_modify_vport_max_tx_speed(struct mlx5_core_dev *mdev, u8 opmod, 62 + u16 vport, u8 other_vport, u16 max_tx_speed); 63 63 int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, 64 64 u16 vport, bool other, u8 *addr); 65 65 int mlx5_query_mac_address(struct mlx5_core_dev *mdev, u8 *addr);