Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'net-6.10-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net

Pull networking fixes from Paolo Abeni:
"Including fixes from bpf and netfilter.

Current release - regressions:

- core: fix rc7's __skb_datagram_iter() regression

Current release - new code bugs:

- eth: bnxt: fix crashes when reducing ring count with active RSS
contexts

Previous releases - regressions:

- sched: fix UAF when resolving a clash

- skmsg: skip zero length skb in sk_msg_recvmsg2

- sunrpc: fix kernel free on connection failure in
xs_tcp_setup_socket

- tcp: avoid too many retransmit packets

- tcp: fix incorrect undo caused by DSACK of TLP retransmit

- udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().

- eth: ks8851: fix deadlock with the SPI chip variant

- eth: i40e: fix XDP program unloading while removing the driver

Previous releases - always broken:

- bpf:
- fix too early release of tcx_entry
- fail bpf_timer_cancel when callback is being cancelled
- bpf: fix order of args in call to bpf_map_kvcalloc

- netfilter: nf_tables: prefer nft_chain_validate

- ppp: reject claimed-as-LCP but actually malformed packets

- wireguard: avoid unaligned 64-bit memory accesses"

* tag 'net-6.10-rc8' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net: (33 commits)
net, sunrpc: Remap EPERM in case of connection failure in xs_tcp_setup_socket
net/sched: Fix UAF when resolving a clash
net: ks8851: Fix potential TX stall after interface reopen
udp: Set SOCK_RCU_FREE earlier in udp_lib_get_port().
netfilter: nf_tables: prefer nft_chain_validate
netfilter: nfnetlink_queue: drop bogus WARN_ON
ethtool: netlink: do not return SQI value if link is down
ppp: reject claimed-as-LCP but actually malformed packets
selftests/bpf: Add timer lockup selftest
net: ethernet: mtk-star-emac: set mac_managed_pm when probing
e1000e: fix force smbus during suspend flow
tcp: avoid too many retransmit packets
bpf: Defer work in bpf_timer_cancel_and_free
bpf: Fail bpf_timer_cancel when callback is being cancelled
bpf: fix order of args in call to bpf_map_kvcalloc
net: ethernet: lantiq_etop: fix double free in detach
i40e: Fix XDP program unloading while removing the driver
net: fix rc7's __skb_datagram_iter()
net: ks8851: Fix deadlock with the SPI chip variant
octeontx2-af: Fix incorrect value output on error path in rvu_check_rsrc_availability()
...

+561 -255
+10 -13
drivers/net/dsa/lan9303-core.c
··· 1047 1047 return ARRAY_SIZE(lan9303_mib); 1048 1048 } 1049 1049 1050 - static int lan9303_phy_read(struct dsa_switch *ds, int phy, int regnum) 1050 + static int lan9303_phy_read(struct dsa_switch *ds, int port, int regnum) 1051 1051 { 1052 1052 struct lan9303 *chip = ds->priv; 1053 1053 int phy_base = chip->phy_addr_base; 1054 1054 1055 - if (phy == phy_base) 1055 + if (port == 0) 1056 1056 return lan9303_virt_phy_reg_read(chip, regnum); 1057 - if (phy > phy_base + 2) 1057 + if (port > 2) 1058 1058 return -ENODEV; 1059 1059 1060 - return chip->ops->phy_read(chip, phy, regnum); 1060 + return chip->ops->phy_read(chip, phy_base + port, regnum); 1061 1061 } 1062 1062 1063 - static int lan9303_phy_write(struct dsa_switch *ds, int phy, int regnum, 1063 + static int lan9303_phy_write(struct dsa_switch *ds, int port, int regnum, 1064 1064 u16 val) 1065 1065 { 1066 1066 struct lan9303 *chip = ds->priv; 1067 1067 int phy_base = chip->phy_addr_base; 1068 1068 1069 - if (phy == phy_base) 1069 + if (port == 0) 1070 1070 return lan9303_virt_phy_reg_write(chip, regnum, val); 1071 - if (phy > phy_base + 2) 1071 + if (port > 2) 1072 1072 return -ENODEV; 1073 1073 1074 - return chip->ops->phy_write(chip, phy, regnum, val); 1074 + return chip->ops->phy_write(chip, phy_base + port, regnum, val); 1075 1075 } 1076 1076 1077 1077 static int lan9303_port_enable(struct dsa_switch *ds, int port, ··· 1099 1099 vlan_vid_del(dsa_port_to_conduit(dp), htons(ETH_P_8021Q), port); 1100 1100 1101 1101 lan9303_disable_processing_port(chip, port); 1102 - lan9303_phy_write(ds, chip->phy_addr_base + port, MII_BMCR, BMCR_PDOWN); 1102 + lan9303_phy_write(ds, port, MII_BMCR, BMCR_PDOWN); 1103 1103 } 1104 1104 1105 1105 static int lan9303_port_bridge_join(struct dsa_switch *ds, int port, ··· 1374 1374 1375 1375 static int lan9303_register_switch(struct lan9303 *chip) 1376 1376 { 1377 - int base; 1378 - 1379 1377 chip->ds = devm_kzalloc(chip->dev, sizeof(*chip->ds), GFP_KERNEL); 1380 1378 if (!chip->ds) 1381 1379 return -ENOMEM; ··· 1383 1385 chip->ds->priv = chip; 1384 1386 chip->ds->ops = &lan9303_switch_ops; 1385 1387 chip->ds->phylink_mac_ops = &lan9303_phylink_mac_ops; 1386 - base = chip->phy_addr_base; 1387 - chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1 + base, base); 1388 + chip->ds->phys_mii_mask = GENMASK(LAN9303_NUM_PORTS - 1, 0); 1388 1389 1389 1390 return dsa_register_switch(chip->ds); 1390 1391 }
+1
drivers/net/ethernet/broadcom/asp2/bcmasp.c
··· 1380 1380 dev_err(dev, "Cannot create eth interface %d\n", i); 1381 1381 bcmasp_remove_intfs(priv); 1382 1382 of_node_put(intf_node); 1383 + ret = -ENOMEM; 1383 1384 goto of_put_exit; 1384 1385 } 1385 1386 list_add_tail(&intf->list, &priv->intfs);
+15
drivers/net/ethernet/broadcom/bnxt/bnxt.c
··· 6146 6146 return max_ring; 6147 6147 } 6148 6148 6149 + u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp) 6150 + { 6151 + u16 i, tbl_size, max_ring = 0; 6152 + struct bnxt_rss_ctx *rss_ctx; 6153 + 6154 + tbl_size = bnxt_get_rxfh_indir_size(bp->dev); 6155 + 6156 + list_for_each_entry(rss_ctx, &bp->rss_ctx_list, list) { 6157 + for (i = 0; i < tbl_size; i++) 6158 + max_ring = max(max_ring, rss_ctx->rss_indir_tbl[i]); 6159 + } 6160 + 6161 + return max_ring; 6162 + } 6163 + 6149 6164 int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings) 6150 6165 { 6151 6166 if (bp->flags & BNXT_FLAG_CHIP_P5_PLUS) {
+1
drivers/net/ethernet/broadcom/bnxt/bnxt.h
··· 2776 2776 void bnxt_fill_ipv6_mask(__be32 mask[4]); 2777 2777 int bnxt_alloc_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); 2778 2778 void bnxt_set_dflt_rss_indir_tbl(struct bnxt *bp, struct bnxt_rss_ctx *rss_ctx); 2779 + u16 bnxt_get_max_rss_ctx_ring(struct bnxt *bp); 2779 2780 int bnxt_get_nr_rss_ctxs(struct bnxt *bp, int rx_rings); 2780 2781 int bnxt_hwrm_vnic_cfg(struct bnxt *bp, struct bnxt_vnic_info *vnic); 2781 2782 int bnxt_hwrm_vnic_alloc(struct bnxt *bp, struct bnxt_vnic_info *vnic,
+6
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
··· 961 961 return rc; 962 962 } 963 963 964 + if (req_rx_rings < bp->rx_nr_rings && 965 + req_rx_rings <= bnxt_get_max_rss_ctx_ring(bp)) { 966 + netdev_warn(dev, "Can't deactivate rings used by RSS contexts\n"); 967 + return -EINVAL; 968 + } 969 + 964 970 if (bnxt_get_nr_rss_ctxs(bp, req_rx_rings) != 965 971 bnxt_get_nr_rss_ctxs(bp, bp->rx_nr_rings) && 966 972 netif_is_rxfh_configured(dev)) {
+53 -20
drivers/net/ethernet/intel/e1000e/ich8lan.c
··· 1109 1109 } 1110 1110 1111 1111 /** 1112 + * e1000e_force_smbus - Force interfaces to transition to SMBUS mode. 1113 + * @hw: pointer to the HW structure 1114 + * 1115 + * Force the MAC and the PHY to SMBUS mode. Assumes semaphore already 1116 + * acquired. 1117 + * 1118 + * Return: 0 on success, negative errno on failure. 1119 + **/ 1120 + static s32 e1000e_force_smbus(struct e1000_hw *hw) 1121 + { 1122 + u16 smb_ctrl = 0; 1123 + u32 ctrl_ext; 1124 + s32 ret_val; 1125 + 1126 + /* Switching PHY interface always returns MDI error 1127 + * so disable retry mechanism to avoid wasting time 1128 + */ 1129 + e1000e_disable_phy_retry(hw); 1130 + 1131 + /* Force SMBus mode in the PHY */ 1132 + ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &smb_ctrl); 1133 + if (ret_val) { 1134 + e1000e_enable_phy_retry(hw); 1135 + return ret_val; 1136 + } 1137 + 1138 + smb_ctrl |= CV_SMB_CTRL_FORCE_SMBUS; 1139 + e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, smb_ctrl); 1140 + 1141 + e1000e_enable_phy_retry(hw); 1142 + 1143 + /* Force SMBus mode in the MAC */ 1144 + ctrl_ext = er32(CTRL_EXT); 1145 + ctrl_ext |= E1000_CTRL_EXT_FORCE_SMBUS; 1146 + ew32(CTRL_EXT, ctrl_ext); 1147 + 1148 + return 0; 1149 + } 1150 + 1151 + /** 1112 1152 * e1000_enable_ulp_lpt_lp - configure Ultra Low Power mode for LynxPoint-LP 1113 1153 * @hw: pointer to the HW structure 1114 1154 * @to_sx: boolean indicating a system power state transition to Sx ··· 1204 1164 ret_val = hw->phy.ops.acquire(hw); 1205 1165 if (ret_val) 1206 1166 goto out; 1167 + 1168 + if (hw->mac.type != e1000_pch_mtp) { 1169 + ret_val = e1000e_force_smbus(hw); 1170 + if (ret_val) { 1171 + e_dbg("Failed to force SMBUS: %d\n", ret_val); 1172 + goto release; 1173 + } 1174 + } 1207 1175 1208 1176 /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable 1209 1177 * LPLU and disable Gig speed when entering ULP ··· 1273 1225 } 1274 1226 1275 1227 release: 1276 - /* Switching PHY interface always returns MDI error 1277 - * so disable retry mechanism to avoid wasting time 1278 - */ 1279 - e1000e_disable_phy_retry(hw); 1280 - 1281 - /* Force SMBus mode in PHY */ 1282 - ret_val = e1000_read_phy_reg_hv_locked(hw, CV_SMB_CTRL, &phy_reg); 1283 - if (ret_val) { 1284 - e1000e_enable_phy_retry(hw); 1285 - hw->phy.ops.release(hw); 1286 - goto out; 1228 + if (hw->mac.type == e1000_pch_mtp) { 1229 + ret_val = e1000e_force_smbus(hw); 1230 + if (ret_val) 1231 + e_dbg("Failed to force SMBUS over MTL system: %d\n", 1232 + ret_val); 1287 1233 } 1288 - phy_reg |= CV_SMB_CTRL_FORCE_SMBUS; 1289 - e1000_write_phy_reg_hv_locked(hw, CV_SMB_CTRL, phy_reg); 1290 - 1291 - e1000e_enable_phy_retry(hw); 1292 - 1293 - /* Force SMBus mode in MAC */ 1294 - mac_reg = er32(CTRL_EXT); 1295 - mac_reg |= E1000_CTRL_EXT_FORCE_SMBUS; 1296 - ew32(CTRL_EXT, mac_reg); 1297 1234 1298 1235 hw->phy.ops.release(hw); 1299 1236 out:
+4 -5
drivers/net/ethernet/intel/i40e/i40e_main.c
··· 13293 13293 bool need_reset; 13294 13294 int i; 13295 13295 13296 + /* VSI shall be deleted in a moment, block loading new programs */ 13297 + if (prog && test_bit(__I40E_IN_REMOVE, pf->state)) 13298 + return -EINVAL; 13299 + 13296 13300 /* Don't allow frames that span over multiple buffers */ 13297 13301 if (vsi->netdev->mtu > frame_size - I40E_PACKET_HDR_PAD) { 13298 13302 NL_SET_ERR_MSG_MOD(extack, "MTU too large for linear frames and XDP prog does not support frags"); ··· 13305 13301 13306 13302 /* When turning XDP on->off/off->on we reset and rebuild the rings. */ 13307 13303 need_reset = (i40e_enabled_xdp_vsi(vsi) != !!prog); 13308 - 13309 13304 if (need_reset) 13310 13305 i40e_prep_for_reset(pf); 13311 - 13312 - /* VSI shall be deleted in a moment, just return EINVAL */ 13313 - if (test_bit(__I40E_IN_REMOVE, pf->state)) 13314 - return -EINVAL; 13315 13306 13316 13307 old_prog = xchg(&vsi->xdp_prog, prog); 13317 13308
+2 -2
drivers/net/ethernet/lantiq_etop.c
··· 217 217 if (ch->dma.irq) 218 218 free_irq(ch->dma.irq, priv); 219 219 if (IS_RX(ch->idx)) { 220 - int desc; 220 + struct ltq_dma_channel *dma = &ch->dma; 221 221 222 - for (desc = 0; desc < LTQ_DESC_NUM; desc++) 222 + for (dma->desc = 0; dma->desc < LTQ_DESC_NUM; dma->desc++) 223 223 dev_kfree_skb_any(ch->skb[ch->dma.desc]); 224 224 } 225 225 }
+1 -1
drivers/net/ethernet/marvell/octeontx2/af/rvu.c
··· 1643 1643 if (req->ssow > block->lf.max) { 1644 1644 dev_err(&rvu->pdev->dev, 1645 1645 "Func 0x%x: Invalid SSOW req, %d > max %d\n", 1646 - pcifunc, req->sso, block->lf.max); 1646 + pcifunc, req->ssow, block->lf.max); 1647 1647 return -EINVAL; 1648 1648 } 1649 1649 mappedlfs = rvu_get_rsrc_mapcount(pfvf, block->addr);
+7
drivers/net/ethernet/mediatek/mtk_star_emac.c
··· 1524 1524 { 1525 1525 struct device_node *of_node; 1526 1526 struct mtk_star_priv *priv; 1527 + struct phy_device *phydev; 1527 1528 struct net_device *ndev; 1528 1529 struct device *dev; 1529 1530 void __iomem *base; ··· 1649 1648 1650 1649 netif_napi_add(ndev, &priv->rx_napi, mtk_star_rx_poll); 1651 1650 netif_napi_add_tx(ndev, &priv->tx_napi, mtk_star_tx_poll); 1651 + 1652 + phydev = of_phy_find_device(priv->phy_node); 1653 + if (phydev) { 1654 + phydev->mac_managed_pm = true; 1655 + put_device(&phydev->mdio.dev); 1656 + } 1652 1657 1653 1658 return devm_register_netdev(dev, ndev); 1654 1659 }
+5 -5
drivers/net/ethernet/micrel/ks8851_common.c
··· 352 352 netif_dbg(ks, intr, ks->netdev, 353 353 "%s: txspace %d\n", __func__, tx_space); 354 354 355 - spin_lock(&ks->statelock); 355 + spin_lock_bh(&ks->statelock); 356 356 ks->tx_space = tx_space; 357 357 if (netif_queue_stopped(ks->netdev)) 358 358 netif_wake_queue(ks->netdev); 359 - spin_unlock(&ks->statelock); 359 + spin_unlock_bh(&ks->statelock); 360 360 } 361 361 362 362 if (status & IRQ_SPIBEI) { ··· 482 482 ks8851_wrreg16(ks, KS_IER, ks->rc_ier); 483 483 484 484 ks->queued_len = 0; 485 + ks->tx_space = ks8851_rdreg16(ks, KS_TXMIR); 485 486 netif_start_queue(ks->netdev); 486 487 487 488 netif_dbg(ks, ifup, ks->netdev, "network device up\n"); ··· 636 635 637 636 /* schedule work to do the actual set of the data if needed */ 638 637 639 - spin_lock(&ks->statelock); 638 + spin_lock_bh(&ks->statelock); 640 639 641 640 if (memcmp(&rxctrl, &ks->rxctrl, sizeof(rxctrl)) != 0) { 642 641 memcpy(&ks->rxctrl, &rxctrl, sizeof(ks->rxctrl)); 643 642 schedule_work(&ks->rxctrl_work); 644 643 } 645 644 646 - spin_unlock(&ks->statelock); 645 + spin_unlock_bh(&ks->statelock); 647 646 } 648 647 649 648 static int ks8851_set_mac_address(struct net_device *dev, void *addr) ··· 1102 1101 int ret; 1103 1102 1104 1103 ks->netdev = netdev; 1105 - ks->tx_space = 6144; 1106 1104 1107 1105 ks->gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); 1108 1106 ret = PTR_ERR_OR_ZERO(ks->gpio);
+2 -2
drivers/net/ethernet/micrel/ks8851_spi.c
··· 340 340 341 341 tx_space = ks8851_rdreg16_spi(ks, KS_TXMIR); 342 342 343 - spin_lock(&ks->statelock); 343 + spin_lock_bh(&ks->statelock); 344 344 ks->queued_len -= dequeued_len; 345 345 ks->tx_space = tx_space; 346 - spin_unlock(&ks->statelock); 346 + spin_unlock_bh(&ks->statelock); 347 347 348 348 ks8851_unlock_spi(ks, &flags); 349 349 }
+1 -1
drivers/net/phy/microchip_t1.c
··· 748 748 ethnl_cable_test_result(phydev, ETHTOOL_A_CABLE_PAIR_A, 749 749 lan87xx_cable_test_report_trans(detect)); 750 750 751 - return 0; 751 + return phy_init_hw(phydev); 752 752 } 753 753 754 754 static int lan87xx_cable_test_get_status(struct phy_device *phydev,
+15
drivers/net/ppp/ppp_generic.c
··· 70 70 #define MPHDRLEN_SSN 4 /* ditto with short sequence numbers */ 71 71 72 72 #define PPP_PROTO_LEN 2 73 + #define PPP_LCP_HDRLEN 4 73 74 74 75 /* 75 76 * An instance of /dev/ppp can be associated with either a ppp ··· 494 493 return ret; 495 494 } 496 495 496 + static bool ppp_check_packet(struct sk_buff *skb, size_t count) 497 + { 498 + /* LCP packets must include LCP header which 4 bytes long: 499 + * 1-byte code, 1-byte identifier, and 2-byte length. 500 + */ 501 + return get_unaligned_be16(skb->data) != PPP_LCP || 502 + count >= PPP_PROTO_LEN + PPP_LCP_HDRLEN; 503 + } 504 + 497 505 static ssize_t ppp_write(struct file *file, const char __user *buf, 498 506 size_t count, loff_t *ppos) 499 507 { ··· 522 512 skb_reserve(skb, pf->hdrlen); 523 513 ret = -EFAULT; 524 514 if (copy_from_user(skb_put(skb, count), buf, count)) { 515 + kfree_skb(skb); 516 + goto out; 517 + } 518 + ret = -EINVAL; 519 + if (unlikely(!ppp_check_packet(skb, count))) { 525 520 kfree_skb(skb); 526 521 goto out; 527 522 }
+2 -2
drivers/net/wireguard/allowedips.c
··· 15 15 if (bits == 32) { 16 16 *(u32 *)dst = be32_to_cpu(*(const __be32 *)src); 17 17 } else if (bits == 128) { 18 - ((u64 *)dst)[0] = be64_to_cpu(((const __be64 *)src)[0]); 19 - ((u64 *)dst)[1] = be64_to_cpu(((const __be64 *)src)[1]); 18 + ((u64 *)dst)[0] = get_unaligned_be64(src); 19 + ((u64 *)dst)[1] = get_unaligned_be64(src + 8); 20 20 } 21 21 } 22 22
+2 -2
drivers/net/wireguard/queueing.h
··· 124 124 */ 125 125 static inline int wg_cpumask_next_online(int *last_cpu) 126 126 { 127 - int cpu = cpumask_next(*last_cpu, cpu_online_mask); 127 + int cpu = cpumask_next(READ_ONCE(*last_cpu), cpu_online_mask); 128 128 if (cpu >= nr_cpu_ids) 129 129 cpu = cpumask_first(cpu_online_mask); 130 - *last_cpu = cpu; 130 + WRITE_ONCE(*last_cpu, cpu); 131 131 return cpu; 132 132 } 133 133
+1 -1
drivers/net/wireguard/send.c
··· 222 222 { 223 223 struct sk_buff *skb; 224 224 225 - if (skb_queue_empty(&peer->staged_packet_queue)) { 225 + if (skb_queue_empty_lockless(&peer->staged_packet_queue)) { 226 226 skb = alloc_skb(DATA_PACKET_HEAD_ROOM + MESSAGE_MINIMUM_LENGTH, 227 227 GFP_ATOMIC); 228 228 if (unlikely(!skb))
+9 -4
include/net/tcx.h
··· 13 13 struct tcx_entry { 14 14 struct mini_Qdisc __rcu *miniq; 15 15 struct bpf_mprog_bundle bundle; 16 - bool miniq_active; 16 + u32 miniq_active; 17 17 struct rcu_head rcu; 18 18 }; 19 19 ··· 125 125 tcx_dec(); 126 126 } 127 127 128 - static inline void tcx_miniq_set_active(struct bpf_mprog_entry *entry, 129 - const bool active) 128 + static inline void tcx_miniq_inc(struct bpf_mprog_entry *entry) 130 129 { 131 130 ASSERT_RTNL(); 132 - tcx_entry(entry)->miniq_active = active; 131 + tcx_entry(entry)->miniq_active++; 132 + } 133 + 134 + static inline void tcx_miniq_dec(struct bpf_mprog_entry *entry) 135 + { 136 + ASSERT_RTNL(); 137 + tcx_entry(entry)->miniq_active--; 133 138 } 134 139 135 140 static inline bool tcx_entry_is_active(struct bpf_mprog_entry *entry)
+2 -2
kernel/bpf/bpf_local_storage.c
··· 782 782 nbuckets = max_t(u32, 2, nbuckets); 783 783 smap->bucket_log = ilog2(nbuckets); 784 784 785 - smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), 786 - nbuckets, GFP_USER | __GFP_NOWARN); 785 + smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, 786 + sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); 787 787 if (!smap->buckets) { 788 788 err = -ENOMEM; 789 789 goto free_smap;
+82 -17
kernel/bpf/helpers.c
··· 1084 1084 struct bpf_prog *prog; 1085 1085 void __rcu *callback_fn; 1086 1086 void *value; 1087 - struct rcu_head rcu; 1087 + union { 1088 + struct rcu_head rcu; 1089 + struct work_struct delete_work; 1090 + }; 1088 1091 u64 flags; 1089 1092 }; 1090 1093 ··· 1110 1107 struct bpf_hrtimer { 1111 1108 struct bpf_async_cb cb; 1112 1109 struct hrtimer timer; 1110 + atomic_t cancelling; 1113 1111 }; 1114 1112 1115 1113 struct bpf_work { ··· 1223 1219 kfree_rcu(w, cb.rcu); 1224 1220 } 1225 1221 1222 + static void bpf_timer_delete_work(struct work_struct *work) 1223 + { 1224 + struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work); 1225 + 1226 + /* Cancel the timer and wait for callback to complete if it was running. 1227 + * If hrtimer_cancel() can be safely called it's safe to call 1228 + * kfree_rcu(t) right after for both preallocated and non-preallocated 1229 + * maps. The async->cb = NULL was already done and no code path can see 1230 + * address 't' anymore. Timer if armed for existing bpf_hrtimer before 1231 + * bpf_timer_cancel_and_free will have been cancelled. 1232 + */ 1233 + hrtimer_cancel(&t->timer); 1234 + kfree_rcu(t, cb.rcu); 1235 + } 1236 + 1226 1237 static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags, 1227 1238 enum bpf_async_type type) 1228 1239 { ··· 1281 1262 clockid = flags & (MAX_CLOCKS - 1); 1282 1263 t = (struct bpf_hrtimer *)cb; 1283 1264 1265 + atomic_set(&t->cancelling, 0); 1266 + INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work); 1284 1267 hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); 1285 1268 t->timer.function = bpf_timer_cb; 1286 1269 cb->value = (void *)async - map->record->timer_off; ··· 1461 1440 1462 1441 BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer) 1463 1442 { 1464 - struct bpf_hrtimer *t; 1443 + struct bpf_hrtimer *t, *cur_t; 1444 + bool inc = false; 1465 1445 int ret = 0; 1466 1446 1467 1447 if (in_nmi()) ··· 1474 1452 ret = -EINVAL; 1475 1453 goto out; 1476 1454 } 1477 - if (this_cpu_read(hrtimer_running) == t) { 1455 + 1456 + cur_t = this_cpu_read(hrtimer_running); 1457 + if (cur_t == t) { 1478 1458 /* If bpf callback_fn is trying to bpf_timer_cancel() 1479 1459 * its own timer the hrtimer_cancel() will deadlock 1480 - * since it waits for callback_fn to finish 1460 + * since it waits for callback_fn to finish. 1481 1461 */ 1482 1462 ret = -EDEADLK; 1483 1463 goto out; 1484 1464 } 1465 + 1466 + /* Only account in-flight cancellations when invoked from a timer 1467 + * callback, since we want to avoid waiting only if other _callbacks_ 1468 + * are waiting on us, to avoid introducing lockups. Non-callback paths 1469 + * are ok, since nobody would synchronously wait for their completion. 1470 + */ 1471 + if (!cur_t) 1472 + goto drop; 1473 + atomic_inc(&t->cancelling); 1474 + /* Need full barrier after relaxed atomic_inc */ 1475 + smp_mb__after_atomic(); 1476 + inc = true; 1477 + if (atomic_read(&cur_t->cancelling)) { 1478 + /* We're cancelling timer t, while some other timer callback is 1479 + * attempting to cancel us. In such a case, it might be possible 1480 + * that timer t belongs to the other callback, or some other 1481 + * callback waiting upon it (creating transitive dependencies 1482 + * upon us), and we will enter a deadlock if we continue 1483 + * cancelling and waiting for it synchronously, since it might 1484 + * do the same. Bail! 1485 + */ 1486 + ret = -EDEADLK; 1487 + goto out; 1488 + } 1489 + drop: 1485 1490 drop_prog_refcnt(&t->cb); 1486 1491 out: 1487 1492 __bpf_spin_unlock_irqrestore(&timer->lock); ··· 1516 1467 * if it was running. 1517 1468 */ 1518 1469 ret = ret ?: hrtimer_cancel(&t->timer); 1470 + if (inc) 1471 + atomic_dec(&t->cancelling); 1519 1472 rcu_read_unlock(); 1520 1473 return ret; 1521 1474 } ··· 1563 1512 1564 1513 if (!t) 1565 1514 return; 1566 - /* Cancel the timer and wait for callback to complete if it was running. 1567 - * If hrtimer_cancel() can be safely called it's safe to call kfree(t) 1568 - * right after for both preallocated and non-preallocated maps. 1569 - * The async->cb = NULL was already done and no code path can 1570 - * see address 't' anymore. 1571 - * 1572 - * Check that bpf_map_delete/update_elem() wasn't called from timer 1573 - * callback_fn. In such case don't call hrtimer_cancel() (since it will 1574 - * deadlock) and don't call hrtimer_try_to_cancel() (since it will just 1575 - * return -1). Though callback_fn is still running on this cpu it's 1515 + /* We check that bpf_map_delete/update_elem() was called from timer 1516 + * callback_fn. In such case we don't call hrtimer_cancel() (since it 1517 + * will deadlock) and don't call hrtimer_try_to_cancel() (since it will 1518 + * just return -1). Though callback_fn is still running on this cpu it's 1576 1519 * safe to do kfree(t) because bpf_timer_cb() read everything it needed 1577 1520 * from 't'. The bpf subprog callback_fn won't be able to access 't', 1578 1521 * since async->cb = NULL was already done. The timer will be 1579 1522 * effectively cancelled because bpf_timer_cb() will return 1580 1523 * HRTIMER_NORESTART. 1524 + * 1525 + * However, it is possible the timer callback_fn calling us armed the 1526 + * timer _before_ calling us, such that failing to cancel it here will 1527 + * cause it to possibly use struct hrtimer after freeing bpf_hrtimer. 1528 + * Therefore, we _need_ to cancel any outstanding timers before we do 1529 + * kfree_rcu, even though no more timers can be armed. 1530 + * 1531 + * Moreover, we need to schedule work even if timer does not belong to 1532 + * the calling callback_fn, as on two different CPUs, we can end up in a 1533 + * situation where both sides run in parallel, try to cancel one 1534 + * another, and we end up waiting on both sides in hrtimer_cancel 1535 + * without making forward progress, since timer1 depends on time2 1536 + * callback to finish, and vice versa. 1537 + * 1538 + * CPU 1 (timer1_cb) CPU 2 (timer2_cb) 1539 + * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1) 1540 + * 1541 + * To avoid these issues, punt to workqueue context when we are in a 1542 + * timer callback. 1581 1543 */ 1582 - if (this_cpu_read(hrtimer_running) != t) 1583 - hrtimer_cancel(&t->timer); 1584 - kfree_rcu(t, cb.rcu); 1544 + if (this_cpu_read(hrtimer_running)) 1545 + queue_work(system_unbound_wq, &t->cb.delete_work); 1546 + else 1547 + bpf_timer_delete_work(&t->cb.delete_work); 1585 1548 } 1586 1549 1587 1550 /* This function is called by map_delete/update_elem for individual element and
+2 -1
net/core/datagram.c
··· 423 423 if (copy > len) 424 424 copy = len; 425 425 426 + n = 0; 426 427 skb_frag_foreach_page(frag, 427 428 skb_frag_off(frag) + offset - start, 428 429 copy, p, p_off, p_len, copied) { 429 430 vaddr = kmap_local_page(p); 430 - n = INDIRECT_CALL_1(cb, simple_copy_to_iter, 431 + n += INDIRECT_CALL_1(cb, simple_copy_to_iter, 431 432 vaddr + p_off, p_len, data, to); 432 433 kunmap_local(vaddr); 433 434 }
+2 -1
net/core/skmsg.c
··· 434 434 page = sg_page(sge); 435 435 if (copied + copy > len) 436 436 copy = len - copied; 437 - copy = copy_page_to_iter(page, sge->offset, copy, iter); 437 + if (copy) 438 + copy = copy_page_to_iter(page, sge->offset, copy, iter); 438 439 if (!copy) { 439 440 copied = copied ? copied : -EFAULT; 440 441 goto out;
+28 -13
net/ethtool/linkstate.c
··· 37 37 mutex_lock(&phydev->lock); 38 38 if (!phydev->drv || !phydev->drv->get_sqi) 39 39 ret = -EOPNOTSUPP; 40 + else if (!phydev->link) 41 + ret = -ENETDOWN; 40 42 else 41 43 ret = phydev->drv->get_sqi(phydev); 42 44 mutex_unlock(&phydev->lock); ··· 57 55 mutex_lock(&phydev->lock); 58 56 if (!phydev->drv || !phydev->drv->get_sqi_max) 59 57 ret = -EOPNOTSUPP; 58 + else if (!phydev->link) 59 + ret = -ENETDOWN; 60 60 else 61 61 ret = phydev->drv->get_sqi_max(phydev); 62 62 mutex_unlock(&phydev->lock); 63 63 64 64 return ret; 65 65 }; 66 + 67 + static bool linkstate_sqi_critical_error(int sqi) 68 + { 69 + return sqi < 0 && sqi != -EOPNOTSUPP && sqi != -ENETDOWN; 70 + } 71 + 72 + static bool linkstate_sqi_valid(struct linkstate_reply_data *data) 73 + { 74 + return data->sqi >= 0 && data->sqi_max >= 0 && 75 + data->sqi <= data->sqi_max; 76 + } 66 77 67 78 static int linkstate_get_link_ext_state(struct net_device *dev, 68 79 struct linkstate_reply_data *data) ··· 108 93 data->link = __ethtool_get_link(dev); 109 94 110 95 ret = linkstate_get_sqi(dev); 111 - if (ret < 0 && ret != -EOPNOTSUPP) 96 + if (linkstate_sqi_critical_error(ret)) 112 97 goto out; 113 98 data->sqi = ret; 114 99 115 100 ret = linkstate_get_sqi_max(dev); 116 - if (ret < 0 && ret != -EOPNOTSUPP) 101 + if (linkstate_sqi_critical_error(ret)) 117 102 goto out; 118 103 data->sqi_max = ret; 119 104 ··· 151 136 len = nla_total_size(sizeof(u8)) /* LINKSTATE_LINK */ 152 137 + 0; 153 138 154 - if (data->sqi != -EOPNOTSUPP) 155 - len += nla_total_size(sizeof(u32)); 156 - 157 - if (data->sqi_max != -EOPNOTSUPP) 158 - len += nla_total_size(sizeof(u32)); 139 + if (linkstate_sqi_valid(data)) { 140 + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI */ 141 + len += nla_total_size(sizeof(u32)); /* LINKSTATE_SQI_MAX */ 142 + } 159 143 160 144 if (data->link_ext_state_provided) 161 145 len += nla_total_size(sizeof(u8)); /* LINKSTATE_EXT_STATE */ ··· 178 164 nla_put_u8(skb, ETHTOOL_A_LINKSTATE_LINK, !!data->link)) 179 165 return -EMSGSIZE; 180 166 181 - if (data->sqi != -EOPNOTSUPP && 182 - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) 183 - return -EMSGSIZE; 167 + if (linkstate_sqi_valid(data)) { 168 + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI, data->sqi)) 169 + return -EMSGSIZE; 184 170 185 - if (data->sqi_max != -EOPNOTSUPP && 186 - nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, data->sqi_max)) 187 - return -EMSGSIZE; 171 + if (nla_put_u32(skb, ETHTOOL_A_LINKSTATE_SQI_MAX, 172 + data->sqi_max)) 173 + return -EMSGSIZE; 174 + } 188 175 189 176 if (data->link_ext_state_provided) { 190 177 if (nla_put_u8(skb, ETHTOOL_A_LINKSTATE_EXT_STATE,
+10 -1
net/ipv4/tcp_input.c
··· 2129 2129 static inline void tcp_init_undo(struct tcp_sock *tp) 2130 2130 { 2131 2131 tp->undo_marker = tp->snd_una; 2132 + 2132 2133 /* Retransmission still in flight may cause DSACKs later. */ 2133 - tp->undo_retrans = tp->retrans_out ? : -1; 2134 + /* First, account for regular retransmits in flight: */ 2135 + tp->undo_retrans = tp->retrans_out; 2136 + /* Next, account for TLP retransmits in flight: */ 2137 + if (tp->tlp_high_seq && tp->tlp_retrans) 2138 + tp->undo_retrans++; 2139 + /* Finally, avoid 0, because undo_retrans==0 means "can undo now": */ 2140 + if (!tp->undo_retrans) 2141 + tp->undo_retrans = -1; 2134 2142 } 2135 2143 2136 2144 static bool tcp_is_rack(const struct sock *sk) ··· 2217 2209 2218 2210 tcp_set_ca_state(sk, TCP_CA_Loss); 2219 2211 tp->high_seq = tp->snd_nxt; 2212 + tp->tlp_high_seq = 0; 2220 2213 tcp_ecn_queue_cwr(tp); 2221 2214 2222 2215 /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+13 -4
net/ipv4/tcp_timer.c
··· 483 483 const struct sk_buff *skb, 484 484 u32 rtx_delta) 485 485 { 486 + const struct inet_connection_sock *icsk = inet_csk(sk); 487 + u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout); 486 488 const struct tcp_sock *tp = tcp_sk(sk); 487 - const int timeout = TCP_RTO_MAX * 2; 489 + int timeout = TCP_RTO_MAX * 2; 488 490 s32 rcv_delta; 489 491 492 + if (user_timeout) { 493 + /* If user application specified a TCP_USER_TIMEOUT, 494 + * it does not want win 0 packets to 'reset the timer' 495 + * while retransmits are not making progress. 496 + */ 497 + if (rtx_delta > user_timeout) 498 + return true; 499 + timeout = min_t(u32, timeout, msecs_to_jiffies(user_timeout)); 500 + } 490 501 /* Note: timer interrupt might have been delayed by at least one jiffy, 491 502 * and tp->rcv_tstamp might very well have been written recently. 492 503 * rcv_delta can thus be negative. 493 504 */ 494 - rcv_delta = inet_csk(sk)->icsk_timeout - tp->rcv_tstamp; 505 + rcv_delta = icsk->icsk_timeout - tp->rcv_tstamp; 495 506 if (rcv_delta <= timeout) 496 507 return false; 497 508 ··· 546 535 skb = tcp_rtx_queue_head(sk); 547 536 if (WARN_ON_ONCE(!skb)) 548 537 return; 549 - 550 - tp->tlp_high_seq = 0; 551 538 552 539 if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && 553 540 !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+3 -1
net/ipv4/udp.c
··· 326 326 goto fail_unlock; 327 327 } 328 328 329 + sock_set_flag(sk, SOCK_RCU_FREE); 330 + 329 331 sk_add_node_rcu(sk, &hslot->head); 330 332 hslot->count++; 331 333 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); ··· 344 342 hslot2->count++; 345 343 spin_unlock(&hslot2->lock); 346 344 } 347 - sock_set_flag(sk, SOCK_RCU_FREE); 345 + 348 346 error = 0; 349 347 fail_unlock: 350 348 spin_unlock_bh(&hslot->lock);
+13 -145
net/netfilter/nf_tables_api.c
··· 3823 3823 nf_tables_rule_destroy(ctx, rule); 3824 3824 } 3825 3825 3826 + /** nft_chain_validate - loop detection and hook validation 3827 + * 3828 + * @ctx: context containing call depth and base chain 3829 + * @chain: chain to validate 3830 + * 3831 + * Walk through the rules of the given chain and chase all jumps/gotos 3832 + * and set lookups until either the jump limit is hit or all reachable 3833 + * chains have been validated. 3834 + */ 3826 3835 int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) 3827 3836 { 3828 3837 struct nft_expr *expr, *last; ··· 3853 3844 if (!expr->ops->validate) 3854 3845 continue; 3855 3846 3847 + /* This may call nft_chain_validate() recursively, 3848 + * callers that do so must increment ctx->level. 3849 + */ 3856 3850 err = expr->ops->validate(ctx, expr, &data); 3857 3851 if (err < 0) 3858 3852 return err; ··· 10821 10809 } 10822 10810 EXPORT_SYMBOL_GPL(nft_chain_validate_hooks); 10823 10811 10824 - /* 10825 - * Loop detection - walk through the ruleset beginning at the destination chain 10826 - * of a new jump until either the source chain is reached (loop) or all 10827 - * reachable chains have been traversed. 10828 - * 10829 - * The loop check is performed whenever a new jump verdict is added to an 10830 - * expression or verdict map or a verdict map is bound to a new chain. 10831 - */ 10832 - 10833 - static int nf_tables_check_loops(const struct nft_ctx *ctx, 10834 - const struct nft_chain *chain); 10835 - 10836 - static int nft_check_loops(const struct nft_ctx *ctx, 10837 - const struct nft_set_ext *ext) 10838 - { 10839 - const struct nft_data *data; 10840 - int ret; 10841 - 10842 - data = nft_set_ext_data(ext); 10843 - switch (data->verdict.code) { 10844 - case NFT_JUMP: 10845 - case NFT_GOTO: 10846 - ret = nf_tables_check_loops(ctx, data->verdict.chain); 10847 - break; 10848 - default: 10849 - ret = 0; 10850 - break; 10851 - } 10852 - 10853 - return ret; 10854 - } 10855 - 10856 - static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx, 10857 - struct nft_set *set, 10858 - const struct nft_set_iter *iter, 10859 - struct nft_elem_priv *elem_priv) 10860 - { 10861 - const struct nft_set_ext *ext = nft_set_elem_ext(set, elem_priv); 10862 - 10863 - if (!nft_set_elem_active(ext, iter->genmask)) 10864 - return 0; 10865 - 10866 - if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && 10867 - *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) 10868 - return 0; 10869 - 10870 - return nft_check_loops(ctx, ext); 10871 - } 10872 - 10873 - static int nft_set_catchall_loops(const struct nft_ctx *ctx, 10874 - struct nft_set *set) 10875 - { 10876 - u8 genmask = nft_genmask_next(ctx->net); 10877 - struct nft_set_elem_catchall *catchall; 10878 - struct nft_set_ext *ext; 10879 - int ret = 0; 10880 - 10881 - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { 10882 - ext = nft_set_elem_ext(set, catchall->elem); 10883 - if (!nft_set_elem_active(ext, genmask)) 10884 - continue; 10885 - 10886 - ret = nft_check_loops(ctx, ext); 10887 - if (ret < 0) 10888 - return ret; 10889 - } 10890 - 10891 - return ret; 10892 - } 10893 - 10894 - static int nf_tables_check_loops(const struct nft_ctx *ctx, 10895 - const struct nft_chain *chain) 10896 - { 10897 - const struct nft_rule *rule; 10898 - const struct nft_expr *expr, *last; 10899 - struct nft_set *set; 10900 - struct nft_set_binding *binding; 10901 - struct nft_set_iter iter; 10902 - 10903 - if (ctx->chain == chain) 10904 - return -ELOOP; 10905 - 10906 - if (fatal_signal_pending(current)) 10907 - return -EINTR; 10908 - 10909 - list_for_each_entry(rule, &chain->rules, list) { 10910 - nft_rule_for_each_expr(expr, last, rule) { 10911 - struct nft_immediate_expr *priv; 10912 - const struct nft_data *data; 10913 - int err; 10914 - 10915 - if (strcmp(expr->ops->type->name, "immediate")) 10916 - continue; 10917 - 10918 - priv = nft_expr_priv(expr); 10919 - if (priv->dreg != NFT_REG_VERDICT) 10920 - continue; 10921 - 10922 - data = &priv->data; 10923 - switch (data->verdict.code) { 10924 - case NFT_JUMP: 10925 - case NFT_GOTO: 10926 - err = nf_tables_check_loops(ctx, 10927 - data->verdict.chain); 10928 - if (err < 0) 10929 - return err; 10930 - break; 10931 - default: 10932 - break; 10933 - } 10934 - } 10935 - } 10936 - 10937 - list_for_each_entry(set, &ctx->table->sets, list) { 10938 - if (!nft_is_active_next(ctx->net, set)) 10939 - continue; 10940 - if (!(set->flags & NFT_SET_MAP) || 10941 - set->dtype != NFT_DATA_VERDICT) 10942 - continue; 10943 - 10944 - list_for_each_entry(binding, &set->bindings, list) { 10945 - if (!(binding->flags & NFT_SET_MAP) || 10946 - binding->chain != chain) 10947 - continue; 10948 - 10949 - iter.genmask = nft_genmask_next(ctx->net); 10950 - iter.type = NFT_ITER_UPDATE; 10951 - iter.skip = 0; 10952 - iter.count = 0; 10953 - iter.err = 0; 10954 - iter.fn = nf_tables_loop_check_setelem; 10955 - 10956 - set->ops->walk(ctx, set, &iter); 10957 - if (!iter.err) 10958 - iter.err = nft_set_catchall_loops(ctx, set); 10959 - 10960 - if (iter.err < 0) 10961 - return iter.err; 10962 - } 10963 - } 10964 - 10965 - return 0; 10966 - } 10967 - 10968 10812 /** 10969 10813 * nft_parse_u32_check - fetch u32 attribute and check for maximum value 10970 10814 * ··· 10933 11065 if (data != NULL && 10934 11066 (data->verdict.code == NFT_GOTO || 10935 11067 data->verdict.code == NFT_JUMP)) { 10936 - err = nf_tables_check_loops(ctx, data->verdict.chain); 11068 + err = nft_chain_validate(ctx, data->verdict.chain); 10937 11069 if (err < 0) 10938 11070 return err; 10939 11071 }
+8
net/sched/act_ct.c
··· 1077 1077 */ 1078 1078 if (nf_conntrack_confirm(skb) != NF_ACCEPT) 1079 1079 goto drop; 1080 + 1081 + /* The ct may be dropped if a clash has been resolved, 1082 + * so it's necessary to retrieve it from skb again to 1083 + * prevent UAF. 1084 + */ 1085 + ct = nf_ct_get(skb, &ctinfo); 1086 + if (!ct) 1087 + skip_add = true; 1080 1088 } 1081 1089 1082 1090 if (!skip_add)
+6 -6
net/sched/sch_ingress.c
··· 91 91 entry = tcx_entry_fetch_or_create(dev, true, &created); 92 92 if (!entry) 93 93 return -ENOMEM; 94 - tcx_miniq_set_active(entry, true); 94 + tcx_miniq_inc(entry); 95 95 mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); 96 96 if (created) 97 97 tcx_entry_update(dev, entry, true); ··· 121 121 tcf_block_put_ext(q->block, sch, &q->block_info); 122 122 123 123 if (entry) { 124 - tcx_miniq_set_active(entry, false); 124 + tcx_miniq_dec(entry); 125 125 if (!tcx_entry_is_active(entry)) { 126 126 tcx_entry_update(dev, NULL, true); 127 127 tcx_entry_free(entry); ··· 257 257 entry = tcx_entry_fetch_or_create(dev, true, &created); 258 258 if (!entry) 259 259 return -ENOMEM; 260 - tcx_miniq_set_active(entry, true); 260 + tcx_miniq_inc(entry); 261 261 mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); 262 262 if (created) 263 263 tcx_entry_update(dev, entry, true); ··· 276 276 entry = tcx_entry_fetch_or_create(dev, false, &created); 277 277 if (!entry) 278 278 return -ENOMEM; 279 - tcx_miniq_set_active(entry, true); 279 + tcx_miniq_inc(entry); 280 280 mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); 281 281 if (created) 282 282 tcx_entry_update(dev, entry, false); ··· 302 302 tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); 303 303 304 304 if (ingress_entry) { 305 - tcx_miniq_set_active(ingress_entry, false); 305 + tcx_miniq_dec(ingress_entry); 306 306 if (!tcx_entry_is_active(ingress_entry)) { 307 307 tcx_entry_update(dev, NULL, true); 308 308 tcx_entry_free(ingress_entry); ··· 310 310 } 311 311 312 312 if (egress_entry) { 313 - tcx_miniq_set_active(egress_entry, false); 313 + tcx_miniq_dec(egress_entry); 314 314 if (!tcx_entry_is_active(egress_entry)) { 315 315 tcx_entry_update(dev, NULL, false); 316 316 tcx_entry_free(egress_entry);
+7
net/sunrpc/xprtsock.c
··· 2441 2441 transport->srcport = 0; 2442 2442 status = -EAGAIN; 2443 2443 break; 2444 + case -EPERM: 2445 + /* Happens, for instance, if a BPF program is preventing 2446 + * the connect. Remap the error so upper layers can better 2447 + * deal with it. 2448 + */ 2449 + status = -ECONNREFUSED; 2450 + fallthrough; 2444 2451 case -EINVAL: 2445 2452 /* Happens, for instance, if the user specified a link 2446 2453 * local IPv6 address without a scope-id.
+3
tools/testing/selftests/bpf/config
··· 58 58 CONFIG_MPLS_IPTUNNEL=y 59 59 CONFIG_MPLS_ROUTING=y 60 60 CONFIG_MPTCP=y 61 + CONFIG_NET_ACT_SKBMOD=y 62 + CONFIG_NET_CLS=y 61 63 CONFIG_NET_CLS_ACT=y 62 64 CONFIG_NET_CLS_BPF=y 63 65 CONFIG_NET_CLS_FLOWER=y 66 + CONFIG_NET_CLS_MATCHALL=y 64 67 CONFIG_NET_FOU=y 65 68 CONFIG_NET_FOU_IP_TUNNELS=y 66 69 CONFIG_NET_IPGRE=y
+61
tools/testing/selftests/bpf/prog_tests/tc_links.c
··· 9 9 #define ping_cmd "ping -q -c1 -w1 127.0.0.1 > /dev/null" 10 10 11 11 #include "test_tc_link.skel.h" 12 + 13 + #include "netlink_helpers.h" 12 14 #include "tc_helpers.h" 13 15 14 16 void serial_test_tc_links_basic(void) ··· 1787 1785 test_tc_links_ingress(BPF_TCX_INGRESS, true, true); 1788 1786 test_tc_links_ingress(BPF_TCX_INGRESS, true, false); 1789 1787 test_tc_links_ingress(BPF_TCX_INGRESS, false, false); 1788 + } 1789 + 1790 + struct qdisc_req { 1791 + struct nlmsghdr n; 1792 + struct tcmsg t; 1793 + char buf[1024]; 1794 + }; 1795 + 1796 + static int qdisc_replace(int ifindex, const char *kind, bool block) 1797 + { 1798 + struct rtnl_handle rth = { .fd = -1 }; 1799 + struct qdisc_req req; 1800 + int err; 1801 + 1802 + err = rtnl_open(&rth, 0); 1803 + if (!ASSERT_OK(err, "open_rtnetlink")) 1804 + return err; 1805 + 1806 + memset(&req, 0, sizeof(req)); 1807 + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg)); 1808 + req.n.nlmsg_flags = NLM_F_CREATE | NLM_F_REPLACE | NLM_F_REQUEST; 1809 + req.n.nlmsg_type = RTM_NEWQDISC; 1810 + req.t.tcm_family = AF_UNSPEC; 1811 + req.t.tcm_ifindex = ifindex; 1812 + req.t.tcm_parent = 0xfffffff1; 1813 + 1814 + addattr_l(&req.n, sizeof(req), TCA_KIND, kind, strlen(kind) + 1); 1815 + if (block) 1816 + addattr32(&req.n, sizeof(req), TCA_INGRESS_BLOCK, 1); 1817 + 1818 + err = rtnl_talk(&rth, &req.n, NULL); 1819 + ASSERT_OK(err, "talk_rtnetlink"); 1820 + rtnl_close(&rth); 1821 + return err; 1822 + } 1823 + 1824 + void serial_test_tc_links_dev_chain0(void) 1825 + { 1826 + int err, ifindex; 1827 + 1828 + ASSERT_OK(system("ip link add dev foo type veth peer name bar"), "add veth"); 1829 + ifindex = if_nametoindex("foo"); 1830 + ASSERT_NEQ(ifindex, 0, "non_zero_ifindex"); 1831 + err = qdisc_replace(ifindex, "ingress", true); 1832 + if (!ASSERT_OK(err, "attaching ingress")) 1833 + goto cleanup; 1834 + ASSERT_OK(system("tc filter add block 1 matchall action skbmod swap mac"), "add block"); 1835 + err = qdisc_replace(ifindex, "clsact", false); 1836 + if (!ASSERT_OK(err, "attaching clsact")) 1837 + goto cleanup; 1838 + /* Heuristic: kern_sync_rcu() alone does not work; a wait-time of ~5s 1839 + * triggered the issue without the fix reliably 100% of the time. 1840 + */ 1841 + sleep(5); 1842 + ASSERT_OK(system("tc filter add dev foo ingress matchall action skbmod swap mac"), "add filter"); 1843 + cleanup: 1844 + ASSERT_OK(system("ip link del dev foo"), "del veth"); 1845 + ASSERT_EQ(if_nametoindex("foo"), 0, "foo removed"); 1846 + ASSERT_EQ(if_nametoindex("bar"), 0, "bar removed"); 1790 1847 } 1791 1848 1792 1849 static void test_tc_links_dev_mixed(int target)
+91
tools/testing/selftests/bpf/prog_tests/timer_lockup.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #define _GNU_SOURCE 4 + #include <sched.h> 5 + #include <test_progs.h> 6 + #include <pthread.h> 7 + #include <network_helpers.h> 8 + 9 + #include "timer_lockup.skel.h" 10 + 11 + static long cpu; 12 + static int *timer1_err; 13 + static int *timer2_err; 14 + static bool skip; 15 + 16 + volatile int k = 0; 17 + 18 + static void *timer_lockup_thread(void *arg) 19 + { 20 + LIBBPF_OPTS(bpf_test_run_opts, opts, 21 + .data_in = &pkt_v4, 22 + .data_size_in = sizeof(pkt_v4), 23 + .repeat = 1000, 24 + ); 25 + int i, prog_fd = *(int *)arg; 26 + cpu_set_t cpuset; 27 + 28 + CPU_ZERO(&cpuset); 29 + CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset); 30 + ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset), 31 + &cpuset), 32 + "cpu affinity"); 33 + 34 + for (i = 0; !READ_ONCE(*timer1_err) && !READ_ONCE(*timer2_err); i++) { 35 + bpf_prog_test_run_opts(prog_fd, &opts); 36 + /* Skip the test if we can't reproduce the race in a reasonable 37 + * amount of time. 38 + */ 39 + if (i > 50) { 40 + WRITE_ONCE(skip, true); 41 + break; 42 + } 43 + } 44 + 45 + return NULL; 46 + } 47 + 48 + void test_timer_lockup(void) 49 + { 50 + int timer1_prog, timer2_prog; 51 + struct timer_lockup *skel; 52 + pthread_t thrds[2]; 53 + void *ret; 54 + 55 + skel = timer_lockup__open_and_load(); 56 + if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load")) 57 + return; 58 + 59 + timer1_prog = bpf_program__fd(skel->progs.timer1_prog); 60 + timer2_prog = bpf_program__fd(skel->progs.timer2_prog); 61 + 62 + timer1_err = &skel->bss->timer1_err; 63 + timer2_err = &skel->bss->timer2_err; 64 + 65 + if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread, 66 + &timer1_prog), 67 + "pthread_create thread1")) 68 + goto out; 69 + if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread, 70 + &timer2_prog), 71 + "pthread_create thread2")) { 72 + pthread_exit(&thrds[0]); 73 + goto out; 74 + } 75 + 76 + pthread_join(thrds[1], &ret); 77 + pthread_join(thrds[0], &ret); 78 + 79 + if (skip) { 80 + test__skip(); 81 + goto out; 82 + } 83 + 84 + if (*timer1_err != -EDEADLK && *timer1_err != 0) 85 + ASSERT_FAIL("timer1_err bad value"); 86 + if (*timer2_err != -EDEADLK && *timer2_err != 0) 87 + ASSERT_FAIL("timer2_err bad value"); 88 + out: 89 + timer_lockup__destroy(skel); 90 + return; 91 + }
+87
tools/testing/selftests/bpf/progs/timer_lockup.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/bpf.h> 4 + #include <time.h> 5 + #include <errno.h> 6 + #include <bpf/bpf_helpers.h> 7 + #include <bpf/bpf_tracing.h> 8 + #include "bpf_misc.h" 9 + 10 + char _license[] SEC("license") = "GPL"; 11 + 12 + struct elem { 13 + struct bpf_timer t; 14 + }; 15 + 16 + struct { 17 + __uint(type, BPF_MAP_TYPE_ARRAY); 18 + __uint(max_entries, 1); 19 + __type(key, int); 20 + __type(value, struct elem); 21 + } timer1_map SEC(".maps"); 22 + 23 + struct { 24 + __uint(type, BPF_MAP_TYPE_ARRAY); 25 + __uint(max_entries, 1); 26 + __type(key, int); 27 + __type(value, struct elem); 28 + } timer2_map SEC(".maps"); 29 + 30 + int timer1_err; 31 + int timer2_err; 32 + 33 + static int timer_cb1(void *map, int *k, struct elem *v) 34 + { 35 + struct bpf_timer *timer; 36 + int key = 0; 37 + 38 + timer = bpf_map_lookup_elem(&timer2_map, &key); 39 + if (timer) 40 + timer2_err = bpf_timer_cancel(timer); 41 + 42 + return 0; 43 + } 44 + 45 + static int timer_cb2(void *map, int *k, struct elem *v) 46 + { 47 + struct bpf_timer *timer; 48 + int key = 0; 49 + 50 + timer = bpf_map_lookup_elem(&timer1_map, &key); 51 + if (timer) 52 + timer1_err = bpf_timer_cancel(timer); 53 + 54 + return 0; 55 + } 56 + 57 + SEC("tc") 58 + int timer1_prog(void *ctx) 59 + { 60 + struct bpf_timer *timer; 61 + int key = 0; 62 + 63 + timer = bpf_map_lookup_elem(&timer1_map, &key); 64 + if (timer) { 65 + bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME); 66 + bpf_timer_set_callback(timer, timer_cb1); 67 + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); 68 + } 69 + 70 + return 0; 71 + } 72 + 73 + SEC("tc") 74 + int timer2_prog(void *ctx) 75 + { 76 + struct bpf_timer *timer; 77 + int key = 0; 78 + 79 + timer = bpf_map_lookup_elem(&timer2_map, &key); 80 + if (timer) { 81 + bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME); 82 + bpf_timer_set_callback(timer, timer_cb2); 83 + bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN); 84 + } 85 + 86 + return 0; 87 + }
+4 -4
tools/testing/selftests/wireguard/qemu/Makefile
··· 109 109 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage 110 110 QEMU_VPORT_RESULT := virtio-serial-device 111 111 ifeq ($(HOST_ARCH),$(ARCH)) 112 - QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi 112 + QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off 113 113 else 114 - QEMU_MACHINE := -cpu max -machine microvm -no-acpi 114 + QEMU_MACHINE := -cpu max -machine microvm,acpi=off 115 115 endif 116 116 else ifeq ($(ARCH),i686) 117 117 CHOST := i686-linux-musl ··· 120 120 KERNEL_BZIMAGE := $(KERNEL_BUILD_PATH)/arch/x86/boot/bzImage 121 121 QEMU_VPORT_RESULT := virtio-serial-device 122 122 ifeq ($(subst x86_64,i686,$(HOST_ARCH)),$(ARCH)) 123 - QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off -no-acpi 123 + QEMU_MACHINE := -cpu host -machine microvm,accel=kvm,pit=off,pic=off,rtc=off,acpi=off 124 124 else 125 - QEMU_MACHINE := -cpu coreduo -machine microvm -no-acpi 125 + QEMU_MACHINE := -cpu coreduo -machine microvm,acpi=off 126 126 endif 127 127 else ifeq ($(ARCH),mips64) 128 128 CHOST := mips64-linux-musl