Merge tag 'net-6.12-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net

+1 -1

Documentation/networking/tproxy.rst

··· 17 17 socket on your box, set the packet mark to a certain value:: 18 18 19 19 # iptables -t mangle -N DIVERT 20 - # iptables -t mangle -A PREROUTING -p tcp -m socket -j DIVERT 20 + # iptables -t mangle -A PREROUTING -p tcp -m socket --transparent -j DIVERT 21 21 # iptables -t mangle -A DIVERT -j MARK --set-mark 1 22 22 # iptables -t mangle -A DIVERT -j ACCEPT 23 23

+1 -1

MAINTAINERS

··· 17316 17316 L: netdev@vger.kernel.org 17317 17317 S: Maintained 17318 17318 F: Documentation/networking/oa-tc6-framework.rst 17319 - F: drivers/include/linux/oa_tc6.h 17320 17319 F: drivers/net/ethernet/oa_tc6.c 17320 + F: include/linux/oa_tc6.h 17321 17321 17322 17322 OPEN FIRMWARE AND FLATTENED DEVICE TREE 17323 17323 M: Rob Herring <robh@kernel.org>

+3 -3

drivers/net/bonding/bond_main.c

··· 5610 5610 break; 5611 5611 5612 5612 default: 5613 - /* Should never happen. Mode guarded by bond_xdp_check() */ 5614 - netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond)); 5615 - WARN_ON_ONCE(1); 5613 + if (net_ratelimit()) 5614 + netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", 5615 + BOND_MODE(bond)); 5616 5616 return NULL; 5617 5617 } 5618 5618

+28

drivers/net/ethernet/realtek/r8169_main.c

··· 579 579 __le32 rx_multicast; 580 580 __le16 tx_aborted; 581 581 __le16 tx_underrun; 582 + /* new since RTL8125 */ 583 + __le64 tx_octets; 584 + __le64 rx_octets; 585 + __le64 rx_multicast64; 586 + __le64 tx_unicast64; 587 + __le64 tx_broadcast64; 588 + __le64 tx_multicast64; 589 + __le32 tx_pause_on; 590 + __le32 tx_pause_off; 591 + __le32 tx_pause_all; 592 + __le32 tx_deferred; 593 + __le32 tx_late_collision; 594 + __le32 tx_all_collision; 595 + __le32 tx_aborted32; 596 + __le32 align_errors32; 597 + __le32 rx_frame_too_long; 598 + __le32 rx_runt; 599 + __le32 rx_pause_on; 600 + __le32 rx_pause_off; 601 + __le32 rx_pause_all; 602 + __le32 rx_unknown_opcode; 603 + __le32 rx_mac_error; 604 + __le32 tx_underrun32; 605 + __le32 rx_mac_missed; 606 + __le32 rx_tcam_dropped; 607 + __le32 tdu; 608 + __le32 rdu; 582 609 }; 583 610 584 611 struct rtl8169_tc_offsets { ··· 708 681 MODULE_FIRMWARE(FIRMWARE_8125A_3); 709 682 MODULE_FIRMWARE(FIRMWARE_8125B_2); 710 683 MODULE_FIRMWARE(FIRMWARE_8126A_2); 684 + MODULE_FIRMWARE(FIRMWARE_8126A_3); 711 685 712 686 static inline struct device *tp_to_dev(struct rtl8169_private *tp) 713 687 {

+1

drivers/net/ethernet/renesas/ravb.h

··· 1052 1052 netdev_features_t net_features; 1053 1053 int stats_len; 1054 1054 u32 tccr_mask; 1055 + u32 tx_max_frame_size; 1055 1056 u32 rx_max_frame_size; 1056 1057 u32 rx_buffer_size; 1057 1058 u32 rx_desc_size;

+15 -3

drivers/net/ethernet/renesas/ravb_main.c

··· 555 555 556 556 static void ravb_emac_init_rcar(struct net_device *ndev) 557 557 { 558 - /* Receive frame limit set register */ 559 - ravb_write(ndev, ndev->mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN, RFLR); 558 + struct ravb_private *priv = netdev_priv(ndev); 559 + 560 + /* Set receive frame length 561 + * 562 + * The length set here describes the frame from the destination address 563 + * up to and including the CRC data. However only the frame data, 564 + * excluding the CRC, are transferred to memory. To allow for the 565 + * largest frames add the CRC length to the maximum Rx descriptor size. 566 + */ 567 + ravb_write(ndev, priv->info->rx_max_frame_size + ETH_FCS_LEN, RFLR); 560 568 561 569 /* EMAC Mode: PAUSE prohibition; Duplex; RX Checksum; TX; RX */ 562 570 ravb_write(ndev, ECMR_ZPF | ECMR_DM | ··· 2682 2674 .net_features = NETIF_F_RXCSUM, 2683 2675 .stats_len = ARRAY_SIZE(ravb_gstrings_stats), 2684 2676 .tccr_mask = TCCR_TSRQ0 | TCCR_TSRQ1 | TCCR_TSRQ2 | TCCR_TSRQ3, 2677 + .tx_max_frame_size = SZ_2K, 2685 2678 .rx_max_frame_size = SZ_2K, 2686 2679 .rx_buffer_size = SZ_2K + 2687 2680 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ··· 2705 2696 .net_features = NETIF_F_RXCSUM, 2706 2697 .stats_len = ARRAY_SIZE(ravb_gstrings_stats), 2707 2698 .tccr_mask = TCCR_TSRQ0 | TCCR_TSRQ1 | TCCR_TSRQ2 | TCCR_TSRQ3, 2699 + .tx_max_frame_size = SZ_2K, 2708 2700 .rx_max_frame_size = SZ_2K, 2709 2701 .rx_buffer_size = SZ_2K + 2710 2702 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ··· 2731 2721 .net_features = NETIF_F_RXCSUM, 2732 2722 .stats_len = ARRAY_SIZE(ravb_gstrings_stats), 2733 2723 .tccr_mask = TCCR_TSRQ0 | TCCR_TSRQ1 | TCCR_TSRQ2 | TCCR_TSRQ3, 2724 + .tx_max_frame_size = SZ_2K, 2734 2725 .rx_max_frame_size = SZ_2K, 2735 2726 .rx_buffer_size = SZ_2K + 2736 2727 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), ··· 2781 2770 .net_features = NETIF_F_RXCSUM | NETIF_F_HW_CSUM, 2782 2771 .stats_len = ARRAY_SIZE(ravb_gstrings_stats_gbeth), 2783 2772 .tccr_mask = TCCR_TSRQ0, 2773 + .tx_max_frame_size = 1522, 2784 2774 .rx_max_frame_size = SZ_8K, 2785 2775 .rx_buffer_size = SZ_2K, 2786 2776 .rx_desc_size = sizeof(struct ravb_rx_desc), ··· 2993 2981 priv->avb_link_active_low = 2994 2982 of_property_read_bool(np, "renesas,ether-link-active-low"); 2995 2983 2996 - ndev->max_mtu = info->rx_max_frame_size - 2984 + ndev->max_mtu = info->tx_max_frame_size - 2997 2985 (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN); 2998 2986 ndev->min_mtu = ETH_MIN_MTU; 2999 2987

+2

drivers/net/ethernet/seeq/ether3.c

··· 847 847 { 848 848 struct net_device *dev = ecard_get_drvdata(ec); 849 849 850 + ether3_outw(priv(dev)->regs.config2 |= CFG2_CTRLO, REG_CONFIG2); 850 851 ecard_set_drvdata(ec, NULL); 851 852 852 853 unregister_netdev(dev); 854 + del_timer_sync(&priv(dev)->timer); 853 855 free_netdev(dev); 854 856 ecard_release_resources(ec); 855 857 }

+1 -1

drivers/net/ethernet/stmicro/stmmac/stmmac_main.c

··· 2035 2035 rx_q->queue_index = queue; 2036 2036 rx_q->priv_data = priv; 2037 2037 2038 - pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; 2038 + pp_params.flags = PP_FLAG_DMA_MAP | (xdp_prog ? PP_FLAG_DMA_SYNC_DEV : 0); 2039 2039 pp_params.pool_size = dma_conf->dma_rx_size; 2040 2040 num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE); 2041 2041 pp_params.order = ilog2(num_pages);

+1

drivers/net/ethernet/stmicro/stmmac/stmmac_tc.c

··· 386 386 return ret; 387 387 388 388 priv->plat->tx_queues_cfg[queue].mode_to_use = MTL_QUEUE_DCB; 389 + return 0; 389 390 } 390 391 391 392 /* Final adjustments for HW */

+22 -15

drivers/net/ethernet/xilinx/xilinx_axienet_main.c

··· 736 736 * 737 737 * Would either be called after a successful transmit operation, or after 738 738 * there was an error when setting up the chain. 739 - * Returns the number of descriptors handled. 739 + * Returns the number of packets handled. 740 740 */ 741 741 static int axienet_free_tx_chain(struct axienet_local *lp, u32 first_bd, 742 742 int nr_bds, bool force, u32 *sizep, int budget) 743 743 { 744 744 struct axidma_bd *cur_p; 745 745 unsigned int status; 746 + int i, packets = 0; 746 747 dma_addr_t phys; 747 - int i; 748 748 749 749 for (i = 0; i < nr_bds; i++) { 750 750 cur_p = &lp->tx_bd_v[(first_bd + i) % lp->tx_bd_num]; ··· 763 763 (cur_p->cntrl & XAXIDMA_BD_CTRL_LENGTH_MASK), 764 764 DMA_TO_DEVICE); 765 765 766 - if (cur_p->skb && (status & XAXIDMA_BD_STS_COMPLETE_MASK)) 766 + if (cur_p->skb && (status & XAXIDMA_BD_STS_COMPLETE_MASK)) { 767 767 napi_consume_skb(cur_p->skb, budget); 768 + packets++; 769 + } 768 770 769 771 cur_p->app0 = 0; 770 772 cur_p->app1 = 0; ··· 782 780 *sizep += status & XAXIDMA_BD_STS_ACTUAL_LEN_MASK; 783 781 } 784 782 785 - return i; 783 + if (!force) { 784 + lp->tx_bd_ci += i; 785 + if (lp->tx_bd_ci >= lp->tx_bd_num) 786 + lp->tx_bd_ci %= lp->tx_bd_num; 787 + } 788 + 789 + return packets; 786 790 } 787 791 788 792 /** ··· 961 953 u32 size = 0; 962 954 int packets; 963 955 964 - packets = axienet_free_tx_chain(lp, lp->tx_bd_ci, budget, false, &size, budget); 956 + packets = axienet_free_tx_chain(lp, lp->tx_bd_ci, lp->tx_bd_num, false, 957 + &size, budget); 965 958 966 959 if (packets) { 967 - lp->tx_bd_ci += packets; 968 - if (lp->tx_bd_ci >= lp->tx_bd_num) 969 - lp->tx_bd_ci %= lp->tx_bd_num; 970 - 971 960 u64_stats_update_begin(&lp->tx_stat_sync); 972 961 u64_stats_add(&lp->tx_packets, packets); 973 962 u64_stats_add(&lp->tx_bytes, size); ··· 1287 1282 u32 cr = lp->tx_dma_cr; 1288 1283 1289 1284 cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK); 1290 - axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr); 1291 - 1292 - napi_schedule(&lp->napi_tx); 1285 + if (napi_schedule_prep(&lp->napi_tx)) { 1286 + axienet_dma_out32(lp, XAXIDMA_TX_CR_OFFSET, cr); 1287 + __napi_schedule(&lp->napi_tx); 1288 + } 1293 1289 } 1294 1290 1295 1291 return IRQ_HANDLED; ··· 1332 1326 u32 cr = lp->rx_dma_cr; 1333 1327 1334 1328 cr &= ~(XAXIDMA_IRQ_IOC_MASK | XAXIDMA_IRQ_DELAY_MASK); 1335 - axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, cr); 1336 - 1337 - napi_schedule(&lp->napi_rx); 1329 + if (napi_schedule_prep(&lp->napi_rx)) { 1330 + axienet_dma_out32(lp, XAXIDMA_RX_CR_OFFSET, cr); 1331 + __napi_schedule(&lp->napi_rx); 1332 + } 1338 1333 } 1339 1334 1340 1335 return IRQ_HANDLED;

+23 -17

drivers/net/phy/aquantia/aquantia_firmware.c

··· 353 353 { 354 354 int ret; 355 355 356 - ret = aqr_wait_reset_complete(phydev); 357 - if (ret) 358 - return ret; 359 - 360 - /* Check if the firmware is not already loaded by pooling 361 - * the current version returned by the PHY. If 0 is returned, 362 - * no firmware is loaded. 356 + /* Check if the firmware is not already loaded by polling 357 + * the current version returned by the PHY. 363 358 */ 364 - ret = phy_read_mmd(phydev, MDIO_MMD_VEND1, VEND1_GLOBAL_FW_ID); 365 - if (ret > 0) 366 - goto exit; 359 + ret = aqr_wait_reset_complete(phydev); 360 + switch (ret) { 361 + case 0: 362 + /* Some firmware is loaded => do nothing */ 363 + return 0; 364 + case -ETIMEDOUT: 365 + /* VEND1_GLOBAL_FW_ID still reads 0 after 2 seconds of polling. 366 + * We don't have full confidence that no firmware is loaded (in 367 + * theory it might just not have loaded yet), but we will 368 + * assume that, and load a new image. 369 + */ 370 + ret = aqr_firmware_load_nvmem(phydev); 371 + if (!ret) 372 + return ret; 367 373 368 - ret = aqr_firmware_load_nvmem(phydev); 369 - if (!ret) 370 - goto exit; 371 - 372 - ret = aqr_firmware_load_fs(phydev); 373 - if (ret) 374 + ret = aqr_firmware_load_fs(phydev); 375 + if (ret) 376 + return ret; 377 + break; 378 + default: 379 + /* PHY read error, propagate it to the caller */ 374 380 return ret; 381 + } 375 382 376 - exit: 377 383 return 0; 378 384 }

+2 -1

drivers/net/phy/aquantia/aquantia_leds.c

··· 120 120 int aqr_phy_led_active_low_set(struct phy_device *phydev, int index, bool enable) 121 121 { 122 122 return phy_modify_mmd(phydev, MDIO_MMD_VEND1, AQR_LED_DRIVE(index), 123 - VEND1_GLOBAL_LED_DRIVE_VDD, enable); 123 + VEND1_GLOBAL_LED_DRIVE_VDD, 124 + enable ? VEND1_GLOBAL_LED_DRIVE_VDD : 0); 124 125 } 125 126 126 127 int aqr_phy_led_polarity_set(struct phy_device *phydev, int index, unsigned long modes)

+17 -7

drivers/net/phy/aquantia/aquantia_main.c

··· 435 435 } 436 436 } 437 437 438 + #define AQR_FW_WAIT_SLEEP_US 20000 439 + #define AQR_FW_WAIT_TIMEOUT_US 2000000 440 + 438 441 /* If we configure settings whilst firmware is still initializing the chip, 439 442 * then these settings may be overwritten. Therefore make sure chip 440 443 * initialization has completed. Use presence of the firmware ID as ··· 447 444 */ 448 445 int aqr_wait_reset_complete(struct phy_device *phydev) 449 446 { 450 - int val; 447 + int ret, val; 451 448 452 - return phy_read_mmd_poll_timeout(phydev, MDIO_MMD_VEND1, 453 - VEND1_GLOBAL_FW_ID, val, val != 0, 454 - 20000, 2000000, false); 449 + ret = read_poll_timeout(phy_read_mmd, val, val != 0, 450 + AQR_FW_WAIT_SLEEP_US, AQR_FW_WAIT_TIMEOUT_US, 451 + false, phydev, MDIO_MMD_VEND1, 452 + VEND1_GLOBAL_FW_ID); 453 + if (val < 0) { 454 + phydev_err(phydev, "Failed to read VEND1_GLOBAL_FW_ID: %pe\n", 455 + ERR_PTR(val)); 456 + return val; 457 + } 458 + 459 + return ret; 455 460 } 456 461 457 462 static void aqr107_chip_info(struct phy_device *phydev) ··· 489 478 { 490 479 struct aqr107_priv *priv = phydev->priv; 491 480 u32 led_active_low; 492 - int ret, index = 0; 481 + int ret; 493 482 494 483 /* Check that the PHY interface type is compatible */ 495 484 if (phydev->interface != PHY_INTERFACE_MODE_SGMII && ··· 516 505 517 506 /* Restore LED polarity state after reset */ 518 507 for_each_set_bit(led_active_low, &priv->leds_active_low, AQR_MAX_LEDS) { 519 - ret = aqr_phy_led_active_low_set(phydev, index, led_active_low); 508 + ret = aqr_phy_led_active_low_set(phydev, led_active_low, true); 520 509 if (ret) 521 510 return ret; 522 - index++; 523 511 } 524 512 525 513 return 0;

+28 -9

drivers/net/usb/usbnet.c

··· 464 464 void usbnet_defer_kevent (struct usbnet *dev, int work) 465 465 { 466 466 set_bit (work, &dev->flags); 467 - if (!schedule_work (&dev->kevent)) 468 - netdev_dbg(dev->net, "kevent %s may have been dropped\n", usbnet_event_names[work]); 469 - else 470 - netdev_dbg(dev->net, "kevent %s scheduled\n", usbnet_event_names[work]); 467 + if (!usbnet_going_away(dev)) { 468 + if (!schedule_work(&dev->kevent)) 469 + netdev_dbg(dev->net, 470 + "kevent %s may have been dropped\n", 471 + usbnet_event_names[work]); 472 + else 473 + netdev_dbg(dev->net, 474 + "kevent %s scheduled\n", usbnet_event_names[work]); 475 + } 471 476 } 472 477 EXPORT_SYMBOL_GPL(usbnet_defer_kevent); 473 478 ··· 540 535 tasklet_schedule (&dev->bh); 541 536 break; 542 537 case 0: 543 - __usbnet_queue_skb(&dev->rxq, skb, rx_start); 538 + if (!usbnet_going_away(dev)) 539 + __usbnet_queue_skb(&dev->rxq, skb, rx_start); 544 540 } 545 541 } else { 546 542 netif_dbg(dev, ifdown, dev->net, "rx: stopped\n"); ··· 849 843 850 844 /* deferred work (timer, softirq, task) must also stop */ 851 845 dev->flags = 0; 852 - del_timer_sync (&dev->delay); 853 - tasklet_kill (&dev->bh); 846 + del_timer_sync(&dev->delay); 847 + tasklet_kill(&dev->bh); 854 848 cancel_work_sync(&dev->kevent); 849 + 850 + /* We have cyclic dependencies. Those calls are needed 851 + * to break a cycle. We cannot fall into the gaps because 852 + * we have a flag 853 + */ 854 + tasklet_kill(&dev->bh); 855 + del_timer_sync(&dev->delay); 856 + cancel_work_sync(&dev->kevent); 857 + 855 858 if (!pm) 856 859 usb_autopm_put_interface(dev->intf); 857 860 ··· 1186 1171 status); 1187 1172 } else { 1188 1173 clear_bit (EVENT_RX_HALT, &dev->flags); 1189 - tasklet_schedule (&dev->bh); 1174 + if (!usbnet_going_away(dev)) 1175 + tasklet_schedule(&dev->bh); 1190 1176 } 1191 1177 } 1192 1178 ··· 1212 1196 usb_autopm_put_interface(dev->intf); 1213 1197 fail_lowmem: 1214 1198 if (resched) 1215 - tasklet_schedule (&dev->bh); 1199 + if (!usbnet_going_away(dev)) 1200 + tasklet_schedule(&dev->bh); 1216 1201 } 1217 1202 } 1218 1203 ··· 1576 1559 } else if (netif_running (dev->net) && 1577 1560 netif_device_present (dev->net) && 1578 1561 netif_carrier_ok(dev->net) && 1562 + !usbnet_going_away(dev) && 1579 1563 !timer_pending(&dev->delay) && 1580 1564 !test_bit(EVENT_RX_PAUSED, &dev->flags) && 1581 1565 !test_bit(EVENT_RX_HALT, &dev->flags)) { ··· 1624 1606 usb_set_intfdata(intf, NULL); 1625 1607 if (!dev) 1626 1608 return; 1609 + usbnet_mark_going_away(dev); 1627 1610 1628 1611 xdev = interface_to_usbdev (intf); 1629 1612

+8 -2

drivers/net/virtio_net.c

··· 1807 1807 struct page *page = virt_to_head_page(buf); 1808 1808 struct sk_buff *skb; 1809 1809 1810 + /* We passed the address of virtnet header to virtio-core, 1811 + * so truncate the padding. 1812 + */ 1813 + buf -= VIRTNET_RX_PAD + xdp_headroom; 1814 + 1810 1815 len -= vi->hdr_len; 1811 1816 u64_stats_add(&stats->bytes, len); 1812 1817 ··· 2427 2422 if (unlikely(!buf)) 2428 2423 return -ENOMEM; 2429 2424 2430 - virtnet_rq_init_one_sg(rq, buf + VIRTNET_RX_PAD + xdp_headroom, 2431 - vi->hdr_len + GOOD_PACKET_LEN); 2425 + buf += VIRTNET_RX_PAD + xdp_headroom; 2426 + 2427 + virtnet_rq_init_one_sg(rq, buf, vi->hdr_len + GOOD_PACKET_LEN); 2432 2428 2433 2429 err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp); 2434 2430 if (err < 0) {

-4

include/linux/netfilter.h

··· 376 376 struct nf_conn; 377 377 enum nf_nat_manip_type; 378 378 struct nlattr; 379 - enum ip_conntrack_dir; 380 379 381 380 struct nf_nat_hook { 382 381 int (*parse_nat_setup)(struct nf_conn *ct, enum nf_nat_manip_type manip, 383 382 const struct nlattr *attr); 384 383 void (*decode_session)(struct sk_buff *skb, struct flowi *fl); 385 - unsigned int (*manip_pkt)(struct sk_buff *skb, struct nf_conn *ct, 386 - enum nf_nat_manip_type mtype, 387 - enum ip_conntrack_dir dir); 388 384 void (*remove_nat_bysrc)(struct nf_conn *ct); 389 385 }; 390 386

+15

include/linux/usb/usbnet.h

··· 76 76 # define EVENT_LINK_CHANGE 11 77 77 # define EVENT_SET_RX_MODE 12 78 78 # define EVENT_NO_IP_ALIGN 13 79 + /* This one is special, as it indicates that the device is going away 80 + * there are cyclic dependencies between tasklet, timer and bh 81 + * that must be broken 82 + */ 83 + # define EVENT_UNPLUG 31 79 84 }; 85 + 86 + static inline bool usbnet_going_away(struct usbnet *ubn) 87 + { 88 + return test_bit(EVENT_UNPLUG, &ubn->flags); 89 + } 90 + 91 + static inline void usbnet_mark_going_away(struct usbnet *ubn) 92 + { 93 + set_bit(EVENT_UNPLUG, &ubn->flags); 94 + } 80 95 81 96 static inline struct usb_driver *driver_of(struct usb_interface *intf) 82 97 {

+19 -2

include/net/tcp.h

··· 2435 2435 { 2436 2436 const struct sk_buff *skb = tcp_rtx_queue_head(sk); 2437 2437 u32 rto = inet_csk(sk)->icsk_rto; 2438 - u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); 2439 2438 2440 - return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; 2439 + if (likely(skb)) { 2440 + u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); 2441 + 2442 + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; 2443 + } else { 2444 + WARN_ONCE(1, 2445 + "rtx queue emtpy: " 2446 + "out:%u sacked:%u lost:%u retrans:%u " 2447 + "tlp_high_seq:%u sk_state:%u ca_state:%u " 2448 + "advmss:%u mss_cache:%u pmtu:%u\n", 2449 + tcp_sk(sk)->packets_out, tcp_sk(sk)->sacked_out, 2450 + tcp_sk(sk)->lost_out, tcp_sk(sk)->retrans_out, 2451 + tcp_sk(sk)->tlp_high_seq, sk->sk_state, 2452 + inet_csk(sk)->icsk_ca_state, 2453 + tcp_sk(sk)->advmss, tcp_sk(sk)->mss_cache, 2454 + inet_csk(sk)->icsk_pmtu_cookie); 2455 + return jiffies_to_usecs(rto); 2456 + } 2457 + 2441 2458 } 2442 2459 2443 2460 /*

+4 -6

net/ipv4/netfilter/nf_reject_ipv4.c

··· 239 239 void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, 240 240 int hook) 241 241 { 242 - struct sk_buff *nskb; 243 - struct iphdr *niph; 244 242 const struct tcphdr *oth; 243 + struct sk_buff *nskb; 245 244 struct tcphdr _oth; 246 245 247 246 oth = nf_reject_ip_tcphdr_get(oldskb, &_oth, hook); ··· 265 266 nskb->mark = IP4_REPLY_MARK(net, oldskb->mark); 266 267 267 268 skb_reserve(nskb, LL_MAX_HEADER); 268 - niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, 269 - ip4_dst_hoplimit(skb_dst(nskb))); 269 + nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP, 270 + ip4_dst_hoplimit(skb_dst(nskb))); 270 271 nf_reject_ip_tcphdr_put(nskb, oldskb, oth); 271 272 if (ip_route_me_harder(net, sk, nskb, RTN_UNSPEC)) 272 273 goto free_nskb; 273 - 274 - niph = ip_hdr(nskb); 275 274 276 275 /* "Never happens" */ 277 276 if (nskb->len > dst_mtu(skb_dst(nskb))) ··· 287 290 */ 288 291 if (nf_bridge_info_exists(oldskb)) { 289 292 struct ethhdr *oeth = eth_hdr(oldskb); 293 + struct iphdr *niph = ip_hdr(nskb); 290 294 struct net_device *br_indev; 291 295 292 296 br_indev = nf_bridge_get_physindev(oldskb, net);

+1

net/ipv6/Kconfig

··· 323 323 bool "IPv6: RPL Source Routing Header support" 324 324 depends on IPV6 325 325 select LWTUNNEL 326 + select DST_CACHE 326 327 help 327 328 Support for RFC6554 RPL Source Routing Header using the lightweight 328 329 tunnels mechanism.

+4 -15

net/ipv6/netfilter/nf_reject_ipv6.c

··· 223 223 const struct tcphdr *oth, unsigned int otcplen) 224 224 { 225 225 struct tcphdr *tcph; 226 - int needs_ack; 227 226 228 227 skb_reset_transport_header(nskb); 229 - tcph = skb_put(nskb, sizeof(struct tcphdr)); 228 + tcph = skb_put_zero(nskb, sizeof(struct tcphdr)); 230 229 /* Truncate to length (no data) */ 231 230 tcph->doff = sizeof(struct tcphdr)/4; 232 231 tcph->source = oth->dest; 233 232 tcph->dest = oth->source; 234 233 235 234 if (oth->ack) { 236 - needs_ack = 0; 237 235 tcph->seq = oth->ack_seq; 238 - tcph->ack_seq = 0; 239 236 } else { 240 - needs_ack = 1; 241 237 tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + 242 238 otcplen - (oth->doff<<2)); 243 - tcph->seq = 0; 239 + tcph->ack = 1; 244 240 } 245 241 246 - /* Reset flags */ 247 - ((u_int8_t *)tcph)[13] = 0; 248 242 tcph->rst = 1; 249 - tcph->ack = needs_ack; 250 - tcph->window = 0; 251 - tcph->urg_ptr = 0; 252 - tcph->check = 0; 253 243 254 244 /* Adjust TCP checksum */ 255 245 tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, ··· 273 283 const struct tcphdr *otcph; 274 284 unsigned int otcplen, hh_len; 275 285 const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); 276 - struct ipv6hdr *ip6h; 277 286 struct dst_entry *dst = NULL; 278 287 struct flowi6 fl6; 279 288 ··· 328 339 nskb->mark = fl6.flowi6_mark; 329 340 330 341 skb_reserve(nskb, hh_len + dst->header_len); 331 - ip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, 332 - ip6_dst_hoplimit(dst)); 342 + nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); 333 343 nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); 334 344 335 345 nf_ct_attach(nskb, oldskb); ··· 343 355 */ 344 356 if (nf_bridge_info_exists(oldskb)) { 345 357 struct ethhdr *oeth = eth_hdr(oldskb); 358 + struct ipv6hdr *ip6h = ipv6_hdr(nskb); 346 359 struct net_device *br_indev; 347 360 348 361 br_indev = nf_bridge_get_physindev(oldskb, net);

+51 -90

net/netfilter/nf_conntrack_core.c

··· 988 988 tstamp->start = ktime_get_real_ns(); 989 989 } 990 990 991 + /** 992 + * nf_ct_match_reverse - check if ct1 and ct2 refer to identical flow 993 + * @ct1: conntrack in hash table to check against 994 + * @ct2: merge candidate 995 + * 996 + * returns true if ct1 and ct2 happen to refer to the same flow, but 997 + * in opposing directions, i.e. 998 + * ct1: a:b -> c:d 999 + * ct2: c:d -> a:b 1000 + * for both directions. If so, @ct2 should not have been created 1001 + * as the skb should have been picked up as ESTABLISHED flow. 1002 + * But ct1 was not yet committed to hash table before skb that created 1003 + * ct2 had arrived. 1004 + * 1005 + * Note we don't compare netns because ct entries in different net 1006 + * namespace cannot clash to begin with. 1007 + * 1008 + * @return: true if ct1 and ct2 are identical when swapping origin/reply. 1009 + */ 1010 + static bool 1011 + nf_ct_match_reverse(const struct nf_conn *ct1, const struct nf_conn *ct2) 1012 + { 1013 + u16 id1, id2; 1014 + 1015 + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 1016 + &ct2->tuplehash[IP_CT_DIR_REPLY].tuple)) 1017 + return false; 1018 + 1019 + if (!nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple, 1020 + &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) 1021 + return false; 1022 + 1023 + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_ORIGINAL); 1024 + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_REPLY); 1025 + if (id1 != id2) 1026 + return false; 1027 + 1028 + id1 = nf_ct_zone_id(nf_ct_zone(ct1), IP_CT_DIR_REPLY); 1029 + id2 = nf_ct_zone_id(nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL); 1030 + 1031 + return id1 == id2; 1032 + } 1033 + 1034 + static int nf_ct_can_merge(const struct nf_conn *ct, 1035 + const struct nf_conn *loser_ct) 1036 + { 1037 + return nf_ct_match(ct, loser_ct) || 1038 + nf_ct_match_reverse(ct, loser_ct); 1039 + } 1040 + 991 1041 /* caller must hold locks to prevent concurrent changes */ 992 1042 static int __nf_ct_resolve_clash(struct sk_buff *skb, 993 1043 struct nf_conntrack_tuple_hash *h) ··· 1049 999 1050 1000 loser_ct = nf_ct_get(skb, &ctinfo); 1051 1001 1052 - if (nf_ct_is_dying(ct)) 1053 - return NF_DROP; 1054 - 1055 - if (((ct->status & IPS_NAT_DONE_MASK) == 0) || 1056 - nf_ct_match(ct, loser_ct)) { 1002 + if (nf_ct_can_merge(ct, loser_ct)) { 1057 1003 struct net *net = nf_ct_net(ct); 1058 1004 1059 1005 nf_conntrack_get(&ct->ct_general); ··· 2197 2151 nf_conntrack_get(skb_nfct(nskb)); 2198 2152 } 2199 2153 2200 - static int __nf_conntrack_update(struct net *net, struct sk_buff *skb, 2201 - struct nf_conn *ct, 2202 - enum ip_conntrack_info ctinfo) 2203 - { 2204 - const struct nf_nat_hook *nat_hook; 2205 - struct nf_conntrack_tuple_hash *h; 2206 - struct nf_conntrack_tuple tuple; 2207 - unsigned int status; 2208 - int dataoff; 2209 - u16 l3num; 2210 - u8 l4num; 2211 - 2212 - l3num = nf_ct_l3num(ct); 2213 - 2214 - dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num); 2215 - if (dataoff <= 0) 2216 - return NF_DROP; 2217 - 2218 - if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, 2219 - l4num, net, &tuple)) 2220 - return NF_DROP; 2221 - 2222 - if (ct->status & IPS_SRC_NAT) { 2223 - memcpy(tuple.src.u3.all, 2224 - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.all, 2225 - sizeof(tuple.src.u3.all)); 2226 - tuple.src.u.all = 2227 - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all; 2228 - } 2229 - 2230 - if (ct->status & IPS_DST_NAT) { 2231 - memcpy(tuple.dst.u3.all, 2232 - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.all, 2233 - sizeof(tuple.dst.u3.all)); 2234 - tuple.dst.u.all = 2235 - ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.all; 2236 - } 2237 - 2238 - h = nf_conntrack_find_get(net, nf_ct_zone(ct), &tuple); 2239 - if (!h) 2240 - return NF_ACCEPT; 2241 - 2242 - /* Store status bits of the conntrack that is clashing to re-do NAT 2243 - * mangling according to what it has been done already to this packet. 2244 - */ 2245 - status = ct->status; 2246 - 2247 - nf_ct_put(ct); 2248 - ct = nf_ct_tuplehash_to_ctrack(h); 2249 - nf_ct_set(skb, ct, ctinfo); 2250 - 2251 - nat_hook = rcu_dereference(nf_nat_hook); 2252 - if (!nat_hook) 2253 - return NF_ACCEPT; 2254 - 2255 - if (status & IPS_SRC_NAT) { 2256 - unsigned int verdict = nat_hook->manip_pkt(skb, ct, 2257 - NF_NAT_MANIP_SRC, 2258 - IP_CT_DIR_ORIGINAL); 2259 - if (verdict != NF_ACCEPT) 2260 - return verdict; 2261 - } 2262 - 2263 - if (status & IPS_DST_NAT) { 2264 - unsigned int verdict = nat_hook->manip_pkt(skb, ct, 2265 - NF_NAT_MANIP_DST, 2266 - IP_CT_DIR_ORIGINAL); 2267 - if (verdict != NF_ACCEPT) 2268 - return verdict; 2269 - } 2270 - 2271 - return NF_ACCEPT; 2272 - } 2273 - 2274 2154 /* This packet is coming from userspace via nf_queue, complete the packet 2275 2155 * processing after the helper invocation in nf_confirm(). 2276 2156 */ ··· 2259 2287 ct = nf_ct_get(skb, &ctinfo); 2260 2288 if (!ct) 2261 2289 return NF_ACCEPT; 2262 - 2263 - if (!nf_ct_is_confirmed(ct)) { 2264 - int ret = __nf_conntrack_update(net, skb, ct, ctinfo); 2265 - 2266 - if (ret != NF_ACCEPT) 2267 - return ret; 2268 - 2269 - ct = nf_ct_get(skb, &ctinfo); 2270 - if (!ct) 2271 - return NF_ACCEPT; 2272 - } 2273 2290 2274 2291 return nf_confirm_cthelper(skb, ct, ctinfo); 2275 2292 }

+3 -6

net/netfilter/nf_conntrack_netlink.c

··· 382 382 #define ctnetlink_dump_secctx(a, b) (0) 383 383 #endif 384 384 385 - #ifdef CONFIG_NF_CONNTRACK_LABELS 385 + #ifdef CONFIG_NF_CONNTRACK_EVENTS 386 386 static inline int ctnetlink_label_size(const struct nf_conn *ct) 387 387 { 388 388 struct nf_conn_labels *labels = nf_ct_labels_find(ct); ··· 391 391 return 0; 392 392 return nla_total_size(sizeof(labels->bits)); 393 393 } 394 + #endif 394 395 395 396 static int 396 397 ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) ··· 412 411 413 412 return 0; 414 413 } 415 - #else 416 - #define ctnetlink_dump_labels(a, b) (0) 417 - #define ctnetlink_label_size(a) (0) 418 - #endif 419 414 420 415 #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 421 416 ··· 649 652 650 653 return len + len4; 651 654 } 652 - #endif 653 655 654 656 static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) 655 657 { ··· 686 690 return 0; 687 691 #endif 688 692 } 693 + #endif 689 694 690 695 #ifdef CONFIG_NF_CONNTRACK_EVENTS 691 696 static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)

+118 -3

net/netfilter/nf_nat_core.c

··· 183 183 return reciprocal_scale(hash, nf_nat_htable_size); 184 184 } 185 185 186 - /* Is this tuple already taken? (not by us) */ 186 + /** 187 + * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry 188 + * @tuple: proposed NAT binding 189 + * @ignored_conntrack: our (unconfirmed) conntrack entry 190 + * 191 + * A conntrack entry can be inserted to the connection tracking table 192 + * if there is no existing entry with an identical tuple in either direction. 193 + * 194 + * Example: 195 + * INITIATOR -> NAT/PAT -> RESPONDER 196 + * 197 + * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). 198 + * Then, later, NAT/PAT itself also connects to RESPONDER. 199 + * 200 + * This will not work if the SNAT done earlier has same IP:PORT source pair. 201 + * 202 + * Conntrack table has: 203 + * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT 204 + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT 205 + * 206 + * and new locally originating connection wants: 207 + * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT 208 + * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT 209 + * 210 + * ... which would mean incoming packets cannot be distinguished between 211 + * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). 212 + * 213 + * @return: true if the proposed NAT mapping collides with an existing entry. 214 + */ 187 215 static int 188 216 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, 189 217 const struct nf_conn *ignored_conntrack) ··· 226 198 227 199 nf_ct_invert_tuple(&reply, tuple); 228 200 return nf_conntrack_tuple_taken(&reply, ignored_conntrack); 201 + } 202 + 203 + static bool nf_nat_allow_clash(const struct nf_conn *ct) 204 + { 205 + return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; 206 + } 207 + 208 + /** 209 + * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry 210 + * @tuple: proposed NAT binding 211 + * @ignored_ct: our (unconfirmed) conntrack entry 212 + * 213 + * Same as nf_nat_used_tuple, but also check for rare clash in reverse 214 + * direction. Should be called only when @tuple has not been altered, i.e. 215 + * @ignored_conntrack will not be subject to NAT. 216 + * 217 + * @return: true if the proposed NAT mapping collides with existing entry. 218 + */ 219 + static noinline bool 220 + nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, 221 + const struct nf_conn *ignored_ct) 222 + { 223 + static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; 224 + const struct nf_conntrack_tuple_hash *thash; 225 + const struct nf_conntrack_zone *zone; 226 + struct nf_conn *ct; 227 + bool taken = true; 228 + struct net *net; 229 + 230 + if (!nf_nat_used_tuple(tuple, ignored_ct)) 231 + return false; 232 + 233 + if (!nf_nat_allow_clash(ignored_ct)) 234 + return true; 235 + 236 + /* Initial choice clashes with existing conntrack. 237 + * Check for (rare) reverse collision. 238 + * 239 + * This can happen when new packets are received in both directions 240 + * at the exact same time on different CPUs. 241 + * 242 + * Without SMP, first packet creates new conntrack entry and second 243 + * packet is resolved as established reply packet. 244 + * 245 + * With parallel processing, both packets could be picked up as 246 + * new and both get their own ct entry allocated. 247 + * 248 + * If ignored_conntrack and colliding ct are not subject to NAT then 249 + * pretend the tuple is available and let later clash resolution 250 + * handle this at insertion time. 251 + * 252 + * Without it, the 'reply' packet has its source port rewritten 253 + * by nat engine. 254 + */ 255 + if (READ_ONCE(ignored_ct->status) & uses_nat) 256 + return true; 257 + 258 + net = nf_ct_net(ignored_ct); 259 + zone = nf_ct_zone(ignored_ct); 260 + 261 + thash = nf_conntrack_find_get(net, zone, tuple); 262 + if (unlikely(!thash)) /* clashing entry went away */ 263 + return false; 264 + 265 + ct = nf_ct_tuplehash_to_ctrack(thash); 266 + 267 + /* NB: IP_CT_DIR_ORIGINAL should be impossible because 268 + * nf_nat_used_tuple() handles origin collisions. 269 + * 270 + * Handle remote chance other CPU confirmed its ct right after. 271 + */ 272 + if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) 273 + goto out; 274 + 275 + /* clashing connection subject to NAT? Retry with new tuple. */ 276 + if (READ_ONCE(ct->status) & uses_nat) 277 + goto out; 278 + 279 + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, 280 + &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && 281 + nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, 282 + &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { 283 + taken = false; 284 + goto out; 285 + } 286 + out: 287 + nf_ct_put(ct); 288 + return taken; 229 289 } 230 290 231 291 static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) ··· 727 611 !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { 728 612 /* try the original tuple first */ 729 613 if (nf_in_range(orig_tuple, range)) { 730 - if (!nf_nat_used_tuple(orig_tuple, ct)) { 614 + if (!nf_nat_used_tuple_new(orig_tuple, ct)) { 731 615 *tuple = *orig_tuple; 732 616 return; 733 617 } ··· 1324 1208 #ifdef CONFIG_XFRM 1325 1209 .decode_session = __nf_nat_decode_session, 1326 1210 #endif 1327 - .manip_pkt = nf_nat_manip_pkt, 1328 1211 .remove_nat_bysrc = nf_nat_cleanup_conntrack, 1329 1212 }; 1330 1213

+3 -3

net/netfilter/nf_tables_api.c

··· 1849 1849 if (!hook_list) 1850 1850 hook_list = &basechain->hook_list; 1851 1851 1852 - list_for_each_entry(hook, hook_list, list) { 1852 + list_for_each_entry_rcu(hook, hook_list, list) { 1853 1853 if (!first) 1854 1854 first = hook; 1855 1855 ··· 6684 6684 } 6685 6685 } 6686 6686 6687 - catchall = kmalloc(sizeof(*catchall), GFP_KERNEL); 6687 + catchall = kmalloc(sizeof(*catchall), GFP_KERNEL_ACCOUNT); 6688 6688 if (!catchall) 6689 6689 return -ENOMEM; 6690 6690 ··· 9207 9207 flowtable->data.type->setup(&flowtable->data, hook->ops.dev, 9208 9208 FLOW_BLOCK_UNBIND); 9209 9209 list_del_rcu(&hook->list); 9210 - kfree(hook); 9210 + kfree_rcu(hook, rcu); 9211 9211 } 9212 9212 kfree(flowtable->name); 9213 9213 module_put(flowtable->data.type->owner);

+3 -3

net/netfilter/nft_compat.c

··· 535 535 struct xt_match *m = expr->ops->data; 536 536 int ret; 537 537 538 - priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL); 538 + priv->info = kmalloc(XT_ALIGN(m->matchsize), GFP_KERNEL_ACCOUNT); 539 539 if (!priv->info) 540 540 return -ENOMEM; 541 541 ··· 808 808 goto err; 809 809 } 810 810 811 - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); 811 + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); 812 812 if (!ops) { 813 813 err = -ENOMEM; 814 814 goto err; ··· 898 898 goto err; 899 899 } 900 900 901 - ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL); 901 + ops = kzalloc(sizeof(struct nft_expr_ops), GFP_KERNEL_ACCOUNT); 902 902 if (!ops) { 903 903 err = -ENOMEM; 904 904 goto err;

+1 -1

net/netfilter/nft_log.c

··· 163 163 164 164 nla = tb[NFTA_LOG_PREFIX]; 165 165 if (nla != NULL) { 166 - priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); 166 + priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL_ACCOUNT); 167 167 if (priv->prefix == NULL) 168 168 return -ENOMEM; 169 169 nla_strscpy(priv->prefix, nla, nla_len(nla) + 1);

+1 -1

net/netfilter/nft_meta.c

··· 952 952 if (tb[NFTA_SECMARK_CTX] == NULL) 953 953 return -EINVAL; 954 954 955 - priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); 955 + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL_ACCOUNT); 956 956 if (!priv->ctx) 957 957 return -ENOMEM; 958 958

+1 -1

net/netfilter/nft_numgen.c

··· 66 66 if (priv->offset + priv->modulus - 1 < priv->offset) 67 67 return -EOVERFLOW; 68 68 69 - priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL); 69 + priv->counter = kmalloc(sizeof(*priv->counter), GFP_KERNEL_ACCOUNT); 70 70 if (!priv->counter) 71 71 return -ENOMEM; 72 72

+7 -6

net/netfilter/nft_set_pipapo.c

··· 663 663 check_add_overflow(rules, extra, &rules_alloc)) 664 664 return -EOVERFLOW; 665 665 666 - new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL); 666 + new_mt = kvmalloc_array(rules_alloc, sizeof(*new_mt), GFP_KERNEL_ACCOUNT); 667 667 if (!new_mt) 668 668 return -ENOMEM; 669 669 ··· 936 936 return; 937 937 } 938 938 939 - new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL); 939 + new_lt = kvzalloc(lt_size + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT); 940 940 if (!new_lt) 941 941 return; 942 942 ··· 1212 1212 scratch = kzalloc_node(struct_size(scratch, map, 1213 1213 bsize_max * 2) + 1214 1214 NFT_PIPAPO_ALIGN_HEADROOM, 1215 - GFP_KERNEL, cpu_to_node(i)); 1215 + GFP_KERNEL_ACCOUNT, cpu_to_node(i)); 1216 1216 if (!scratch) { 1217 1217 /* On failure, there's no need to undo previous 1218 1218 * allocations: this means that some scratch maps have ··· 1427 1427 struct nft_pipapo_match *new; 1428 1428 int i; 1429 1429 1430 - new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL); 1430 + new = kmalloc(struct_size(new, f, old->field_count), GFP_KERNEL_ACCOUNT); 1431 1431 if (!new) 1432 1432 return NULL; 1433 1433 ··· 1457 1457 new_lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS(src->bb) * 1458 1458 src->bsize * sizeof(*dst->lt) + 1459 1459 NFT_PIPAPO_ALIGN_HEADROOM, 1460 - GFP_KERNEL); 1460 + GFP_KERNEL_ACCOUNT); 1461 1461 if (!new_lt) 1462 1462 goto out_lt; 1463 1463 ··· 1470 1470 1471 1471 if (src->rules > 0) { 1472 1472 dst->mt = kvmalloc_array(src->rules_alloc, 1473 - sizeof(*src->mt), GFP_KERNEL); 1473 + sizeof(*src->mt), 1474 + GFP_KERNEL_ACCOUNT); 1474 1475 if (!dst->mt) 1475 1476 goto out_mt; 1476 1477

+3 -2

net/netfilter/nft_tunnel.c

··· 509 509 return err; 510 510 } 511 511 512 - md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL); 512 + md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, 513 + GFP_KERNEL_ACCOUNT); 513 514 if (!md) 514 515 return -ENOMEM; 515 516 516 517 memcpy(&md->u.tun_info, &info, sizeof(info)); 517 518 #ifdef CONFIG_DST_CACHE 518 - err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL); 519 + err = dst_cache_init(&md->u.tun_info.dst_cache, GFP_KERNEL_ACCOUNT); 519 520 if (err < 0) { 520 521 metadata_dst_free(md); 521 522 return err;

+1 -1

net/qrtr/af_qrtr.c

··· 884 884 885 885 mutex_lock(&qrtr_node_lock); 886 886 list_for_each_entry(node, &qrtr_all_nodes, item) { 887 - skbn = skb_clone(skb, GFP_KERNEL); 887 + skbn = pskb_copy(skb, GFP_KERNEL); 888 888 if (!skbn) 889 889 break; 890 890 skb_set_owner_w(skbn, skb->sk);

+4

tools/testing/selftests/net/netfilter/Makefile

··· 13 13 TEST_PROGS += conntrack_tcp_unreplied.sh 14 14 TEST_PROGS += conntrack_sctp_collision.sh 15 15 TEST_PROGS += conntrack_vrf.sh 16 + TEST_PROGS += conntrack_reverse_clash.sh 16 17 TEST_PROGS += ipvs.sh 17 18 TEST_PROGS += nf_conntrack_packetdrill.sh 18 19 TEST_PROGS += nf_nat_edemux.sh ··· 27 26 TEST_PROGS += nft_nat_zones.sh 28 27 TEST_PROGS += nft_queue.sh 29 28 TEST_PROGS += nft_synproxy.sh 29 + TEST_PROGS += nft_tproxy_tcp.sh 30 + TEST_PROGS += nft_tproxy_udp.sh 30 31 TEST_PROGS += nft_zones_many.sh 31 32 TEST_PROGS += rpath.sh 32 33 TEST_PROGS += xt_string.sh ··· 39 36 40 37 TEST_GEN_FILES = audit_logread 41 38 TEST_GEN_FILES += connect_close nf_queue 39 + TEST_GEN_FILES += conntrack_reverse_clash 42 40 TEST_GEN_FILES += sctp_collision 43 41 44 42 include ../../lib.mk

+1

tools/testing/selftests/net/netfilter/config

··· 81 81 CONFIG_NFT_QUOTA=m 82 82 CONFIG_NFT_REDIR=m 83 83 CONFIG_NFT_SYNPROXY=m 84 + CONFIG_NFT_TPROXY=m 84 85 CONFIG_VETH=m 85 86 CONFIG_VLAN_8021Q=m 86 87 CONFIG_XFRM_USER=m

+125

tools/testing/selftests/net/netfilter/conntrack_reverse_clash.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Needs something like: 4 + * 5 + * iptables -t nat -A POSTROUTING -o nomatch -j MASQUERADE 6 + * 7 + * so NAT engine attaches a NAT null-binding to each connection. 8 + * 9 + * With unmodified kernels, child or parent will exit with 10 + * "Port number changed" error, even though no port translation 11 + * was requested. 12 + */ 13 + 14 + #include <errno.h> 15 + #include <stdbool.h> 16 + #include <stdint.h> 17 + #include <stdio.h> 18 + #include <string.h> 19 + #include <stdlib.h> 20 + #include <time.h> 21 + #include <unistd.h> 22 + #include <arpa/inet.h> 23 + #include <sys/socket.h> 24 + #include <sys/wait.h> 25 + 26 + #define LEN 512 27 + #define PORT 56789 28 + #define TEST_TIME 5 29 + 30 + static void die(const char *e) 31 + { 32 + perror(e); 33 + exit(111); 34 + } 35 + 36 + static void die_port(uint16_t got, uint16_t want) 37 + { 38 + fprintf(stderr, "Port number changed, wanted %d got %d\n", want, ntohs(got)); 39 + exit(1); 40 + } 41 + 42 + static int udp_socket(void) 43 + { 44 + static const struct timeval tv = { 45 + .tv_sec = 1, 46 + }; 47 + int fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); 48 + 49 + if (fd < 0) 50 + die("socket"); 51 + 52 + setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)); 53 + return fd; 54 + } 55 + 56 + int main(int argc, char *argv[]) 57 + { 58 + struct sockaddr_in sa1 = { 59 + .sin_family = AF_INET, 60 + }; 61 + struct sockaddr_in sa2 = { 62 + .sin_family = AF_INET, 63 + }; 64 + int s1, s2, status; 65 + time_t end, now; 66 + socklen_t plen; 67 + char buf[LEN]; 68 + bool child; 69 + 70 + sa1.sin_port = htons(PORT); 71 + sa2.sin_port = htons(PORT + 1); 72 + 73 + s1 = udp_socket(); 74 + s2 = udp_socket(); 75 + 76 + inet_pton(AF_INET, "127.0.0.11", &sa1.sin_addr); 77 + inet_pton(AF_INET, "127.0.0.12", &sa2.sin_addr); 78 + 79 + if (bind(s1, (struct sockaddr *)&sa1, sizeof(sa1)) < 0) 80 + die("bind 1"); 81 + if (bind(s2, (struct sockaddr *)&sa2, sizeof(sa2)) < 0) 82 + die("bind 2"); 83 + 84 + child = fork() == 0; 85 + 86 + now = time(NULL); 87 + end = now + TEST_TIME; 88 + 89 + while (now < end) { 90 + struct sockaddr_in peer; 91 + socklen_t plen = sizeof(peer); 92 + 93 + now = time(NULL); 94 + 95 + if (child) { 96 + if (sendto(s1, buf, LEN, 0, (struct sockaddr *)&sa2, sizeof(sa2)) != LEN) 97 + continue; 98 + 99 + if (recvfrom(s2, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0) 100 + die("child recvfrom"); 101 + 102 + if (peer.sin_port != htons(PORT)) 103 + die_port(peer.sin_port, PORT); 104 + } else { 105 + if (sendto(s2, buf, LEN, 0, (struct sockaddr *)&sa1, sizeof(sa1)) != LEN) 106 + continue; 107 + 108 + if (recvfrom(s1, buf, LEN, 0, (struct sockaddr *)&peer, &plen) < 0) 109 + die("parent recvfrom"); 110 + 111 + if (peer.sin_port != htons((PORT + 1))) 112 + die_port(peer.sin_port, PORT + 1); 113 + } 114 + } 115 + 116 + if (child) 117 + return 0; 118 + 119 + wait(&status); 120 + 121 + if (WIFEXITED(status)) 122 + return WEXITSTATUS(status); 123 + 124 + return 1; 125 + }

+51

tools/testing/selftests/net/netfilter/conntrack_reverse_clash.sh

··· 1 + #!/bin/bash 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + source lib.sh 5 + 6 + cleanup() 7 + { 8 + cleanup_all_ns 9 + } 10 + 11 + checktool "nft --version" "run test without nft" 12 + checktool "conntrack --version" "run test without conntrack" 13 + 14 + trap cleanup EXIT 15 + 16 + setup_ns ns0 17 + 18 + # make loopback connections get nat null bindings assigned 19 + ip netns exec "$ns0" nft -f - <<EOF 20 + table ip nat { 21 + chain POSTROUTING { 22 + type nat hook postrouting priority srcnat; policy accept; 23 + oifname "nomatch" counter packets 0 bytes 0 masquerade 24 + } 25 + } 26 + EOF 27 + 28 + do_flush() 29 + { 30 + local end 31 + local now 32 + 33 + now=$(date +%s) 34 + end=$((now + 5)) 35 + 36 + while [ $now -lt $end ];do 37 + ip netns exec "$ns0" conntrack -F 2>/dev/null 38 + now=$(date +%s) 39 + done 40 + } 41 + 42 + do_flush & 43 + 44 + if ip netns exec "$ns0" ./conntrack_reverse_clash; then 45 + echo "PASS: No SNAT performed for null bindings" 46 + else 47 + echo "ERROR: SNAT performed without any matching snat rule" 48 + exit 1 49 + fi 50 + 51 + exit 0

+1 -1

tools/testing/selftests/net/netfilter/ipvs.sh

··· 97 97 } 98 98 99 99 server_listen() { 100 - ip netns exec "$ns2" socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & 100 + ip netns exec "$ns2" timeout 5 socat -u -4 TCP-LISTEN:8080,reuseaddr STDOUT > "${outfile}" & 101 101 server_pid=$! 102 102 sleep 0.2 103 103 }

+91 -1

tools/testing/selftests/net/netfilter/nft_queue.sh

··· 31 31 32 32 trap cleanup EXIT 33 33 34 - setup_ns ns1 ns2 nsrouter 34 + setup_ns ns1 ns2 ns3 nsrouter 35 35 36 36 TMPFILE0=$(mktemp) 37 37 TMPFILE1=$(mktemp) ··· 48 48 exit $ksft_skip 49 49 fi 50 50 ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" 51 + ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3" 51 52 52 53 ip -net "$nsrouter" link set veth0 up 53 54 ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 ··· 58 57 ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 59 58 ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad 60 59 60 + ip -net "$nsrouter" link set veth2 up 61 + ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2 62 + ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad 63 + 61 64 ip -net "$ns1" link set eth0 up 62 65 ip -net "$ns2" link set eth0 up 66 + ip -net "$ns3" link set eth0 up 63 67 64 68 ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 65 69 ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad ··· 75 69 ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad 76 70 ip -net "$ns2" route add default via 10.0.2.1 77 71 ip -net "$ns2" route add default via dead:2::1 72 + 73 + ip -net "$ns3" addr add 10.0.3.99/24 dev eth0 74 + ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad 75 + ip -net "$ns3" route add default via 10.0.3.1 76 + ip -net "$ns3" route add default via dead:3::1 78 77 79 78 load_ruleset() { 80 79 local name=$1 ··· 484 473 check_output_files "$TMPINPUT" "$TMPFILE1" "sctp output" 485 474 } 486 475 476 + udp_listener_ready() 477 + { 478 + ss -S -N "$1" -uln -o "sport = :12345" | grep -q 12345 479 + } 480 + 481 + output_files_written() 482 + { 483 + test -s "$1" && test -s "$2" 484 + } 485 + 486 + test_udp_ct_race() 487 + { 488 + ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF 489 + flush ruleset 490 + table inet udpq { 491 + chain prerouting { 492 + type nat hook prerouting priority dstnat - 5; policy accept; 493 + ip daddr 10.6.6.6 udp dport 12345 counter dnat to numgen inc mod 2 map { 0 : 10.0.2.99, 1 : 10.0.3.99 } 494 + } 495 + chain postrouting { 496 + type filter hook postrouting priority srcnat - 5; policy accept; 497 + udp dport 12345 counter queue num 12 498 + } 499 + } 500 + EOF 501 + :> "$TMPFILE1" 502 + :> "$TMPFILE2" 503 + 504 + timeout 10 ip netns exec "$ns2" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE1",trunc & 505 + local rpid1=$! 506 + 507 + timeout 10 ip netns exec "$ns3" socat UDP-LISTEN:12345,fork OPEN:"$TMPFILE2",trunc & 508 + local rpid2=$! 509 + 510 + ip netns exec "$nsrouter" ./nf_queue -q 12 -d 1000 & 511 + local nfqpid=$! 512 + 513 + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns2" 514 + busywait "$BUSYWAIT_TIMEOUT" udp_listener_ready "$ns3" 515 + busywait "$BUSYWAIT_TIMEOUT" nf_queue_wait "$nsrouter" 12 516 + 517 + # Send two packets, one should end up in ns1, other in ns2. 518 + # This is because nfqueue will delay packet for long enough so that 519 + # second packet will not find existing conntrack entry. 520 + echo "Packet 1" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221 521 + echo "Packet 2" | ip netns exec "$ns1" socat STDIN UDP-DATAGRAM:10.6.6.6:12345,bind=0.0.0.0:55221 522 + 523 + busywait 10000 output_files_written "$TMPFILE1" "$TMPFILE2" 524 + 525 + kill "$nfqpid" 526 + 527 + if ! ip netns exec "$nsrouter" bash -c 'conntrack -L -p udp --dport 12345 2>/dev/null | wc -l | grep -q "^1"'; then 528 + echo "FAIL: Expected One udp conntrack entry" 529 + ip netns exec "$nsrouter" conntrack -L -p udp --dport 12345 530 + ret=1 531 + fi 532 + 533 + if ! ip netns exec "$nsrouter" nft delete table inet udpq; then 534 + echo "FAIL: Could not delete udpq table" 535 + ret=1 536 + return 537 + fi 538 + 539 + NUMLINES1=$(wc -l < "$TMPFILE1") 540 + NUMLINES2=$(wc -l < "$TMPFILE2") 541 + 542 + if [ "$NUMLINES1" -ne 1 ] || [ "$NUMLINES2" -ne 1 ]; then 543 + ret=1 544 + echo "FAIL: uneven udp packet distribution: $NUMLINES1 $NUMLINES2" 545 + echo -n "$TMPFILE1: ";cat "$TMPFILE1" 546 + echo -n "$TMPFILE2: ";cat "$TMPFILE2" 547 + return 548 + fi 549 + 550 + echo "PASS: both udp receivers got one packet each" 551 + } 552 + 487 553 test_queue_removal() 488 554 { 489 555 read tainted_then < /proc/sys/kernel/tainted ··· 600 512 ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 601 513 ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null 602 514 ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null 515 + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null 603 516 604 517 load_ruleset "filter" 0 605 518 ··· 638 549 test_tcp_localhost_requeue 639 550 test_sctp_forward 640 551 test_sctp_output 552 + test_udp_ct_race 641 553 642 554 # should be last, adds vrf device in ns1 and changes routes 643 555 test_icmp_vrf

+358

tools/testing/selftests/net/netfilter/nft_tproxy_tcp.sh

··· 1 + #!/bin/bash 2 + # 3 + # This tests tproxy on the following scenario: 4 + # 5 + # +------------+ 6 + # +-------+ | nsrouter | +-------+ 7 + # |ns1 |.99 .1| |.1 .99| ns2| 8 + # | eth0|---------------|veth0 veth1|------------------|eth0 | 9 + # | | 10.0.1.0/24 | | 10.0.2.0/24 | | 10 + # +-------+ dead:1::/64 | veth2 | dead:2::/64 +-------+ 11 + # +------------+ 12 + # |.1 13 + # | 14 + # | 15 + # | +-------+ 16 + # | .99| ns3| 17 + # +------------------------|eth0 | 18 + # 10.0.3.0/24 | | 19 + # dead:3::/64 +-------+ 20 + # 21 + # The tproxy implementation acts as an echo server so the client 22 + # must receive the same message it sent if it has been proxied. 23 + # If is not proxied the servers return PONG_NS# with the number 24 + # of the namespace the server is running. 25 + # 26 + # shellcheck disable=SC2162,SC2317 27 + 28 + source lib.sh 29 + ret=0 30 + timeout=5 31 + 32 + cleanup() 33 + { 34 + ip netns pids "$ns1" | xargs kill 2>/dev/null 35 + ip netns pids "$ns2" | xargs kill 2>/dev/null 36 + ip netns pids "$ns3" | xargs kill 2>/dev/null 37 + ip netns pids "$nsrouter" | xargs kill 2>/dev/null 38 + 39 + cleanup_all_ns 40 + } 41 + 42 + checktool "nft --version" "test without nft tool" 43 + checktool "socat -h" "run test without socat" 44 + 45 + trap cleanup EXIT 46 + setup_ns ns1 ns2 ns3 nsrouter 47 + 48 + if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then 49 + echo "SKIP: No virtual ethernet pair device support in kernel" 50 + exit $ksft_skip 51 + fi 52 + ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" 53 + ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3" 54 + 55 + ip -net "$nsrouter" link set veth0 up 56 + ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 57 + ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad 58 + 59 + ip -net "$nsrouter" link set veth1 up 60 + ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 61 + ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad 62 + 63 + ip -net "$nsrouter" link set veth2 up 64 + ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2 65 + ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad 66 + 67 + ip -net "$ns1" link set eth0 up 68 + ip -net "$ns2" link set eth0 up 69 + ip -net "$ns3" link set eth0 up 70 + 71 + ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 72 + ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad 73 + ip -net "$ns1" route add default via 10.0.1.1 74 + ip -net "$ns1" route add default via dead:1::1 75 + 76 + ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 77 + ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad 78 + ip -net "$ns2" route add default via 10.0.2.1 79 + ip -net "$ns2" route add default via dead:2::1 80 + 81 + ip -net "$ns3" addr add 10.0.3.99/24 dev eth0 82 + ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad 83 + ip -net "$ns3" route add default via 10.0.3.1 84 + ip -net "$ns3" route add default via dead:3::1 85 + 86 + ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 87 + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null 88 + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null 89 + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null 90 + 91 + test_ping() { 92 + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then 93 + return 1 94 + fi 95 + 96 + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then 97 + return 2 98 + fi 99 + 100 + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then 101 + return 1 102 + fi 103 + 104 + if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then 105 + return 2 106 + fi 107 + 108 + return 0 109 + } 110 + 111 + test_ping_router() { 112 + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then 113 + return 3 114 + fi 115 + 116 + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then 117 + return 4 118 + fi 119 + 120 + return 0 121 + } 122 + 123 + 124 + listener_ready() 125 + { 126 + local ns="$1" 127 + local port="$2" 128 + local proto="$3" 129 + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" 130 + } 131 + 132 + test_tproxy() 133 + { 134 + local traffic_origin="$1" 135 + local ip_proto="$2" 136 + local expect_ns1_ns2="$3" 137 + local expect_ns1_ns3="$4" 138 + local expect_nsrouter_ns2="$5" 139 + local expect_nsrouter_ns3="$6" 140 + 141 + # derived variables 142 + local testname="test_${ip_proto}_tcp_${traffic_origin}" 143 + local socat_ipproto 144 + local ns1_ip 145 + local ns2_ip 146 + local ns3_ip 147 + local ns2_target 148 + local ns3_target 149 + local nftables_subject 150 + local ip_command 151 + 152 + # socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1) 153 + case $ip_proto in 154 + "ip") 155 + socat_ipproto="-4" 156 + ns1_ip=10.0.1.99 157 + ns2_ip=10.0.2.99 158 + ns3_ip=10.0.3.99 159 + ns2_target="tcp:$ns2_ip:8080" 160 + ns3_target="tcp:$ns3_ip:8080" 161 + nftables_subject="ip daddr $ns2_ip tcp dport 8080" 162 + ip_command="ip" 163 + ;; 164 + "ip6") 165 + socat_ipproto="-6" 166 + ns1_ip=dead:1::99 167 + ns2_ip=dead:2::99 168 + ns3_ip=dead:3::99 169 + ns2_target="tcp:[$ns2_ip]:8080" 170 + ns3_target="tcp:[$ns3_ip]:8080" 171 + nftables_subject="ip6 daddr $ns2_ip tcp dport 8080" 172 + ip_command="ip -6" 173 + ;; 174 + *) 175 + echo "FAIL: unsupported protocol" 176 + exit 255 177 + ;; 178 + esac 179 + 180 + case $traffic_origin in 181 + # to capture the local originated traffic we need to mark the outgoing 182 + # traffic so the policy based routing rule redirects it and can be processed 183 + # in the prerouting chain. 184 + "local") 185 + nftables_rules=" 186 + flush ruleset 187 + table inet filter { 188 + chain divert { 189 + type filter hook prerouting priority 0; policy accept; 190 + $nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept 191 + } 192 + chain output { 193 + type route hook output priority 0; policy accept; 194 + $nftables_subject meta mark set 1 accept 195 + } 196 + }" 197 + ;; 198 + "forward") 199 + nftables_rules=" 200 + flush ruleset 201 + table inet filter { 202 + chain divert { 203 + type filter hook prerouting priority 0; policy accept; 204 + $nftables_subject tproxy $ip_proto to :12345 meta mark set 1 accept 205 + } 206 + }" 207 + ;; 208 + *) 209 + echo "FAIL: unsupported parameter for traffic origin" 210 + exit 255 211 + ;; 212 + esac 213 + 214 + # shellcheck disable=SC2046 # Intended splitting of ip_command 215 + ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100 216 + ip netns exec "$nsrouter" $ip_command route add local "${ns2_ip}" dev lo table 100 217 + echo "$nftables_rules" | ip netns exec "$nsrouter" nft -f /dev/stdin 218 + 219 + timeout "$timeout" ip netns exec "$nsrouter" socat "$socat_ipproto" tcp-listen:12345,fork,ip-transparent SYSTEM:"cat" 2>/dev/null & 220 + local tproxy_pid=$! 221 + 222 + timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null & 223 + local server2_pid=$! 224 + 225 + timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" tcp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null & 226 + local server3_pid=$! 227 + 228 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-t" 229 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-t" 230 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-t" 231 + 232 + local result 233 + # request from ns1 to ns2 (forwarded traffic) 234 + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns2_target") 235 + if [ "$result" == "$expect_ns1_ns2" ] ;then 236 + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2" 237 + else 238 + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended" 239 + ret=1 240 + fi 241 + 242 + # request from ns1 to ns3(forwarded traffic) 243 + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO "$ns3_target") 244 + if [ "$result" = "$expect_ns1_ns3" ] ;then 245 + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3" 246 + else 247 + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended" 248 + ret=1 249 + fi 250 + 251 + # request from nsrouter to ns2 (localy originated traffic) 252 + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns2_target") 253 + if [ "$result" == "$expect_nsrouter_ns2" ] ;then 254 + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2" 255 + else 256 + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended" 257 + ret=1 258 + fi 259 + 260 + # request from nsrouter to ns3 (localy originated traffic) 261 + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO "$ns3_target") 262 + if [ "$result" = "$expect_nsrouter_ns3" ] ;then 263 + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3" 264 + else 265 + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\" as intended" 266 + ret=1 267 + fi 268 + 269 + # cleanup 270 + kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null 271 + # shellcheck disable=SC2046 # Intended splitting of ip_command 272 + ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100 273 + ip netns exec "$nsrouter" $ip_command route flush table 100 274 + } 275 + 276 + 277 + test_ipv4_tcp_forward() 278 + { 279 + local traffic_origin="forward" 280 + local ip_proto="ip" 281 + local expect_ns1_ns2="I_M_PROXIED" 282 + local expect_ns1_ns3="PONG_NS3" 283 + local expect_nsrouter_ns2="PONG_NS2" 284 + local expect_nsrouter_ns3="PONG_NS3" 285 + 286 + test_tproxy "$traffic_origin" \ 287 + "$ip_proto" \ 288 + "$expect_ns1_ns2" \ 289 + "$expect_ns1_ns3" \ 290 + "$expect_nsrouter_ns2" \ 291 + "$expect_nsrouter_ns3" 292 + } 293 + 294 + test_ipv4_tcp_local() 295 + { 296 + local traffic_origin="local" 297 + local ip_proto="ip" 298 + local expect_ns1_ns2="I_M_PROXIED" 299 + local expect_ns1_ns3="PONG_NS3" 300 + local expect_nsrouter_ns2="I_M_PROXIED" 301 + local expect_nsrouter_ns3="PONG_NS3" 302 + 303 + test_tproxy "$traffic_origin" \ 304 + "$ip_proto" \ 305 + "$expect_ns1_ns2" \ 306 + "$expect_ns1_ns3" \ 307 + "$expect_nsrouter_ns2" \ 308 + "$expect_nsrouter_ns3" 309 + } 310 + 311 + test_ipv6_tcp_forward() 312 + { 313 + local traffic_origin="forward" 314 + local ip_proto="ip6" 315 + local expect_ns1_ns2="I_M_PROXIED" 316 + local expect_ns1_ns3="PONG_NS3" 317 + local expect_nsrouter_ns2="PONG_NS2" 318 + local expect_nsrouter_ns3="PONG_NS3" 319 + 320 + test_tproxy "$traffic_origin" \ 321 + "$ip_proto" \ 322 + "$expect_ns1_ns2" \ 323 + "$expect_ns1_ns3" \ 324 + "$expect_nsrouter_ns2" \ 325 + "$expect_nsrouter_ns3" 326 + } 327 + 328 + test_ipv6_tcp_local() 329 + { 330 + local traffic_origin="local" 331 + local ip_proto="ip6" 332 + local expect_ns1_ns2="I_M_PROXIED" 333 + local expect_ns1_ns3="PONG_NS3" 334 + local expect_nsrouter_ns2="I_M_PROXIED" 335 + local expect_nsrouter_ns3="PONG_NS3" 336 + 337 + test_tproxy "$traffic_origin" \ 338 + "$ip_proto" \ 339 + "$expect_ns1_ns2" \ 340 + "$expect_ns1_ns3" \ 341 + "$expect_nsrouter_ns2" \ 342 + "$expect_nsrouter_ns3" 343 + } 344 + 345 + if test_ping; then 346 + # queue bypass works (rules were skipped, no listener) 347 + echo "PASS: ${ns1} can reach ${ns2}" 348 + else 349 + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 350 + exit $ret 351 + fi 352 + 353 + test_ipv4_tcp_forward 354 + test_ipv4_tcp_local 355 + test_ipv6_tcp_forward 356 + test_ipv6_tcp_local 357 + 358 + exit $ret

+262

tools/testing/selftests/net/netfilter/nft_tproxy_udp.sh

··· 1 + #!/bin/bash 2 + # 3 + # This tests tproxy on the following scenario: 4 + # 5 + # +------------+ 6 + # +-------+ | nsrouter | +-------+ 7 + # |ns1 |.99 .1| |.1 .99| ns2| 8 + # | eth0|---------------|veth0 veth1|------------------|eth0 | 9 + # | | 10.0.1.0/24 | | 10.0.2.0/24 | | 10 + # +-------+ dead:1::/64 | veth2 | dead:2::/64 +-------+ 11 + # +------------+ 12 + # |.1 13 + # | 14 + # | 15 + # | +-------+ 16 + # | .99| ns3| 17 + # +------------------------|eth0 | 18 + # 10.0.3.0/24 | | 19 + # dead:3::/64 +-------+ 20 + # 21 + # The tproxy implementation acts as an echo server so the client 22 + # must receive the same message it sent if it has been proxied. 23 + # If is not proxied the servers return PONG_NS# with the number 24 + # of the namespace the server is running. 25 + # shellcheck disable=SC2162,SC2317 26 + 27 + source lib.sh 28 + ret=0 29 + # UDP is slow 30 + timeout=15 31 + 32 + cleanup() 33 + { 34 + ip netns pids "$ns1" | xargs kill 2>/dev/null 35 + ip netns pids "$ns2" | xargs kill 2>/dev/null 36 + ip netns pids "$ns3" | xargs kill 2>/dev/null 37 + ip netns pids "$nsrouter" | xargs kill 2>/dev/null 38 + 39 + cleanup_all_ns 40 + } 41 + 42 + checktool "nft --version" "test without nft tool" 43 + checktool "socat -h" "run test without socat" 44 + 45 + trap cleanup EXIT 46 + setup_ns ns1 ns2 ns3 nsrouter 47 + 48 + if ! ip link add veth0 netns "$nsrouter" type veth peer name eth0 netns "$ns1" > /dev/null 2>&1; then 49 + echo "SKIP: No virtual ethernet pair device support in kernel" 50 + exit $ksft_skip 51 + fi 52 + ip link add veth1 netns "$nsrouter" type veth peer name eth0 netns "$ns2" 53 + ip link add veth2 netns "$nsrouter" type veth peer name eth0 netns "$ns3" 54 + 55 + ip -net "$nsrouter" link set veth0 up 56 + ip -net "$nsrouter" addr add 10.0.1.1/24 dev veth0 57 + ip -net "$nsrouter" addr add dead:1::1/64 dev veth0 nodad 58 + 59 + ip -net "$nsrouter" link set veth1 up 60 + ip -net "$nsrouter" addr add 10.0.2.1/24 dev veth1 61 + ip -net "$nsrouter" addr add dead:2::1/64 dev veth1 nodad 62 + 63 + ip -net "$nsrouter" link set veth2 up 64 + ip -net "$nsrouter" addr add 10.0.3.1/24 dev veth2 65 + ip -net "$nsrouter" addr add dead:3::1/64 dev veth2 nodad 66 + 67 + ip -net "$ns1" link set eth0 up 68 + ip -net "$ns2" link set eth0 up 69 + ip -net "$ns3" link set eth0 up 70 + 71 + ip -net "$ns1" addr add 10.0.1.99/24 dev eth0 72 + ip -net "$ns1" addr add dead:1::99/64 dev eth0 nodad 73 + ip -net "$ns1" route add default via 10.0.1.1 74 + ip -net "$ns1" route add default via dead:1::1 75 + 76 + ip -net "$ns2" addr add 10.0.2.99/24 dev eth0 77 + ip -net "$ns2" addr add dead:2::99/64 dev eth0 nodad 78 + ip -net "$ns2" route add default via 10.0.2.1 79 + ip -net "$ns2" route add default via dead:2::1 80 + 81 + ip -net "$ns3" addr add 10.0.3.99/24 dev eth0 82 + ip -net "$ns3" addr add dead:3::99/64 dev eth0 nodad 83 + ip -net "$ns3" route add default via 10.0.3.1 84 + ip -net "$ns3" route add default via dead:3::1 85 + 86 + ip netns exec "$nsrouter" sysctl net.ipv6.conf.all.forwarding=1 > /dev/null 87 + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null 88 + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth1.forwarding=1 > /dev/null 89 + ip netns exec "$nsrouter" sysctl net.ipv4.conf.veth2.forwarding=1 > /dev/null 90 + 91 + test_ping() { 92 + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.99 > /dev/null; then 93 + return 1 94 + fi 95 + 96 + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::99 > /dev/null; then 97 + return 2 98 + fi 99 + 100 + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.3.99 > /dev/null; then 101 + return 1 102 + fi 103 + 104 + if ! ip netns exec "$ns1" ping -c 1 -q dead:3::99 > /dev/null; then 105 + return 2 106 + fi 107 + 108 + return 0 109 + } 110 + 111 + test_ping_router() { 112 + if ! ip netns exec "$ns1" ping -c 1 -q 10.0.2.1 > /dev/null; then 113 + return 3 114 + fi 115 + 116 + if ! ip netns exec "$ns1" ping -c 1 -q dead:2::1 > /dev/null; then 117 + return 4 118 + fi 119 + 120 + return 0 121 + } 122 + 123 + 124 + listener_ready() 125 + { 126 + local ns="$1" 127 + local port="$2" 128 + local proto="$3" 129 + ss -N "$ns" -ln "$proto" -o "sport = :$port" | grep -q "$port" 130 + } 131 + 132 + test_tproxy_udp_forward() 133 + { 134 + local ip_proto="$1" 135 + 136 + local expect_ns1_ns2="I_M_PROXIED" 137 + local expect_ns1_ns3="PONG_NS3" 138 + local expect_nsrouter_ns2="PONG_NS2" 139 + local expect_nsrouter_ns3="PONG_NS3" 140 + 141 + # derived variables 142 + local testname="test_${ip_proto}_udp_forward" 143 + local socat_ipproto 144 + local ns1_ip 145 + local ns2_ip 146 + local ns3_ip 147 + local ns1_ip_port 148 + local ns2_ip_port 149 + local ns3_ip_port 150 + local ip_command 151 + 152 + # socat 1.8.0 has a bug that requires to specify the IP family to bind (fixed in 1.8.0.1) 153 + case $ip_proto in 154 + "ip") 155 + socat_ipproto="-4" 156 + ns1_ip=10.0.1.99 157 + ns2_ip=10.0.2.99 158 + ns3_ip=10.0.3.99 159 + ns1_ip_port="$ns1_ip:18888" 160 + ns2_ip_port="$ns2_ip:8080" 161 + ns3_ip_port="$ns3_ip:8080" 162 + ip_command="ip" 163 + ;; 164 + "ip6") 165 + socat_ipproto="-6" 166 + ns1_ip=dead:1::99 167 + ns2_ip=dead:2::99 168 + ns3_ip=dead:3::99 169 + ns1_ip_port="[$ns1_ip]:18888" 170 + ns2_ip_port="[$ns2_ip]:8080" 171 + ns3_ip_port="[$ns3_ip]:8080" 172 + ip_command="ip -6" 173 + ;; 174 + *) 175 + echo "FAIL: unsupported protocol" 176 + exit 255 177 + ;; 178 + esac 179 + 180 + # shellcheck disable=SC2046 # Intended splitting of ip_command 181 + ip netns exec "$nsrouter" $ip_command rule add fwmark 1 table 100 182 + ip netns exec "$nsrouter" $ip_command route add local "$ns2_ip" dev lo table 100 183 + ip netns exec "$nsrouter" nft -f /dev/stdin <<EOF 184 + flush ruleset 185 + table inet filter { 186 + chain divert { 187 + type filter hook prerouting priority 0; policy accept; 188 + $ip_proto daddr $ns2_ip udp dport 8080 tproxy $ip_proto to :12345 meta mark set 1 accept 189 + } 190 + } 191 + EOF 192 + 193 + timeout "$timeout" ip netns exec "$nsrouter" socat -u "$socat_ipproto" udp-listen:12345,fork,ip-transparent,reuseport udp:"$ns1_ip_port",ip-transparent,reuseport,bind="$ns2_ip_port" 2>/dev/null & 194 + local tproxy_pid=$! 195 + 196 + timeout "$timeout" ip netns exec "$ns2" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS2" 2>/dev/null & 197 + local server2_pid=$! 198 + 199 + timeout "$timeout" ip netns exec "$ns3" socat "$socat_ipproto" udp-listen:8080,fork SYSTEM:"echo PONG_NS3" 2>/dev/null & 200 + local server3_pid=$! 201 + 202 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$nsrouter" 12345 "-u" 203 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns2" 8080 "-u" 204 + busywait "$BUSYWAIT_TIMEOUT" listener_ready "$ns3" 8080 "-u" 205 + 206 + local result 207 + # request from ns1 to ns2 (forwarded traffic) 208 + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port",sourceport=18888) 209 + if [ "$result" == "$expect_ns1_ns2" ] ;then 210 + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2" 211 + else 212 + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns2, not \"${expect_ns1_ns2}\" as intended" 213 + ret=1 214 + fi 215 + 216 + # request from ns1 to ns3 (forwarded traffic) 217 + result=$(echo I_M_PROXIED | ip netns exec "$ns1" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port") 218 + if [ "$result" = "$expect_ns1_ns3" ] ;then 219 + echo "PASS: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3" 220 + else 221 + echo "ERROR: tproxy test $testname: ns1 got reply \"$result\" connecting to ns3, not \"$expect_ns1_ns3\" as intended" 222 + ret=1 223 + fi 224 + 225 + # request from nsrouter to ns2 (localy originated traffic) 226 + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns2_ip_port") 227 + if [ "$result" == "$expect_nsrouter_ns2" ] ;then 228 + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2" 229 + else 230 + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns2, not \"$expect_nsrouter_ns2\" as intended" 231 + ret=1 232 + fi 233 + 234 + # request from nsrouter to ns3 (localy originated traffic) 235 + result=$(echo I_M_PROXIED | ip netns exec "$nsrouter" socat -t 2 -T 2 STDIO udp:"$ns3_ip_port") 236 + if [ "$result" = "$expect_nsrouter_ns3" ] ;then 237 + echo "PASS: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3" 238 + else 239 + echo "ERROR: tproxy test $testname: nsrouter got reply \"$result\" connecting to ns3, not \"$expect_nsrouter_ns3\" as intended" 240 + ret=1 241 + fi 242 + 243 + # cleanup 244 + kill "$tproxy_pid" "$server2_pid" "$server3_pid" 2>/dev/null 245 + # shellcheck disable=SC2046 # Intended splitting of ip_command 246 + ip netns exec "$nsrouter" $ip_command rule del fwmark 1 table 100 247 + ip netns exec "$nsrouter" $ip_command route flush table 100 248 + } 249 + 250 + 251 + if test_ping; then 252 + # queue bypass works (rules were skipped, no listener) 253 + echo "PASS: ${ns1} can reach ${ns2}" 254 + else 255 + echo "FAIL: ${ns1} cannot reach ${ns2}: $ret" 1>&2 256 + exit $ret 257 + fi 258 + 259 + test_tproxy_udp_forward "ip" 260 + test_tproxy_udp_forward "ip6" 261 + 262 + exit $ret

+7 -2

tools/testing/selftests/net/packetdrill/ksft_runner.sh

··· 30 30 exit "$KSFT_SKIP" 31 31 fi 32 32 33 + declare -a optargs 34 + if [[ -n "${KSFT_MACHINE_SLOW}" ]]; then 35 + optargs+=('--tolerance_usecs=14000') 36 + fi 37 + 33 38 ktap_print_header 34 39 ktap_set_plan 2 35 40 36 - unshare -n packetdrill ${ipv4_args[@]} $(basename $script) > /dev/null \ 41 + unshare -n packetdrill ${ipv4_args[@]} ${optargs[@]} $(basename $script) > /dev/null \ 37 42 && ktap_test_pass "ipv4" || ktap_test_fail "ipv4" 38 - unshare -n packetdrill ${ipv6_args[@]} $(basename $script) > /dev/null \ 43 + unshare -n packetdrill ${ipv6_args[@]} ${optargs[@]} $(basename $script) > /dev/null \ 39 44 && ktap_test_pass "ipv6" || ktap_test_fail "ipv6" 40 45 41 46 ktap_finished

Configure Feed

Configure Feed