Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'add-tso-map-once-dma-helpers-and-bnxt-sw-uso-support'

Joe Damato says:

====================
Add TSO map-once DMA helpers and bnxt SW USO support

Greetings:

This series extends net/tso to add a data structure and some helpers allowing
drivers to DMA map headers and packet payloads a single time. The helpers can
then be used to reference slices of shared mapping for each segment. This
helps to avoid the cost of repeated DMA mappings, especially on systems which
use an IOMMU. N per-packet DMA maps are replaced with a single map for the
entire GSO skb. As of v3, the series uses the DMA IOVA API (as suggested by
Leon [1]) and provides a fallback path when an IOMMU is not in use. The DMA
IOVA API provides even better efficiency than the v2; see below.

The added helpers are then used in bnxt to add support for software UDP
Segmentation Offloading (SW USO) for older bnxt devices which do not have
support for USO in hardware. Since the helpers are generic, other drivers
can be extended similarly.

The v2 showed a ~4x reduction in DMA mapping calls at the same wire packet
rate on production traffic with a bnxt device. The v3, however, shows a larger
reduction of about ~6x at the same wire packet rate. This is thanks to Leon's
suggestion of using the DMA IOVA API [1].

Special care is taken to make bnxt ethtool operations work correctly: the ring
size cannot be reduced below a minimum threshold while USO is enabled and
growing the ring automatically re-enables USO if it was previously blocked.

This v10 contains some cosmetic changes (wrapping long lines), moves the test
to the correct directory, and attempts to fix the slot availability check
added in the v9.

I re-ran the python test and the test passed on my bnxt system. I also ran
this on a production system.
====================

Link: https://patch.msgid.link/20260408230607.2019402-1-joe@dama.to
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+967 -39
+1 -1
drivers/net/ethernet/broadcom/bnxt/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 obj-$(CONFIG_BNXT) += bnxt_en.o 3 3 4 - bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o 4 + bnxt_en-y := bnxt.o bnxt_hwrm.o bnxt_sriov.o bnxt_ethtool.o bnxt_dcb.o bnxt_ulp.o bnxt_xdp.o bnxt_ptp.o bnxt_vfr.o bnxt_devlink.o bnxt_dim.o bnxt_coredump.o bnxt_gso.o 5 5 bnxt_en-$(CONFIG_BNXT_FLOWER_OFFLOAD) += bnxt_tc.o 6 6 bnxt_en-$(CONFIG_DEBUG_FS) += bnxt_debugfs.o 7 7 bnxt_en-$(CONFIG_BNXT_HWMON) += bnxt_hwmon.o
+147 -36
drivers/net/ethernet/broadcom/bnxt/bnxt.c
··· 74 74 #include "bnxt_debugfs.h" 75 75 #include "bnxt_coredump.h" 76 76 #include "bnxt_hwmon.h" 77 + #include "bnxt_gso.h" 78 + #include <net/tso.h> 77 79 78 80 #define BNXT_TX_TIMEOUT (5 * HZ) 79 81 #define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \ ··· 449 447 TX_BD_FLAGS_LHINT_2048_AND_LARGER, 450 448 }; 451 449 452 - static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) 450 + u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) 453 451 { 454 452 struct metadata_dst *md_dst = skb_metadata_dst(skb); 455 453 ··· 508 506 } 509 507 } 510 508 #endif 509 + if (skb_is_gso(skb) && 510 + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) && 511 + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) 512 + return bnxt_sw_udp_gso_xmit(bp, txr, txq, skb); 513 + 511 514 free_size = bnxt_tx_avail(bp, txr); 512 515 if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { 513 516 /* We must have raced with NAPI cleanup */ ··· 663 656 goto tx_free; 664 657 665 658 dma_unmap_addr_set(tx_buf, mapping, mapping); 659 + dma_unmap_len_set(tx_buf, len, len); 666 660 flags = (len << TX_BD_LEN_SHIFT) | TX_BD_TYPE_LONG_TX_BD | 667 661 TX_BD_CNT(last_frag + 2); 668 662 ··· 671 663 txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 2 + last_frag); 672 664 673 665 prod = NEXT_TX(prod); 674 - txbd1 = (struct tx_bd_ext *) 675 - &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; 666 + txbd1 = bnxt_init_ext_bd(bp, txr, prod, lflags, vlan_tag_flags, 667 + cfa_action); 676 668 677 - txbd1->tx_bd_hsize_lflags = lflags; 678 669 if (skb_is_gso(skb)) { 679 670 bool udp_gso = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4); 680 671 u32 hdr_len; ··· 700 693 } else if (skb->ip_summed == CHECKSUM_PARTIAL) { 701 694 txbd1->tx_bd_hsize_lflags |= 702 695 cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); 703 - txbd1->tx_bd_mss = 0; 704 696 } 705 697 706 698 length >>= 9; ··· 712 706 flags |= bnxt_lhint_arr[length]; 713 707 txbd->tx_bd_len_flags_type = cpu_to_le32(flags); 714 708 715 - txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); 716 - txbd1->tx_bd_cfa_action = 717 - cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); 718 709 txbd0 = txbd; 719 710 for (i = 0; i < last_frag; i++) { 720 711 frag = &skb_shinfo(skb)->frags[i]; ··· 728 725 tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; 729 726 netmem_dma_unmap_addr_set(skb_frag_netmem(frag), tx_buf, 730 727 mapping, mapping); 728 + dma_unmap_len_set(tx_buf, len, len); 731 729 732 730 txbd->tx_bd_haddr = cpu_to_le64(mapping); 733 731 ··· 818 814 u16 hw_cons = txr->tx_hw_cons; 819 815 unsigned int tx_bytes = 0; 820 816 u16 cons = txr->tx_cons; 821 - skb_frag_t *frag; 817 + unsigned int dma_len; 818 + dma_addr_t dma_addr; 822 819 int tx_pkts = 0; 823 820 bool rc = false; 824 821 825 822 while (RING_TX(bp, cons) != hw_cons) { 826 - struct bnxt_sw_tx_bd *tx_buf; 823 + struct bnxt_sw_tx_bd *tx_buf, *head_buf; 827 824 struct sk_buff *skb; 828 825 bool is_ts_pkt; 829 826 int j, last; 830 827 831 828 tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; 829 + head_buf = tx_buf; 832 830 skb = tx_buf->skb; 833 831 834 832 if (unlikely(!skb)) { ··· 855 849 goto next_tx_int; 856 850 } 857 851 858 - dma_unmap_single(&pdev->dev, dma_unmap_addr(tx_buf, mapping), 859 - skb_headlen(skb), DMA_TO_DEVICE); 852 + if (dma_unmap_len(tx_buf, len)) { 853 + dma_addr = dma_unmap_addr(tx_buf, mapping); 854 + dma_len = dma_unmap_len(tx_buf, len); 855 + 856 + dma_unmap_single(&pdev->dev, dma_addr, dma_len, 857 + DMA_TO_DEVICE); 858 + } 859 + 860 860 last = tx_buf->nr_frags; 861 861 862 862 for (j = 0; j < last; j++) { 863 - frag = &skb_shinfo(skb)->frags[j]; 864 863 cons = NEXT_TX(cons); 865 864 tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)]; 866 - netmem_dma_unmap_page_attrs(&pdev->dev, 867 - dma_unmap_addr(tx_buf, 868 - mapping), 869 - skb_frag_size(frag), 870 - DMA_TO_DEVICE, 0); 865 + if (dma_unmap_len(tx_buf, len)) { 866 + dma_addr = dma_unmap_addr(tx_buf, mapping); 867 + dma_len = dma_unmap_len(tx_buf, len); 868 + 869 + netmem_dma_unmap_page_attrs(&pdev->dev, 870 + dma_addr, dma_len, 871 + DMA_TO_DEVICE, 0); 872 + } 871 873 } 874 + 875 + if (unlikely(head_buf->is_sw_gso)) { 876 + u16 inline_cons = txr->tx_inline_cons + 1; 877 + 878 + WRITE_ONCE(txr->tx_inline_cons, inline_cons); 879 + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { 880 + tso_dma_map_complete(&pdev->dev, 881 + &head_buf->sw_gso_cstate); 882 + } else { 883 + tx_pkts--; 884 + tx_bytes -= skb->len; 885 + skb = NULL; 886 + } 887 + head_buf->is_sw_gso = 0; 888 + } 889 + 872 890 if (unlikely(is_ts_pkt)) { 873 891 if (BNXT_CHIP_P5(bp)) { 874 892 /* PTP worker takes ownership of the skb */ ··· 3429 3399 { 3430 3400 int i, max_idx; 3431 3401 struct pci_dev *pdev = bp->pdev; 3402 + unsigned int dma_len; 3403 + dma_addr_t dma_addr; 3432 3404 3433 3405 max_idx = bp->tx_nr_pages * TX_DESC_CNT; 3434 3406 3435 3407 for (i = 0; i < max_idx;) { 3436 3408 struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[i]; 3409 + struct bnxt_sw_tx_bd *head_buf = tx_buf; 3437 3410 struct sk_buff *skb; 3438 3411 int j, last; 3439 3412 3440 3413 if (idx < bp->tx_nr_rings_xdp && 3441 3414 tx_buf->action == XDP_REDIRECT) { 3442 - dma_unmap_single(&pdev->dev, 3443 - dma_unmap_addr(tx_buf, mapping), 3444 - dma_unmap_len(tx_buf, len), 3415 + dma_addr = dma_unmap_addr(tx_buf, mapping); 3416 + dma_len = dma_unmap_len(tx_buf, len); 3417 + 3418 + dma_unmap_single(&pdev->dev, dma_addr, dma_len, 3445 3419 DMA_TO_DEVICE); 3446 3420 xdp_return_frame(tx_buf->xdpf); 3447 3421 tx_buf->action = 0; ··· 3468 3434 continue; 3469 3435 } 3470 3436 3471 - dma_unmap_single(&pdev->dev, 3472 - dma_unmap_addr(tx_buf, mapping), 3473 - skb_headlen(skb), 3474 - DMA_TO_DEVICE); 3437 + if (dma_unmap_len(tx_buf, len)) { 3438 + dma_addr = dma_unmap_addr(tx_buf, mapping); 3439 + dma_len = dma_unmap_len(tx_buf, len); 3440 + 3441 + dma_unmap_single(&pdev->dev, dma_addr, dma_len, 3442 + DMA_TO_DEVICE); 3443 + } 3475 3444 3476 3445 last = tx_buf->nr_frags; 3477 3446 i += 2; 3478 3447 for (j = 0; j < last; j++, i++) { 3479 3448 int ring_idx = i & bp->tx_ring_mask; 3480 - skb_frag_t *frag = &skb_shinfo(skb)->frags[j]; 3481 3449 3482 3450 tx_buf = &txr->tx_buf_ring[ring_idx]; 3483 - netmem_dma_unmap_page_attrs(&pdev->dev, 3484 - dma_unmap_addr(tx_buf, 3485 - mapping), 3486 - skb_frag_size(frag), 3487 - DMA_TO_DEVICE, 0); 3451 + if (dma_unmap_len(tx_buf, len)) { 3452 + dma_addr = dma_unmap_addr(tx_buf, mapping); 3453 + dma_len = dma_unmap_len(tx_buf, len); 3454 + 3455 + netmem_dma_unmap_page_attrs(&pdev->dev, 3456 + dma_addr, dma_len, 3457 + DMA_TO_DEVICE, 0); 3458 + } 3488 3459 } 3489 - dev_kfree_skb(skb); 3460 + if (head_buf->is_sw_gso) { 3461 + u16 inline_cons = txr->tx_inline_cons + 1; 3462 + 3463 + WRITE_ONCE(txr->tx_inline_cons, inline_cons); 3464 + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) { 3465 + tso_dma_map_complete(&pdev->dev, 3466 + &head_buf->sw_gso_cstate); 3467 + } else { 3468 + skb = NULL; 3469 + } 3470 + head_buf->is_sw_gso = 0; 3471 + } 3472 + if (skb) 3473 + dev_kfree_skb(skb); 3490 3474 } 3491 3475 netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, idx)); 3492 3476 } ··· 4017 3965 return rc; 4018 3966 } 4019 3967 3968 + static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr, 3969 + struct pci_dev *pdev) 3970 + { 3971 + if (!txr->tx_inline_buf) 3972 + return; 3973 + 3974 + dma_unmap_single(&pdev->dev, txr->tx_inline_dma, 3975 + txr->tx_inline_size, DMA_TO_DEVICE); 3976 + kfree(txr->tx_inline_buf); 3977 + txr->tx_inline_buf = NULL; 3978 + txr->tx_inline_size = 0; 3979 + } 3980 + 3981 + static int bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr, 3982 + struct pci_dev *pdev, 3983 + unsigned int size) 3984 + { 3985 + txr->tx_inline_buf = kmalloc(size, GFP_KERNEL); 3986 + if (!txr->tx_inline_buf) 3987 + return -ENOMEM; 3988 + 3989 + txr->tx_inline_dma = dma_map_single(&pdev->dev, txr->tx_inline_buf, 3990 + size, DMA_TO_DEVICE); 3991 + if (dma_mapping_error(&pdev->dev, txr->tx_inline_dma)) { 3992 + kfree(txr->tx_inline_buf); 3993 + txr->tx_inline_buf = NULL; 3994 + return -ENOMEM; 3995 + } 3996 + txr->tx_inline_size = size; 3997 + 3998 + return 0; 3999 + } 4000 + 4020 4001 static void bnxt_free_tx_rings(struct bnxt *bp) 4021 4002 { 4022 4003 int i; ··· 4067 3982 txr->tx_push, txr->tx_push_mapping); 4068 3983 txr->tx_push = NULL; 4069 3984 } 3985 + 3986 + bnxt_free_tx_inline_buf(txr, pdev); 4070 3987 4071 3988 ring = &txr->tx_ring_struct; 4072 3989 ··· 4134 4047 mapping = txr->tx_push_mapping + 4135 4048 sizeof(struct tx_push_bd); 4136 4049 txr->data_mapping = cpu_to_le64(mapping); 4050 + } 4051 + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { 4052 + rc = bnxt_alloc_tx_inline_buf(txr, pdev, 4053 + BNXT_SW_USO_MAX_SEGS * 4054 + TSO_HEADER_SIZE); 4055 + if (rc) 4056 + return rc; 4137 4057 } 4138 4058 qidx = bp->tc_to_qidx[j]; 4139 4059 ring->queue_id = bp->q_info[qidx].queue_id; ··· 4680 4586 4681 4587 static int bnxt_init_tx_rings(struct bnxt *bp) 4682 4588 { 4589 + netdev_features_t features; 4683 4590 u16 i; 4684 4591 4592 + features = bp->dev->features; 4593 + 4685 4594 bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, 4686 - BNXT_MIN_TX_DESC_CNT); 4595 + bnxt_min_tx_desc_cnt(bp, features)); 4687 4596 4688 4597 for (i = 0; i < bp->tx_nr_rings; i++) { 4689 4598 struct bnxt_tx_ring_info *txr = &bp->tx_ring[i]; ··· 13885 13788 if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false)) 13886 13789 features &= ~NETIF_F_NTUPLE; 13887 13790 13791 + if ((features & NETIF_F_GSO_UDP_L4) && 13792 + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && 13793 + bp->tx_ring_size < 2 * BNXT_SW_USO_MAX_DESCS) 13794 + features &= ~NETIF_F_GSO_UDP_L4; 13795 + 13888 13796 if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog) 13889 13797 features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW); 13890 13798 ··· 13934 13832 u32 changes; 13935 13833 int rc = 0; 13936 13834 bool re_init = false; 13835 + 13836 + bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2, 13837 + bnxt_min_tx_desc_cnt(bp, features)); 13937 13838 13938 13839 flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS; 13939 13840 if (features & NETIF_F_GRO_HW) ··· 16963 16858 NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM | 16964 16859 NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH | 16965 16860 NETIF_F_RXCSUM | NETIF_F_GRO; 16966 - if (bp->flags & BNXT_FLAG_UDP_GSO_CAP) 16967 - dev->hw_features |= NETIF_F_GSO_UDP_L4; 16861 + dev->hw_features |= NETIF_F_GSO_UDP_L4; 16968 16862 16969 16863 if (BNXT_SUPPORTS_TPA(bp)) 16970 16864 dev->hw_features |= NETIF_F_LRO; ··· 16996 16892 dev->priv_flags |= IFF_UNICAST_FLT; 16997 16893 16998 16894 netif_set_tso_max_size(dev, GSO_MAX_SIZE); 16999 - if (bp->tso_max_segs) 16895 + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) { 16896 + u16 max_segs = BNXT_SW_USO_MAX_SEGS; 16897 + 16898 + if (bp->tso_max_segs) 16899 + max_segs = min_t(u16, max_segs, bp->tso_max_segs); 16900 + netif_set_tso_max_segs(dev, max_segs); 16901 + } else if (bp->tso_max_segs) { 17000 16902 netif_set_tso_max_segs(dev, bp->tso_max_segs); 16903 + } 17001 16904 17002 16905 dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | 17003 16906 NETDEV_XDP_ACT_RX_SG;
+32
drivers/net/ethernet/broadcom/bnxt/bnxt.h
··· 11 11 #ifndef BNXT_H 12 12 #define BNXT_H 13 13 14 + #include <net/tso.h> 15 + 14 16 #define DRV_MODULE_NAME "bnxt_en" 15 17 16 18 /* DO NOT CHANGE DRV_VER_* defines ··· 894 892 struct page *page; 895 893 u8 is_ts_pkt; 896 894 u8 is_push; 895 + u8 is_sw_gso; 897 896 u8 action; 898 897 unsigned short nr_frags; 899 898 union { 900 899 u16 rx_prod; 901 900 u16 txts_prod; 902 901 }; 902 + struct tso_dma_map_completion_state sw_gso_cstate; 903 903 }; 904 + 905 + #define BNXT_SW_GSO_MID 1 906 + #define BNXT_SW_GSO_LAST 2 904 907 905 908 struct bnxt_sw_rx_bd { 906 909 void *data; ··· 1002 995 struct tx_push_buffer *tx_push; 1003 996 dma_addr_t tx_push_mapping; 1004 997 __le64 data_mapping; 998 + 999 + void *tx_inline_buf; 1000 + dma_addr_t tx_inline_dma; 1001 + unsigned int tx_inline_size; 1002 + u16 tx_inline_prod; 1003 + u16 tx_inline_cons; 1005 1004 1006 1005 #define BNXT_DEV_STATE_CLOSING 0x1 1007 1006 u32 dev_state; ··· 2849 2836 return bp->tx_ring_size - (used & bp->tx_ring_mask); 2850 2837 } 2851 2838 2839 + static inline struct tx_bd_ext * 2840 + bnxt_init_ext_bd(struct bnxt *bp, struct bnxt_tx_ring_info *txr, 2841 + u16 prod, __le32 lflags, u32 vlan_tag_flags, 2842 + u32 cfa_action) 2843 + { 2844 + struct tx_bd_ext *txbd1; 2845 + 2846 + txbd1 = (struct tx_bd_ext *) 2847 + &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; 2848 + txbd1->tx_bd_hsize_lflags = lflags; 2849 + txbd1->tx_bd_mss = 0; 2850 + txbd1->tx_bd_cfa_meta = cpu_to_le32(vlan_tag_flags); 2851 + txbd1->tx_bd_cfa_action = 2852 + cpu_to_le32(cfa_action << TX_BD_CFA_ACTION_SHIFT); 2853 + 2854 + return txbd1; 2855 + } 2856 + 2852 2857 static inline void bnxt_writeq(struct bnxt *bp, u64 val, 2853 2858 volatile void __iomem *addr) 2854 2859 { ··· 3000 2969 int bnxt_reserve_rings(struct bnxt *bp, bool irq_re_init); 3001 2970 void bnxt_tx_disable(struct bnxt *bp); 3002 2971 void bnxt_tx_enable(struct bnxt *bp); 2972 + u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb); 3003 2973 void bnxt_sched_reset_txr(struct bnxt *bp, struct bnxt_tx_ring_info *txr, 3004 2974 u16 curr); 3005 2975 void bnxt_report_link(struct bnxt *bp);
+17 -2
drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
··· 33 33 #include "bnxt_xdp.h" 34 34 #include "bnxt_ptp.h" 35 35 #include "bnxt_ethtool.h" 36 + #include "bnxt_gso.h" 36 37 #include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */ 37 38 #include "bnxt_fw_hdr.h" /* Firmware hdr constant and structure defs */ 38 39 #include "bnxt_coredump.h" ··· 853 852 u8 tcp_data_split = kernel_ering->tcp_data_split; 854 853 struct bnxt *bp = netdev_priv(dev); 855 854 u8 hds_config_mod; 855 + int rc; 856 856 857 857 if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) || 858 858 (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) || 859 859 (ering->tx_pending < BNXT_MIN_TX_DESC_CNT)) 860 + return -EINVAL; 861 + 862 + if ((dev->features & NETIF_F_GSO_UDP_L4) && 863 + !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && 864 + ering->tx_pending < 2 * BNXT_SW_USO_MAX_DESCS) 860 865 return -EINVAL; 861 866 862 867 hds_config_mod = tcp_data_split != dev->cfg->hds_config; ··· 889 882 bp->tx_ring_size = ering->tx_pending; 890 883 bnxt_set_ring_params(bp); 891 884 892 - if (netif_running(dev)) 893 - return bnxt_open_nic(bp, false, false); 885 + if (netif_running(dev)) { 886 + rc = bnxt_open_nic(bp, false, false); 887 + if (rc) 888 + return rc; 889 + } 894 890 891 + /* ring size changes may affect features (SW USO requires a minimum 892 + * ring size), so recalculate features to ensure the correct features 893 + * are blocked/available. 894 + */ 895 + netdev_update_features(dev); 895 896 return 0; 896 897 } 897 898
+240
drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* Broadcom NetXtreme-C/E network driver. 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License as published by 6 + * the Free Software Foundation. 7 + */ 8 + 9 + #include <linux/pci.h> 10 + #include <linux/netdevice.h> 11 + #include <linux/skbuff.h> 12 + #include <net/netdev_queues.h> 13 + #include <net/ip.h> 14 + #include <net/ipv6.h> 15 + #include <net/udp.h> 16 + #include <net/tso.h> 17 + #include <linux/bnxt/hsi.h> 18 + 19 + #include "bnxt.h" 20 + #include "bnxt_gso.h" 21 + 22 + static u32 bnxt_sw_gso_lhint(unsigned int len) 23 + { 24 + if (len <= 512) 25 + return TX_BD_FLAGS_LHINT_512_AND_SMALLER; 26 + else if (len <= 1023) 27 + return TX_BD_FLAGS_LHINT_512_TO_1023; 28 + else if (len <= 2047) 29 + return TX_BD_FLAGS_LHINT_1024_TO_2047; 30 + else 31 + return TX_BD_FLAGS_LHINT_2048_AND_LARGER; 32 + } 33 + 34 + netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, 35 + struct bnxt_tx_ring_info *txr, 36 + struct netdev_queue *txq, 37 + struct sk_buff *skb) 38 + { 39 + unsigned int last_unmap_len __maybe_unused = 0; 40 + dma_addr_t last_unmap_addr __maybe_unused = 0; 41 + struct bnxt_sw_tx_bd *last_unmap_buf = NULL; 42 + unsigned int hdr_len, mss, num_segs; 43 + struct pci_dev *pdev = bp->pdev; 44 + unsigned int total_payload; 45 + struct tso_dma_map map; 46 + u32 vlan_tag_flags = 0; 47 + int i, bds_needed; 48 + struct tso_t tso; 49 + u16 cfa_action; 50 + __le32 csum; 51 + u16 prod; 52 + 53 + hdr_len = tso_start(skb, &tso); 54 + mss = skb_shinfo(skb)->gso_size; 55 + total_payload = skb->len - hdr_len; 56 + num_segs = DIV_ROUND_UP(total_payload, mss); 57 + 58 + if (unlikely(num_segs <= 1)) 59 + goto drop; 60 + 61 + /* Upper bound on the number of descriptors needed. 62 + * 63 + * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is 64 + * at most num_segs + nr_frags (each frag boundary crossing adds at 65 + * most 1 extra BD). 66 + */ 67 + bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1; 68 + 69 + if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) { 70 + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), 71 + bp->tx_wake_thresh); 72 + return NETDEV_TX_BUSY; 73 + } 74 + 75 + /* BD backpressure alone cannot prevent overwriting in-flight 76 + * headers in the inline buffer. Check slot availability directly. 77 + */ 78 + if (!netif_txq_maybe_stop(txq, bnxt_inline_avail(txr), 79 + num_segs, num_segs)) 80 + return NETDEV_TX_BUSY; 81 + 82 + if (unlikely(tso_dma_map_init(&map, &pdev->dev, skb, hdr_len))) 83 + goto drop; 84 + 85 + cfa_action = bnxt_xmit_get_cfa_action(skb); 86 + if (skb_vlan_tag_present(skb)) { 87 + vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN | 88 + skb_vlan_tag_get(skb); 89 + if (skb->vlan_proto == htons(ETH_P_8021Q)) 90 + vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT; 91 + } 92 + 93 + csum = cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM); 94 + if (!tso.ipv6) 95 + csum |= cpu_to_le32(TX_BD_FLAGS_IP_CKSUM); 96 + 97 + prod = txr->tx_prod; 98 + 99 + for (i = 0; i < num_segs; i++) { 100 + unsigned int seg_payload = min_t(unsigned int, mss, 101 + total_payload - i * mss); 102 + u16 slot = (txr->tx_inline_prod + i) & 103 + (BNXT_SW_USO_MAX_SEGS - 1); 104 + struct bnxt_sw_tx_bd *tx_buf; 105 + unsigned int mapping_len; 106 + dma_addr_t this_hdr_dma; 107 + unsigned int chunk_len; 108 + unsigned int offset; 109 + dma_addr_t dma_addr; 110 + struct tx_bd *txbd; 111 + struct udphdr *uh; 112 + void *this_hdr; 113 + int bd_count; 114 + bool last; 115 + u32 flags; 116 + 117 + last = (i == num_segs - 1); 118 + offset = slot * TSO_HEADER_SIZE; 119 + this_hdr = txr->tx_inline_buf + offset; 120 + this_hdr_dma = txr->tx_inline_dma + offset; 121 + 122 + tso_build_hdr(skb, this_hdr, &tso, seg_payload, last); 123 + 124 + /* Zero stale csum fields copied from the original skb; 125 + * HW offload recomputes from scratch. 126 + */ 127 + uh = this_hdr + skb_transport_offset(skb); 128 + uh->check = 0; 129 + if (!tso.ipv6) { 130 + struct iphdr *iph = this_hdr + skb_network_offset(skb); 131 + 132 + iph->check = 0; 133 + } 134 + 135 + dma_sync_single_for_device(&pdev->dev, this_hdr_dma, 136 + hdr_len, DMA_TO_DEVICE); 137 + 138 + bd_count = tso_dma_map_count(&map, seg_payload); 139 + 140 + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; 141 + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; 142 + 143 + tx_buf->skb = skb; 144 + tx_buf->nr_frags = bd_count; 145 + tx_buf->is_push = 0; 146 + tx_buf->is_ts_pkt = 0; 147 + 148 + dma_unmap_addr_set(tx_buf, mapping, this_hdr_dma); 149 + dma_unmap_len_set(tx_buf, len, 0); 150 + 151 + if (last) { 152 + tx_buf->is_sw_gso = BNXT_SW_GSO_LAST; 153 + tso_dma_map_completion_save(&map, &tx_buf->sw_gso_cstate); 154 + } else { 155 + tx_buf->is_sw_gso = BNXT_SW_GSO_MID; 156 + } 157 + 158 + flags = (hdr_len << TX_BD_LEN_SHIFT) | 159 + TX_BD_TYPE_LONG_TX_BD | 160 + TX_BD_CNT(2 + bd_count); 161 + 162 + flags |= bnxt_sw_gso_lhint(hdr_len + seg_payload); 163 + 164 + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); 165 + txbd->tx_bd_haddr = cpu_to_le64(this_hdr_dma); 166 + txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod, 167 + 2 + bd_count); 168 + 169 + prod = NEXT_TX(prod); 170 + bnxt_init_ext_bd(bp, txr, prod, csum, 171 + vlan_tag_flags, cfa_action); 172 + 173 + /* set dma_unmap_len on the LAST BD touching each 174 + * region. Since completions are in-order, the last segment 175 + * completes after all earlier ones, so the unmap is safe. 176 + */ 177 + while (tso_dma_map_next(&map, &dma_addr, &chunk_len, 178 + &mapping_len, seg_payload)) { 179 + prod = NEXT_TX(prod); 180 + txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)]; 181 + tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)]; 182 + 183 + txbd->tx_bd_haddr = cpu_to_le64(dma_addr); 184 + dma_unmap_addr_set(tx_buf, mapping, dma_addr); 185 + dma_unmap_len_set(tx_buf, len, 0); 186 + tx_buf->skb = NULL; 187 + tx_buf->is_sw_gso = 0; 188 + 189 + if (mapping_len) { 190 + if (last_unmap_buf) { 191 + dma_unmap_addr_set(last_unmap_buf, 192 + mapping, 193 + last_unmap_addr); 194 + dma_unmap_len_set(last_unmap_buf, 195 + len, 196 + last_unmap_len); 197 + } 198 + last_unmap_addr = dma_addr; 199 + last_unmap_len = mapping_len; 200 + } 201 + last_unmap_buf = tx_buf; 202 + 203 + flags = chunk_len << TX_BD_LEN_SHIFT; 204 + txbd->tx_bd_len_flags_type = cpu_to_le32(flags); 205 + txbd->tx_bd_opaque = 0; 206 + 207 + seg_payload -= chunk_len; 208 + } 209 + 210 + txbd->tx_bd_len_flags_type |= 211 + cpu_to_le32(TX_BD_FLAGS_PACKET_END); 212 + 213 + prod = NEXT_TX(prod); 214 + } 215 + 216 + if (last_unmap_buf) { 217 + dma_unmap_addr_set(last_unmap_buf, mapping, last_unmap_addr); 218 + dma_unmap_len_set(last_unmap_buf, len, last_unmap_len); 219 + } 220 + 221 + txr->tx_inline_prod += num_segs; 222 + 223 + netdev_tx_sent_queue(txq, skb->len); 224 + 225 + WRITE_ONCE(txr->tx_prod, prod); 226 + /* Sync BDs before doorbell */ 227 + wmb(); 228 + bnxt_db_write(bp, &txr->tx_db, prod); 229 + 230 + if (unlikely(bnxt_tx_avail(bp, txr) <= bp->tx_wake_thresh)) 231 + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr), 232 + bp->tx_wake_thresh); 233 + 234 + return NETDEV_TX_OK; 235 + 236 + drop: 237 + dev_kfree_skb_any(skb); 238 + dev_core_stats_tx_dropped_inc(bp->dev); 239 + return NETDEV_TX_OK; 240 + }
+46
drivers/net/ethernet/broadcom/bnxt/bnxt_gso.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Broadcom NetXtreme-C/E network driver. 4 + * 5 + * This program is free software; you can redistribute it and/or modify 6 + * it under the terms of the GNU General Public License as published by 7 + * the Free Software Foundation. 8 + */ 9 + 10 + #ifndef BNXT_GSO_H 11 + #define BNXT_GSO_H 12 + 13 + /* Maximum segments the stack may send in a single SW USO skb. 14 + * This caps gso_max_segs for NICs without HW USO support. 15 + */ 16 + #define BNXT_SW_USO_MAX_SEGS 64 17 + 18 + /* Worst-case TX descriptors consumed by one SW USO packet: 19 + * Each segment: 1 long BD + 1 ext BD + payload BDs. 20 + * Total payload BDs across all segs <= num_segs + nr_frags (each frag 21 + * boundary crossing adds at most 1 extra BD). 22 + * So: 3 * max_segs + MAX_SKB_FRAGS + 1 = 3 * 64 + 17 + 1 = 210. 23 + */ 24 + #define BNXT_SW_USO_MAX_DESCS (3 * BNXT_SW_USO_MAX_SEGS + MAX_SKB_FRAGS + 1) 25 + 26 + static inline u16 bnxt_inline_avail(struct bnxt_tx_ring_info *txr) 27 + { 28 + return BNXT_SW_USO_MAX_SEGS - 29 + (u16)(txr->tx_inline_prod - READ_ONCE(txr->tx_inline_cons)); 30 + } 31 + 32 + static inline int bnxt_min_tx_desc_cnt(struct bnxt *bp, 33 + netdev_features_t features) 34 + { 35 + if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) && 36 + (features & NETIF_F_GSO_UDP_L4)) 37 + return BNXT_SW_USO_MAX_DESCS; 38 + return BNXT_MIN_TX_DESC_CNT; 39 + } 40 + 41 + netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp, 42 + struct bnxt_tx_ring_info *txr, 43 + struct netdev_queue *txq, 44 + struct sk_buff *skb); 45 + 46 + #endif
+11
include/linux/skbuff.h
··· 3764 3764 } 3765 3765 3766 3766 /** 3767 + * skb_frag_phys - gets the physical address of the data in a paged fragment 3768 + * @frag: the paged fragment buffer 3769 + * 3770 + * Returns: the physical address of the data within @frag. 3771 + */ 3772 + static inline phys_addr_t skb_frag_phys(const skb_frag_t *frag) 3773 + { 3774 + return page_to_phys(skb_frag_page(frag)) + skb_frag_off(frag); 3775 + } 3776 + 3777 + /** 3767 3778 * skb_frag_page_copy() - sets the page in a fragment from another fragment 3768 3779 * @fragto: skb fragment where page is set 3769 3780 * @fragfrom: skb fragment page is copied from
+100
include/net/tso.h
··· 3 3 #define _TSO_H 4 4 5 5 #include <linux/skbuff.h> 6 + #include <linux/dma-mapping.h> 6 7 #include <net/ip.h> 7 8 8 9 #define TSO_HEADER_SIZE 256 ··· 28 27 int size, bool is_last); 29 28 void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size); 30 29 int tso_start(struct sk_buff *skb, struct tso_t *tso); 30 + 31 + /** 32 + * struct tso_dma_map - DMA mapping state for GSO payload 33 + * @dev: device used for DMA mapping 34 + * @skb: the GSO skb being mapped 35 + * @hdr_len: per-segment header length 36 + * @iova_state: DMA IOVA state (when IOMMU available) 37 + * @iova_offset: global byte offset into IOVA range (IOVA path only) 38 + * @total_len: total payload length 39 + * @frag_idx: current region (-1 = linear, 0..nr_frags-1 = frag) 40 + * @offset: byte offset within current region 41 + * @linear_dma: DMA address of the linear payload 42 + * @linear_len: length of the linear payload 43 + * @nr_frags: number of frags successfully DMA-mapped 44 + * @frags: per-frag DMA address and length 45 + * 46 + * DMA-maps the payload regions of a GSO skb (linear data + frags). 47 + * Prefers the DMA IOVA API for a single contiguous mapping with one 48 + * IOTLB sync; falls back to per-region dma_map_phys() otherwise. 49 + */ 50 + struct tso_dma_map { 51 + struct device *dev; 52 + const struct sk_buff *skb; 53 + unsigned int hdr_len; 54 + /* IOVA path */ 55 + struct dma_iova_state iova_state; 56 + size_t iova_offset; 57 + size_t total_len; 58 + /* Fallback path if IOVA path fails */ 59 + int frag_idx; 60 + unsigned int offset; 61 + dma_addr_t linear_dma; 62 + unsigned int linear_len; 63 + unsigned int nr_frags; 64 + struct { 65 + dma_addr_t dma; 66 + unsigned int len; 67 + } frags[MAX_SKB_FRAGS]; 68 + }; 69 + 70 + /** 71 + * struct tso_dma_map_completion_state - Completion-time cleanup state 72 + * @iova_state: DMA IOVA state (when IOMMU available) 73 + * @total_len: total payload length of the IOVA mapping 74 + * 75 + * Drivers store this on their SW ring at xmit time via 76 + * tso_dma_map_completion_save(), then call tso_dma_map_complete() at 77 + * completion time. 78 + */ 79 + struct tso_dma_map_completion_state { 80 + struct dma_iova_state iova_state; 81 + size_t total_len; 82 + }; 83 + 84 + int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, 85 + const struct sk_buff *skb, unsigned int hdr_len); 86 + void tso_dma_map_cleanup(struct tso_dma_map *map); 87 + unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len); 88 + bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, 89 + unsigned int *chunk_len, unsigned int *mapping_len, 90 + unsigned int seg_remaining); 91 + 92 + /** 93 + * tso_dma_map_completion_save - save state needed for completion-time cleanup 94 + * @map: the xmit-time DMA map 95 + * @cstate: driver-owned storage that persists until completion 96 + * 97 + * Should be called at xmit time to update the completion state and later passed 98 + * to tso_dma_map_complete(). 99 + */ 100 + static inline void 101 + tso_dma_map_completion_save(const struct tso_dma_map *map, 102 + struct tso_dma_map_completion_state *cstate) 103 + { 104 + cstate->iova_state = map->iova_state; 105 + cstate->total_len = map->total_len; 106 + } 107 + 108 + /** 109 + * tso_dma_map_complete - tear down mapping at completion time 110 + * @dev: the device that owns the mapping 111 + * @cstate: state saved by tso_dma_map_completion_save() 112 + * 113 + * Return: true if the IOVA path was used and the mapping has been 114 + * destroyed; false if the fallback per-region path was used and the 115 + * driver must unmap via its normal completion path. 116 + */ 117 + static inline bool 118 + tso_dma_map_complete(struct device *dev, 119 + struct tso_dma_map_completion_state *cstate) 120 + { 121 + if (dma_use_iova(&cstate->iova_state)) { 122 + dma_iova_destroy(dev, &cstate->iova_state, cstate->total_len, 123 + DMA_TO_DEVICE, 0); 124 + return true; 125 + } 126 + 127 + return false; 128 + } 31 129 32 130 #endif /* _TSO_H */
+269
net/core/tso.c
··· 3 3 #include <linux/if_vlan.h> 4 4 #include <net/ip.h> 5 5 #include <net/tso.h> 6 + #include <linux/dma-mapping.h> 6 7 #include <linux/unaligned.h> 7 8 8 9 void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso, ··· 88 87 return hdr_len; 89 88 } 90 89 EXPORT_SYMBOL(tso_start); 90 + 91 + static int tso_dma_iova_try(struct device *dev, struct tso_dma_map *map, 92 + phys_addr_t phys, size_t linear_len, 93 + size_t total_len, size_t *offset) 94 + { 95 + const struct sk_buff *skb; 96 + unsigned int nr_frags; 97 + int i; 98 + 99 + if (!dma_iova_try_alloc(dev, &map->iova_state, phys, total_len)) 100 + return 1; 101 + 102 + skb = map->skb; 103 + nr_frags = skb_shinfo(skb)->nr_frags; 104 + 105 + if (linear_len) { 106 + if (dma_iova_link(dev, &map->iova_state, 107 + phys, *offset, linear_len, 108 + DMA_TO_DEVICE, 0)) 109 + goto iova_fail; 110 + map->linear_len = linear_len; 111 + *offset += linear_len; 112 + } 113 + 114 + for (i = 0; i < nr_frags; i++) { 115 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 116 + unsigned int frag_len = skb_frag_size(frag); 117 + 118 + if (dma_iova_link(dev, &map->iova_state, 119 + skb_frag_phys(frag), *offset, 120 + frag_len, DMA_TO_DEVICE, 0)) { 121 + map->nr_frags = i; 122 + goto iova_fail; 123 + } 124 + map->frags[i].len = frag_len; 125 + *offset += frag_len; 126 + map->nr_frags = i + 1; 127 + } 128 + 129 + if (dma_iova_sync(dev, &map->iova_state, 0, total_len)) 130 + goto iova_fail; 131 + 132 + return 0; 133 + 134 + iova_fail: 135 + dma_iova_destroy(dev, &map->iova_state, *offset, 136 + DMA_TO_DEVICE, 0); 137 + memset(&map->iova_state, 0, sizeof(map->iova_state)); 138 + 139 + /* reset map state */ 140 + map->frag_idx = -1; 141 + map->offset = 0; 142 + map->linear_len = 0; 143 + map->nr_frags = 0; 144 + 145 + return 1; 146 + } 147 + 148 + /** 149 + * tso_dma_map_init - DMA-map GSO payload regions 150 + * @map: map struct to initialize 151 + * @dev: device for DMA mapping 152 + * @skb: the GSO skb 153 + * @hdr_len: per-segment header length in bytes 154 + * 155 + * DMA-maps the linear payload (after headers) and all frags. 156 + * Prefers the DMA IOVA API (one contiguous mapping, one IOTLB sync); 157 + * falls back to per-region dma_map_phys() when IOVA is not available. 158 + * Positions the iterator at byte 0 of the payload. 159 + * 160 + * Return: 0 on success, -ENOMEM on DMA mapping failure (partial mappings 161 + * are cleaned up internally). 162 + */ 163 + int tso_dma_map_init(struct tso_dma_map *map, struct device *dev, 164 + const struct sk_buff *skb, unsigned int hdr_len) 165 + { 166 + unsigned int linear_len = skb_headlen(skb) - hdr_len; 167 + unsigned int nr_frags = skb_shinfo(skb)->nr_frags; 168 + size_t total_len = skb->len - hdr_len; 169 + size_t offset = 0; 170 + phys_addr_t phys; 171 + int i; 172 + 173 + map->dev = dev; 174 + map->skb = skb; 175 + map->hdr_len = hdr_len; 176 + map->frag_idx = -1; 177 + map->offset = 0; 178 + map->iova_offset = 0; 179 + map->total_len = total_len; 180 + map->linear_len = 0; 181 + map->nr_frags = 0; 182 + memset(&map->iova_state, 0, sizeof(map->iova_state)); 183 + 184 + if (!total_len) 185 + return 0; 186 + 187 + if (linear_len) 188 + phys = virt_to_phys(skb->data + hdr_len); 189 + else 190 + phys = skb_frag_phys(&skb_shinfo(skb)->frags[0]); 191 + 192 + if (tso_dma_iova_try(dev, map, phys, linear_len, total_len, &offset)) { 193 + /* IOVA path failed, map state was reset. Fallback to 194 + * per-region dma_map_phys() 195 + */ 196 + if (linear_len) { 197 + map->linear_dma = dma_map_phys(dev, phys, linear_len, 198 + DMA_TO_DEVICE, 0); 199 + if (dma_mapping_error(dev, map->linear_dma)) 200 + return -ENOMEM; 201 + map->linear_len = linear_len; 202 + } 203 + 204 + for (i = 0; i < nr_frags; i++) { 205 + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; 206 + unsigned int frag_len = skb_frag_size(frag); 207 + 208 + map->frags[i].len = frag_len; 209 + map->frags[i].dma = dma_map_phys(dev, skb_frag_phys(frag), 210 + frag_len, DMA_TO_DEVICE, 0); 211 + if (dma_mapping_error(dev, map->frags[i].dma)) { 212 + tso_dma_map_cleanup(map); 213 + return -ENOMEM; 214 + } 215 + map->nr_frags = i + 1; 216 + } 217 + } 218 + 219 + if (linear_len == 0 && nr_frags > 0) 220 + map->frag_idx = 0; 221 + 222 + return 0; 223 + } 224 + EXPORT_SYMBOL(tso_dma_map_init); 225 + 226 + /** 227 + * tso_dma_map_cleanup - unmap all DMA regions in a tso_dma_map 228 + * @map: the map to clean up 229 + * 230 + * Handles both IOVA and fallback paths. For IOVA, calls 231 + * dma_iova_destroy(). For fallback, unmaps each region individually. 232 + */ 233 + void tso_dma_map_cleanup(struct tso_dma_map *map) 234 + { 235 + int i; 236 + 237 + if (dma_use_iova(&map->iova_state)) { 238 + dma_iova_destroy(map->dev, &map->iova_state, map->total_len, 239 + DMA_TO_DEVICE, 0); 240 + memset(&map->iova_state, 0, sizeof(map->iova_state)); 241 + } else { 242 + if (map->linear_len) 243 + dma_unmap_phys(map->dev, map->linear_dma, 244 + map->linear_len, DMA_TO_DEVICE, 0); 245 + 246 + for (i = 0; i < map->nr_frags; i++) 247 + dma_unmap_phys(map->dev, map->frags[i].dma, 248 + map->frags[i].len, DMA_TO_DEVICE, 0); 249 + } 250 + 251 + map->linear_len = 0; 252 + map->nr_frags = 0; 253 + } 254 + EXPORT_SYMBOL(tso_dma_map_cleanup); 255 + 256 + /** 257 + * tso_dma_map_count - count descriptors for a payload range 258 + * @map: the payload map 259 + * @len: number of payload bytes in this segment 260 + * 261 + * Counts how many contiguous DMA region chunks the next @len bytes 262 + * will span, without advancing the iterator. On the IOVA path this 263 + * is always 1 (contiguous). On the fallback path, uses region sizes 264 + * from the current position. 265 + * 266 + * Return: the number of descriptors needed for @len bytes of payload. 267 + */ 268 + unsigned int tso_dma_map_count(struct tso_dma_map *map, unsigned int len) 269 + { 270 + unsigned int offset = map->offset; 271 + int idx = map->frag_idx; 272 + unsigned int count = 0; 273 + 274 + if (!len) 275 + return 0; 276 + 277 + if (dma_use_iova(&map->iova_state)) 278 + return 1; 279 + 280 + while (len > 0) { 281 + unsigned int region_len, chunk; 282 + 283 + if (idx == -1) 284 + region_len = map->linear_len; 285 + else 286 + region_len = map->frags[idx].len; 287 + 288 + chunk = min(len, region_len - offset); 289 + len -= chunk; 290 + count++; 291 + offset = 0; 292 + idx++; 293 + } 294 + 295 + return count; 296 + } 297 + EXPORT_SYMBOL(tso_dma_map_count); 298 + 299 + /** 300 + * tso_dma_map_next - yield the next DMA address range 301 + * @map: the payload map 302 + * @addr: output DMA address 303 + * @chunk_len: output chunk length 304 + * @mapping_len: full DMA mapping length when this chunk starts a new 305 + * mapping region, or 0 when continuing a previous one. 306 + * On the IOVA path this is always 0 (driver must not 307 + * do per-region unmaps; use tso_dma_map_cleanup instead). 308 + * @seg_remaining: bytes left in current segment 309 + * 310 + * Yields the next (dma_addr, chunk_len) pair and advances the iterator. 311 + * On the IOVA path, the entire payload is contiguous so each segment 312 + * is always a single chunk. 313 + * 314 + * Return: true if a chunk was yielded, false when @seg_remaining is 0. 315 + */ 316 + bool tso_dma_map_next(struct tso_dma_map *map, dma_addr_t *addr, 317 + unsigned int *chunk_len, unsigned int *mapping_len, 318 + unsigned int seg_remaining) 319 + { 320 + unsigned int region_len, chunk; 321 + 322 + if (!seg_remaining) 323 + return false; 324 + 325 + /* IOVA path: contiguous DMA range, no region boundaries */ 326 + if (dma_use_iova(&map->iova_state)) { 327 + *addr = map->iova_state.addr + map->iova_offset; 328 + *chunk_len = seg_remaining; 329 + *mapping_len = 0; 330 + map->iova_offset += seg_remaining; 331 + return true; 332 + } 333 + 334 + /* Fallback path: per-region iteration */ 335 + 336 + if (map->frag_idx == -1) { 337 + region_len = map->linear_len; 338 + chunk = min(seg_remaining, region_len - map->offset); 339 + *addr = map->linear_dma + map->offset; 340 + } else { 341 + region_len = map->frags[map->frag_idx].len; 342 + chunk = min(seg_remaining, region_len - map->offset); 343 + *addr = map->frags[map->frag_idx].dma + map->offset; 344 + } 345 + 346 + *mapping_len = (map->offset == 0) ? region_len : 0; 347 + *chunk_len = chunk; 348 + map->offset += chunk; 349 + 350 + if (map->offset >= region_len) { 351 + map->frag_idx++; 352 + map->offset = 0; 353 + } 354 + 355 + return true; 356 + } 357 + EXPORT_SYMBOL(tso_dma_map_next);
+1
tools/testing/selftests/drivers/net/hw/Makefile
··· 45 45 rss_input_xfrm.py \ 46 46 toeplitz.py \ 47 47 tso.py \ 48 + uso.py \ 48 49 xdp_metadata.py \ 49 50 xsk_reconfig.py \ 50 51 #
+103
tools/testing/selftests/drivers/net/hw/uso.py
··· 1 + #!/usr/bin/env python3 2 + # SPDX-License-Identifier: GPL-2.0 3 + 4 + """Test USO 5 + 6 + Sends large UDP datagrams with UDP_SEGMENT and verifies that the peer 7 + receives the expected total payload and that the NIC transmitted at least 8 + the expected number of segments. 9 + """ 10 + import random 11 + import socket 12 + import string 13 + 14 + from lib.py import ksft_run, ksft_exit, KsftSkipEx 15 + from lib.py import ksft_eq, ksft_ge, ksft_variants, KsftNamedVariant 16 + from lib.py import NetDrvEpEnv 17 + from lib.py import bkg, defer, ethtool, ip, rand_port, wait_port_listen 18 + 19 + # python doesn't expose this constant, so we need to hardcode it to enable UDP 20 + # segmentation for large payloads 21 + UDP_SEGMENT = 103 22 + 23 + 24 + def _send_uso(cfg, ipver, mss, total_payload, port): 25 + if ipver == "4": 26 + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 27 + dst = (cfg.remote_addr_v["4"], port) 28 + else: 29 + sock = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM) 30 + dst = (cfg.remote_addr_v["6"], port) 31 + 32 + sock.setsockopt(socket.IPPROTO_UDP, UDP_SEGMENT, mss) 33 + payload = ''.join(random.choice(string.ascii_lowercase) 34 + for _ in range(total_payload)) 35 + sock.sendto(payload.encode(), dst) 36 + sock.close() 37 + 38 + 39 + def _get_tx_packets(cfg): 40 + stats = ip(f"-s link show dev {cfg.ifname}", json=True)[0] 41 + return stats['stats64']['tx']['packets'] 42 + 43 + 44 + def _test_uso(cfg, ipver, mss, total_payload): 45 + cfg.require_ipver(ipver) 46 + cfg.require_cmd("socat", remote=True) 47 + 48 + features = ethtool(f"-k {cfg.ifname}", json=True) 49 + uso_was_on = features[0]["tx-udp-segmentation"]["active"] 50 + 51 + try: 52 + ethtool(f"-K {cfg.ifname} tx-udp-segmentation on") 53 + except Exception as exc: 54 + raise KsftSkipEx( 55 + "Device does not support tx-udp-segmentation") from exc 56 + if not uso_was_on: 57 + defer(ethtool, f"-K {cfg.ifname} tx-udp-segmentation off") 58 + 59 + expected_segs = (total_payload + mss - 1) // mss 60 + 61 + port = rand_port(stype=socket.SOCK_DGRAM) 62 + rx_cmd = f"socat -{ipver} -T 2 -u UDP-LISTEN:{port},reuseport STDOUT" 63 + 64 + tx_before = _get_tx_packets(cfg) 65 + 66 + with bkg(rx_cmd, host=cfg.remote, exit_wait=True) as rx: 67 + wait_port_listen(port, proto="udp", host=cfg.remote) 68 + _send_uso(cfg, ipver, mss, total_payload, port) 69 + 70 + ksft_eq(len(rx.stdout), total_payload, 71 + comment=f"Received {len(rx.stdout)}B, expected {total_payload}B") 72 + 73 + cfg.wait_hw_stats_settle() 74 + 75 + tx_after = _get_tx_packets(cfg) 76 + tx_delta = tx_after - tx_before 77 + 78 + ksft_ge(tx_delta, expected_segs, 79 + comment=f"Expected >= {expected_segs} tx packets, got {tx_delta}") 80 + 81 + 82 + def _uso_variants(): 83 + for ipver in ["4", "6"]: 84 + yield KsftNamedVariant(f"v{ipver}_partial", ipver, 1400, 1400 * 10 + 500) 85 + yield KsftNamedVariant(f"v{ipver}_exact", ipver, 1400, 1400 * 5) 86 + 87 + 88 + @ksft_variants(_uso_variants()) 89 + def test_uso(cfg, ipver, mss, total_payload): 90 + """Send a USO datagram and verify the peer receives the expected segments.""" 91 + _test_uso(cfg, ipver, mss, total_payload) 92 + 93 + 94 + def main() -> None: 95 + """Run USO tests.""" 96 + with NetDrvEpEnv(__file__) as cfg: 97 + ksft_run([test_uso], 98 + args=(cfg, )) 99 + ksft_exit() 100 + 101 + 102 + if __name__ == "__main__": 103 + main()