Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'devmem-io_uring-allow-more-flexibility-for-zc-dma-devices'

Dragos Tatulea says:

====================
devmem/io_uring: allow more flexibility for ZC DMA devices

For TCP zerocopy rx (io_uring, devmem), there is an assumption that the
parent device can do DMA. However that is not always the case:
- Scalable Function netdevs [1] have the DMA device in the grandparent.
- For Multi-PF netdevs [2] queues can be associated to different DMA
devices.

The series adds an API for getting the DMA device for a netdev queue.
Drivers that have special requirements can implement the newly added
queue management op. Otherwise the parent will still be used as before.

This series continues with switching to this API for io_uring zcrx and
devmem and adds a ndo_queue_dma_dev op for mlx5.

The last part of the series changes devmem rx bind to get the DMA device
per queue and blocks the case when multiple queues use different DMA
devices. The tx bind is left as is.

[1] Documentation/networking/device_drivers/ethernet/mellanox/mlx5/switchdev.rst
[2] Documentation/networking/multi-pf-netdev.rst
====================

Link: https://patch.msgid.link/20250827144017.1529208-2-dtatulea@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+163 -31
+24
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
··· 5625 5625 return 0; 5626 5626 } 5627 5627 5628 + static struct device *mlx5e_queue_get_dma_dev(struct net_device *dev, 5629 + int queue_index) 5630 + { 5631 + struct mlx5e_priv *priv = netdev_priv(dev); 5632 + struct mlx5e_channels *channels; 5633 + struct device *pdev = NULL; 5634 + struct mlx5e_channel *ch; 5635 + 5636 + channels = &priv->channels; 5637 + 5638 + mutex_lock(&priv->state_lock); 5639 + 5640 + if (queue_index >= channels->num) 5641 + goto out; 5642 + 5643 + ch = channels->c[queue_index]; 5644 + pdev = ch->pdev; 5645 + out: 5646 + mutex_unlock(&priv->state_lock); 5647 + 5648 + return pdev; 5649 + } 5650 + 5628 5651 static const struct netdev_queue_mgmt_ops mlx5e_queue_mgmt_ops = { 5629 5652 .ndo_queue_mem_size = sizeof(struct mlx5_qmgmt_data), 5630 5653 .ndo_queue_mem_alloc = mlx5e_queue_mem_alloc, 5631 5654 .ndo_queue_mem_free = mlx5e_queue_mem_free, 5632 5655 .ndo_queue_start = mlx5e_queue_start, 5633 5656 .ndo_queue_stop = mlx5e_queue_stop, 5657 + .ndo_queue_get_dma_dev = mlx5e_queue_get_dma_dev, 5634 5658 }; 5635 5659 5636 5660 static void mlx5e_build_nic_netdev(struct net_device *netdev)
+7
include/net/netdev_queues.h
··· 127 127 * @ndo_queue_stop: Stop the RX queue at the specified index. The stopped 128 128 * queue's memory is written at the specified address. 129 129 * 130 + * @ndo_queue_get_dma_dev: Get dma device for zero-copy operations to be used 131 + * for this queue. Return NULL on error. 132 + * 130 133 * Note that @ndo_queue_mem_alloc and @ndo_queue_mem_free may be called while 131 134 * the interface is closed. @ndo_queue_start and @ndo_queue_stop will only 132 135 * be called for an interface which is open. ··· 147 144 int (*ndo_queue_stop)(struct net_device *dev, 148 145 void *per_queue_mem, 149 146 int idx); 147 + struct device * (*ndo_queue_get_dma_dev)(struct net_device *dev, 148 + int idx); 150 149 }; 151 150 152 151 /** ··· 325 320 netif_txq_completed_wake(_txq, pkts, bytes, \ 326 321 get_desc, start_thrs); \ 327 322 }) 323 + 324 + struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx); 328 325 329 326 #endif
+2 -1
io_uring/zcrx.c
··· 12 12 #include <net/page_pool/helpers.h> 13 13 #include <net/page_pool/memory_provider.h> 14 14 #include <net/netlink.h> 15 + #include <net/netdev_queues.h> 15 16 #include <net/netdev_rx_queue.h> 16 17 #include <net/tcp.h> 17 18 #include <net/rps.h> ··· 600 599 goto err; 601 600 } 602 601 603 - ifq->dev = ifq->netdev->dev.parent; 602 + ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, ifq->if_rxq); 604 603 if (!ifq->dev) { 605 604 ret = -EOPNOTSUPP; 606 605 goto err;
+1
net/core/Makefile
··· 20 20 obj-y += net-sysfs.o 21 21 obj-y += hotdata.o 22 22 obj-y += netdev_rx_queue.o 23 + obj-y += netdev_queues.o 23 24 obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o 24 25 obj-$(CONFIG_PROC_FS) += net-procfs.o 25 26 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
+7 -1
net/core/devmem.c
··· 176 176 177 177 struct net_devmem_dmabuf_binding * 178 178 net_devmem_bind_dmabuf(struct net_device *dev, 179 + struct device *dma_dev, 179 180 enum dma_data_direction direction, 180 181 unsigned int dmabuf_fd, struct netdev_nl_sock *priv, 181 182 struct netlink_ext_ack *extack) ··· 188 187 unsigned int sg_idx, i; 189 188 unsigned long virtual; 190 189 int err; 190 + 191 + if (!dma_dev) { 192 + NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); 193 + return ERR_PTR(-EOPNOTSUPP); 194 + } 191 195 192 196 dmabuf = dma_buf_get(dmabuf_fd); 193 197 if (IS_ERR(dmabuf)) ··· 215 209 binding->dmabuf = dmabuf; 216 210 binding->direction = direction; 217 211 218 - binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); 212 + binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); 219 213 if (IS_ERR(binding->attachment)) { 220 214 err = PTR_ERR(binding->attachment); 221 215 NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
+2
net/core/devmem.h
··· 85 85 void __net_devmem_dmabuf_binding_free(struct work_struct *wq); 86 86 struct net_devmem_dmabuf_binding * 87 87 net_devmem_bind_dmabuf(struct net_device *dev, 88 + struct device *dma_dev, 88 89 enum dma_data_direction direction, 89 90 unsigned int dmabuf_fd, struct netdev_nl_sock *priv, 90 91 struct netlink_ext_ack *extack); ··· 171 170 172 171 static inline struct net_devmem_dmabuf_binding * 173 172 net_devmem_bind_dmabuf(struct net_device *dev, 173 + struct device *dma_dev, 174 174 enum dma_data_direction direction, 175 175 unsigned int dmabuf_fd, 176 176 struct netdev_nl_sock *priv,
+93 -29
net/core/netdev-genl.c
··· 869 869 return err; 870 870 } 871 871 872 + static int netdev_nl_read_rxq_bitmap(struct genl_info *info, 873 + u32 rxq_bitmap_len, 874 + unsigned long *rxq_bitmap) 875 + { 876 + const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; 877 + struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; 878 + struct nlattr *attr; 879 + int rem, err = 0; 880 + u32 rxq_idx; 881 + 882 + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, 883 + genlmsg_data(info->genlhdr), 884 + genlmsg_len(info->genlhdr), rem) { 885 + err = nla_parse_nested(tb, maxtype, attr, 886 + netdev_queue_id_nl_policy, info->extack); 887 + if (err < 0) 888 + return err; 889 + 890 + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || 891 + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) 892 + return -EINVAL; 893 + 894 + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { 895 + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); 896 + return -EINVAL; 897 + } 898 + 899 + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); 900 + if (rxq_idx >= rxq_bitmap_len) { 901 + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); 902 + return -EINVAL; 903 + } 904 + 905 + bitmap_set(rxq_bitmap, rxq_idx, 1); 906 + } 907 + 908 + return 0; 909 + } 910 + 911 + static struct device * 912 + netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, 913 + struct netlink_ext_ack *extack) 914 + { 915 + struct device *dma_dev = NULL; 916 + u32 rxq_idx, prev_rxq_idx; 917 + 918 + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { 919 + struct device *rxq_dma_dev; 920 + 921 + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); 922 + if (dma_dev && rxq_dma_dev != dma_dev) { 923 + NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", 924 + rxq_idx, prev_rxq_idx); 925 + return ERR_PTR(-EOPNOTSUPP); 926 + } 927 + 928 + dma_dev = rxq_dma_dev; 929 + prev_rxq_idx = rxq_idx; 930 + } 931 + 932 + return dma_dev; 933 + } 934 + 872 935 int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) 873 936 { 874 - struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; 875 937 struct net_devmem_dmabuf_binding *binding; 876 938 u32 ifindex, dmabuf_fd, rxq_idx; 877 939 struct netdev_nl_sock *priv; 878 940 struct net_device *netdev; 941 + unsigned long *rxq_bitmap; 942 + struct device *dma_dev; 879 943 struct sk_buff *rsp; 880 - struct nlattr *attr; 881 - int rem, err = 0; 944 + int err = 0; 882 945 void *hdr; 883 946 884 947 if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || ··· 984 921 goto err_unlock; 985 922 } 986 923 987 - binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, 988 - priv, info->extack); 989 - if (IS_ERR(binding)) { 990 - err = PTR_ERR(binding); 924 + rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); 925 + if (!rxq_bitmap) { 926 + err = -ENOMEM; 991 927 goto err_unlock; 992 928 } 993 929 994 - nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, 995 - genlmsg_data(info->genlhdr), 996 - genlmsg_len(info->genlhdr), rem) { 997 - err = nla_parse_nested( 998 - tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, 999 - netdev_queue_id_nl_policy, info->extack); 1000 - if (err < 0) 1001 - goto err_unbind; 930 + err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, 931 + rxq_bitmap); 932 + if (err) 933 + goto err_rxq_bitmap; 1002 934 1003 - if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || 1004 - NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { 1005 - err = -EINVAL; 1006 - goto err_unbind; 1007 - } 935 + dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); 936 + if (IS_ERR(dma_dev)) { 937 + err = PTR_ERR(dma_dev); 938 + goto err_rxq_bitmap; 939 + } 1008 940 1009 - if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { 1010 - NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); 1011 - err = -EINVAL; 1012 - goto err_unbind; 1013 - } 941 + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, 942 + dmabuf_fd, priv, info->extack); 943 + if (IS_ERR(binding)) { 944 + err = PTR_ERR(binding); 945 + goto err_rxq_bitmap; 946 + } 1014 947 1015 - rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); 1016 - 948 + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { 1017 949 err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, 1018 950 info->extack); 1019 951 if (err) ··· 1022 964 if (err) 1023 965 goto err_unbind; 1024 966 967 + bitmap_free(rxq_bitmap); 968 + 1025 969 netdev_unlock(netdev); 1026 970 1027 971 mutex_unlock(&priv->lock); ··· 1032 972 1033 973 err_unbind: 1034 974 net_devmem_unbind_dmabuf(binding); 975 + err_rxq_bitmap: 976 + bitmap_free(rxq_bitmap); 1035 977 err_unlock: 1036 978 netdev_unlock(netdev); 1037 979 err_unlock_sock: ··· 1048 986 struct net_devmem_dmabuf_binding *binding; 1049 987 struct netdev_nl_sock *priv; 1050 988 struct net_device *netdev; 989 + struct device *dma_dev; 1051 990 u32 ifindex, dmabuf_fd; 1052 991 struct sk_buff *rsp; 1053 992 int err = 0; ··· 1095 1032 goto err_unlock_netdev; 1096 1033 } 1097 1034 1098 - binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, 1099 - info->extack); 1035 + dma_dev = netdev_queue_get_dma_dev(netdev, 0); 1036 + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, 1037 + dmabuf_fd, priv, info->extack); 1100 1038 if (IS_ERR(binding)) { 1101 1039 err = PTR_ERR(binding); 1102 1040 goto err_unlock_netdev;
+27
net/core/netdev_queues.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + 3 + #include <net/netdev_queues.h> 4 + 5 + /** 6 + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations 7 + * @dev: net_device 8 + * @idx: queue index 9 + * 10 + * Get dma device for zero-copy operations to be used for this queue. 11 + * When such device is not available or valid, the function will return NULL. 12 + * 13 + * Return: Device or NULL on error 14 + */ 15 + struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) 16 + { 17 + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; 18 + struct device *dma_dev; 19 + 20 + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) 21 + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); 22 + else 23 + dma_dev = dev->dev.parent; 24 + 25 + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; 26 + } 27 +