Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/uverbs: Add DMABUF object type and operations

Expose DMABUF functionality to userspace through the uverbs interface,
enabling InfiniBand/RDMA devices to export PCI based memory regions
(e.g. device memory) as DMABUF file descriptors. This allows
zero-copy sharing of RDMA memory with other subsystems that support the
dma-buf framework.

A new UVERBS_OBJECT_DMABUF object type and allocation method were
introduced.

During allocation, uverbs invokes the driver to supply the
rdma_user_mmap_entry associated with the given page offset (pgoff).

Based on the returned rdma_user_mmap_entry, uverbs requests the driver
to provide the corresponding physical-memory details as well as the
driver’s PCI provider information.

Using this information, dma_buf_export() is called; if it succeeds,
uobj->object is set to the underlying file pointer returned by the
dma-buf framework.

The file descriptor number follows the standard uverbs allocation flow,
but the file pointer comes from the dma-buf subsystem, including its own
fops and private data.

When an mmap entry is removed, uverbs iterates over its associated
DMABUFs, marks them as revoked, and calls dma_buf_move_notify() so that
their importers are notified.

The same procedure applies during the disassociate flow; final cleanup
occurs when the application closes the file.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Edward Srouji <edwards@nvidia.com>
Link: https://patch.msgid.link/20260201-dmabuf-export-v3-2-da238b614fe3@nvidia.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Yishai Hadas and committed by
Leon Romanovsky
0ac6f405 9ad95a0f

+286 -12
+1
drivers/infiniband/core/Makefile
··· 33 33 ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ 34 34 rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ 35 35 uverbs_std_types_cq.o \ 36 + uverbs_std_types_dmabuf.o \ 36 37 uverbs_std_types_dmah.o \ 37 38 uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ 38 39 uverbs_std_types_mr.o uverbs_std_types_counters.o \
+2
drivers/infiniband/core/device.c
··· 2765 2765 SET_DEVICE_OP(dev_ops, map_mr_sg); 2766 2766 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2767 2767 SET_DEVICE_OP(dev_ops, mmap); 2768 + SET_DEVICE_OP(dev_ops, mmap_get_pfns); 2768 2769 SET_DEVICE_OP(dev_ops, mmap_free); 2769 2770 SET_DEVICE_OP(dev_ops, modify_ah); 2770 2771 SET_DEVICE_OP(dev_ops, modify_cq); ··· 2776 2775 SET_DEVICE_OP(dev_ops, modify_srq); 2777 2776 SET_DEVICE_OP(dev_ops, modify_wq); 2778 2777 SET_DEVICE_OP(dev_ops, peek_cq); 2778 + SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry); 2779 2779 SET_DEVICE_OP(dev_ops, pre_destroy_cq); 2780 2780 SET_DEVICE_OP(dev_ops, poll_cq); 2781 2781 SET_DEVICE_OP(dev_ops, port_groups);
+24
drivers/infiniband/core/ib_core_uverbs.c
··· 5 5 * Copyright 2019 Marvell. All rights reserved. 6 6 */ 7 7 #include <linux/xarray.h> 8 + #include <linux/dma-buf.h> 9 + #include <linux/dma-resv.h> 8 10 #include "uverbs.h" 9 11 #include "core_priv.h" 12 + 13 + MODULE_IMPORT_NS("DMA_BUF"); 10 14 11 15 /** 12 16 * rdma_umap_priv_init() - Initialize the private data of a vma ··· 233 229 */ 234 230 void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) 235 231 { 232 + struct ib_uverbs_dmabuf_file *uverbs_dmabuf, *tmp; 233 + 236 234 if (!entry) 237 235 return; 238 236 237 + mutex_lock(&entry->dmabufs_lock); 239 238 xa_lock(&entry->ucontext->mmap_xa); 240 239 entry->driver_removed = true; 241 240 xa_unlock(&entry->ucontext->mmap_xa); 241 + list_for_each_entry_safe(uverbs_dmabuf, tmp, &entry->dmabufs, dmabufs_elm) { 242 + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); 243 + list_del(&uverbs_dmabuf->dmabufs_elm); 244 + uverbs_dmabuf->revoked = true; 245 + dma_buf_move_notify(uverbs_dmabuf->dmabuf); 246 + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, 247 + DMA_RESV_USAGE_BOOKKEEP, false, 248 + MAX_SCHEDULE_TIMEOUT); 249 + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); 250 + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); 251 + wait_for_completion(&uverbs_dmabuf->comp); 252 + } 253 + mutex_unlock(&entry->dmabufs_lock); 254 + 242 255 kref_put(&entry->ref, rdma_user_mmap_entry_free); 243 256 } 244 257 EXPORT_SYMBOL(rdma_user_mmap_entry_remove); ··· 295 274 return -EINVAL; 296 275 297 276 kref_init(&entry->ref); 277 + INIT_LIST_HEAD(&entry->dmabufs); 278 + mutex_init(&entry->dmabufs_lock); 279 + 298 280 entry->ucontext = ucontext; 299 281 300 282 /*
+16 -12
drivers/infiniband/core/rdma_core.c
··· 809 809 }; 810 810 EXPORT_SYMBOL(uverbs_idr_class); 811 811 812 - /* 813 - * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct 814 - * file_operations release method. 815 - */ 816 - int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) 812 + int uverbs_uobject_release(struct ib_uobject *uobj) 817 813 { 818 814 struct ib_uverbs_file *ufile; 819 - struct ib_uobject *uobj; 820 815 821 - /* 822 - * This can only happen if the fput came from alloc_abort_fd_uobject() 823 - */ 824 - if (!filp->private_data) 825 - return 0; 826 - uobj = filp->private_data; 827 816 ufile = uobj->ufile; 828 817 829 818 if (down_read_trylock(&ufile->hw_destroy_rwsem)) { ··· 838 849 /* Pairs with filp->private_data in alloc_begin_fd_uobject */ 839 850 uverbs_uobject_put(uobj); 840 851 return 0; 852 + } 853 + 854 + /* 855 + * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct 856 + * file_operations release method. 857 + */ 858 + int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) 859 + { 860 + /* 861 + * This can only happen if the fput came from alloc_abort_fd_uobject() 862 + */ 863 + if (!filp->private_data) 864 + return 0; 865 + 866 + return uverbs_uobject_release(filp->private_data); 841 867 } 842 868 EXPORT_SYMBOL(uverbs_uobject_fd_release); 843 869
+1
drivers/infiniband/core/rdma_core.h
··· 156 156 extern const struct uapi_definition uverbs_def_obj_cq[]; 157 157 extern const struct uapi_definition uverbs_def_obj_device[]; 158 158 extern const struct uapi_definition uverbs_def_obj_dm[]; 159 + extern const struct uapi_definition uverbs_def_obj_dmabuf[]; 159 160 extern const struct uapi_definition uverbs_def_obj_dmah[]; 160 161 extern const struct uapi_definition uverbs_def_obj_flow_action[]; 161 162 extern const struct uapi_definition uverbs_def_obj_intf[];
+21
drivers/infiniband/core/uverbs.h
··· 133 133 struct ib_uverbs_event_queue ev_queue; 134 134 }; 135 135 136 + struct ib_uverbs_dmabuf_file { 137 + struct ib_uobject uobj; 138 + struct dma_buf *dmabuf; 139 + struct list_head dmabufs_elm; 140 + struct rdma_user_mmap_entry *mmap_entry; 141 + struct phys_vec phys_vec; 142 + struct p2pdma_provider *provider; 143 + struct kref kref; 144 + struct completion comp; 145 + u8 revoked :1; 146 + }; 147 + 136 148 struct ib_uverbs_event { 137 149 union { 138 150 struct ib_uverbs_async_event_desc async; ··· 302 290 void copy_port_attr_to_resp(struct ib_port_attr *attr, 303 291 struct ib_uverbs_query_port_resp *resp, 304 292 struct ib_device *ib_dev, u8 port_num); 293 + 294 + static inline void ib_uverbs_dmabuf_done(struct kref *kref) 295 + { 296 + struct ib_uverbs_dmabuf_file *priv = 297 + container_of(kref, struct ib_uverbs_dmabuf_file, kref); 298 + 299 + complete(&priv->comp); 300 + } 301 + 305 302 #endif /* UVERBS_H */
+200
drivers/infiniband/core/uverbs_std_types_dmabuf.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 + /* 3 + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 + */ 5 + 6 + #include <linux/dma-buf-mapping.h> 7 + #include <linux/pci-p2pdma.h> 8 + #include <linux/dma-resv.h> 9 + #include <rdma/uverbs_std_types.h> 10 + #include "rdma_core.h" 11 + #include "uverbs.h" 12 + 13 + static int uverbs_dmabuf_attach(struct dma_buf *dmabuf, 14 + struct dma_buf_attachment *attachment) 15 + { 16 + if (!attachment->peer2peer) 17 + return -EOPNOTSUPP; 18 + 19 + return 0; 20 + } 21 + 22 + static struct sg_table * 23 + uverbs_dmabuf_map(struct dma_buf_attachment *attachment, 24 + enum dma_data_direction dir) 25 + { 26 + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; 27 + struct sg_table *ret; 28 + 29 + dma_resv_assert_held(priv->dmabuf->resv); 30 + 31 + if (priv->revoked) 32 + return ERR_PTR(-ENODEV); 33 + 34 + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, 35 + &priv->phys_vec, 1, priv->phys_vec.len, 36 + dir); 37 + if (IS_ERR(ret)) 38 + return ret; 39 + 40 + kref_get(&priv->kref); 41 + return ret; 42 + } 43 + 44 + static void uverbs_dmabuf_unmap(struct dma_buf_attachment *attachment, 45 + struct sg_table *sgt, 46 + enum dma_data_direction dir) 47 + { 48 + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; 49 + 50 + dma_resv_assert_held(priv->dmabuf->resv); 51 + dma_buf_free_sgt(attachment, sgt, dir); 52 + kref_put(&priv->kref, ib_uverbs_dmabuf_done); 53 + } 54 + 55 + static int uverbs_dmabuf_pin(struct dma_buf_attachment *attach) 56 + { 57 + return -EOPNOTSUPP; 58 + } 59 + 60 + static void uverbs_dmabuf_unpin(struct dma_buf_attachment *attach) 61 + { 62 + } 63 + 64 + static void uverbs_dmabuf_release(struct dma_buf *dmabuf) 65 + { 66 + struct ib_uverbs_dmabuf_file *priv = dmabuf->priv; 67 + 68 + /* 69 + * This can only happen if the fput came from alloc_abort_fd_uobject() 70 + */ 71 + if (!priv->uobj.context) 72 + return; 73 + 74 + uverbs_uobject_release(&priv->uobj); 75 + } 76 + 77 + static const struct dma_buf_ops uverbs_dmabuf_ops = { 78 + .attach = uverbs_dmabuf_attach, 79 + .map_dma_buf = uverbs_dmabuf_map, 80 + .unmap_dma_buf = uverbs_dmabuf_unmap, 81 + .pin = uverbs_dmabuf_pin, 82 + .unpin = uverbs_dmabuf_unpin, 83 + .release = uverbs_dmabuf_release, 84 + }; 85 + 86 + static int UVERBS_HANDLER(UVERBS_METHOD_DMABUF_ALLOC)( 87 + struct uverbs_attr_bundle *attrs) 88 + { 89 + struct ib_uobject *uobj = 90 + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE) 91 + ->obj_attr.uobject; 92 + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = 93 + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); 94 + struct ib_device *ib_dev = attrs->context->device; 95 + struct rdma_user_mmap_entry *mmap_entry; 96 + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); 97 + off_t pg_off; 98 + int ret; 99 + 100 + ret = uverbs_get_const(&pg_off, attrs, UVERBS_ATTR_ALLOC_DMABUF_PGOFF); 101 + if (ret) 102 + return ret; 103 + 104 + mmap_entry = ib_dev->ops.pgoff_to_mmap_entry(attrs->context, pg_off); 105 + if (!mmap_entry) 106 + return -EINVAL; 107 + 108 + ret = ib_dev->ops.mmap_get_pfns(mmap_entry, &uverbs_dmabuf->phys_vec, 109 + &uverbs_dmabuf->provider); 110 + if (ret) 111 + goto err; 112 + 113 + exp_info.ops = &uverbs_dmabuf_ops; 114 + exp_info.size = uverbs_dmabuf->phys_vec.len; 115 + exp_info.flags = O_CLOEXEC; 116 + exp_info.priv = uverbs_dmabuf; 117 + 118 + uverbs_dmabuf->dmabuf = dma_buf_export(&exp_info); 119 + if (IS_ERR(uverbs_dmabuf->dmabuf)) { 120 + ret = PTR_ERR(uverbs_dmabuf->dmabuf); 121 + goto err; 122 + } 123 + 124 + kref_init(&uverbs_dmabuf->kref); 125 + init_completion(&uverbs_dmabuf->comp); 126 + INIT_LIST_HEAD(&uverbs_dmabuf->dmabufs_elm); 127 + mutex_lock(&mmap_entry->dmabufs_lock); 128 + if (mmap_entry->driver_removed) 129 + ret = -EIO; 130 + else 131 + list_add_tail(&uverbs_dmabuf->dmabufs_elm, &mmap_entry->dmabufs); 132 + mutex_unlock(&mmap_entry->dmabufs_lock); 133 + if (ret) 134 + goto err_revoked; 135 + 136 + uobj->object = uverbs_dmabuf->dmabuf->file; 137 + uverbs_dmabuf->mmap_entry = mmap_entry; 138 + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE); 139 + return 0; 140 + 141 + err_revoked: 142 + dma_buf_put(uverbs_dmabuf->dmabuf); 143 + err: 144 + rdma_user_mmap_entry_put(mmap_entry); 145 + return ret; 146 + } 147 + 148 + DECLARE_UVERBS_NAMED_METHOD( 149 + UVERBS_METHOD_DMABUF_ALLOC, 150 + UVERBS_ATTR_FD(UVERBS_ATTR_ALLOC_DMABUF_HANDLE, 151 + UVERBS_OBJECT_DMABUF, 152 + UVERBS_ACCESS_NEW, 153 + UA_MANDATORY), 154 + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMABUF_PGOFF, 155 + UVERBS_ATTR_TYPE(u64), 156 + UA_MANDATORY)); 157 + 158 + static void uverbs_dmabuf_fd_destroy_uobj(struct ib_uobject *uobj, 159 + enum rdma_remove_reason why) 160 + { 161 + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = 162 + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); 163 + bool wait_for_comp = false; 164 + 165 + mutex_lock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); 166 + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); 167 + if (!uverbs_dmabuf->revoked) { 168 + uverbs_dmabuf->revoked = true; 169 + list_del(&uverbs_dmabuf->dmabufs_elm); 170 + dma_buf_move_notify(uverbs_dmabuf->dmabuf); 171 + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, 172 + DMA_RESV_USAGE_BOOKKEEP, false, 173 + MAX_SCHEDULE_TIMEOUT); 174 + wait_for_comp = true; 175 + } 176 + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); 177 + if (wait_for_comp) { 178 + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); 179 + /* Let's wait till all DMA unmap are completed. */ 180 + wait_for_completion(&uverbs_dmabuf->comp); 181 + } 182 + mutex_unlock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); 183 + 184 + /* Matches the get done as part of pgoff_to_mmap_entry() */ 185 + rdma_user_mmap_entry_put(uverbs_dmabuf->mmap_entry); 186 + } 187 + 188 + DECLARE_UVERBS_NAMED_OBJECT( 189 + UVERBS_OBJECT_DMABUF, 190 + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_dmabuf_file), 191 + uverbs_dmabuf_fd_destroy_uobj, 192 + NULL, NULL, O_RDONLY), 193 + &UVERBS_METHOD(UVERBS_METHOD_DMABUF_ALLOC)); 194 + 195 + const struct uapi_definition uverbs_def_obj_dmabuf[] = { 196 + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMABUF), 197 + UAPI_DEF_OBJ_NEEDS_FN(mmap_get_pfns), 198 + UAPI_DEF_OBJ_NEEDS_FN(pgoff_to_mmap_entry), 199 + {} 200 + };
+1
drivers/infiniband/core/uverbs_uapi.c
··· 631 631 UAPI_DEF_CHAIN(uverbs_def_obj_cq), 632 632 UAPI_DEF_CHAIN(uverbs_def_obj_device), 633 633 UAPI_DEF_CHAIN(uverbs_def_obj_dm), 634 + UAPI_DEF_CHAIN(uverbs_def_obj_dmabuf), 634 635 UAPI_DEF_CHAIN(uverbs_def_obj_dmah), 635 636 UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), 636 637 UAPI_DEF_CHAIN(uverbs_def_obj_intf),
+9
include/rdma/ib_verbs.h
··· 44 44 #include <uapi/rdma/rdma_user_ioctl.h> 45 45 #include <uapi/rdma/ib_user_ioctl_verbs.h> 46 46 #include <linux/pci-tph.h> 47 + #include <linux/dma-buf.h> 47 48 48 49 #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN 49 50 ··· 2365 2364 unsigned long start_pgoff; 2366 2365 size_t npages; 2367 2366 bool driver_removed; 2367 + /* protects access to dmabufs */ 2368 + struct mutex dmabufs_lock; 2369 + struct list_head dmabufs; 2368 2370 }; 2369 2371 2370 2372 /* Return the offset (in bytes) the user should pass to libc's mmap() */ ··· 2505 2501 * Therefore needs to be implemented by the driver in mmap_free. 2506 2502 */ 2507 2503 void (*mmap_free)(struct rdma_user_mmap_entry *entry); 2504 + int (*mmap_get_pfns)(struct rdma_user_mmap_entry *entry, 2505 + struct phys_vec *phys_vec, 2506 + struct p2pdma_provider **provider); 2507 + struct rdma_user_mmap_entry *(*pgoff_to_mmap_entry)(struct ib_ucontext *ucontext, 2508 + off_t pg_off); 2508 2509 void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); 2509 2510 int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); 2510 2511 int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
+1
include/rdma/uverbs_types.h
··· 186 186 extern const struct uverbs_obj_type_class uverbs_idr_class; 187 187 extern const struct uverbs_obj_type_class uverbs_fd_class; 188 188 int uverbs_uobject_fd_release(struct inode *inode, struct file *filp); 189 + int uverbs_uobject_release(struct ib_uobject *uobj); 189 190 190 191 #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ 191 192 sizeof(char))
+10
include/uapi/rdma/ib_user_ioctl_cmds.h
··· 56 56 UVERBS_OBJECT_COUNTERS, 57 57 UVERBS_OBJECT_ASYNC_EVENT, 58 58 UVERBS_OBJECT_DMAH, 59 + UVERBS_OBJECT_DMABUF, 59 60 }; 60 61 61 62 enum { ··· 262 261 enum uverbs_methods_dmah { 263 262 UVERBS_METHOD_DMAH_ALLOC, 264 263 UVERBS_METHOD_DMAH_FREE, 264 + }; 265 + 266 + enum uverbs_attrs_alloc_dmabuf_cmd_attr_ids { 267 + UVERBS_ATTR_ALLOC_DMABUF_HANDLE, 268 + UVERBS_ATTR_ALLOC_DMABUF_PGOFF, 269 + }; 270 + 271 + enum uverbs_methods_dmabuf { 272 + UVERBS_METHOD_DMABUF_ALLOC, 265 273 }; 266 274 267 275 enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {