Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma updates from Jason Gunthorpe:
"Usual smallish cycle. The NFS biovec work to push it down into RDMA
instead of indirecting through a scatterlist is pretty nice to see,
been talked about for a long time now.

- Various code improvements in irdma, rtrs, qedr, ocrdma, irdma, rxe

- Small driver improvements and minor bug fixes to hns, mlx5, rxe,
mana, mlx5, irdma

- Robusness improvements in completion processing for EFA

- New query_port_speed() verb to move past limited IBA defined speed
steps

- Support for SG_GAPS in rts and many other small improvements

- Rare list corruption fix in iwcm

- Better support different page sizes in rxe

- Device memory support for mana

- Direct bio vec to kernel MR for use by NFS-RDMA

- QP rate limiting for bnxt_re

- Remote triggerable NULL pointer crash in siw

- DMA-buf exporter support for RDMA mmaps like doorbells"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (66 commits)
RDMA/mlx5: Implement DMABUF export ops
RDMA/uverbs: Add DMABUF object type and operations
RDMA/uverbs: Support external FD uobjects
RDMA/siw: Fix potential NULL pointer dereference in header processing
RDMA/umad: Reject negative data_len in ib_umad_write
IB/core: Extend rate limit support for RC QPs
RDMA/mlx5: Support rate limit only for Raw Packet QP
RDMA/bnxt_re: Report QP rate limit in debugfs
RDMA/bnxt_re: Report packet pacing capabilities when querying device
RDMA/bnxt_re: Add support for QP rate limiting
MAINTAINERS: Drop RDMA files from Hyper-V section
RDMA/uverbs: Add __GFP_NOWARN to ib_uverbs_unmarshall_recv() kmalloc
svcrdma: use bvec-based RDMA read/write API
RDMA/core: add rdma_rw_max_sge() helper for SQ sizing
RDMA/core: add MR support for bvec-based RDMA operations
RDMA/core: use IOVA-based DMA mapping for bvec RDMA operations
RDMA/core: add bio_vec based RDMA read/write API
RDMA/irdma: Use kvzalloc for paged memory DMA address array
RDMA/rxe: Fix race condition in QP timer handlers
RDMA/mana_ib: Add device‑memory support
...

+2648 -722
+1 -2
MAINTAINERS
··· 11842 11842 F: drivers/clocksource/hyperv_timer.c 11843 11843 F: drivers/hid/hid-hyperv.c 11844 11844 F: drivers/hv/ 11845 - F: drivers/infiniband/hw/mana/ 11846 11845 F: drivers/input/serio/hyperv-keyboard.c 11847 11846 F: drivers/iommu/hyperv-iommu.c 11848 11847 F: drivers/net/ethernet/microsoft/ ··· 11860 11861 F: include/linux/hyperv.h 11861 11862 F: include/net/mana 11862 11863 F: include/uapi/linux/hyperv.h 11863 - F: include/uapi/rdma/mana-abi.h 11864 11864 F: net/vmw_vsock/hyperv_transport.c 11865 11865 F: tools/hv/ 11866 11866 ··· 17466 17468 M: Long Li <longli@microsoft.com> 17467 17469 M: Konstantin Taranov <kotaranov@microsoft.com> 17468 17470 L: linux-rdma@vger.kernel.org 17471 + L: linux-hyperv@vger.kernel.org 17469 17472 S: Supported 17470 17473 F: drivers/infiniband/hw/mana/ 17471 17474 F: include/net/mana
+1
drivers/infiniband/core/Makefile
··· 33 33 ib_uverbs-y := uverbs_main.o uverbs_cmd.o uverbs_marshall.o \ 34 34 rdma_core.o uverbs_std_types.o uverbs_ioctl.o \ 35 35 uverbs_std_types_cq.o \ 36 + uverbs_std_types_dmabuf.o \ 36 37 uverbs_std_types_dmah.o \ 37 38 uverbs_std_types_flow_action.o uverbs_std_types_dm.o \ 38 39 uverbs_std_types_mr.o uverbs_std_types_counters.o \
+2 -1
drivers/infiniband/core/cache.c
··· 1537 1537 * the cache. 1538 1538 */ 1539 1539 ret = ib_cache_update(work->event.device, work->event.element.port_num, 1540 - work->event.event == IB_EVENT_GID_CHANGE, 1540 + work->event.event == IB_EVENT_GID_CHANGE || 1541 + work->event.event == IB_EVENT_CLIENT_REREGISTER, 1541 1542 work->event.event == IB_EVENT_PKEY_CHANGE, 1542 1543 work->enforce_security); 1543 1544
+3 -30
drivers/infiniband/core/device.c
··· 361 361 return NULL; 362 362 } 363 363 364 - /** 365 - * ib_device_get_by_name - Find an IB device by name 366 - * @name: The name to look for 367 - * @driver_id: The driver ID that must match (RDMA_DRIVER_UNKNOWN matches all) 368 - * 369 - * Find and hold an ib_device by its name. The caller must call 370 - * ib_device_put() on the returned pointer. 371 - */ 372 - struct ib_device *ib_device_get_by_name(const char *name, 373 - enum rdma_driver_id driver_id) 374 - { 375 - struct ib_device *device; 376 - 377 - down_read(&devices_rwsem); 378 - device = __ib_device_get_by_name(name); 379 - if (device && driver_id != RDMA_DRIVER_UNKNOWN && 380 - device->ops.driver_id != driver_id) 381 - device = NULL; 382 - 383 - if (device) { 384 - if (!ib_device_try_get(device)) 385 - device = NULL; 386 - } 387 - up_read(&devices_rwsem); 388 - return device; 389 - } 390 - EXPORT_SYMBOL(ib_device_get_by_name); 391 - 392 364 static int rename_compat_devs(struct ib_device *device) 393 365 { 394 366 struct ib_core_device *cdev; ··· 2765 2793 SET_DEVICE_OP(dev_ops, map_mr_sg); 2766 2794 SET_DEVICE_OP(dev_ops, map_mr_sg_pi); 2767 2795 SET_DEVICE_OP(dev_ops, mmap); 2796 + SET_DEVICE_OP(dev_ops, mmap_get_pfns); 2768 2797 SET_DEVICE_OP(dev_ops, mmap_free); 2769 2798 SET_DEVICE_OP(dev_ops, modify_ah); 2770 2799 SET_DEVICE_OP(dev_ops, modify_cq); ··· 2776 2803 SET_DEVICE_OP(dev_ops, modify_srq); 2777 2804 SET_DEVICE_OP(dev_ops, modify_wq); 2778 2805 SET_DEVICE_OP(dev_ops, peek_cq); 2806 + SET_DEVICE_OP(dev_ops, pgoff_to_mmap_entry); 2779 2807 SET_DEVICE_OP(dev_ops, pre_destroy_cq); 2780 2808 SET_DEVICE_OP(dev_ops, poll_cq); 2781 2809 SET_DEVICE_OP(dev_ops, port_groups); ··· 2790 2816 SET_DEVICE_OP(dev_ops, query_gid); 2791 2817 SET_DEVICE_OP(dev_ops, query_pkey); 2792 2818 SET_DEVICE_OP(dev_ops, query_port); 2819 + SET_DEVICE_OP(dev_ops, query_port_speed); 2793 2820 SET_DEVICE_OP(dev_ops, query_qp); 2794 2821 SET_DEVICE_OP(dev_ops, query_srq); 2795 2822 SET_DEVICE_OP(dev_ops, query_ucontext); ··· 2850 2875 2851 2876 return ret; 2852 2877 } 2853 - EXPORT_SYMBOL(ib_add_sub_device); 2854 2878 2855 2879 int ib_del_sub_device_and_put(struct ib_device *sub) 2856 2880 { ··· 2870 2896 2871 2897 return 0; 2872 2898 } 2873 - EXPORT_SYMBOL(ib_del_sub_device_and_put); 2874 2899 2875 2900 #ifdef CONFIG_INFINIBAND_VIRT_DMA 2876 2901 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents)
+24
drivers/infiniband/core/ib_core_uverbs.c
··· 5 5 * Copyright 2019 Marvell. All rights reserved. 6 6 */ 7 7 #include <linux/xarray.h> 8 + #include <linux/dma-buf.h> 9 + #include <linux/dma-resv.h> 8 10 #include "uverbs.h" 9 11 #include "core_priv.h" 12 + 13 + MODULE_IMPORT_NS("DMA_BUF"); 10 14 11 15 /** 12 16 * rdma_umap_priv_init() - Initialize the private data of a vma ··· 233 229 */ 234 230 void rdma_user_mmap_entry_remove(struct rdma_user_mmap_entry *entry) 235 231 { 232 + struct ib_uverbs_dmabuf_file *uverbs_dmabuf, *tmp; 233 + 236 234 if (!entry) 237 235 return; 238 236 237 + mutex_lock(&entry->dmabufs_lock); 239 238 xa_lock(&entry->ucontext->mmap_xa); 240 239 entry->driver_removed = true; 241 240 xa_unlock(&entry->ucontext->mmap_xa); 241 + list_for_each_entry_safe(uverbs_dmabuf, tmp, &entry->dmabufs, dmabufs_elm) { 242 + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); 243 + list_del(&uverbs_dmabuf->dmabufs_elm); 244 + uverbs_dmabuf->revoked = true; 245 + dma_buf_move_notify(uverbs_dmabuf->dmabuf); 246 + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, 247 + DMA_RESV_USAGE_BOOKKEEP, false, 248 + MAX_SCHEDULE_TIMEOUT); 249 + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); 250 + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); 251 + wait_for_completion(&uverbs_dmabuf->comp); 252 + } 253 + mutex_unlock(&entry->dmabufs_lock); 254 + 242 255 kref_put(&entry->ref, rdma_user_mmap_entry_free); 243 256 } 244 257 EXPORT_SYMBOL(rdma_user_mmap_entry_remove); ··· 295 274 return -EINVAL; 296 275 297 276 kref_init(&entry->ref); 277 + INIT_LIST_HEAD(&entry->dmabufs); 278 + mutex_init(&entry->dmabufs_lock); 279 + 298 280 entry->ucontext = ucontext; 299 281 300 282 /*
+21 -35
drivers/infiniband/core/iwcm.c
··· 95 95 struct iwcm_work { 96 96 struct work_struct work; 97 97 struct iwcm_id_private *cm_id; 98 - struct list_head list; 99 98 struct iw_cm_event event; 100 99 struct list_head free_list; 101 100 }; ··· 177 178 return -ENOMEM; 178 179 } 179 180 work->cm_id = cm_id_priv; 180 - INIT_LIST_HEAD(&work->list); 181 181 put_work(work); 182 182 } 183 183 return 0; ··· 211 213 static bool iwcm_deref_id(struct iwcm_id_private *cm_id_priv) 212 214 { 213 215 if (refcount_dec_and_test(&cm_id_priv->refcount)) { 214 - BUG_ON(!list_empty(&cm_id_priv->work_list)); 215 216 free_cm_id(cm_id_priv); 216 217 return true; 217 218 } ··· 257 260 refcount_set(&cm_id_priv->refcount, 1); 258 261 init_waitqueue_head(&cm_id_priv->connect_wait); 259 262 init_completion(&cm_id_priv->destroy_comp); 260 - INIT_LIST_HEAD(&cm_id_priv->work_list); 261 263 INIT_LIST_HEAD(&cm_id_priv->work_free_list); 262 264 263 265 return &cm_id_priv->id; ··· 1003 1007 } 1004 1008 1005 1009 /* 1006 - * Process events on the work_list for the cm_id. If the callback 1007 - * function requests that the cm_id be deleted, a flag is set in the 1008 - * cm_id flags to indicate that when the last reference is 1009 - * removed, the cm_id is to be destroyed. This is necessary to 1010 - * distinguish between an object that will be destroyed by the app 1011 - * thread asleep on the destroy_comp list vs. an object destroyed 1012 - * here synchronously when the last reference is removed. 1010 + * Process events for the cm_id. If the callback function requests 1011 + * that the cm_id be deleted, a flag is set in the cm_id flags to 1012 + * indicate that when the last reference is removed, the cm_id is 1013 + * to be destroyed. This is necessary to distinguish between an 1014 + * object that will be destroyed by the app thread asleep on the 1015 + * destroy_comp list vs. an object destroyed here synchronously 1016 + * when the last reference is removed. 1013 1017 */ 1014 1018 static void cm_work_handler(struct work_struct *_work) 1015 1019 { ··· 1020 1024 int ret = 0; 1021 1025 1022 1026 spin_lock_irqsave(&cm_id_priv->lock, flags); 1023 - while (!list_empty(&cm_id_priv->work_list)) { 1024 - work = list_first_entry(&cm_id_priv->work_list, 1025 - struct iwcm_work, list); 1026 - list_del_init(&work->list); 1027 - levent = work->event; 1028 - put_work(work); 1029 - spin_unlock_irqrestore(&cm_id_priv->lock, flags); 1030 - 1031 - if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { 1032 - ret = process_event(cm_id_priv, &levent); 1033 - if (ret) { 1034 - destroy_cm_id(&cm_id_priv->id); 1035 - WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); 1036 - } 1037 - } else 1038 - pr_debug("dropping event %d\n", levent.event); 1039 - if (iwcm_deref_id(cm_id_priv)) 1040 - return; 1041 - spin_lock_irqsave(&cm_id_priv->lock, flags); 1042 - } 1027 + levent = work->event; 1028 + put_work(work); 1043 1029 spin_unlock_irqrestore(&cm_id_priv->lock, flags); 1030 + 1031 + if (!test_bit(IWCM_F_DROP_EVENTS, &cm_id_priv->flags)) { 1032 + ret = process_event(cm_id_priv, &levent); 1033 + if (ret) { 1034 + destroy_cm_id(&cm_id_priv->id); 1035 + WARN_ON_ONCE(iwcm_deref_id(cm_id_priv)); 1036 + } 1037 + } else 1038 + pr_debug("dropping event %d\n", levent.event); 1039 + if (iwcm_deref_id(cm_id_priv)) 1040 + return; 1044 1041 } 1045 1042 1046 1043 /* 1047 1044 * This function is called on interrupt context. Schedule events on 1048 1045 * the iwcm_wq thread to allow callback functions to downcall into 1049 - * the CM and/or block. Events are queued to a per-CM_ID 1050 - * work_list. If this is the first event on the work_list, the work 1051 - * element is also queued on the iwcm_wq thread. 1046 + * the CM and/or block. 1052 1047 * 1053 1048 * Each event holds a reference on the cm_id. Until the last posted 1054 1049 * event has been delivered and processed, the cm_id cannot be ··· 1081 1094 } 1082 1095 1083 1096 refcount_inc(&cm_id_priv->refcount); 1084 - list_add_tail(&work->list, &cm_id_priv->work_list); 1085 1097 queue_work(iwcm_wq, &work->work); 1086 1098 out: 1087 1099 spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-1
drivers/infiniband/core/iwcm.h
··· 50 50 struct ib_qp *qp; 51 51 struct completion destroy_comp; 52 52 wait_queue_head_t connect_wait; 53 - struct list_head work_list; 54 53 spinlock_t lock; 55 54 refcount_t refcount; 56 55 struct list_head work_free_list;
+37 -26
drivers/infiniband/core/rdma_core.c
··· 465 465 466 466 fd_type = 467 467 container_of(obj->type_attrs, struct uverbs_obj_fd_type, type); 468 - if (WARN_ON(fd_type->fops->release != &uverbs_uobject_fd_release && 468 + if (WARN_ON(fd_type->fops && fd_type->fops->release != &uverbs_uobject_fd_release && 469 469 fd_type->fops->release != &uverbs_async_event_release)) { 470 470 ret = ERR_PTR(-EINVAL); 471 471 goto err_fd; ··· 477 477 goto err_fd; 478 478 } 479 479 480 - /* Note that uverbs_uobject_fd_release() is called during abort */ 481 - filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, 482 - fd_type->flags); 483 - if (IS_ERR(filp)) { 484 - ret = ERR_CAST(filp); 485 - goto err_getfile; 480 + if (fd_type->fops) { 481 + /* Note that uverbs_uobject_fd_release() is called during abort */ 482 + filp = anon_inode_getfile(fd_type->name, fd_type->fops, NULL, 483 + fd_type->flags); 484 + if (IS_ERR(filp)) { 485 + ret = ERR_CAST(filp); 486 + goto err_getfile; 487 + } 488 + uobj->object = filp; 486 489 } 487 - uobj->object = filp; 488 490 489 491 uobj->id = new_fd; 490 492 return uobj; ··· 563 561 { 564 562 struct file *filp = uobj->object; 565 563 566 - fput(filp); 564 + if (filp) 565 + fput(filp); 566 + 567 567 put_unused_fd(uobj->id); 568 568 } 569 569 ··· 632 628 /* This shouldn't be used anymore. Use the file object instead */ 633 629 uobj->id = 0; 634 630 635 - /* 636 - * NOTE: Once we install the file we loose ownership of our kref on 637 - * uobj. It will be put by uverbs_uobject_fd_release() 638 - */ 639 - filp->private_data = uobj; 631 + if (!filp->private_data) { 632 + /* 633 + * NOTE: Once we install the file we loose ownership of our kref on 634 + * uobj. It will be put by uverbs_uobject_fd_release() 635 + */ 636 + filp->private_data = uobj; 637 + } 638 + 640 639 fd_install(fd, filp); 641 640 } 642 641 ··· 809 802 }; 810 803 EXPORT_SYMBOL(uverbs_idr_class); 811 804 812 - /* 813 - * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct 814 - * file_operations release method. 815 - */ 816 - int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) 805 + int uverbs_uobject_release(struct ib_uobject *uobj) 817 806 { 818 807 struct ib_uverbs_file *ufile; 819 - struct ib_uobject *uobj; 820 808 821 - /* 822 - * This can only happen if the fput came from alloc_abort_fd_uobject() 823 - */ 824 - if (!filp->private_data) 825 - return 0; 826 - uobj = filp->private_data; 827 809 ufile = uobj->ufile; 828 810 829 811 if (down_read_trylock(&ufile->hw_destroy_rwsem)) { ··· 838 842 /* Pairs with filp->private_data in alloc_begin_fd_uobject */ 839 843 uverbs_uobject_put(uobj); 840 844 return 0; 845 + } 846 + 847 + /* 848 + * Users of UVERBS_TYPE_ALLOC_FD should set this function as the struct 849 + * file_operations release method. 850 + */ 851 + int uverbs_uobject_fd_release(struct inode *inode, struct file *filp) 852 + { 853 + /* 854 + * This can only happen if the fput came from alloc_abort_fd_uobject() 855 + */ 856 + if (!filp->private_data) 857 + return 0; 858 + 859 + return uverbs_uobject_release(filp->private_data); 841 860 } 842 861 EXPORT_SYMBOL(uverbs_uobject_fd_release); 843 862
+1
drivers/infiniband/core/rdma_core.h
··· 156 156 extern const struct uapi_definition uverbs_def_obj_cq[]; 157 157 extern const struct uapi_definition uverbs_def_obj_device[]; 158 158 extern const struct uapi_definition uverbs_def_obj_dm[]; 159 + extern const struct uapi_definition uverbs_def_obj_dmabuf[]; 159 160 extern const struct uapi_definition uverbs_def_obj_dmah[]; 160 161 extern const struct uapi_definition uverbs_def_obj_flow_action[]; 161 162 extern const struct uapi_definition uverbs_def_obj_intf[];
+482 -39
drivers/infiniband/core/rw.c
··· 14 14 RDMA_RW_MULTI_WR, 15 15 RDMA_RW_MR, 16 16 RDMA_RW_SIG_MR, 17 + RDMA_RW_IOVA, 17 18 }; 18 19 19 20 static bool rdma_rw_force_mr; ··· 122 121 return count; 123 122 } 124 123 124 + static int rdma_rw_init_reg_wr(struct rdma_rw_reg_ctx *reg, 125 + struct rdma_rw_reg_ctx *prev, struct ib_qp *qp, u32 port_num, 126 + u64 remote_addr, u32 rkey, enum dma_data_direction dir) 127 + { 128 + if (prev) { 129 + if (reg->mr->need_inval) 130 + prev->wr.wr.next = &reg->inv_wr; 131 + else 132 + prev->wr.wr.next = &reg->reg_wr.wr; 133 + } 134 + 135 + reg->reg_wr.wr.next = &reg->wr.wr; 136 + 137 + reg->wr.wr.sg_list = &reg->sge; 138 + reg->wr.wr.num_sge = 1; 139 + reg->wr.remote_addr = remote_addr; 140 + reg->wr.rkey = rkey; 141 + 142 + if (dir == DMA_TO_DEVICE) { 143 + reg->wr.wr.opcode = IB_WR_RDMA_WRITE; 144 + } else if (!rdma_cap_read_inv(qp->device, port_num)) { 145 + reg->wr.wr.opcode = IB_WR_RDMA_READ; 146 + } else { 147 + reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; 148 + reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; 149 + } 150 + 151 + return 1; 152 + } 153 + 125 154 static int rdma_rw_init_mr_wrs(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 126 155 u32 port_num, struct scatterlist *sg, u32 sg_cnt, u32 offset, 127 156 u64 remote_addr, u32 rkey, enum dma_data_direction dir) ··· 177 146 if (ret < 0) 178 147 goto out_free; 179 148 count += ret; 180 - 181 - if (prev) { 182 - if (reg->mr->need_inval) 183 - prev->wr.wr.next = &reg->inv_wr; 184 - else 185 - prev->wr.wr.next = &reg->reg_wr.wr; 186 - } 187 - 188 - reg->reg_wr.wr.next = &reg->wr.wr; 189 - 190 - reg->wr.wr.sg_list = &reg->sge; 191 - reg->wr.wr.num_sge = 1; 192 - reg->wr.remote_addr = remote_addr; 193 - reg->wr.rkey = rkey; 194 - if (dir == DMA_TO_DEVICE) { 195 - reg->wr.wr.opcode = IB_WR_RDMA_WRITE; 196 - } else if (!rdma_cap_read_inv(qp->device, port_num)) { 197 - reg->wr.wr.opcode = IB_WR_RDMA_READ; 198 - } else { 199 - reg->wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV; 200 - reg->wr.wr.ex.invalidate_rkey = reg->mr->lkey; 201 - } 202 - count++; 203 - 149 + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, 150 + remote_addr, rkey, dir); 204 151 remote_addr += reg->sge.length; 205 152 sg_cnt -= nents; 206 153 for (j = 0; j < nents; j++) ··· 198 189 ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 199 190 kfree(ctx->reg); 200 191 out: 192 + return ret; 193 + } 194 + 195 + static int rdma_rw_init_mr_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 196 + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, 197 + struct bvec_iter *iter, u64 remote_addr, u32 rkey, 198 + enum dma_data_direction dir) 199 + { 200 + struct ib_device *dev = qp->pd->device; 201 + struct rdma_rw_reg_ctx *prev = NULL; 202 + u32 pages_per_mr = rdma_rw_fr_page_list_len(dev, qp->integrity_en); 203 + struct scatterlist *sg; 204 + int i, ret, count = 0; 205 + u32 nents = 0; 206 + 207 + ctx->reg = kcalloc(DIV_ROUND_UP(nr_bvec, pages_per_mr), 208 + sizeof(*ctx->reg), GFP_KERNEL); 209 + if (!ctx->reg) 210 + return -ENOMEM; 211 + 212 + /* 213 + * Build scatterlist from bvecs using the iterator. This follows 214 + * the pattern from __blk_rq_map_sg. 215 + */ 216 + ctx->reg[0].sgt.sgl = kmalloc_array(nr_bvec, 217 + sizeof(*ctx->reg[0].sgt.sgl), 218 + GFP_KERNEL); 219 + if (!ctx->reg[0].sgt.sgl) { 220 + ret = -ENOMEM; 221 + goto out_free_reg; 222 + } 223 + sg_init_table(ctx->reg[0].sgt.sgl, nr_bvec); 224 + 225 + for (sg = ctx->reg[0].sgt.sgl; iter->bi_size; sg = sg_next(sg)) { 226 + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); 227 + 228 + if (nents >= nr_bvec) { 229 + ret = -EINVAL; 230 + goto out_free_sgl; 231 + } 232 + sg_set_page(sg, bv.bv_page, bv.bv_len, bv.bv_offset); 233 + bvec_iter_advance(bvecs, iter, bv.bv_len); 234 + nents++; 235 + } 236 + sg_mark_end(sg_last(ctx->reg[0].sgt.sgl, nents)); 237 + ctx->reg[0].sgt.orig_nents = nents; 238 + 239 + /* DMA map the scatterlist */ 240 + ret = ib_dma_map_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); 241 + if (ret) 242 + goto out_free_sgl; 243 + 244 + ctx->nr_ops = DIV_ROUND_UP(ctx->reg[0].sgt.nents, pages_per_mr); 245 + 246 + sg = ctx->reg[0].sgt.sgl; 247 + nents = ctx->reg[0].sgt.nents; 248 + for (i = 0; i < ctx->nr_ops; i++) { 249 + struct rdma_rw_reg_ctx *reg = &ctx->reg[i]; 250 + u32 sge_cnt = min(nents, pages_per_mr); 251 + 252 + ret = rdma_rw_init_one_mr(qp, port_num, reg, sg, sge_cnt, 0); 253 + if (ret < 0) 254 + goto out_free_mrs; 255 + count += ret; 256 + count += rdma_rw_init_reg_wr(reg, prev, qp, port_num, 257 + remote_addr, rkey, dir); 258 + remote_addr += reg->sge.length; 259 + nents -= sge_cnt; 260 + sg += sge_cnt; 261 + prev = reg; 262 + } 263 + 264 + if (prev) 265 + prev->wr.wr.next = NULL; 266 + 267 + ctx->type = RDMA_RW_MR; 268 + return count; 269 + 270 + out_free_mrs: 271 + while (--i >= 0) 272 + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 273 + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); 274 + out_free_sgl: 275 + kfree(ctx->reg[0].sgt.sgl); 276 + out_free_reg: 277 + kfree(ctx->reg); 201 278 return ret; 202 279 } 203 280 ··· 369 274 return 1; 370 275 } 371 276 277 + static int rdma_rw_init_single_wr_bvec(struct rdma_rw_ctx *ctx, 278 + struct ib_qp *qp, const struct bio_vec *bvecs, 279 + struct bvec_iter *iter, u64 remote_addr, u32 rkey, 280 + enum dma_data_direction dir) 281 + { 282 + struct ib_device *dev = qp->pd->device; 283 + struct ib_rdma_wr *rdma_wr = &ctx->single.wr; 284 + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); 285 + u64 dma_addr; 286 + 287 + ctx->nr_ops = 1; 288 + 289 + dma_addr = ib_dma_map_bvec(dev, &bv, dir); 290 + if (ib_dma_mapping_error(dev, dma_addr)) 291 + return -ENOMEM; 292 + 293 + ctx->single.sge.lkey = qp->pd->local_dma_lkey; 294 + ctx->single.sge.addr = dma_addr; 295 + ctx->single.sge.length = bv.bv_len; 296 + 297 + memset(rdma_wr, 0, sizeof(*rdma_wr)); 298 + if (dir == DMA_TO_DEVICE) 299 + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; 300 + else 301 + rdma_wr->wr.opcode = IB_WR_RDMA_READ; 302 + rdma_wr->wr.sg_list = &ctx->single.sge; 303 + rdma_wr->wr.num_sge = 1; 304 + rdma_wr->remote_addr = remote_addr; 305 + rdma_wr->rkey = rkey; 306 + 307 + ctx->type = RDMA_RW_SINGLE_WR; 308 + return 1; 309 + } 310 + 311 + static int rdma_rw_init_map_wrs_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 312 + const struct bio_vec *bvecs, u32 nr_bvec, struct bvec_iter *iter, 313 + u64 remote_addr, u32 rkey, enum dma_data_direction dir) 314 + { 315 + struct ib_device *dev = qp->pd->device; 316 + u32 max_sge = dir == DMA_TO_DEVICE ? qp->max_write_sge : 317 + qp->max_read_sge; 318 + struct ib_sge *sge; 319 + u32 total_len = 0, i, j; 320 + u32 mapped_bvecs = 0; 321 + u32 nr_ops = DIV_ROUND_UP(nr_bvec, max_sge); 322 + size_t sges_size = array_size(nr_bvec, sizeof(*ctx->map.sges)); 323 + size_t wrs_offset = ALIGN(sges_size, __alignof__(*ctx->map.wrs)); 324 + size_t wrs_size = array_size(nr_ops, sizeof(*ctx->map.wrs)); 325 + void *mem; 326 + 327 + if (sges_size == SIZE_MAX || wrs_size == SIZE_MAX || 328 + check_add_overflow(wrs_offset, wrs_size, &wrs_size)) 329 + return -ENOMEM; 330 + 331 + mem = kzalloc(wrs_size, GFP_KERNEL); 332 + if (!mem) 333 + return -ENOMEM; 334 + 335 + ctx->map.sges = sge = mem; 336 + ctx->map.wrs = mem + wrs_offset; 337 + 338 + for (i = 0; i < nr_ops; i++) { 339 + struct ib_rdma_wr *rdma_wr = &ctx->map.wrs[i]; 340 + u32 nr_sge = min(nr_bvec - mapped_bvecs, max_sge); 341 + 342 + if (dir == DMA_TO_DEVICE) 343 + rdma_wr->wr.opcode = IB_WR_RDMA_WRITE; 344 + else 345 + rdma_wr->wr.opcode = IB_WR_RDMA_READ; 346 + rdma_wr->remote_addr = remote_addr + total_len; 347 + rdma_wr->rkey = rkey; 348 + rdma_wr->wr.num_sge = nr_sge; 349 + rdma_wr->wr.sg_list = sge; 350 + 351 + for (j = 0; j < nr_sge; j++) { 352 + struct bio_vec bv = mp_bvec_iter_bvec(bvecs, *iter); 353 + u64 dma_addr; 354 + 355 + dma_addr = ib_dma_map_bvec(dev, &bv, dir); 356 + if (ib_dma_mapping_error(dev, dma_addr)) 357 + goto out_unmap; 358 + 359 + mapped_bvecs++; 360 + sge->addr = dma_addr; 361 + sge->length = bv.bv_len; 362 + sge->lkey = qp->pd->local_dma_lkey; 363 + 364 + total_len += bv.bv_len; 365 + sge++; 366 + 367 + bvec_iter_advance_single(bvecs, iter, bv.bv_len); 368 + } 369 + 370 + rdma_wr->wr.next = i + 1 < nr_ops ? 371 + &ctx->map.wrs[i + 1].wr : NULL; 372 + } 373 + 374 + ctx->nr_ops = nr_ops; 375 + ctx->type = RDMA_RW_MULTI_WR; 376 + return nr_ops; 377 + 378 + out_unmap: 379 + for (i = 0; i < mapped_bvecs; i++) 380 + ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, 381 + ctx->map.sges[i].length, dir); 382 + kfree(ctx->map.sges); 383 + return -ENOMEM; 384 + } 385 + 386 + /* 387 + * Try to use the two-step IOVA API to map bvecs into a contiguous DMA range. 388 + * This reduces IOTLB sync overhead by doing one sync at the end instead of 389 + * one per bvec, and produces a contiguous DMA address range that can be 390 + * described by a single SGE. 391 + * 392 + * Returns the number of WQEs (always 1) on success, -EOPNOTSUPP if IOVA 393 + * mapping is not available, or another negative error code on failure. 394 + */ 395 + static int rdma_rw_init_iova_wrs_bvec(struct rdma_rw_ctx *ctx, 396 + struct ib_qp *qp, const struct bio_vec *bvec, 397 + struct bvec_iter *iter, u64 remote_addr, u32 rkey, 398 + enum dma_data_direction dir) 399 + { 400 + struct ib_device *dev = qp->pd->device; 401 + struct device *dma_dev = dev->dma_device; 402 + size_t total_len = iter->bi_size; 403 + struct bio_vec first_bv; 404 + size_t mapped_len = 0; 405 + int ret; 406 + 407 + /* Virtual DMA devices cannot support IOVA allocators */ 408 + if (ib_uses_virt_dma(dev)) 409 + return -EOPNOTSUPP; 410 + 411 + /* Try to allocate contiguous IOVA space */ 412 + first_bv = mp_bvec_iter_bvec(bvec, *iter); 413 + if (!dma_iova_try_alloc(dma_dev, &ctx->iova.state, 414 + bvec_phys(&first_bv), total_len)) 415 + return -EOPNOTSUPP; 416 + 417 + /* Link all bvecs into the IOVA space */ 418 + while (iter->bi_size) { 419 + struct bio_vec bv = mp_bvec_iter_bvec(bvec, *iter); 420 + 421 + ret = dma_iova_link(dma_dev, &ctx->iova.state, bvec_phys(&bv), 422 + mapped_len, bv.bv_len, dir, 0); 423 + if (ret) 424 + goto out_destroy; 425 + 426 + mapped_len += bv.bv_len; 427 + bvec_iter_advance(bvec, iter, bv.bv_len); 428 + } 429 + 430 + /* Sync the IOTLB once for all linked pages */ 431 + ret = dma_iova_sync(dma_dev, &ctx->iova.state, 0, mapped_len); 432 + if (ret) 433 + goto out_destroy; 434 + 435 + ctx->iova.mapped_len = mapped_len; 436 + 437 + /* Single SGE covers the entire contiguous IOVA range */ 438 + ctx->iova.sge.addr = ctx->iova.state.addr; 439 + ctx->iova.sge.length = mapped_len; 440 + ctx->iova.sge.lkey = qp->pd->local_dma_lkey; 441 + 442 + /* Single WR for the whole transfer */ 443 + memset(&ctx->iova.wr, 0, sizeof(ctx->iova.wr)); 444 + if (dir == DMA_TO_DEVICE) 445 + ctx->iova.wr.wr.opcode = IB_WR_RDMA_WRITE; 446 + else 447 + ctx->iova.wr.wr.opcode = IB_WR_RDMA_READ; 448 + ctx->iova.wr.wr.num_sge = 1; 449 + ctx->iova.wr.wr.sg_list = &ctx->iova.sge; 450 + ctx->iova.wr.remote_addr = remote_addr; 451 + ctx->iova.wr.rkey = rkey; 452 + 453 + ctx->type = RDMA_RW_IOVA; 454 + ctx->nr_ops = 1; 455 + return 1; 456 + 457 + out_destroy: 458 + /* 459 + * dma_iova_destroy() expects the actual mapped length, not the 460 + * total allocation size. It unlinks only the successfully linked 461 + * range and frees the entire IOVA allocation. 462 + */ 463 + dma_iova_destroy(dma_dev, &ctx->iova.state, mapped_len, dir, 0); 464 + return ret; 465 + } 466 + 372 467 /** 373 468 * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context 374 469 * @ctx: context to initialize ··· 628 343 return ret; 629 344 } 630 345 EXPORT_SYMBOL(rdma_rw_ctx_init); 346 + 347 + /** 348 + * rdma_rw_ctx_init_bvec - initialize a RDMA READ/WRITE context from bio_vec 349 + * @ctx: context to initialize 350 + * @qp: queue pair to operate on 351 + * @port_num: port num to which the connection is bound 352 + * @bvecs: bio_vec array to READ/WRITE from/to 353 + * @nr_bvec: number of entries in @bvecs 354 + * @iter: bvec iterator describing offset and length 355 + * @remote_addr: remote address to read/write (relative to @rkey) 356 + * @rkey: remote key to operate on 357 + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ 358 + * 359 + * Maps the bio_vec array directly, avoiding intermediate scatterlist 360 + * conversion. Supports MR registration for iWARP devices and force_mr mode. 361 + * 362 + * Returns the number of WQEs that will be needed on the workqueue if 363 + * successful, or a negative error code: 364 + * 365 + * * -EINVAL - @nr_bvec is zero or @iter.bi_size is zero 366 + * * -ENOMEM - DMA mapping or memory allocation failed 367 + */ 368 + int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 369 + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, 370 + struct bvec_iter iter, u64 remote_addr, u32 rkey, 371 + enum dma_data_direction dir) 372 + { 373 + struct ib_device *dev = qp->pd->device; 374 + int ret; 375 + 376 + if (nr_bvec == 0 || iter.bi_size == 0) 377 + return -EINVAL; 378 + 379 + /* 380 + * iWARP requires MR registration for all RDMA READs. The force_mr 381 + * debug option also mandates MR usage. 382 + */ 383 + if (dir == DMA_FROM_DEVICE && rdma_protocol_iwarp(dev, port_num)) 384 + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, 385 + nr_bvec, &iter, remote_addr, 386 + rkey, dir); 387 + if (unlikely(rdma_rw_force_mr)) 388 + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, 389 + nr_bvec, &iter, remote_addr, 390 + rkey, dir); 391 + 392 + if (nr_bvec == 1) 393 + return rdma_rw_init_single_wr_bvec(ctx, qp, bvecs, &iter, 394 + remote_addr, rkey, dir); 395 + 396 + /* 397 + * Try IOVA-based mapping first for multi-bvec transfers. 398 + * IOVA coalesces bvecs into a single DMA-contiguous region, 399 + * reducing the number of WRs needed and avoiding MR overhead. 400 + */ 401 + ret = rdma_rw_init_iova_wrs_bvec(ctx, qp, bvecs, &iter, remote_addr, 402 + rkey, dir); 403 + if (ret != -EOPNOTSUPP) 404 + return ret; 405 + 406 + /* 407 + * IOVA mapping not available. Check if MR registration provides 408 + * better performance than multiple SGE entries. 409 + */ 410 + if (rdma_rw_io_needs_mr(dev, port_num, dir, nr_bvec)) 411 + return rdma_rw_init_mr_wrs_bvec(ctx, qp, port_num, bvecs, 412 + nr_bvec, &iter, remote_addr, 413 + rkey, dir); 414 + 415 + return rdma_rw_init_map_wrs_bvec(ctx, qp, bvecs, nr_bvec, &iter, 416 + remote_addr, rkey, dir); 417 + } 418 + EXPORT_SYMBOL(rdma_rw_ctx_init_bvec); 631 419 632 420 /** 633 421 * rdma_rw_ctx_signature_init - initialize a RW context with signature offload ··· 873 515 first_wr = &ctx->reg[0].reg_wr.wr; 874 516 last_wr = &ctx->reg[ctx->nr_ops - 1].wr.wr; 875 517 break; 518 + case RDMA_RW_IOVA: 519 + first_wr = &ctx->iova.wr.wr; 520 + last_wr = &ctx->iova.wr.wr; 521 + break; 876 522 case RDMA_RW_MULTI_WR: 877 523 first_wr = &ctx->map.wrs[0].wr; 878 524 last_wr = &ctx->map.wrs[ctx->nr_ops - 1].wr; ··· 941 579 942 580 switch (ctx->type) { 943 581 case RDMA_RW_MR: 582 + /* Bvec MR contexts must use rdma_rw_ctx_destroy_bvec() */ 583 + WARN_ON_ONCE(ctx->reg[0].sgt.sgl); 944 584 for (i = 0; i < ctx->nr_ops; i++) 945 585 ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 946 586 kfree(ctx->reg); ··· 953 589 break; 954 590 case RDMA_RW_SINGLE_WR: 955 591 break; 592 + case RDMA_RW_IOVA: 593 + /* IOVA contexts must use rdma_rw_ctx_destroy_bvec() */ 594 + WARN_ON_ONCE(1); 595 + return; 956 596 default: 957 597 BUG(); 958 598 break; ··· 965 597 ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir); 966 598 } 967 599 EXPORT_SYMBOL(rdma_rw_ctx_destroy); 600 + 601 + /** 602 + * rdma_rw_ctx_destroy_bvec - release resources from rdma_rw_ctx_init_bvec 603 + * @ctx: context to release 604 + * @qp: queue pair to operate on 605 + * @port_num: port num to which the connection is bound (unused) 606 + * @bvecs: bio_vec array that was used for the READ/WRITE (unused) 607 + * @nr_bvec: number of entries in @bvecs 608 + * @dir: %DMA_TO_DEVICE for RDMA WRITE, %DMA_FROM_DEVICE for RDMA READ 609 + * 610 + * Releases all resources allocated by a successful rdma_rw_ctx_init_bvec() 611 + * call. Must not be called if rdma_rw_ctx_init_bvec() returned an error. 612 + * 613 + * The @port_num and @bvecs parameters are unused but present for API 614 + * symmetry with rdma_rw_ctx_destroy(). 615 + */ 616 + void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 617 + u32 __maybe_unused port_num, 618 + const struct bio_vec __maybe_unused *bvecs, 619 + u32 nr_bvec, enum dma_data_direction dir) 620 + { 621 + struct ib_device *dev = qp->pd->device; 622 + u32 i; 623 + 624 + switch (ctx->type) { 625 + case RDMA_RW_MR: 626 + for (i = 0; i < ctx->nr_ops; i++) 627 + ib_mr_pool_put(qp, &qp->rdma_mrs, ctx->reg[i].mr); 628 + ib_dma_unmap_sgtable_attrs(dev, &ctx->reg[0].sgt, dir, 0); 629 + kfree(ctx->reg[0].sgt.sgl); 630 + kfree(ctx->reg); 631 + break; 632 + case RDMA_RW_IOVA: 633 + dma_iova_destroy(dev->dma_device, &ctx->iova.state, 634 + ctx->iova.mapped_len, dir, 0); 635 + break; 636 + case RDMA_RW_MULTI_WR: 637 + for (i = 0; i < nr_bvec; i++) 638 + ib_dma_unmap_bvec(dev, ctx->map.sges[i].addr, 639 + ctx->map.sges[i].length, dir); 640 + kfree(ctx->map.sges); 641 + break; 642 + case RDMA_RW_SINGLE_WR: 643 + ib_dma_unmap_bvec(dev, ctx->single.sge.addr, 644 + ctx->single.sge.length, dir); 645 + break; 646 + default: 647 + WARN_ON_ONCE(1); 648 + return; 649 + } 650 + } 651 + EXPORT_SYMBOL(rdma_rw_ctx_destroy_bvec); 968 652 969 653 /** 970 654 * rdma_rw_ctx_destroy_signature - release all resources allocated by ··· 1071 651 } 1072 652 EXPORT_SYMBOL(rdma_rw_mr_factor); 1073 653 654 + /** 655 + * rdma_rw_max_send_wr - compute max Send WRs needed for RDMA R/W contexts 656 + * @dev: RDMA device 657 + * @port_num: port number 658 + * @max_rdma_ctxs: number of rdma_rw_ctx structures 659 + * @create_flags: QP create flags (pass IB_QP_CREATE_INTEGRITY_EN if 660 + * data integrity will be enabled on the QP) 661 + * 662 + * Returns the total number of Send Queue entries needed for 663 + * @max_rdma_ctxs. The result accounts for memory registration and 664 + * invalidation work requests when the device requires them. 665 + * 666 + * ULPs use this to size Send Queues and Send CQs before creating a 667 + * Queue Pair. 668 + */ 669 + unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num, 670 + unsigned int max_rdma_ctxs, u32 create_flags) 671 + { 672 + unsigned int factor = 1; 673 + unsigned int result; 674 + 675 + if (create_flags & IB_QP_CREATE_INTEGRITY_EN || 676 + rdma_rw_can_use_mr(dev, port_num)) 677 + factor += 2; /* reg + inv */ 678 + 679 + if (check_mul_overflow(factor, max_rdma_ctxs, &result)) 680 + return UINT_MAX; 681 + return result; 682 + } 683 + EXPORT_SYMBOL(rdma_rw_max_send_wr); 684 + 1074 685 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr) 1075 686 { 1076 - u32 factor; 687 + unsigned int factor = 1; 1077 688 1078 689 WARN_ON_ONCE(attr->port_num == 0); 1079 690 1080 691 /* 1081 - * Each context needs at least one RDMA READ or WRITE WR. 1082 - * 1083 - * For some hardware we might need more, eventually we should ask the 1084 - * HCA driver for a multiplier here. 1085 - */ 1086 - factor = 1; 1087 - 1088 - /* 1089 - * If the device needs MRs to perform RDMA READ or WRITE operations, 1090 - * we'll need two additional MRs for the registrations and the 1091 - * invalidation. 692 + * If the device uses MRs to perform RDMA READ or WRITE operations, 693 + * or if data integrity is enabled, account for registration and 694 + * invalidation work requests. 1092 695 */ 1093 696 if (attr->create_flags & IB_QP_CREATE_INTEGRITY_EN || 1094 697 rdma_rw_can_use_mr(dev, attr->port_num)) 1095 - factor += 2; /* inv + reg */ 698 + factor += 2; /* reg + inv */ 1096 699 1097 700 attr->cap.max_send_wr += factor * attr->cap.max_rdma_ctxs; 1098 701 1099 702 /* 1100 - * But maybe we were just too high in the sky and the device doesn't 1101 - * even support all we need, and we'll have to live with what we get.. 703 + * The device might not support all we need, and we'll have to 704 + * live with what we get. 1102 705 */ 1103 706 attr->cap.max_send_wr = 1104 707 min_t(u32, attr->cap.max_send_wr, dev->attrs.max_qp_wr);
+8 -48
drivers/infiniband/core/sysfs.c
··· 292 292 static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, 293 293 struct ib_port_attribute *unused, char *buf) 294 294 { 295 + struct ib_port_speed_info speed_info; 295 296 struct ib_port_attr attr; 296 - char *speed = ""; 297 - int rate; /* in deci-Gb/sec */ 298 297 ssize_t ret; 299 298 300 299 ret = ib_query_port(ibdev, port_num, &attr); 301 300 if (ret) 302 301 return ret; 303 302 304 - switch (attr.active_speed) { 305 - case IB_SPEED_DDR: 306 - speed = " DDR"; 307 - rate = 50; 308 - break; 309 - case IB_SPEED_QDR: 310 - speed = " QDR"; 311 - rate = 100; 312 - break; 313 - case IB_SPEED_FDR10: 314 - speed = " FDR10"; 315 - rate = 100; 316 - break; 317 - case IB_SPEED_FDR: 318 - speed = " FDR"; 319 - rate = 140; 320 - break; 321 - case IB_SPEED_EDR: 322 - speed = " EDR"; 323 - rate = 250; 324 - break; 325 - case IB_SPEED_HDR: 326 - speed = " HDR"; 327 - rate = 500; 328 - break; 329 - case IB_SPEED_NDR: 330 - speed = " NDR"; 331 - rate = 1000; 332 - break; 333 - case IB_SPEED_XDR: 334 - speed = " XDR"; 335 - rate = 2000; 336 - break; 337 - case IB_SPEED_SDR: 338 - default: /* default to SDR for invalid rates */ 339 - speed = " SDR"; 340 - rate = 25; 341 - break; 342 - } 303 + ret = ib_port_attr_to_speed_info(&attr, &speed_info); 304 + if (ret) 305 + return ret; 343 306 344 - rate *= ib_width_enum_to_int(attr.active_width); 345 - if (rate < 0) 346 - return -EINVAL; 347 - 348 - return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, 349 - rate % 10 ? ".5" : "", 350 - ib_width_enum_to_int(attr.active_width), speed); 307 + return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", speed_info.rate / 10, 308 + speed_info.rate % 10 ? ".5" : "", 309 + ib_width_enum_to_int(attr.active_width), 310 + speed_info.str); 351 311 } 352 312 353 313 static const char *phys_state_to_str(enum ib_port_phys_state phys_state)
-3
drivers/infiniband/core/umem_dmabuf.c
··· 129 129 if (check_add_overflow(offset, (unsigned long)size, &end)) 130 130 return ret; 131 131 132 - if (unlikely(!ops || !ops->move_notify)) 133 - return ret; 134 - 135 132 dmabuf = dma_buf_get(fd); 136 133 if (IS_ERR(dmabuf)) 137 134 return ERR_CAST(dmabuf);
+6 -2
drivers/infiniband/core/user_mad.c
··· 514 514 struct rdma_ah_attr ah_attr; 515 515 struct ib_ah *ah; 516 516 __be64 *tid; 517 - int ret, data_len, hdr_len, copy_offset, rmpp_active; 517 + int ret, hdr_len, copy_offset, rmpp_active; 518 + size_t data_len; 518 519 u8 base_version; 519 520 520 521 if (count < hdr_size(file) + IB_MGMT_RMPP_HDR) ··· 589 588 } 590 589 591 590 base_version = ((struct ib_mad_hdr *)&packet->mad.data)->base_version; 592 - data_len = count - hdr_size(file) - hdr_len; 591 + if (check_sub_overflow(count, hdr_size(file) + hdr_len, &data_len)) { 592 + ret = -EINVAL; 593 + goto err_ah; 594 + } 593 595 packet->msg = ib_create_send_mad(agent, 594 596 be32_to_cpu(packet->mad.hdr.qpn), 595 597 packet->mad.hdr.pkey_index, rmpp_active,
+21
drivers/infiniband/core/uverbs.h
··· 133 133 struct ib_uverbs_event_queue ev_queue; 134 134 }; 135 135 136 + struct ib_uverbs_dmabuf_file { 137 + struct ib_uobject uobj; 138 + struct dma_buf *dmabuf; 139 + struct list_head dmabufs_elm; 140 + struct rdma_user_mmap_entry *mmap_entry; 141 + struct phys_vec phys_vec; 142 + struct p2pdma_provider *provider; 143 + struct kref kref; 144 + struct completion comp; 145 + u8 revoked :1; 146 + }; 147 + 136 148 struct ib_uverbs_event { 137 149 union { 138 150 struct ib_uverbs_async_event_desc async; ··· 302 290 void copy_port_attr_to_resp(struct ib_port_attr *attr, 303 291 struct ib_uverbs_query_port_resp *resp, 304 292 struct ib_device *ib_dev, u8 port_num); 293 + 294 + static inline void ib_uverbs_dmabuf_done(struct kref *kref) 295 + { 296 + struct ib_uverbs_dmabuf_file *priv = 297 + container_of(kref, struct ib_uverbs_dmabuf_file, kref); 298 + 299 + complete(&priv->comp); 300 + } 301 + 305 302 #endif /* UVERBS_H */
+5 -2
drivers/infiniband/core/uverbs_cmd.c
··· 2049 2049 if (ret) 2050 2050 return ret; 2051 2051 2052 - user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL); 2052 + if (cmd.wqe_size < sizeof(struct ib_uverbs_send_wr)) 2053 + return -EINVAL; 2054 + 2055 + user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL | __GFP_NOWARN); 2053 2056 if (!user_wr) 2054 2057 return -ENOMEM; 2055 2058 ··· 2242 2239 if (ret) 2243 2240 return ERR_PTR(ret); 2244 2241 2245 - user_wr = kmalloc(wqe_size, GFP_KERNEL); 2242 + user_wr = kmalloc(wqe_size, GFP_KERNEL | __GFP_NOWARN); 2246 2243 if (!user_wr) 2247 2244 return ERR_PTR(-ENOMEM); 2248 2245
+42
drivers/infiniband/core/uverbs_std_types_device.c
··· 209 209 &resp, sizeof(resp)); 210 210 } 211 211 212 + static int UVERBS_HANDLER(UVERBS_METHOD_QUERY_PORT_SPEED)( 213 + struct uverbs_attr_bundle *attrs) 214 + { 215 + struct ib_ucontext *ucontext; 216 + struct ib_device *ib_dev; 217 + u32 port_num; 218 + u64 speed; 219 + int ret; 220 + 221 + ucontext = ib_uverbs_get_ucontext(attrs); 222 + if (IS_ERR(ucontext)) 223 + return PTR_ERR(ucontext); 224 + ib_dev = ucontext->device; 225 + 226 + if (!ib_dev->ops.query_port_speed) 227 + return -EOPNOTSUPP; 228 + 229 + ret = uverbs_get_const(&port_num, attrs, 230 + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM); 231 + if (ret) 232 + return ret; 233 + 234 + if (!rdma_is_port_valid(ib_dev, port_num)) 235 + return -EINVAL; 236 + 237 + ret = ib_dev->ops.query_port_speed(ib_dev, port_num, &speed); 238 + if (ret) 239 + return ret; 240 + 241 + return uverbs_copy_to(attrs, UVERBS_ATTR_QUERY_PORT_SPEED_RESP, 242 + &speed, sizeof(speed)); 243 + } 244 + 212 245 static int UVERBS_HANDLER(UVERBS_METHOD_GET_CONTEXT)( 213 246 struct uverbs_attr_bundle *attrs) 214 247 { ··· 503 470 UA_MANDATORY)); 504 471 505 472 DECLARE_UVERBS_NAMED_METHOD( 473 + UVERBS_METHOD_QUERY_PORT_SPEED, 474 + UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, u32, 475 + UA_MANDATORY), 476 + UVERBS_ATTR_PTR_OUT(UVERBS_ATTR_QUERY_PORT_SPEED_RESP, 477 + UVERBS_ATTR_TYPE(u64), 478 + UA_MANDATORY)); 479 + 480 + DECLARE_UVERBS_NAMED_METHOD( 506 481 UVERBS_METHOD_QUERY_GID_TABLE, 507 482 UVERBS_ATTR_CONST_IN(UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, u64, 508 483 UA_MANDATORY), ··· 539 498 &UVERBS_METHOD(UVERBS_METHOD_INVOKE_WRITE), 540 499 &UVERBS_METHOD(UVERBS_METHOD_INFO_HANDLES), 541 500 &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT), 501 + &UVERBS_METHOD(UVERBS_METHOD_QUERY_PORT_SPEED), 542 502 &UVERBS_METHOD(UVERBS_METHOD_QUERY_CONTEXT), 543 503 &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_TABLE), 544 504 &UVERBS_METHOD(UVERBS_METHOD_QUERY_GID_ENTRY));
+200
drivers/infiniband/core/uverbs_std_types_dmabuf.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB 2 + /* 3 + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved 4 + */ 5 + 6 + #include <linux/dma-buf-mapping.h> 7 + #include <linux/pci-p2pdma.h> 8 + #include <linux/dma-resv.h> 9 + #include <rdma/uverbs_std_types.h> 10 + #include "rdma_core.h" 11 + #include "uverbs.h" 12 + 13 + static int uverbs_dmabuf_attach(struct dma_buf *dmabuf, 14 + struct dma_buf_attachment *attachment) 15 + { 16 + if (!attachment->peer2peer) 17 + return -EOPNOTSUPP; 18 + 19 + return 0; 20 + } 21 + 22 + static struct sg_table * 23 + uverbs_dmabuf_map(struct dma_buf_attachment *attachment, 24 + enum dma_data_direction dir) 25 + { 26 + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; 27 + struct sg_table *ret; 28 + 29 + dma_resv_assert_held(priv->dmabuf->resv); 30 + 31 + if (priv->revoked) 32 + return ERR_PTR(-ENODEV); 33 + 34 + ret = dma_buf_phys_vec_to_sgt(attachment, priv->provider, 35 + &priv->phys_vec, 1, priv->phys_vec.len, 36 + dir); 37 + if (IS_ERR(ret)) 38 + return ret; 39 + 40 + kref_get(&priv->kref); 41 + return ret; 42 + } 43 + 44 + static void uverbs_dmabuf_unmap(struct dma_buf_attachment *attachment, 45 + struct sg_table *sgt, 46 + enum dma_data_direction dir) 47 + { 48 + struct ib_uverbs_dmabuf_file *priv = attachment->dmabuf->priv; 49 + 50 + dma_resv_assert_held(priv->dmabuf->resv); 51 + dma_buf_free_sgt(attachment, sgt, dir); 52 + kref_put(&priv->kref, ib_uverbs_dmabuf_done); 53 + } 54 + 55 + static int uverbs_dmabuf_pin(struct dma_buf_attachment *attach) 56 + { 57 + return -EOPNOTSUPP; 58 + } 59 + 60 + static void uverbs_dmabuf_unpin(struct dma_buf_attachment *attach) 61 + { 62 + } 63 + 64 + static void uverbs_dmabuf_release(struct dma_buf *dmabuf) 65 + { 66 + struct ib_uverbs_dmabuf_file *priv = dmabuf->priv; 67 + 68 + /* 69 + * This can only happen if the fput came from alloc_abort_fd_uobject() 70 + */ 71 + if (!priv->uobj.context) 72 + return; 73 + 74 + uverbs_uobject_release(&priv->uobj); 75 + } 76 + 77 + static const struct dma_buf_ops uverbs_dmabuf_ops = { 78 + .attach = uverbs_dmabuf_attach, 79 + .map_dma_buf = uverbs_dmabuf_map, 80 + .unmap_dma_buf = uverbs_dmabuf_unmap, 81 + .pin = uverbs_dmabuf_pin, 82 + .unpin = uverbs_dmabuf_unpin, 83 + .release = uverbs_dmabuf_release, 84 + }; 85 + 86 + static int UVERBS_HANDLER(UVERBS_METHOD_DMABUF_ALLOC)( 87 + struct uverbs_attr_bundle *attrs) 88 + { 89 + struct ib_uobject *uobj = 90 + uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE) 91 + ->obj_attr.uobject; 92 + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = 93 + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); 94 + struct ib_device *ib_dev = attrs->context->device; 95 + struct rdma_user_mmap_entry *mmap_entry; 96 + DEFINE_DMA_BUF_EXPORT_INFO(exp_info); 97 + off_t pg_off; 98 + int ret; 99 + 100 + ret = uverbs_get_const(&pg_off, attrs, UVERBS_ATTR_ALLOC_DMABUF_PGOFF); 101 + if (ret) 102 + return ret; 103 + 104 + mmap_entry = ib_dev->ops.pgoff_to_mmap_entry(attrs->context, pg_off); 105 + if (!mmap_entry) 106 + return -EINVAL; 107 + 108 + ret = ib_dev->ops.mmap_get_pfns(mmap_entry, &uverbs_dmabuf->phys_vec, 109 + &uverbs_dmabuf->provider); 110 + if (ret) 111 + goto err; 112 + 113 + exp_info.ops = &uverbs_dmabuf_ops; 114 + exp_info.size = uverbs_dmabuf->phys_vec.len; 115 + exp_info.flags = O_CLOEXEC; 116 + exp_info.priv = uverbs_dmabuf; 117 + 118 + uverbs_dmabuf->dmabuf = dma_buf_export(&exp_info); 119 + if (IS_ERR(uverbs_dmabuf->dmabuf)) { 120 + ret = PTR_ERR(uverbs_dmabuf->dmabuf); 121 + goto err; 122 + } 123 + 124 + kref_init(&uverbs_dmabuf->kref); 125 + init_completion(&uverbs_dmabuf->comp); 126 + INIT_LIST_HEAD(&uverbs_dmabuf->dmabufs_elm); 127 + mutex_lock(&mmap_entry->dmabufs_lock); 128 + if (mmap_entry->driver_removed) 129 + ret = -EIO; 130 + else 131 + list_add_tail(&uverbs_dmabuf->dmabufs_elm, &mmap_entry->dmabufs); 132 + mutex_unlock(&mmap_entry->dmabufs_lock); 133 + if (ret) 134 + goto err_revoked; 135 + 136 + uobj->object = uverbs_dmabuf->dmabuf->file; 137 + uverbs_dmabuf->mmap_entry = mmap_entry; 138 + uverbs_finalize_uobj_create(attrs, UVERBS_ATTR_ALLOC_DMABUF_HANDLE); 139 + return 0; 140 + 141 + err_revoked: 142 + dma_buf_put(uverbs_dmabuf->dmabuf); 143 + err: 144 + rdma_user_mmap_entry_put(mmap_entry); 145 + return ret; 146 + } 147 + 148 + DECLARE_UVERBS_NAMED_METHOD( 149 + UVERBS_METHOD_DMABUF_ALLOC, 150 + UVERBS_ATTR_FD(UVERBS_ATTR_ALLOC_DMABUF_HANDLE, 151 + UVERBS_OBJECT_DMABUF, 152 + UVERBS_ACCESS_NEW, 153 + UA_MANDATORY), 154 + UVERBS_ATTR_PTR_IN(UVERBS_ATTR_ALLOC_DMABUF_PGOFF, 155 + UVERBS_ATTR_TYPE(u64), 156 + UA_MANDATORY)); 157 + 158 + static void uverbs_dmabuf_fd_destroy_uobj(struct ib_uobject *uobj, 159 + enum rdma_remove_reason why) 160 + { 161 + struct ib_uverbs_dmabuf_file *uverbs_dmabuf = 162 + container_of(uobj, struct ib_uverbs_dmabuf_file, uobj); 163 + bool wait_for_comp = false; 164 + 165 + mutex_lock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); 166 + dma_resv_lock(uverbs_dmabuf->dmabuf->resv, NULL); 167 + if (!uverbs_dmabuf->revoked) { 168 + uverbs_dmabuf->revoked = true; 169 + list_del(&uverbs_dmabuf->dmabufs_elm); 170 + dma_buf_move_notify(uverbs_dmabuf->dmabuf); 171 + dma_resv_wait_timeout(uverbs_dmabuf->dmabuf->resv, 172 + DMA_RESV_USAGE_BOOKKEEP, false, 173 + MAX_SCHEDULE_TIMEOUT); 174 + wait_for_comp = true; 175 + } 176 + dma_resv_unlock(uverbs_dmabuf->dmabuf->resv); 177 + if (wait_for_comp) { 178 + kref_put(&uverbs_dmabuf->kref, ib_uverbs_dmabuf_done); 179 + /* Let's wait till all DMA unmap are completed. */ 180 + wait_for_completion(&uverbs_dmabuf->comp); 181 + } 182 + mutex_unlock(&uverbs_dmabuf->mmap_entry->dmabufs_lock); 183 + 184 + /* Matches the get done as part of pgoff_to_mmap_entry() */ 185 + rdma_user_mmap_entry_put(uverbs_dmabuf->mmap_entry); 186 + } 187 + 188 + DECLARE_UVERBS_NAMED_OBJECT( 189 + UVERBS_OBJECT_DMABUF, 190 + UVERBS_TYPE_ALLOC_FD(sizeof(struct ib_uverbs_dmabuf_file), 191 + uverbs_dmabuf_fd_destroy_uobj, 192 + NULL, NULL, O_RDONLY), 193 + &UVERBS_METHOD(UVERBS_METHOD_DMABUF_ALLOC)); 194 + 195 + const struct uapi_definition uverbs_def_obj_dmabuf[] = { 196 + UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_DMABUF), 197 + UAPI_DEF_OBJ_NEEDS_FN(mmap_get_pfns), 198 + UAPI_DEF_OBJ_NEEDS_FN(pgoff_to_mmap_entry), 199 + {} 200 + };
+1
drivers/infiniband/core/uverbs_uapi.c
··· 631 631 UAPI_DEF_CHAIN(uverbs_def_obj_cq), 632 632 UAPI_DEF_CHAIN(uverbs_def_obj_device), 633 633 UAPI_DEF_CHAIN(uverbs_def_obj_dm), 634 + UAPI_DEF_CHAIN(uverbs_def_obj_dmabuf), 634 635 UAPI_DEF_CHAIN(uverbs_def_obj_dmah), 635 636 UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), 636 637 UAPI_DEF_CHAIN(uverbs_def_obj_intf),
+58 -3
drivers/infiniband/core/verbs.c
··· 78 78 [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", 79 79 [IB_EVENT_CLIENT_REREGISTER] = "client reregister", 80 80 [IB_EVENT_GID_CHANGE] = "GID changed", 81 + [IB_EVENT_DEVICE_SPEED_CHANGE] = "device speed change" 81 82 }; 82 83 83 84 const char *__attribute_const__ ib_event_msg(enum ib_event_type event) ··· 216 215 } 217 216 } 218 217 EXPORT_SYMBOL(ib_rate_to_mbps); 218 + 219 + struct ib_speed_attr { 220 + const char *str; 221 + int speed; 222 + }; 223 + 224 + #define IB_SPEED_ATTR(speed_type, _str, _speed) \ 225 + [speed_type] = {.str = _str, .speed = _speed} 226 + 227 + static const struct ib_speed_attr ib_speed_attrs[] = { 228 + IB_SPEED_ATTR(IB_SPEED_SDR, " SDR", 25), 229 + IB_SPEED_ATTR(IB_SPEED_DDR, " DDR", 50), 230 + IB_SPEED_ATTR(IB_SPEED_QDR, " QDR", 100), 231 + IB_SPEED_ATTR(IB_SPEED_FDR10, " FDR10", 100), 232 + IB_SPEED_ATTR(IB_SPEED_FDR, " FDR", 140), 233 + IB_SPEED_ATTR(IB_SPEED_EDR, " EDR", 250), 234 + IB_SPEED_ATTR(IB_SPEED_HDR, " HDR", 500), 235 + IB_SPEED_ATTR(IB_SPEED_NDR, " NDR", 1000), 236 + IB_SPEED_ATTR(IB_SPEED_XDR, " XDR", 2000), 237 + }; 238 + 239 + int ib_port_attr_to_speed_info(struct ib_port_attr *attr, 240 + struct ib_port_speed_info *speed_info) 241 + { 242 + int speed_idx = attr->active_speed; 243 + 244 + switch (attr->active_speed) { 245 + case IB_SPEED_DDR: 246 + case IB_SPEED_QDR: 247 + case IB_SPEED_FDR10: 248 + case IB_SPEED_FDR: 249 + case IB_SPEED_EDR: 250 + case IB_SPEED_HDR: 251 + case IB_SPEED_NDR: 252 + case IB_SPEED_XDR: 253 + case IB_SPEED_SDR: 254 + break; 255 + default: 256 + speed_idx = IB_SPEED_SDR; /* Default to SDR for invalid rates */ 257 + break; 258 + } 259 + 260 + speed_info->str = ib_speed_attrs[speed_idx].str; 261 + speed_info->rate = ib_speed_attrs[speed_idx].speed; 262 + speed_info->rate *= ib_width_enum_to_int(attr->active_width); 263 + if (speed_info->rate < 0) 264 + return -EINVAL; 265 + 266 + return 0; 267 + } 268 + EXPORT_SYMBOL(ib_port_attr_to_speed_info); 219 269 220 270 __attribute_const__ enum rdma_transport_type 221 271 rdma_node_get_transport(unsigned int node_type) ··· 1537 1485 IB_QP_PKEY_INDEX), 1538 1486 [IB_QPT_RC] = (IB_QP_ALT_PATH | 1539 1487 IB_QP_ACCESS_FLAGS | 1540 - IB_QP_PKEY_INDEX), 1488 + IB_QP_PKEY_INDEX | 1489 + IB_QP_RATE_LIMIT), 1541 1490 [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | 1542 1491 IB_QP_ACCESS_FLAGS | 1543 1492 IB_QP_PKEY_INDEX), ··· 1586 1533 IB_QP_ALT_PATH | 1587 1534 IB_QP_ACCESS_FLAGS | 1588 1535 IB_QP_MIN_RNR_TIMER | 1589 - IB_QP_PATH_MIG_STATE), 1536 + IB_QP_PATH_MIG_STATE | 1537 + IB_QP_RATE_LIMIT), 1590 1538 [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | 1591 1539 IB_QP_ALT_PATH | 1592 1540 IB_QP_ACCESS_FLAGS | ··· 1621 1567 IB_QP_ACCESS_FLAGS | 1622 1568 IB_QP_ALT_PATH | 1623 1569 IB_QP_PATH_MIG_STATE | 1624 - IB_QP_MIN_RNR_TIMER), 1570 + IB_QP_MIN_RNR_TIMER | 1571 + IB_QP_RATE_LIMIT), 1625 1572 [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | 1626 1573 IB_QP_ACCESS_FLAGS | 1627 1574 IB_QP_ALT_PATH |
+12 -2
drivers/infiniband/hw/bnxt_re/debugfs.c
··· 87 87 size_t count, loff_t *ppos) 88 88 { 89 89 struct bnxt_re_qp *qp = filep->private_data; 90 + struct bnxt_qplib_qp *qplib_qp; 91 + u32 rate_limit = 0; 90 92 char *buf; 91 93 int len; 92 94 93 95 if (*ppos) 94 96 return 0; 97 + 98 + qplib_qp = &qp->qplib_qp; 99 + if (qplib_qp->shaper_allocation_status) 100 + rate_limit = qplib_qp->rate_limit; 95 101 96 102 buf = kasprintf(GFP_KERNEL, 97 103 "QPN\t\t: %d\n" ··· 105 99 "state\t\t: %s\n" 106 100 "mtu\t\t: %d\n" 107 101 "timeout\t\t: %d\n" 108 - "remote QPN\t: %d\n", 102 + "remote QPN\t: %d\n" 103 + "shaper allocated : %d\n" 104 + "rate limit\t: %d kbps\n", 109 105 qp->qplib_qp.id, 110 106 bnxt_re_qp_type_str(qp->qplib_qp.type), 111 107 bnxt_re_qp_state_str(qp->qplib_qp.state), 112 108 qp->qplib_qp.mtu, 113 109 qp->qplib_qp.timeout, 114 - qp->qplib_qp.dest_qpn); 110 + qp->qplib_qp.dest_qpn, 111 + qplib_qp->shaper_allocation_status, 112 + rate_limit); 115 113 if (!buf) 116 114 return -ENOMEM; 117 115 if (count < strlen(buf)) {
+32 -2
drivers/infiniband/hw/bnxt_re/ib_verbs.c
··· 186 186 { 187 187 struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev); 188 188 struct bnxt_qplib_dev_attr *dev_attr = rdev->dev_attr; 189 + struct bnxt_re_query_device_ex_resp resp = {}; 190 + size_t outlen = (udata) ? udata->outlen : 0; 191 + int rc = 0; 189 192 190 193 memset(ib_attr, 0, sizeof(*ib_attr)); 191 194 memcpy(&ib_attr->fw_ver, dev_attr->fw_ver, ··· 253 250 254 251 ib_attr->max_pkeys = 1; 255 252 ib_attr->local_ca_ack_delay = BNXT_RE_DEFAULT_ACK_DELAY; 256 - return 0; 253 + 254 + if ((offsetofend(typeof(resp), packet_pacing_caps) <= outlen) && 255 + _is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) { 256 + resp.packet_pacing_caps.qp_rate_limit_min = 257 + dev_attr->rate_limit_min; 258 + resp.packet_pacing_caps.qp_rate_limit_max = 259 + dev_attr->rate_limit_max; 260 + resp.packet_pacing_caps.supported_qpts = 261 + 1 << IB_QPT_RC; 262 + } 263 + if (outlen) 264 + rc = ib_copy_to_udata(udata, &resp, 265 + min(sizeof(resp), outlen)); 266 + 267 + return rc; 257 268 } 258 269 259 270 int bnxt_re_modify_device(struct ib_device *ibdev, ··· 2106 2089 unsigned int flags; 2107 2090 u8 nw_type; 2108 2091 2109 - if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS) 2092 + if (qp_attr_mask & ~(IB_QP_ATTR_STANDARD_BITS | IB_QP_RATE_LIMIT)) 2110 2093 return -EOPNOTSUPP; 2111 2094 2112 2095 qp->qplib_qp.modify_flags = 0; 2096 + qp->qplib_qp.ext_modify_flags = 0; 2113 2097 if (qp_attr_mask & IB_QP_STATE) { 2114 2098 curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state); 2115 2099 new_qp_state = qp_attr->qp_state; ··· 2146 2128 bnxt_qplib_clean_qp(&qp->qplib_qp); 2147 2129 bnxt_re_unlock_cqs(qp, flags); 2148 2130 } 2131 + } 2132 + 2133 + if (qp_attr_mask & IB_QP_RATE_LIMIT) { 2134 + if (qp->qplib_qp.type != IB_QPT_RC || 2135 + !_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) 2136 + return -EOPNOTSUPP; 2137 + qp->qplib_qp.ext_modify_flags |= 2138 + CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID; 2139 + qp->qplib_qp.rate_limit = qp_attr->rate_limit; 2149 2140 } 2150 2141 if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) { 2151 2142 qp->qplib_qp.modify_flags |= ··· 4412 4385 4413 4386 if (_is_host_msn_table(rdev->qplib_res.dattr->dev_cap_flags2)) 4414 4387 resp.comp_mask |= BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED; 4388 + 4389 + if (_is_modify_qp_rate_limit_supported(dev_attr->dev_cap_flags2)) 4390 + resp.comp_mask |= BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED; 4415 4391 4416 4392 if (udata->inlen >= sizeof(ureq)) { 4417 4393 rc = ib_copy_from_udata(&ureq, udata, min(udata->inlen, sizeof(ureq)));
+11 -1
drivers/infiniband/hw/bnxt_re/qplib_fp.c
··· 1313 1313 struct bnxt_qplib_cmdqmsg msg = {}; 1314 1314 struct cmdq_modify_qp req = {}; 1315 1315 u16 vlan_pcp_vlan_dei_vlan_id; 1316 + u32 bmask, bmask_ext; 1316 1317 u32 temp32[4]; 1317 - u32 bmask; 1318 1318 int rc; 1319 1319 1320 1320 bnxt_qplib_rcfw_cmd_prep((struct cmdq_base *)&req, ··· 1329 1329 is_optimized_state_transition(qp)) 1330 1330 bnxt_set_mandatory_attributes(res, qp, &req); 1331 1331 } 1332 + 1332 1333 bmask = qp->modify_flags; 1333 1334 req.modify_mask = cpu_to_le32(qp->modify_flags); 1335 + bmask_ext = qp->ext_modify_flags; 1336 + req.ext_modify_mask = cpu_to_le32(qp->ext_modify_flags); 1334 1337 req.qp_cid = cpu_to_le32(qp->id); 1338 + 1339 + if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID) 1340 + req.rate_limit = cpu_to_le32(qp->rate_limit); 1341 + 1335 1342 if (bmask & CMDQ_MODIFY_QP_MODIFY_MASK_STATE) { 1336 1343 req.network_type_en_sqd_async_notify_new_state = 1337 1344 (qp->state & CMDQ_MODIFY_QP_NEW_STATE_MASK) | ··· 1436 1429 rc = bnxt_qplib_rcfw_send_message(rcfw, &msg); 1437 1430 if (rc) 1438 1431 return rc; 1432 + 1433 + if (bmask_ext & CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID) 1434 + qp->shaper_allocation_status = resp.shaper_allocation_status; 1439 1435 qp->cur_qp_state = qp->state; 1440 1436 return 0; 1441 1437 }
+3
drivers/infiniband/hw/bnxt_re/qplib_fp.h
··· 280 280 u8 state; 281 281 u8 cur_qp_state; 282 282 u64 modify_flags; 283 + u32 ext_modify_flags; 283 284 u32 max_inline_data; 284 285 u32 mtu; 285 286 u8 path_mtu; ··· 347 346 bool is_host_msn_tbl; 348 347 u8 tos_dscp; 349 348 u32 ugid_index; 349 + u32 rate_limit; 350 + u8 shaper_allocation_status; 350 351 }; 351 352 352 353 #define BNXT_RE_MAX_MSG_SIZE 0x80000000
+6
drivers/infiniband/hw/bnxt_re/qplib_res.h
··· 623 623 return !!(dev_cap_ext_flags_2 & CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED); 624 624 } 625 625 626 + static inline bool _is_modify_qp_rate_limit_supported(u16 dev_cap_ext_flags2) 627 + { 628 + return dev_cap_ext_flags2 & 629 + CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED; 630 + } 631 + 626 632 #endif /* __BNXT_QPLIB_RES_H__ */
+5
drivers/infiniband/hw/bnxt_re/qplib_sp.c
··· 193 193 attr->max_dpi = le32_to_cpu(sb->max_dpi); 194 194 195 195 attr->is_atomic = bnxt_qplib_is_atomic_cap(rcfw); 196 + 197 + if (_is_modify_qp_rate_limit_supported(attr->dev_cap_flags2)) { 198 + attr->rate_limit_min = le16_to_cpu(sb->rate_limit_min); 199 + attr->rate_limit_max = le32_to_cpu(sb->rate_limit_max); 200 + } 196 201 bail: 197 202 dma_free_coherent(&rcfw->pdev->dev, sbuf.size, 198 203 sbuf.sb, sbuf.dma_addr);
+2
drivers/infiniband/hw/bnxt_re/qplib_sp.h
··· 76 76 u16 dev_cap_flags; 77 77 u16 dev_cap_flags2; 78 78 u32 max_dpi; 79 + u16 rate_limit_min; 80 + u32 rate_limit_max; 79 81 }; 80 82 81 83 struct bnxt_qplib_pd {
+9 -4
drivers/infiniband/hw/bnxt_re/roce_hsi.h
··· 690 690 __le32 ext_modify_mask; 691 691 #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_EXT_STATS_CTX 0x1UL 692 692 #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_SCHQ_ID_VALID 0x2UL 693 + #define CMDQ_MODIFY_QP_EXT_MODIFY_MASK_RATE_LIMIT_VALID 0x8UL 693 694 __le32 ext_stats_ctx_id; 694 695 __le16 schq_id; 695 696 __le16 unused_0; 696 - __le32 reserved32; 697 + __le32 rate_limit; 697 698 }; 698 699 699 700 /* creq_modify_qp_resp (size:128b/16B) */ ··· 717 716 #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_MASK 0xeUL 718 717 #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_INDEX_SFT 1 719 718 #define CREQ_MODIFY_QP_RESP_PINGPONG_PUSH_STATE 0x10UL 720 - u8 reserved8; 719 + u8 shaper_allocation_status; 720 + #define CREQ_MODIFY_QP_RESP_SHAPER_ALLOCATED 0x1UL 721 721 __le32 lag_src_mac; 722 722 }; 723 723 ··· 2181 2179 u8 reserved48[6]; 2182 2180 }; 2183 2181 2184 - /* creq_query_func_resp_sb (size:1088b/136B) */ 2182 + /* creq_query_func_resp_sb (size:1280b/160B) */ 2185 2183 struct creq_query_func_resp_sb { 2186 2184 u8 opcode; 2187 2185 #define CREQ_QUERY_FUNC_RESP_SB_OPCODE_QUERY_FUNC 0x83UL ··· 2258 2256 #define CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_LAST \ 2259 2257 CREQ_QUERY_FUNC_RESP_SB_REQ_RETRANSMISSION_SUPPORT_IQM_MSN_TABLE 2260 2258 #define CREQ_QUERY_FUNC_RESP_SB_MAX_SRQ_EXTENDED 0x40UL 2259 + #define CREQ_QUERY_FUNC_RESP_SB_MODIFY_QP_RATE_LIMIT_SUPPORTED 0x400UL 2261 2260 #define CREQ_QUERY_FUNC_RESP_SB_MIN_RNR_RTR_RTS_OPT_SUPPORTED 0x1000UL 2262 2261 __le16 max_xp_qp_size; 2263 2262 __le16 create_qp_batch_size; 2264 2263 __le16 destroy_qp_batch_size; 2265 2264 __le16 max_srq_ext; 2266 - __le64 reserved64; 2265 + __le16 reserved16; 2266 + __le16 rate_limit_min; 2267 + __le32 rate_limit_max; 2267 2268 }; 2268 2269 2269 2270 /* cmdq_set_func_resources (size:448b/56B) */
+55 -44
drivers/infiniband/hw/efa/efa_com.c
··· 3 3 * Copyright 2018-2025 Amazon.com, Inc. or its affiliates. All rights reserved. 4 4 */ 5 5 6 + #include <linux/log2.h> 7 + 6 8 #include "efa_com.h" 7 9 #include "efa_regs_defs.h" 8 10 ··· 23 21 #define EFA_CTRL_SUB_MINOR 1 24 22 25 23 enum efa_cmd_status { 24 + EFA_CMD_UNUSED, 25 + EFA_CMD_ALLOCATED, 26 26 EFA_CMD_SUBMITTED, 27 27 EFA_CMD_COMPLETED, 28 28 }; ··· 36 32 enum efa_cmd_status status; 37 33 u16 cmd_id; 38 34 u8 cmd_opcode; 39 - u8 occupied; 40 35 }; 41 36 42 37 static const char *efa_com_cmd_str(u8 cmd) ··· 244 241 return 0; 245 242 } 246 243 247 - /* ID to be used with efa_com_get_comp_ctx */ 248 244 static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq) 249 245 { 250 246 u16 ctx_id; ··· 265 263 spin_unlock(&aq->comp_ctx_lock); 266 264 } 267 265 268 - static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq, 269 - struct efa_comp_ctx *comp_ctx) 266 + static struct efa_comp_ctx *efa_com_alloc_comp_ctx(struct efa_com_admin_queue *aq) 270 267 { 271 - u16 cmd_id = EFA_GET(&comp_ctx->user_cqe->acq_common_descriptor.command, 272 - EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); 273 - u16 ctx_id = cmd_id & (aq->depth - 1); 268 + struct efa_comp_ctx *comp_ctx; 269 + u16 ctx_id; 274 270 275 - ibdev_dbg(aq->efa_dev, "Put completion command_id %#x\n", cmd_id); 276 - comp_ctx->occupied = 0; 277 - efa_com_dealloc_ctx_id(aq, ctx_id); 278 - } 271 + ctx_id = efa_com_alloc_ctx_id(aq); 279 272 280 - static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq, 281 - u16 cmd_id, bool capture) 282 - { 283 - u16 ctx_id = cmd_id & (aq->depth - 1); 284 - 285 - if (aq->comp_ctx[ctx_id].occupied && capture) { 286 - ibdev_err_ratelimited( 287 - aq->efa_dev, 288 - "Completion context for command_id %#x is occupied\n", 289 - cmd_id); 273 + comp_ctx = &aq->comp_ctx[ctx_id]; 274 + if (comp_ctx->status != EFA_CMD_UNUSED) { 275 + efa_com_dealloc_ctx_id(aq, ctx_id); 276 + ibdev_err_ratelimited(aq->efa_dev, 277 + "Completion context[%u] is used[%u]\n", 278 + ctx_id, comp_ctx->status); 290 279 return NULL; 291 280 } 292 281 293 - if (capture) { 294 - aq->comp_ctx[ctx_id].occupied = 1; 295 - ibdev_dbg(aq->efa_dev, 296 - "Take completion ctxt for command_id %#x\n", cmd_id); 297 - } 282 + comp_ctx->status = EFA_CMD_ALLOCATED; 283 + ibdev_dbg(aq->efa_dev, "Take completion context[%u]\n", ctx_id); 284 + return comp_ctx; 285 + } 286 + 287 + static inline u16 efa_com_get_comp_ctx_id(struct efa_com_admin_queue *aq, 288 + struct efa_comp_ctx *comp_ctx) 289 + { 290 + return comp_ctx - aq->comp_ctx; 291 + } 292 + 293 + static inline void efa_com_dealloc_comp_ctx(struct efa_com_admin_queue *aq, 294 + struct efa_comp_ctx *comp_ctx) 295 + { 296 + u16 ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx); 297 + 298 + ibdev_dbg(aq->efa_dev, "Put completion context[%u]\n", ctx_id); 299 + comp_ctx->status = EFA_CMD_UNUSED; 300 + efa_com_dealloc_ctx_id(aq, ctx_id); 301 + } 302 + 303 + static inline struct efa_comp_ctx *efa_com_get_comp_ctx_by_cmd_id(struct efa_com_admin_queue *aq, 304 + u16 cmd_id) 305 + { 306 + u16 ctx_id = cmd_id & (aq->depth - 1); 298 307 299 308 return &aq->comp_ctx[ctx_id]; 300 309 } ··· 323 310 u16 ctx_id; 324 311 u16 pi; 325 312 313 + comp_ctx = efa_com_alloc_comp_ctx(aq); 314 + if (!comp_ctx) 315 + return ERR_PTR(-EINVAL); 316 + 326 317 queue_size_mask = aq->depth - 1; 327 318 pi = aq->sq.pc & queue_size_mask; 328 - 329 - ctx_id = efa_com_alloc_ctx_id(aq); 319 + ctx_id = efa_com_get_comp_ctx_id(aq, comp_ctx); 330 320 331 321 /* cmd_id LSBs are the ctx_id and MSBs are entropy bits from pc */ 332 322 cmd_id = ctx_id & queue_size_mask; 333 - cmd_id |= aq->sq.pc & ~queue_size_mask; 323 + cmd_id |= aq->sq.pc << ilog2(aq->depth); 334 324 cmd_id &= EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK; 335 325 336 326 cmd->aq_common_descriptor.command_id = cmd_id; 337 327 EFA_SET(&cmd->aq_common_descriptor.flags, 338 328 EFA_ADMIN_AQ_COMMON_DESC_PHASE, aq->sq.phase); 339 - 340 - comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, true); 341 - if (!comp_ctx) { 342 - efa_com_dealloc_ctx_id(aq, ctx_id); 343 - return ERR_PTR(-EINVAL); 344 - } 345 329 346 330 comp_ctx->status = EFA_CMD_SUBMITTED; 347 331 comp_ctx->comp_size = comp_size_in_bytes; ··· 380 370 } 381 371 382 372 for (i = 0; i < aq->depth; i++) { 383 - comp_ctx = efa_com_get_comp_ctx(aq, i, false); 384 - if (comp_ctx) 385 - init_completion(&comp_ctx->wait_event); 373 + comp_ctx = &aq->comp_ctx[i]; 374 + comp_ctx->status = EFA_CMD_UNUSED; 375 + init_completion(&comp_ctx->wait_event); 386 376 387 377 aq->comp_ctx_pool[i] = i; 388 378 } ··· 427 417 cmd_id = EFA_GET(&cqe->acq_common_descriptor.command, 428 418 EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID); 429 419 430 - comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false); 431 - if (comp_ctx->status != EFA_CMD_SUBMITTED) { 420 + comp_ctx = efa_com_get_comp_ctx_by_cmd_id(aq, cmd_id); 421 + if (comp_ctx->status != EFA_CMD_SUBMITTED || comp_ctx->cmd_id != cmd_id) { 432 422 ibdev_err(aq->efa_dev, 433 - "Received completion with unexpected command id[%d], sq producer: %d, sq consumer: %d, cq consumer: %d\n", 434 - cmd_id, aq->sq.pc, aq->sq.cc, aq->cq.cc); 423 + "Received completion with unexpected command id[%x], status[%d] sq producer[%d], sq consumer[%d], cq consumer[%d]\n", 424 + cmd_id, comp_ctx->status, aq->sq.pc, aq->sq.cc, 425 + aq->cq.cc); 435 426 return -EINVAL; 436 427 } 437 428 ··· 541 530 542 531 err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); 543 532 out: 544 - efa_com_put_comp_ctx(aq, comp_ctx); 533 + efa_com_dealloc_comp_ctx(aq, comp_ctx); 545 534 return err; 546 535 } 547 536 ··· 591 580 592 581 err = efa_com_comp_status_to_errno(comp_ctx->user_cqe->acq_common_descriptor.status); 593 582 out: 594 - efa_com_put_comp_ctx(aq, comp_ctx); 583 + efa_com_dealloc_comp_ctx(aq, comp_ctx); 595 584 return err; 596 585 } 597 586
+11 -12
drivers/infiniband/hw/hns/hns_roce_ah.c
··· 60 60 u8 tclass = get_tclass(grh); 61 61 u8 priority = 0; 62 62 u8 tc_mode = 0; 63 - int ret; 63 + int ret = 0; 64 64 65 65 if (hr_dev->pci_dev->revision == PCI_REVISION_ID_HIP08 && udata) { 66 66 ret = -EOPNOTSUPP; ··· 77 77 ah->av.flowlabel = grh->flow_label; 78 78 ah->av.udp_sport = get_ah_udp_sport(ah_attr); 79 79 ah->av.tclass = tclass; 80 + ah->av.sl = rdma_ah_get_sl(ah_attr); 80 81 81 - ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); 82 - if (ret == -EOPNOTSUPP) 83 - ret = 0; 82 + if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { 83 + ret = hr_dev->hw->get_dscp(hr_dev, tclass, &tc_mode, &priority); 84 + if (ret == -EOPNOTSUPP) 85 + ret = 0; 86 + else if (ret) 87 + goto err_out; 84 88 85 - if (ret && grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) 86 - goto err_out; 87 - 88 - if (tc_mode == HNAE3_TC_MAP_MODE_DSCP && 89 - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) 90 - ah->av.sl = priority; 91 - else 92 - ah->av.sl = rdma_ah_get_sl(ah_attr); 89 + if (tc_mode == HNAE3_TC_MAP_MODE_DSCP) 90 + ah->av.sl = priority; 91 + } 93 92 94 93 if (!check_sl_valid(hr_dev, ah->av.sl)) { 95 94 ret = -EINVAL;
+10 -2
drivers/infiniband/hw/hns/hns_roce_cq.c
··· 55 55 { 56 56 struct hns_roce_dev *hr_dev = to_hr_dev(uctx->ibucontext.device); 57 57 struct hns_roce_cq_table *cq_table = &hr_dev->cq_table; 58 - u32 least_load = cq_table->ctx_num[0]; 58 + u32 least_load = U32_MAX; 59 59 u8 bankid = 0; 60 60 u8 i; 61 61 ··· 63 63 return; 64 64 65 65 mutex_lock(&cq_table->bank_mutex); 66 - for (i = 1; i < HNS_ROCE_CQ_BANK_NUM; i++) { 66 + for (i = 0; i < HNS_ROCE_CQ_BANK_NUM; i++) { 67 + if (!(cq_table->valid_cq_bank_mask & BIT(i))) 68 + continue; 69 + 67 70 if (cq_table->ctx_num[i] < least_load) { 68 71 least_load = cq_table->ctx_num[i]; 69 72 bankid = i; ··· 584 581 cq_table->bank[i].max = hr_dev->caps.num_cqs / 585 582 HNS_ROCE_CQ_BANK_NUM - 1; 586 583 } 584 + 585 + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) 586 + cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_LIMIT; 587 + else 588 + cq_table->valid_cq_bank_mask = VALID_CQ_BANK_MASK_DEFAULT; 587 589 } 588 590 589 591 void hns_roce_cleanup_cq_table(struct hns_roce_dev *hr_dev)
+6
drivers/infiniband/hw/hns/hns_roce_device.h
··· 103 103 104 104 #define CQ_BANKID_SHIFT 2 105 105 #define CQ_BANKID_MASK GENMASK(1, 0) 106 + #define VALID_CQ_BANK_MASK_DEFAULT 0xF 107 + #define VALID_CQ_BANK_MASK_LIMIT 0x9 108 + 109 + #define VALID_EXT_SGE_QP_BANK_MASK_LIMIT 0x42 106 110 107 111 #define HNS_ROCE_MAX_CQ_COUNT 0xFFFF 108 112 #define HNS_ROCE_MAX_CQ_PERIOD 0xFFFF ··· 160 156 HNS_ROCE_CAP_FLAG_CQE_INLINE = BIT(19), 161 157 HNS_ROCE_CAP_FLAG_BOND = BIT(21), 162 158 HNS_ROCE_CAP_FLAG_SRQ_RECORD_DB = BIT(22), 159 + HNS_ROCE_CAP_FLAG_LIMIT_BANK = BIT(23), 163 160 }; 164 161 165 162 #define HNS_ROCE_DB_TYPE_COUNT 2 ··· 505 500 struct hns_roce_bank bank[HNS_ROCE_CQ_BANK_NUM]; 506 501 struct mutex bank_mutex; 507 502 u32 ctx_num[HNS_ROCE_CQ_BANK_NUM]; 503 + u8 valid_cq_bank_mask; 508 504 }; 509 505 510 506 struct hns_roce_srq_table {
+206 -14
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
··· 876 876 return ret; 877 877 } 878 878 879 + static int hns_roce_push_drain_wr(struct hns_roce_wq *wq, struct ib_cq *cq, 880 + u64 wr_id) 881 + { 882 + unsigned long flags; 883 + int ret = 0; 884 + 885 + spin_lock_irqsave(&wq->lock, flags); 886 + if (hns_roce_wq_overflow(wq, 1, cq)) { 887 + ret = -ENOMEM; 888 + goto out; 889 + } 890 + 891 + wq->wrid[wq->head & (wq->wqe_cnt - 1)] = wr_id; 892 + wq->head++; 893 + 894 + out: 895 + spin_unlock_irqrestore(&wq->lock, flags); 896 + return ret; 897 + } 898 + 899 + struct hns_roce_drain_cqe { 900 + struct ib_cqe cqe; 901 + struct completion done; 902 + }; 903 + 904 + static void hns_roce_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) 905 + { 906 + struct hns_roce_drain_cqe *cqe = container_of(wc->wr_cqe, 907 + struct hns_roce_drain_cqe, 908 + cqe); 909 + complete(&cqe->done); 910 + } 911 + 912 + static void handle_drain_completion(struct ib_cq *ibcq, 913 + struct hns_roce_drain_cqe *drain, 914 + struct hns_roce_dev *hr_dev) 915 + { 916 + #define TIMEOUT (HZ / 10) 917 + struct hns_roce_cq *hr_cq = to_hr_cq(ibcq); 918 + unsigned long flags; 919 + bool triggered; 920 + 921 + if (ibcq->poll_ctx == IB_POLL_DIRECT) { 922 + while (wait_for_completion_timeout(&drain->done, TIMEOUT) <= 0) 923 + ib_process_cq_direct(ibcq, -1); 924 + return; 925 + } 926 + 927 + if (hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) 928 + goto waiting_done; 929 + 930 + spin_lock_irqsave(&hr_cq->lock, flags); 931 + triggered = hr_cq->is_armed; 932 + hr_cq->is_armed = 1; 933 + spin_unlock_irqrestore(&hr_cq->lock, flags); 934 + 935 + /* Triggered means this cq is processing or has been processed 936 + * by hns_roce_handle_device_err() or this function. We need to 937 + * cancel the already invoked comp_handler() to avoid concurrency. 938 + * If it has not been triggered, we can directly invoke 939 + * comp_handler(). 940 + */ 941 + if (triggered) { 942 + switch (ibcq->poll_ctx) { 943 + case IB_POLL_SOFTIRQ: 944 + irq_poll_disable(&ibcq->iop); 945 + irq_poll_enable(&ibcq->iop); 946 + break; 947 + case IB_POLL_WORKQUEUE: 948 + case IB_POLL_UNBOUND_WORKQUEUE: 949 + cancel_work_sync(&ibcq->work); 950 + break; 951 + default: 952 + WARN_ON_ONCE(1); 953 + } 954 + } 955 + 956 + if (ibcq->comp_handler) 957 + ibcq->comp_handler(ibcq, ibcq->cq_context); 958 + 959 + waiting_done: 960 + if (ibcq->comp_handler) 961 + wait_for_completion(&drain->done); 962 + } 963 + 964 + static void hns_roce_v2_drain_rq(struct ib_qp *ibqp) 965 + { 966 + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); 967 + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; 968 + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); 969 + struct hns_roce_drain_cqe rdrain = {}; 970 + const struct ib_recv_wr *bad_rwr; 971 + struct ib_cq *cq = ibqp->recv_cq; 972 + struct ib_recv_wr rwr = {}; 973 + int ret; 974 + 975 + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); 976 + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { 977 + ibdev_err_ratelimited(&hr_dev->ib_dev, 978 + "failed to modify qp during drain rq, ret = %d.\n", 979 + ret); 980 + return; 981 + } 982 + 983 + rwr.wr_cqe = &rdrain.cqe; 984 + rdrain.cqe.done = hns_roce_drain_qp_done; 985 + init_completion(&rdrain.done); 986 + 987 + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) 988 + ret = hns_roce_push_drain_wr(&hr_qp->rq, cq, rwr.wr_id); 989 + else 990 + ret = hns_roce_v2_post_recv(ibqp, &rwr, &bad_rwr); 991 + if (ret) { 992 + ibdev_err_ratelimited(&hr_dev->ib_dev, 993 + "failed to post recv for drain rq, ret = %d.\n", 994 + ret); 995 + return; 996 + } 997 + 998 + handle_drain_completion(cq, &rdrain, hr_dev); 999 + } 1000 + 1001 + static void hns_roce_v2_drain_sq(struct ib_qp *ibqp) 1002 + { 1003 + struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device); 1004 + struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; 1005 + struct hns_roce_qp *hr_qp = to_hr_qp(ibqp); 1006 + struct hns_roce_drain_cqe sdrain = {}; 1007 + const struct ib_send_wr *bad_swr; 1008 + struct ib_cq *cq = ibqp->send_cq; 1009 + struct ib_rdma_wr swr = { 1010 + .wr = { 1011 + .next = NULL, 1012 + { .wr_cqe = &sdrain.cqe, }, 1013 + .opcode = IB_WR_RDMA_WRITE, 1014 + }, 1015 + }; 1016 + int ret; 1017 + 1018 + ret = ib_modify_qp(ibqp, &attr, IB_QP_STATE); 1019 + if (ret && hr_dev->state < HNS_ROCE_DEVICE_STATE_RST_DOWN) { 1020 + ibdev_err_ratelimited(&hr_dev->ib_dev, 1021 + "failed to modify qp during drain sq, ret = %d.\n", 1022 + ret); 1023 + return; 1024 + } 1025 + 1026 + sdrain.cqe.done = hns_roce_drain_qp_done; 1027 + init_completion(&sdrain.done); 1028 + 1029 + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) 1030 + ret = hns_roce_push_drain_wr(&hr_qp->sq, cq, swr.wr.wr_id); 1031 + else 1032 + ret = hns_roce_v2_post_send(ibqp, &swr.wr, &bad_swr); 1033 + if (ret) { 1034 + ibdev_err_ratelimited(&hr_dev->ib_dev, 1035 + "failed to post send for drain sq, ret = %d.\n", 1036 + ret); 1037 + return; 1038 + } 1039 + 1040 + handle_drain_completion(cq, &sdrain, hr_dev); 1041 + } 1042 + 879 1043 static void *get_srq_wqe_buf(struct hns_roce_srq *srq, u32 n) 880 1044 { 881 1045 return hns_roce_buf_offset(srq->buf_mtr.kmem, n << srq->wqe_shift); ··· 3903 3739 HNS_ROCE_V2_CQ_DEFAULT_INTERVAL); 3904 3740 } 3905 3741 3742 + static bool left_sw_wc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq) 3743 + { 3744 + struct hns_roce_qp *hr_qp; 3745 + 3746 + list_for_each_entry(hr_qp, &hr_cq->sq_list, sq_node) { 3747 + if (hr_qp->sq.head != hr_qp->sq.tail) 3748 + return true; 3749 + } 3750 + 3751 + list_for_each_entry(hr_qp, &hr_cq->rq_list, rq_node) { 3752 + if (hr_qp->rq.head != hr_qp->rq.tail) 3753 + return true; 3754 + } 3755 + 3756 + return false; 3757 + } 3758 + 3906 3759 static int hns_roce_v2_req_notify_cq(struct ib_cq *ibcq, 3907 3760 enum ib_cq_notify_flags flags) 3908 3761 { ··· 3928 3747 struct hns_roce_v2_db cq_db = {}; 3929 3748 u32 notify_flag; 3930 3749 3750 + if (hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN) { 3751 + if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && 3752 + left_sw_wc(hr_dev, hr_cq)) 3753 + return 1; 3754 + return 0; 3755 + } 3931 3756 /* 3932 3757 * flags = 0, then notify_flag : next 3933 3758 * flags = 1, then notify flag : solocited ··· 5240 5053 struct ib_device *ibdev = &hr_dev->ib_dev; 5241 5054 int ret; 5242 5055 5243 - ret = hns_roce_hw_v2_get_dscp(hr_dev, get_tclass(&attr->ah_attr.grh), 5244 - &hr_qp->tc_mode, &hr_qp->priority); 5245 - if (ret && ret != -EOPNOTSUPP && 5246 - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { 5247 - ibdev_err_ratelimited(ibdev, 5248 - "failed to get dscp, ret = %d.\n", ret); 5249 - return ret; 5250 - } 5056 + hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); 5251 5057 5252 - if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP && 5253 - grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) 5254 - hr_qp->sl = hr_qp->priority; 5255 - else 5256 - hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr); 5058 + if (grh->sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { 5059 + ret = hns_roce_hw_v2_get_dscp(hr_dev, 5060 + get_tclass(&attr->ah_attr.grh), 5061 + &hr_qp->tc_mode, &hr_qp->priority); 5062 + if (ret && ret != -EOPNOTSUPP) { 5063 + ibdev_err_ratelimited(ibdev, 5064 + "failed to get dscp, ret = %d.\n", 5065 + ret); 5066 + return ret; 5067 + } 5068 + 5069 + if (hr_qp->tc_mode == HNAE3_TC_MAP_MODE_DSCP) 5070 + hr_qp->sl = hr_qp->priority; 5071 + } 5257 5072 5258 5073 if (!check_sl_valid(hr_dev, hr_qp->sl)) 5259 5074 return -EINVAL; ··· 7145 6956 7146 6957 INIT_WORK(&hr_dev->ecc_work, fmea_ram_ecc_work); 7147 6958 7148 - hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 0); 6959 + hr_dev->irq_workq = alloc_ordered_workqueue("hns_roce_irq_workq", 6960 + WQ_MEM_RECLAIM); 7149 6961 if (!hr_dev->irq_workq) { 7150 6962 dev_err(dev, "failed to create irq workqueue.\n"); 7151 6963 ret = -ENOMEM; ··· 7204 7014 .post_send = hns_roce_v2_post_send, 7205 7015 .query_qp = hns_roce_v2_query_qp, 7206 7016 .req_notify_cq = hns_roce_v2_req_notify_cq, 7017 + .drain_rq = hns_roce_v2_drain_rq, 7018 + .drain_sq = hns_roce_v2_drain_sq, 7207 7019 }; 7208 7020 7209 7021 static const struct ib_device_ops hns_roce_v2_dev_srq_ops = {
+5
drivers/infiniband/hw/hns/hns_roce_main.c
··· 259 259 props->max_srq_sge = hr_dev->caps.max_srq_sges; 260 260 } 261 261 262 + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) { 263 + props->max_cq >>= 1; 264 + props->max_qp >>= 1; 265 + } 266 + 262 267 if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR && 263 268 hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09) { 264 269 props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+39 -10
drivers/infiniband/hw/hns/hns_roce_qp.c
··· 197 197 return (qp_bank >> 1) & CQ_BANKID_MASK; 198 198 } 199 199 200 - static u8 get_least_load_bankid_for_qp(struct ib_qp_init_attr *init_attr, 201 - struct hns_roce_bank *bank) 200 + static u8 get_least_load_bankid_for_qp(struct hns_roce_bank *bank, u8 valid_qp_bank_mask) 202 201 { 203 202 #define INVALID_LOAD_QPNUM 0xFFFFFFFF 204 - struct ib_cq *scq = init_attr->send_cq; 205 203 u32 least_load = INVALID_LOAD_QPNUM; 206 - unsigned long cqn = 0; 207 204 u8 bankid = 0; 208 205 u32 bankcnt; 209 206 u8 i; 210 207 211 - if (scq) 212 - cqn = to_hr_cq(scq)->cqn; 213 - 214 208 for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) { 215 - if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK))) 209 + if (!(valid_qp_bank_mask & BIT(i))) 216 210 continue; 217 211 218 212 bankcnt = bank[i].inuse; ··· 240 246 241 247 return 0; 242 248 } 249 + 250 + static bool use_ext_sge(struct ib_qp_init_attr *init_attr) 251 + { 252 + return init_attr->cap.max_send_sge > HNS_ROCE_SGE_IN_WQE || 253 + init_attr->qp_type == IB_QPT_UD || 254 + init_attr->qp_type == IB_QPT_GSI; 255 + } 256 + 257 + static u8 select_qp_bankid(struct hns_roce_dev *hr_dev, 258 + struct ib_qp_init_attr *init_attr) 259 + { 260 + struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; 261 + struct hns_roce_bank *bank = qp_table->bank; 262 + struct ib_cq *scq = init_attr->send_cq; 263 + u8 valid_qp_bank_mask = 0; 264 + unsigned long cqn = 0; 265 + u8 i; 266 + 267 + if (scq) 268 + cqn = to_hr_cq(scq)->cqn; 269 + 270 + for (i = 0; i < HNS_ROCE_QP_BANK_NUM; i++) { 271 + if (scq && (get_affinity_cq_bank(i) != (cqn & CQ_BANKID_MASK))) 272 + continue; 273 + 274 + if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_LIMIT_BANK) && 275 + use_ext_sge(init_attr) && 276 + !(VALID_EXT_SGE_QP_BANK_MASK_LIMIT & BIT(i))) 277 + continue; 278 + 279 + valid_qp_bank_mask |= BIT(i); 280 + } 281 + 282 + return get_least_load_bankid_for_qp(bank, valid_qp_bank_mask); 283 + } 284 + 243 285 static int alloc_qpn(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp, 244 286 struct ib_qp_init_attr *init_attr) 245 287 { ··· 288 258 num = 1; 289 259 } else { 290 260 mutex_lock(&qp_table->bank_mutex); 291 - bankid = get_least_load_bankid_for_qp(init_attr, qp_table->bank); 292 - 261 + bankid = select_qp_bankid(hr_dev, init_attr); 293 262 ret = alloc_qpn_with_bankid(&qp_table->bank[bankid], bankid, 294 263 &num); 295 264 if (ret) {
+2 -2
drivers/infiniband/hw/hns/hns_roce_restrack.c
··· 51 51 52 52 ret = hr_dev->hw->query_cqc(hr_dev, hr_cq->cqn, &context); 53 53 if (ret) 54 - return -EINVAL; 54 + return ret; 55 55 56 56 ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context); 57 57 ··· 177 177 178 178 ret = hr_dev->hw->query_mpt(hr_dev, hr_mr->key, &context); 179 179 if (ret) 180 - return -EINVAL; 180 + return ret; 181 181 182 182 ret = nla_put(msg, RDMA_NLDEV_ATTR_RES_RAW, sizeof(context), &context); 183 183
+25 -42
drivers/infiniband/hw/irdma/ctrl.c
··· 2887 2887 } 2888 2888 2889 2889 /** 2890 - * irdma_sc_cq_ack - acknowledge completion q 2891 - * @cq: cq struct 2892 - */ 2893 - static inline void irdma_sc_cq_ack(struct irdma_sc_cq *cq) 2894 - { 2895 - writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); 2896 - } 2897 - 2898 - /** 2899 2890 * irdma_sc_cq_init - initialize completion q 2900 2891 * @cq: cq struct 2901 2892 * @info: cq initialization info ··· 2947 2956 return -ENOMEM; 2948 2957 2949 2958 set_64bit_val(wqe, 0, cq->cq_uk.cq_size); 2950 - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); 2959 + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); 2951 2960 set_64bit_val(wqe, 16, 2952 2961 FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, cq->shadow_read_threshold)); 2953 2962 set_64bit_val(wqe, 32, (cq->virtual_map ? 0 : cq->cq_pa)); ··· 3004 3013 return -ENOMEM; 3005 3014 3006 3015 set_64bit_val(wqe, 0, cq->cq_uk.cq_size); 3007 - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); 3016 + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); 3008 3017 set_64bit_val(wqe, 40, cq->shadow_area_pa); 3009 3018 set_64bit_val(wqe, 48, 3010 3019 (cq->virtual_map ? cq->first_pm_pbl_idx : 0)); ··· 3073 3082 return -ENOMEM; 3074 3083 3075 3084 set_64bit_val(wqe, 0, info->cq_size); 3076 - set_64bit_val(wqe, 8, (uintptr_t)cq >> 1); 3085 + set_64bit_val(wqe, 8, cq->cq_uk.cq_id); 3077 3086 set_64bit_val(wqe, 16, 3078 3087 FIELD_PREP(IRDMA_CQPSQ_CQ_SHADOW_READ_THRESHOLD, info->shadow_read_threshold)); 3079 3088 set_64bit_val(wqe, 32, info->cq_pa); ··· 3878 3887 set_64bit_val(ccq->cq_uk.shadow_area, 32, temp_val); 3879 3888 spin_unlock_irqrestore(&ccq->dev->cqp_lock, flags); 3880 3889 3881 - dma_wmb(); /* make sure shadow area is updated before arming */ 3882 - 3883 3890 writel(ccq->cq_uk.cq_id, ccq->dev->cq_arm_db); 3884 3891 } 3885 3892 ··· 4449 4460 * irdma_sc_process_ceq - process ceq 4450 4461 * @dev: sc device struct 4451 4462 * @ceq: ceq sc structure 4463 + * @cq_idx: Pointer to a CQ ID that will be populated. 4452 4464 * 4453 4465 * It is expected caller serializes this function with cleanup_ceqes() 4454 4466 * because these functions manipulate the same ceq 4467 + * 4468 + * Return: True if cq_idx has been populated with a CQ ID. 4455 4469 */ 4456 - void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq) 4470 + bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq, 4471 + u32 *cq_idx) 4457 4472 { 4458 4473 u64 temp; 4459 4474 __le64 *ceqe; 4460 - struct irdma_sc_cq *cq = NULL; 4461 - struct irdma_sc_cq *temp_cq; 4462 4475 u8 polarity; 4463 - u32 cq_idx; 4464 4476 4465 4477 do { 4466 - cq_idx = 0; 4467 4478 ceqe = IRDMA_GET_CURRENT_CEQ_ELEM(ceq); 4468 4479 get_64bit_val(ceqe, 0, &temp); 4469 4480 polarity = (u8)FIELD_GET(IRDMA_CEQE_VALID, temp); 4470 4481 if (polarity != ceq->polarity) 4471 - return NULL; 4482 + return false; 4472 4483 4473 - temp_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1); 4474 - if (!temp_cq) { 4475 - cq_idx = IRDMA_INVALID_CQ_IDX; 4476 - IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); 4477 - 4478 - if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) 4479 - ceq->polarity ^= 1; 4480 - continue; 4481 - } 4482 - 4483 - cq = temp_cq; 4484 + /* Truncate. Discard valid bit which is MSb of temp. */ 4485 + *cq_idx = temp; 4486 + if (*cq_idx >= dev->hmc_info->hmc_obj[IRDMA_HMC_IW_CQ].cnt) 4487 + *cq_idx = IRDMA_INVALID_CQ_IDX; 4484 4488 4485 4489 IRDMA_RING_MOVE_TAIL(ceq->ceq_ring); 4486 4490 if (!IRDMA_RING_CURRENT_TAIL(ceq->ceq_ring)) 4487 4491 ceq->polarity ^= 1; 4488 - } while (cq_idx == IRDMA_INVALID_CQ_IDX); 4492 + } while (*cq_idx == IRDMA_INVALID_CQ_IDX); 4489 4493 4490 - if (cq) 4491 - irdma_sc_cq_ack(cq); 4492 - return cq; 4494 + return true; 4493 4495 } 4494 4496 4495 4497 /** ··· 4494 4514 */ 4495 4515 void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq) 4496 4516 { 4497 - struct irdma_sc_cq *next_cq; 4498 4517 u8 ceq_polarity = ceq->polarity; 4499 4518 __le64 *ceqe; 4500 4519 u8 polarity; 4520 + u32 cq_idx; 4501 4521 u64 temp; 4502 4522 int next; 4503 4523 u32 i; ··· 4512 4532 if (polarity != ceq_polarity) 4513 4533 return; 4514 4534 4515 - next_cq = (struct irdma_sc_cq *)(unsigned long)(temp << 1); 4516 - if (cq == next_cq) 4517 - set_64bit_val(ceqe, 0, temp & IRDMA_CEQE_VALID); 4535 + cq_idx = temp; 4536 + if (cq_idx == cq->cq_uk.cq_id) 4537 + set_64bit_val(ceqe, 0, (temp & IRDMA_CEQE_VALID) | 4538 + IRDMA_INVALID_CQ_IDX); 4518 4539 4519 4540 next = IRDMA_RING_GET_NEXT_TAIL(ceq->ceq_ring, i); 4520 4541 if (!next) ··· 4956 4975 return -ENOMEM; 4957 4976 4958 4977 set_64bit_val(wqe, 0, ccq->cq_uk.cq_size); 4959 - set_64bit_val(wqe, 8, (uintptr_t)ccq >> 1); 4978 + set_64bit_val(wqe, 8, ccq->cq_uk.cq_id); 4960 4979 set_64bit_val(wqe, 40, ccq->shadow_area_pa); 4961 4980 4962 4981 hdr = ccq->cq_uk.cq_id | ··· 5769 5788 bool is_mrte_loc_mem; 5770 5789 5771 5790 loc_mem_pages = hmc_fpm_misc->loc_mem_pages; 5772 - is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds ? 5773 - true : false; 5791 + is_mrte_loc_mem = hmc_fpm_misc->loc_mem_pages == hmc_fpm_misc->max_sds; 5774 5792 5775 5793 irdma_get_rsrc_mem_config(dev, is_mrte_loc_mem); 5776 5794 mrte_loc = hmc_info->hmc_obj[IRDMA_HMC_IW_MR].mem_loc; ··· 6442 6462 int ret_code = 0; 6443 6463 u8 db_size; 6444 6464 6465 + spin_lock_init(&dev->puda_cq_lock); 6466 + dev->ilq_cq = NULL; 6467 + dev->ieq_cq = NULL; 6445 6468 INIT_LIST_HEAD(&dev->cqp_cmd_head); /* for CQP command backlog */ 6446 6469 mutex_init(&dev->ws_mutex); 6447 6470 dev->hmc_fn_id = info->hmc_fn_id;
+88 -20
drivers/infiniband/hw/irdma/hw.c
··· 99 99 } 100 100 101 101 /** 102 + * irdma_process_normal_ceqe - Handle a CEQE for a normal CQ. 103 + * @rf: RDMA PCI function. 104 + * @dev: iWARP device. 105 + * @cq_idx: CQ ID. Must be in table bounds. 106 + * 107 + * Context: Atomic (CEQ lock must be held) 108 + */ 109 + static void irdma_process_normal_ceqe(struct irdma_pci_f *rf, 110 + struct irdma_sc_dev *dev, u32 cq_idx) 111 + { 112 + /* cq_idx bounds validated in irdma_sc_process_ceq. */ 113 + struct irdma_cq *icq = READ_ONCE(rf->cq_table[cq_idx]); 114 + struct irdma_sc_cq *cq; 115 + 116 + if (unlikely(!icq)) { 117 + /* Should not happen since CEQ is scrubbed upon CQ delete. */ 118 + ibdev_warn_ratelimited(to_ibdev(dev), "Stale CEQE for CQ %u", 119 + cq_idx); 120 + return; 121 + } 122 + 123 + cq = &icq->sc_cq; 124 + 125 + if (unlikely(cq->cq_type != IRDMA_CQ_TYPE_IWARP)) { 126 + ibdev_warn_ratelimited(to_ibdev(dev), "Unexpected CQ type %u", 127 + cq->cq_type); 128 + return; 129 + } 130 + 131 + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); 132 + irdma_iwarp_ce_handler(cq); 133 + } 134 + 135 + /** 136 + * irdma_process_reserved_ceqe - Handle a CEQE for a reserved CQ. 137 + * @rf: RDMA PCI function. 138 + * @dev: iWARP device. 139 + * @cq_idx: CQ ID. 140 + * 141 + * Context: Atomic 142 + */ 143 + static void irdma_process_reserved_ceqe(struct irdma_pci_f *rf, 144 + struct irdma_sc_dev *dev, u32 cq_idx) 145 + { 146 + struct irdma_sc_cq *cq; 147 + 148 + if (cq_idx == IRDMA_RSVD_CQ_ID_CQP) { 149 + cq = &rf->ccq.sc_cq; 150 + /* CQP CQ lifetime > CEQ. */ 151 + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); 152 + queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work); 153 + } else if (cq_idx == IRDMA_RSVD_CQ_ID_ILQ || 154 + cq_idx == IRDMA_RSVD_CQ_ID_IEQ) { 155 + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { 156 + cq = (cq_idx == IRDMA_RSVD_CQ_ID_ILQ) ? 157 + dev->ilq_cq : dev->ieq_cq; 158 + if (!cq) { 159 + ibdev_warn_ratelimited(to_ibdev(dev), 160 + "Stale ILQ/IEQ CEQE"); 161 + return; 162 + } 163 + writel(cq->cq_uk.cq_id, cq->cq_uk.cq_ack_db); 164 + irdma_puda_ce_handler(rf, cq); 165 + } 166 + } 167 + } 168 + 169 + /** 102 170 * irdma_process_ceq - handle ceq for completions 103 171 * @rf: RDMA PCI function 104 172 * @ceq: ceq having cq for completion ··· 175 107 { 176 108 struct irdma_sc_dev *dev = &rf->sc_dev; 177 109 struct irdma_sc_ceq *sc_ceq; 178 - struct irdma_sc_cq *cq; 179 110 unsigned long flags; 111 + u32 cq_idx; 180 112 181 113 sc_ceq = &ceq->sc_ceq; 182 114 do { 183 115 spin_lock_irqsave(&ceq->ce_lock, flags); 184 - cq = irdma_sc_process_ceq(dev, sc_ceq); 185 - if (!cq) { 116 + 117 + if (!irdma_sc_process_ceq(dev, sc_ceq, &cq_idx)) { 186 118 spin_unlock_irqrestore(&ceq->ce_lock, flags); 187 119 break; 188 120 } 189 121 190 - if (cq->cq_type == IRDMA_CQ_TYPE_IWARP) 191 - irdma_iwarp_ce_handler(cq); 122 + /* Normal CQs must be handled while holding CEQ lock. */ 123 + if (likely(cq_idx > IRDMA_RSVD_CQ_ID_IEQ)) { 124 + irdma_process_normal_ceqe(rf, dev, cq_idx); 125 + spin_unlock_irqrestore(&ceq->ce_lock, flags); 126 + continue; 127 + } 192 128 193 129 spin_unlock_irqrestore(&ceq->ce_lock, flags); 194 130 195 - if (cq->cq_type == IRDMA_CQ_TYPE_CQP) 196 - queue_work(rf->cqp_cmpl_wq, &rf->cqp_cmpl_work); 197 - else if (cq->cq_type == IRDMA_CQ_TYPE_ILQ || 198 - cq->cq_type == IRDMA_CQ_TYPE_IEQ) 199 - irdma_puda_ce_handler(rf, cq); 131 + irdma_process_reserved_ceqe(rf, dev, cq_idx); 200 132 } while (1); 201 133 } 202 134 ··· 1600 1532 int status; 1601 1533 1602 1534 info.type = IRDMA_PUDA_RSRC_TYPE_ILQ; 1603 - info.cq_id = 1; 1604 - info.qp_id = 1; 1535 + info.cq_id = IRDMA_RSVD_CQ_ID_ILQ; 1536 + info.qp_id = IRDMA_RSVD_QP_ID_GSI_ILQ; 1605 1537 info.count = 1; 1606 1538 info.pd_id = 1; 1607 1539 info.abi_ver = IRDMA_ABI_VER; ··· 1630 1562 int status; 1631 1563 1632 1564 info.type = IRDMA_PUDA_RSRC_TYPE_IEQ; 1633 - info.cq_id = 2; 1565 + info.cq_id = IRDMA_RSVD_CQ_ID_IEQ; 1634 1566 info.qp_id = iwdev->vsi.exception_lan_q; 1635 1567 info.count = 1; 1636 1568 info.pd_id = 2; ··· 1936 1868 vsi_info.pf_data_vsi_num = iwdev->vsi_num; 1937 1869 vsi_info.register_qset = rf->gen_ops.register_qset; 1938 1870 vsi_info.unregister_qset = rf->gen_ops.unregister_qset; 1939 - vsi_info.exception_lan_q = 2; 1871 + vsi_info.exception_lan_q = IRDMA_RSVD_QP_ID_IEQ; 1940 1872 irdma_sc_vsi_init(&iwdev->vsi, &vsi_info); 1941 1873 1942 1874 status = irdma_setup_cm_core(iwdev, rf->rdma_ver); ··· 2167 2099 irdma_set_hw_rsrc(rf); 2168 2100 2169 2101 set_bit(0, rf->allocated_mrs); 2170 - set_bit(0, rf->allocated_qps); 2171 - set_bit(0, rf->allocated_cqs); 2102 + set_bit(IRDMA_RSVD_QP_ID_0, rf->allocated_qps); 2103 + set_bit(IRDMA_RSVD_CQ_ID_CQP, rf->allocated_cqs); 2172 2104 set_bit(0, rf->allocated_srqs); 2173 2105 set_bit(0, rf->allocated_pds); 2174 2106 set_bit(0, rf->allocated_arps); 2175 2107 set_bit(0, rf->allocated_ahs); 2176 2108 set_bit(0, rf->allocated_mcgs); 2177 - set_bit(2, rf->allocated_qps); /* qp 2 IEQ */ 2178 - set_bit(1, rf->allocated_qps); /* qp 1 ILQ */ 2179 - set_bit(1, rf->allocated_cqs); 2109 + set_bit(IRDMA_RSVD_QP_ID_IEQ, rf->allocated_qps); 2110 + set_bit(IRDMA_RSVD_QP_ID_GSI_ILQ, rf->allocated_qps); 2111 + set_bit(IRDMA_RSVD_CQ_ID_ILQ, rf->allocated_cqs); 2180 2112 set_bit(1, rf->allocated_pds); 2181 - set_bit(2, rf->allocated_cqs); 2113 + set_bit(IRDMA_RSVD_CQ_ID_IEQ, rf->allocated_cqs); 2182 2114 set_bit(2, rf->allocated_pds); 2183 2115 2184 2116 INIT_LIST_HEAD(&rf->mc_qht_list.list);
+2
drivers/infiniband/hw/irdma/main.h
··· 23 23 #include <linux/workqueue.h> 24 24 #include <linux/slab.h> 25 25 #include <linux/io.h> 26 + #include <linux/iopoll.h> 26 27 #include <linux/crc32c.h> 27 28 #include <linux/kthread.h> 28 29 #ifndef CONFIG_64BIT ··· 529 528 void irdma_srq_event(struct irdma_sc_srq *srq); 530 529 void irdma_srq_wq_destroy(struct irdma_pci_f *rf, struct irdma_sc_srq *srq); 531 530 void irdma_cleanup_pending_cqp_op(struct irdma_pci_f *rf); 531 + int irdma_get_timeout_threshold(struct irdma_sc_dev *dev); 532 532 int irdma_hw_modify_qp(struct irdma_device *iwdev, struct irdma_qp *iwqp, 533 533 struct irdma_modify_qp_info *info, bool wait); 534 534 int irdma_qp_suspend_resume(struct irdma_sc_qp *qp, bool suspend);
+14
drivers/infiniband/hw/irdma/puda.c
··· 809 809 dma_free_coherent(dev->hw->device, rsrc->cqmem.size, 810 810 rsrc->cqmem.va, rsrc->cqmem.pa); 811 811 rsrc->cqmem.va = NULL; 812 + } else { 813 + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { 814 + if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ) 815 + dev->ilq_cq = cq; 816 + else 817 + dev->ieq_cq = cq; 818 + } 812 819 } 813 820 814 821 return ret; ··· 862 855 int ret; 863 856 struct irdma_ccq_cqe_info compl_info; 864 857 struct irdma_sc_dev *dev = rsrc->dev; 858 + 859 + scoped_guard(spinlock_irqsave, &dev->puda_cq_lock) { 860 + if (rsrc->type == IRDMA_PUDA_RSRC_TYPE_ILQ) 861 + dev->ilq_cq = NULL; 862 + else 863 + dev->ieq_cq = NULL; 864 + } 865 865 866 866 if (rsrc->dev->ceq_valid) { 867 867 irdma_cqp_cq_destroy_cmd(dev, &rsrc->cq);
+17 -1
drivers/infiniband/hw/irdma/type.h
··· 239 239 IRDMA_QUEUE_TYPE_SRQ, 240 240 }; 241 241 242 + enum irdma_rsvd_cq_id { 243 + IRDMA_RSVD_CQ_ID_CQP, 244 + IRDMA_RSVD_CQ_ID_ILQ, 245 + IRDMA_RSVD_CQ_ID_IEQ, 246 + }; 247 + 248 + enum irdma_rsvd_qp_id { 249 + IRDMA_RSVD_QP_ID_0, 250 + IRDMA_RSVD_QP_ID_GSI_ILQ, 251 + IRDMA_RSVD_QP_ID_IEQ, 252 + }; 253 + 242 254 struct irdma_sc_dev; 243 255 struct irdma_vsi_pestat; 244 256 ··· 707 695 struct irdma_sc_aeq *aeq; 708 696 struct irdma_sc_ceq *ceq[IRDMA_CEQ_MAX_COUNT]; 709 697 struct irdma_sc_cq *ccq; 698 + spinlock_t puda_cq_lock; 699 + struct irdma_sc_cq *ilq_cq; 700 + struct irdma_sc_cq *ieq_cq; 710 701 const struct irdma_irq_ops *irq_ops; 711 702 struct irdma_qos qos[IRDMA_MAX_USER_PRIORITY]; 712 703 struct irdma_hmc_fpm_misc hmc_fpm_misc; ··· 1347 1332 int irdma_sc_ceq_init(struct irdma_sc_ceq *ceq, 1348 1333 struct irdma_ceq_init_info *info); 1349 1334 void irdma_sc_cleanup_ceqes(struct irdma_sc_cq *cq, struct irdma_sc_ceq *ceq); 1350 - void *irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq); 1335 + bool irdma_sc_process_ceq(struct irdma_sc_dev *dev, struct irdma_sc_ceq *ceq, 1336 + u32 *cq_idx); 1351 1337 1352 1338 int irdma_sc_aeq_init(struct irdma_sc_aeq *aeq, 1353 1339 struct irdma_aeq_init_info *info);
+1 -5
drivers/infiniband/hw/irdma/uk.c
··· 114 114 */ 115 115 void irdma_uk_qp_post_wr(struct irdma_qp_uk *qp) 116 116 { 117 - dma_wmb(); 118 117 writel(qp->qp_id, qp->wqe_alloc_db); 119 118 } 120 119 ··· 1106 1107 1107 1108 set_64bit_val(cq->shadow_area, 32, temp_val); 1108 1109 1109 - dma_wmb(); /* make sure WQE is populated before valid bit is set */ 1110 - 1111 1110 writel(cq->cq_id, cq->cqe_alloc_db); 1112 1111 } 1113 1112 ··· 1405 1408 * from SW for all unprocessed WQEs. For GEN3 and beyond 1406 1409 * FW will generate/flush these CQEs so move to the next CQE 1407 1410 */ 1408 - move_cq_head = qp->uk_attrs->hw_rev <= IRDMA_GEN_2 ? 1409 - false : true; 1411 + move_cq_head = qp->uk_attrs->hw_rev > IRDMA_GEN_2; 1410 1412 } 1411 1413 1412 1414 if (move_cq_head) {
+6 -5
drivers/infiniband/hw/irdma/utils.c
··· 573 573 } 574 574 } 575 575 576 - static int irdma_get_timeout_threshold(struct irdma_sc_dev *dev) 576 + int irdma_get_timeout_threshold(struct irdma_sc_dev *dev) 577 577 { 578 578 u16 time_s = dev->vc_caps.cqp_timeout_s; 579 579 ··· 830 830 return; 831 831 } 832 832 833 - iwdev->rf->cq_table[iwcq->cq_num] = NULL; 833 + /* May be asynchronously sampled by CEQ ISR without holding tbl lock. */ 834 + WRITE_ONCE(iwdev->rf->cq_table[iwcq->cq_num], NULL); 834 835 spin_unlock_irqrestore(&iwdev->rf->cqtable_lock, flags); 835 836 complete(&iwcq->free_cq); 836 837 } ··· 2240 2239 chunk->pg_cnt); 2241 2240 2242 2241 done: 2243 - kfree(chunk->dmainfo.dmaaddrs); 2242 + kvfree(chunk->dmainfo.dmaaddrs); 2244 2243 chunk->dmainfo.dmaaddrs = NULL; 2245 2244 vfree(chunk->vaddr); 2246 2245 chunk->vaddr = NULL; ··· 2257 2256 u32 size; 2258 2257 void *va; 2259 2258 2260 - chunk->dmainfo.dmaaddrs = kzalloc(pg_cnt << 3, GFP_KERNEL); 2259 + chunk->dmainfo.dmaaddrs = kvzalloc(pg_cnt << 3, GFP_KERNEL); 2261 2260 if (!chunk->dmainfo.dmaaddrs) 2262 2261 return -ENOMEM; 2263 2262 ··· 2278 2277 2279 2278 return 0; 2280 2279 err: 2281 - kfree(chunk->dmainfo.dmaaddrs); 2280 + kvfree(chunk->dmainfo.dmaaddrs); 2282 2281 chunk->dmainfo.dmaaddrs = NULL; 2283 2282 2284 2283 return -ENOMEM;
+12 -9
drivers/infiniband/hw/irdma/verbs.c
··· 2669 2669 goto cq_destroy; 2670 2670 } 2671 2671 } 2672 - rf->cq_table[cq_num] = iwcq; 2672 + 2673 2673 init_completion(&iwcq->free_cq); 2674 + 2675 + /* Populate table entry after CQ is fully created. */ 2676 + smp_store_release(&rf->cq_table[cq_num], iwcq); 2674 2677 2675 2678 return 0; 2676 2679 cq_destroy: ··· 5030 5027 } 5031 5028 5032 5029 if (!sleep) { 5033 - int cnt = CQP_COMPL_WAIT_TIME_MS * CQP_TIMEOUT_THRESHOLD; 5030 + const u64 tmout_ms = irdma_get_timeout_threshold(&rf->sc_dev) * 5031 + CQP_COMPL_WAIT_TIME_MS; 5034 5032 5035 - do { 5036 - irdma_cqp_ce_handler(rf, &rf->ccq.sc_cq); 5037 - mdelay(1); 5038 - } while (!ah->sc_ah.ah_info.ah_valid && --cnt); 5039 - 5040 - if (!cnt) { 5041 - ibdev_dbg(&iwdev->ibdev, "VERBS: CQP create AH timed out"); 5033 + if (poll_timeout_us_atomic(irdma_cqp_ce_handler(rf, 5034 + &rf->ccq.sc_cq), 5035 + ah->sc_ah.ah_info.ah_valid, 1, 5036 + tmout_ms * USEC_PER_MSEC, false)) { 5037 + ibdev_dbg(&iwdev->ibdev, 5038 + "VERBS: CQP create AH timed out"); 5042 5039 err = -ETIMEDOUT; 5043 5040 goto err_ah_create; 5044 5041 }
+1 -3
drivers/infiniband/hw/mana/cq.c
··· 24 24 25 25 cq->comp_vector = attr->comp_vector % ibdev->num_comp_vectors; 26 26 cq->cq_handle = INVALID_MANA_HANDLE; 27 + is_rnic_cq = mana_ib_is_rnic(mdev); 27 28 28 29 if (udata) { 29 30 if (udata->inlen < offsetof(struct mana_ib_create_cq, flags)) ··· 35 34 ibdev_dbg(ibdev, "Failed to copy from udata for create cq, %d\n", err); 36 35 return err; 37 36 } 38 - 39 - is_rnic_cq = !!(ucmd.flags & MANA_IB_CREATE_RNIC_CQ); 40 37 41 38 if ((!is_rnic_cq && attr->cqe > mdev->adapter_caps.max_qp_wr) || 42 39 attr->cqe > U32_MAX / COMP_ENTRY_SIZE) { ··· 54 55 ibucontext); 55 56 doorbell = mana_ucontext->doorbell; 56 57 } else { 57 - is_rnic_cq = true; 58 58 if (attr->cqe > U32_MAX / COMP_ENTRY_SIZE / 2 + 1) { 59 59 ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe); 60 60 return -EINVAL;
+7
drivers/infiniband/hw/mana/device.c
··· 69 69 .alloc_hw_device_stats = mana_ib_alloc_hw_device_stats, 70 70 }; 71 71 72 + const struct ib_device_ops mana_ib_dev_dm_ops = { 73 + .alloc_dm = mana_ib_alloc_dm, 74 + .dealloc_dm = mana_ib_dealloc_dm, 75 + .reg_dm_mr = mana_ib_reg_dm_mr, 76 + }; 77 + 72 78 static int mana_ib_netdev_event(struct notifier_block *this, 73 79 unsigned long event, void *ptr) 74 80 { ··· 145 139 ib_set_device_ops(&dev->ib_dev, &mana_ib_stats_ops); 146 140 if (dev->adapter_caps.feature_flags & MANA_IB_FEATURE_DEV_COUNTERS_SUPPORT) 147 141 ib_set_device_ops(&dev->ib_dev, &mana_ib_device_stats_ops); 142 + ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_dm_ops); 148 143 149 144 ret = mana_ib_create_eqs(dev); 150 145 if (ret) {
+12
drivers/infiniband/hw/mana/mana_ib.h
··· 131 131 mana_handle_t mr_handle; 132 132 }; 133 133 134 + struct mana_ib_dm { 135 + struct ib_dm ibdm; 136 + mana_handle_t dm_handle; 137 + }; 138 + 134 139 struct mana_ib_cq { 135 140 struct ib_cq ibcq; 136 141 struct mana_ib_queue queue; ··· 740 735 u64 iova, int fd, int mr_access_flags, 741 736 struct ib_dmah *dmah, 742 737 struct uverbs_attr_bundle *attrs); 738 + 739 + struct ib_dm *mana_ib_alloc_dm(struct ib_device *dev, struct ib_ucontext *context, 740 + struct ib_dm_alloc_attr *attr, struct uverbs_attr_bundle *attrs); 741 + int mana_ib_dealloc_dm(struct ib_dm *dm, struct uverbs_attr_bundle *attrs); 742 + struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm, struct ib_dm_mr_attr *attr, 743 + struct uverbs_attr_bundle *attrs); 744 + 743 745 #endif
+130
drivers/infiniband/hw/mana/mr.c
··· 40 40 41 41 mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req), 42 42 sizeof(resp)); 43 + req.hdr.req.msg_version = GDMA_MESSAGE_V2; 43 44 req.pd_handle = mr_params->pd_handle; 44 45 req.mr_type = mr_params->mr_type; 45 46 ··· 55 54 case GDMA_MR_TYPE_ZBVA: 56 55 req.zbva.dma_region_handle = mr_params->zbva.dma_region_handle; 57 56 req.zbva.access_flags = mr_params->zbva.access_flags; 57 + break; 58 + case GDMA_MR_TYPE_DM: 59 + req.da_ext.length = mr_params->da.length; 60 + req.da.dm_handle = mr_params->da.dm_handle; 61 + req.da.offset = mr_params->da.offset; 62 + req.da.access_flags = mr_params->da.access_flags; 58 63 break; 59 64 default: 60 65 ibdev_dbg(&dev->ib_dev, ··· 323 316 kfree(mr); 324 317 325 318 return 0; 319 + } 320 + 321 + static int mana_ib_gd_alloc_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm, 322 + struct ib_dm_alloc_attr *attr) 323 + { 324 + struct gdma_context *gc = mdev_to_gc(mdev); 325 + struct gdma_alloc_dm_resp resp = {}; 326 + struct gdma_alloc_dm_req req = {}; 327 + int err; 328 + 329 + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOC_DM, sizeof(req), sizeof(resp)); 330 + req.length = attr->length; 331 + req.alignment = attr->alignment; 332 + req.flags = attr->flags; 333 + 334 + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); 335 + if (err || resp.hdr.status) { 336 + if (!err) 337 + err = -EPROTO; 338 + 339 + return err; 340 + } 341 + 342 + dm->dm_handle = resp.dm_handle; 343 + 344 + return 0; 345 + } 346 + 347 + struct ib_dm *mana_ib_alloc_dm(struct ib_device *ibdev, 348 + struct ib_ucontext *context, 349 + struct ib_dm_alloc_attr *attr, 350 + struct uverbs_attr_bundle *attrs) 351 + { 352 + struct mana_ib_dev *dev = container_of(ibdev, struct mana_ib_dev, ib_dev); 353 + struct mana_ib_dm *dm; 354 + int err; 355 + 356 + dm = kzalloc(sizeof(*dm), GFP_KERNEL); 357 + if (!dm) 358 + return ERR_PTR(-ENOMEM); 359 + 360 + err = mana_ib_gd_alloc_dm(dev, dm, attr); 361 + if (err) 362 + goto err_free; 363 + 364 + return &dm->ibdm; 365 + 366 + err_free: 367 + kfree(dm); 368 + return ERR_PTR(err); 369 + } 370 + 371 + static int mana_ib_gd_destroy_dm(struct mana_ib_dev *mdev, struct mana_ib_dm *dm) 372 + { 373 + struct gdma_context *gc = mdev_to_gc(mdev); 374 + struct gdma_destroy_dm_resp resp = {}; 375 + struct gdma_destroy_dm_req req = {}; 376 + int err; 377 + 378 + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DM, sizeof(req), sizeof(resp)); 379 + req.dm_handle = dm->dm_handle; 380 + 381 + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); 382 + if (err || resp.hdr.status) { 383 + if (!err) 384 + err = -EPROTO; 385 + 386 + return err; 387 + } 388 + 389 + return 0; 390 + } 391 + 392 + int mana_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs) 393 + { 394 + struct mana_ib_dev *dev = container_of(ibdm->device, struct mana_ib_dev, ib_dev); 395 + struct mana_ib_dm *dm = container_of(ibdm, struct mana_ib_dm, ibdm); 396 + int err; 397 + 398 + err = mana_ib_gd_destroy_dm(dev, dm); 399 + if (err) 400 + return err; 401 + 402 + kfree(dm); 403 + return 0; 404 + } 405 + 406 + struct ib_mr *mana_ib_reg_dm_mr(struct ib_pd *ibpd, struct ib_dm *ibdm, 407 + struct ib_dm_mr_attr *attr, 408 + struct uverbs_attr_bundle *attrs) 409 + { 410 + struct mana_ib_dev *dev = container_of(ibpd->device, struct mana_ib_dev, ib_dev); 411 + struct mana_ib_dm *mana_dm = container_of(ibdm, struct mana_ib_dm, ibdm); 412 + struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd); 413 + struct gdma_create_mr_params mr_params = {}; 414 + struct mana_ib_mr *mr; 415 + int err; 416 + 417 + attr->access_flags &= ~IB_ACCESS_OPTIONAL; 418 + if (attr->access_flags & ~VALID_MR_FLAGS) 419 + return ERR_PTR(-EOPNOTSUPP); 420 + 421 + mr = kzalloc(sizeof(*mr), GFP_KERNEL); 422 + if (!mr) 423 + return ERR_PTR(-ENOMEM); 424 + 425 + mr_params.pd_handle = pd->pd_handle; 426 + mr_params.mr_type = GDMA_MR_TYPE_DM; 427 + mr_params.da.dm_handle = mana_dm->dm_handle; 428 + mr_params.da.offset = attr->offset; 429 + mr_params.da.length = attr->length; 430 + mr_params.da.access_flags = 431 + mana_ib_verbs_to_gdma_access_flags(attr->access_flags); 432 + 433 + err = mana_ib_gd_create_mr(dev, mr, &mr_params); 434 + if (err) 435 + goto err_free; 436 + 437 + return &mr->ibmr; 438 + 439 + err_free: 440 + kfree(mr); 441 + return ERR_PTR(err); 326 442 }
+289 -16
drivers/infiniband/hw/mlx5/main.c
··· 561 561 * of an error it will still be zeroed out. 562 562 * Use native port in case of reps 563 563 */ 564 - if (dev->is_rep) 565 - err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 566 - 1, 0); 567 - else 568 - err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 569 - mdev_port_num, 0); 564 + if (dev->is_rep) { 565 + struct mlx5_eswitch_rep *rep; 566 + 567 + rep = dev->port[port_num - 1].rep; 568 + if (rep) { 569 + mdev = mlx5_eswitch_get_core_dev(rep->esw); 570 + WARN_ON(!mdev); 571 + } 572 + mdev_port_num = 1; 573 + } 574 + 575 + err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN, 576 + mdev_port_num, 0); 577 + 570 578 if (err) 571 579 goto out; 572 580 ext = !!MLX5_GET_ETH_PROTO(ptys_reg, out, true, eth_proto_capability); ··· 1589 1581 return 0; 1590 1582 } 1591 1583 1584 + static int mlx5_ib_query_port_speed_from_port(struct mlx5_ib_dev *dev, 1585 + u32 port_num, u64 *speed) 1586 + { 1587 + struct ib_port_speed_info speed_info; 1588 + struct ib_port_attr attr = {}; 1589 + int err; 1590 + 1591 + err = mlx5_ib_query_port(&dev->ib_dev, port_num, &attr); 1592 + if (err) 1593 + return err; 1594 + 1595 + if (attr.state == IB_PORT_DOWN) { 1596 + *speed = 0; 1597 + return 0; 1598 + } 1599 + 1600 + err = ib_port_attr_to_speed_info(&attr, &speed_info); 1601 + if (err) 1602 + return err; 1603 + 1604 + *speed = speed_info.rate; 1605 + return 0; 1606 + } 1607 + 1608 + static int mlx5_ib_query_port_speed_from_vport(struct mlx5_core_dev *mdev, 1609 + u8 op_mod, u16 vport, 1610 + u8 other_vport, u64 *speed, 1611 + struct mlx5_ib_dev *dev, 1612 + u32 port_num) 1613 + { 1614 + u32 max_tx_speed; 1615 + int err; 1616 + 1617 + err = mlx5_query_vport_max_tx_speed(mdev, op_mod, vport, other_vport, 1618 + &max_tx_speed); 1619 + if (err) 1620 + return err; 1621 + 1622 + if (max_tx_speed == 0) 1623 + /* Value 0 indicates field not supported, fallback */ 1624 + return mlx5_ib_query_port_speed_from_port(dev, port_num, 1625 + speed); 1626 + 1627 + *speed = max_tx_speed; 1628 + return 0; 1629 + } 1630 + 1631 + static int mlx5_ib_query_port_speed_from_bond(struct mlx5_ib_dev *dev, 1632 + u32 port_num, u64 *speed) 1633 + { 1634 + struct mlx5_core_dev *mdev = dev->mdev; 1635 + u32 bond_speed; 1636 + int err; 1637 + 1638 + err = mlx5_lag_query_bond_speed(mdev, &bond_speed); 1639 + if (err) 1640 + return err; 1641 + 1642 + *speed = bond_speed / MLX5_MAX_TX_SPEED_UNIT; 1643 + 1644 + return 0; 1645 + } 1646 + 1647 + static int mlx5_ib_query_port_speed_non_rep(struct mlx5_ib_dev *dev, 1648 + u32 port_num, u64 *speed) 1649 + { 1650 + u16 op_mod = MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT; 1651 + 1652 + if (mlx5_lag_is_roce(dev->mdev)) 1653 + return mlx5_ib_query_port_speed_from_bond(dev, port_num, 1654 + speed); 1655 + 1656 + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, 0, false, 1657 + speed, dev, port_num); 1658 + } 1659 + 1660 + static int mlx5_ib_query_port_speed_rep(struct mlx5_ib_dev *dev, u32 port_num, 1661 + u64 *speed) 1662 + { 1663 + struct mlx5_eswitch_rep *rep; 1664 + struct mlx5_core_dev *mdev; 1665 + u16 op_mod; 1666 + 1667 + if (!dev->port[port_num - 1].rep) { 1668 + mlx5_ib_warn(dev, "Representor doesn't exist for port %u\n", 1669 + port_num); 1670 + return -EINVAL; 1671 + } 1672 + 1673 + rep = dev->port[port_num - 1].rep; 1674 + mdev = mlx5_eswitch_get_core_dev(rep->esw); 1675 + if (!mdev) 1676 + return -ENODEV; 1677 + 1678 + if (rep->vport == MLX5_VPORT_UPLINK) { 1679 + if (mlx5_lag_is_sriov(mdev)) 1680 + return mlx5_ib_query_port_speed_from_bond(dev, 1681 + port_num, 1682 + speed); 1683 + 1684 + return mlx5_ib_query_port_speed_from_port(dev, port_num, 1685 + speed); 1686 + } 1687 + 1688 + op_mod = MLX5_VPORT_STATE_OP_MOD_ESW_VPORT; 1689 + return mlx5_ib_query_port_speed_from_vport(dev->mdev, op_mod, 1690 + rep->vport, true, speed, dev, 1691 + port_num); 1692 + } 1693 + 1694 + int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, u64 *speed) 1695 + { 1696 + struct mlx5_ib_dev *dev = to_mdev(ibdev); 1697 + 1698 + if (mlx5_ib_port_link_layer(ibdev, port_num) == 1699 + IB_LINK_LAYER_INFINIBAND || mlx5_core_mp_enabled(dev->mdev)) 1700 + return mlx5_ib_query_port_speed_from_port(dev, port_num, speed); 1701 + else if (!dev->is_rep) 1702 + return mlx5_ib_query_port_speed_non_rep(dev, port_num, speed); 1703 + else 1704 + return mlx5_ib_query_port_speed_rep(dev, port_num, speed); 1705 + } 1706 + 1592 1707 static int mlx5_ib_query_gid(struct ib_device *ibdev, u32 port, int index, 1593 1708 union ib_gid *gid) 1594 1709 { ··· 2454 2323 virt_to_page(dev->mdev->clock_info)); 2455 2324 } 2456 2325 2326 + static int phys_addr_to_bar(struct pci_dev *pdev, phys_addr_t pa) 2327 + { 2328 + resource_size_t start, end; 2329 + int bar; 2330 + 2331 + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) { 2332 + /* Skip BARs not present or not memory-mapped */ 2333 + if (!(pci_resource_flags(pdev, bar) & IORESOURCE_MEM)) 2334 + continue; 2335 + 2336 + start = pci_resource_start(pdev, bar); 2337 + end = pci_resource_end(pdev, bar); 2338 + 2339 + if (!start || !end) 2340 + continue; 2341 + 2342 + if (pa >= start && pa <= end) 2343 + return bar; 2344 + } 2345 + 2346 + return -1; 2347 + } 2348 + 2349 + static int mlx5_ib_mmap_get_pfns(struct rdma_user_mmap_entry *entry, 2350 + struct phys_vec *phys_vec, 2351 + struct p2pdma_provider **provider) 2352 + { 2353 + struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); 2354 + struct pci_dev *pdev = to_mdev(entry->ucontext->device)->mdev->pdev; 2355 + int bar; 2356 + 2357 + phys_vec->paddr = mentry->address; 2358 + phys_vec->len = entry->npages * PAGE_SIZE; 2359 + 2360 + bar = phys_addr_to_bar(pdev, phys_vec->paddr); 2361 + if (bar < 0) 2362 + return -EINVAL; 2363 + 2364 + *provider = pcim_p2pdma_provider(pdev, bar); 2365 + /* If the kernel was not compiled with CONFIG_PCI_P2PDMA the 2366 + * functionality is not supported. 2367 + */ 2368 + if (!*provider) 2369 + return -EOPNOTSUPP; 2370 + 2371 + return 0; 2372 + } 2373 + 2374 + static struct rdma_user_mmap_entry * 2375 + mlx5_ib_pgoff_to_mmap_entry(struct ib_ucontext *ucontext, off_t pg_off) 2376 + { 2377 + unsigned long entry_pgoff; 2378 + unsigned long idx; 2379 + u8 command; 2380 + 2381 + pg_off = pg_off >> PAGE_SHIFT; 2382 + command = get_command(pg_off); 2383 + idx = get_extended_index(pg_off); 2384 + 2385 + entry_pgoff = command << 16 | idx; 2386 + 2387 + return rdma_user_mmap_entry_get_pgoff(ucontext, entry_pgoff); 2388 + } 2389 + 2457 2390 static void mlx5_ib_mmap_free(struct rdma_user_mmap_entry *entry) 2458 2391 { 2459 2392 struct mlx5_user_mmap_entry *mentry = to_mmmap(entry); ··· 3033 2838 case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE: 3034 2839 case MLX5_PORT_CHANGE_SUBTYPE_DOWN: 3035 2840 case MLX5_PORT_CHANGE_SUBTYPE_INITIALIZED: 2841 + if (ibdev->ib_active) { 2842 + struct ib_event speed_event = {}; 2843 + 2844 + speed_event.device = &ibdev->ib_dev; 2845 + speed_event.event = IB_EVENT_DEVICE_SPEED_CHANGE; 2846 + ib_dispatch_event(&speed_event); 2847 + } 2848 + 3036 2849 /* In RoCE, port up/down events are handled in 3037 2850 * mlx5_netdev_event(). 3038 2851 */ ··· 3081 2878 container_of(_work, struct mlx5_ib_event_work, work); 3082 2879 struct mlx5_ib_dev *ibdev; 3083 2880 struct ib_event ibev; 3084 - bool fatal = false; 3085 2881 3086 2882 if (work->is_slave) { 3087 2883 ibdev = mlx5_ib_get_ibdev_from_mpi(work->mpi); ··· 3091 2889 } 3092 2890 3093 2891 switch (work->event) { 3094 - case MLX5_DEV_EVENT_SYS_ERROR: 3095 - ibev.event = IB_EVENT_DEVICE_FATAL; 3096 - mlx5_ib_handle_internal_error(ibdev); 3097 - ibev.element.port_num = (u8)(unsigned long)work->param; 3098 - fatal = true; 3099 - break; 3100 2892 case MLX5_EVENT_TYPE_PORT_CHANGE: 3101 2893 if (handle_port_change(ibdev, work->param, &ibev)) 3102 2894 goto out; ··· 3112 2916 if (ibdev->ib_active) 3113 2917 ib_dispatch_event(&ibev); 3114 2918 3115 - if (fatal) 3116 - ibdev->ib_active = false; 3117 2919 out: 3118 2920 kfree(work); 3119 2921 } ··· 3153 2959 queue_work(mlx5_ib_event_wq, &work->work); 3154 2960 3155 2961 return NOTIFY_OK; 2962 + } 2963 + 2964 + static void mlx5_ib_handle_sys_error_event(struct work_struct *_work) 2965 + { 2966 + struct mlx5_ib_event_work *work = 2967 + container_of(_work, struct mlx5_ib_event_work, work); 2968 + struct mlx5_ib_dev *ibdev = work->dev; 2969 + struct ib_event ibev; 2970 + 2971 + ibev.event = IB_EVENT_DEVICE_FATAL; 2972 + mlx5_ib_handle_internal_error(ibdev); 2973 + ibev.element.port_num = (u8)(unsigned long)work->param; 2974 + ibev.device = &ibdev->ib_dev; 2975 + 2976 + if (!rdma_is_port_valid(&ibdev->ib_dev, ibev.element.port_num)) { 2977 + mlx5_ib_warn(ibdev, "warning: event on port %d\n", ibev.element.port_num); 2978 + goto out; 2979 + } 2980 + 2981 + if (ibdev->ib_active) 2982 + ib_dispatch_event(&ibev); 2983 + 2984 + ibdev->ib_active = false; 2985 + out: 2986 + kfree(work); 2987 + } 2988 + 2989 + static int mlx5_ib_sys_error_event(struct notifier_block *nb, 2990 + unsigned long event, void *param) 2991 + { 2992 + struct mlx5_ib_event_work *work; 2993 + 2994 + if (event != MLX5_DEV_EVENT_SYS_ERROR) 2995 + return NOTIFY_DONE; 2996 + 2997 + work = kmalloc(sizeof(*work), GFP_ATOMIC); 2998 + if (!work) 2999 + return NOTIFY_DONE; 3000 + 3001 + INIT_WORK(&work->work, mlx5_ib_handle_sys_error_event); 3002 + work->dev = container_of(nb, struct mlx5_ib_dev, sys_error_events); 3003 + work->is_slave = false; 3004 + work->param = param; 3005 + work->event = event; 3006 + 3007 + queue_work(mlx5_ib_event_wq, &work->work); 3008 + 3009 + return NOTIFY_OK; 3010 + } 3011 + 3012 + static int mlx5_ib_stage_sys_error_notifier_init(struct mlx5_ib_dev *dev) 3013 + { 3014 + dev->sys_error_events.notifier_call = mlx5_ib_sys_error_event; 3015 + mlx5_notifier_register(dev->mdev, &dev->sys_error_events); 3016 + return 0; 3017 + } 3018 + 3019 + static void mlx5_ib_stage_sys_error_notifier_cleanup(struct mlx5_ib_dev *dev) 3020 + { 3021 + mlx5_notifier_unregister(dev->mdev, &dev->sys_error_events); 3156 3022 } 3157 3023 3158 3024 static int mlx5_ib_get_plane_num(struct mlx5_core_dev *mdev, u8 *num_plane) ··· 4483 4229 if (err) 4484 4230 goto err_mp; 4485 4231 4232 + err = pcim_p2pdma_init(mdev->pdev); 4233 + if (err && err != -EOPNOTSUPP) 4234 + goto err_dd; 4235 + 4486 4236 return 0; 4237 + err_dd: 4238 + mlx5_ib_data_direct_cleanup(dev); 4487 4239 err_mp: 4488 4240 mlx5_ib_cleanup_multiport_master(dev); 4489 4241 err: ··· 4541 4281 .map_mr_sg_pi = mlx5_ib_map_mr_sg_pi, 4542 4282 .mmap = mlx5_ib_mmap, 4543 4283 .mmap_free = mlx5_ib_mmap_free, 4284 + .mmap_get_pfns = mlx5_ib_mmap_get_pfns, 4544 4285 .modify_cq = mlx5_ib_modify_cq, 4545 4286 .modify_device = mlx5_ib_modify_device, 4546 4287 .modify_port = mlx5_ib_modify_port, 4547 4288 .modify_qp = mlx5_ib_modify_qp, 4548 4289 .modify_srq = mlx5_ib_modify_srq, 4290 + .pgoff_to_mmap_entry = mlx5_ib_pgoff_to_mmap_entry, 4549 4291 .pre_destroy_cq = mlx5_ib_pre_destroy_cq, 4550 4292 .poll_cq = mlx5_ib_poll_cq, 4551 4293 .post_destroy_cq = mlx5_ib_post_destroy_cq, ··· 4559 4297 .query_device = mlx5_ib_query_device, 4560 4298 .query_gid = mlx5_ib_query_gid, 4561 4299 .query_pkey = mlx5_ib_query_pkey, 4300 + .query_port_speed = mlx5_ib_query_port_speed, 4562 4301 .query_qp = mlx5_ib_query_qp, 4563 4302 .query_srq = mlx5_ib_query_srq, 4564 4303 .query_ucontext = mlx5_ib_query_ucontext, ··· 4729 4466 MLX5_HCA_CAP_2_GENERAL_OBJECT_TYPES_RDMA_CTRL) { 4730 4467 err = mlx5_ib_init_ucaps(dev); 4731 4468 if (err) 4732 - return err; 4469 + goto err_ucaps; 4733 4470 } 4734 4471 4735 4472 dev->ib_dev.use_cq_dim = true; 4736 4473 4737 4474 return 0; 4475 + 4476 + err_ucaps: 4477 + bitmap_free(dev->var_table.bitmap); 4478 + return err; 4738 4479 } 4739 4480 4740 4481 static const struct ib_device_ops mlx5_ib_dev_port_ops = { ··· 5074 4807 STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, 5075 4808 mlx5_ib_devx_init, 5076 4809 mlx5_ib_devx_cleanup), 4810 + STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, 4811 + mlx5_ib_stage_sys_error_notifier_init, 4812 + mlx5_ib_stage_sys_error_notifier_cleanup), 5077 4813 STAGE_CREATE(MLX5_IB_STAGE_IB_REG, 5078 4814 mlx5_ib_stage_ib_reg_init, 5079 4815 mlx5_ib_stage_ib_reg_cleanup), ··· 5134 4864 STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID, 5135 4865 mlx5_ib_devx_init, 5136 4866 mlx5_ib_devx_cleanup), 4867 + STAGE_CREATE(MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, 4868 + mlx5_ib_stage_sys_error_notifier_init, 4869 + mlx5_ib_stage_sys_error_notifier_cleanup), 5137 4870 STAGE_CREATE(MLX5_IB_STAGE_IB_REG, 5138 4871 mlx5_ib_stage_ib_reg_init, 5139 4872 mlx5_ib_stage_ib_reg_cleanup),
+4
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 1007 1007 MLX5_IB_STAGE_BFREG, 1008 1008 MLX5_IB_STAGE_PRE_IB_REG_UMR, 1009 1009 MLX5_IB_STAGE_WHITELIST_UID, 1010 + MLX5_IB_STAGE_SYS_ERROR_NOTIFIER, 1010 1011 MLX5_IB_STAGE_IB_REG, 1011 1012 MLX5_IB_STAGE_DEVICE_NOTIFIER, 1012 1013 MLX5_IB_STAGE_POST_IB_REG_UMR, ··· 1166 1165 /* protect accessing data_direct_dev */ 1167 1166 struct mutex data_direct_lock; 1168 1167 struct notifier_block mdev_events; 1168 + struct notifier_block sys_error_events; 1169 1169 struct notifier_block lag_events; 1170 1170 int num_ports; 1171 1171 /* serialize update of capability mask ··· 1437 1435 struct ib_port_attr *props); 1438 1436 int mlx5_ib_query_port(struct ib_device *ibdev, u32 port, 1439 1437 struct ib_port_attr *props); 1438 + int mlx5_ib_query_port_speed(struct ib_device *ibdev, u32 port_num, 1439 + u64 *speed); 1440 1440 void mlx5_ib_populate_pas(struct ib_umem *umem, size_t page_size, __be64 *pas, 1441 1441 u64 access_flags); 1442 1442 int mlx5_ib_get_cqe_size(struct ib_cq *ibcq);
+6 -5
drivers/infiniband/hw/mlx5/mr.c
··· 1646 1646 offset, length, fd, 1647 1647 access_flags, 1648 1648 &mlx5_ib_dmabuf_attach_ops); 1649 - else 1649 + else if (dma_device) 1650 1650 umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev, 1651 1651 dma_device, offset, length, 1652 1652 fd, access_flags); 1653 + else 1654 + umem_dmabuf = ib_umem_dmabuf_get_pinned( 1655 + &dev->ib_dev, offset, length, fd, access_flags); 1653 1656 1654 1657 if (IS_ERR(umem_dmabuf)) { 1655 1658 mlx5_ib_dbg(dev, "umem_dmabuf get failed (%pe)\n", umem_dmabuf); ··· 1785 1782 return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr, 1786 1783 fd, access_flags); 1787 1784 1788 - return reg_user_mr_dmabuf(pd, pd->device->dma_device, 1789 - offset, length, virt_addr, 1790 - fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT, 1791 - dmah); 1785 + return reg_user_mr_dmabuf(pd, NULL, offset, length, virt_addr, fd, 1786 + access_flags, MLX5_MKC_ACCESS_MODE_MTT, dmah); 1792 1787 } 1793 1788 1794 1789 /*
+5
drivers/infiniband/hw/mlx5/qp.c
··· 4362 4362 optpar |= ib_mask_to_mlx5_opt(attr_mask); 4363 4363 optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; 4364 4364 4365 + if (attr_mask & IB_QP_RATE_LIMIT && qp->type != IB_QPT_RAW_PACKET) { 4366 + err = -EOPNOTSUPP; 4367 + goto out; 4368 + } 4369 + 4365 4370 if (qp->type == IB_QPT_RAW_PACKET || 4366 4371 qp->flags & IB_QP_CREATE_SOURCE_QPN) { 4367 4372 struct mlx5_modify_raw_qp_param raw_qp_param = {};
+2 -2
drivers/infiniband/hw/mlx5/std_types.c
··· 195 195 int out_len = uverbs_attr_get_len(attrs, 196 196 MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH); 197 197 u32 dev_path_len; 198 - char *dev_path; 198 + char *dev_path = NULL; 199 199 int ret; 200 200 201 201 c = to_mucontext(ib_uverbs_get_ucontext(attrs)); ··· 223 223 224 224 ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path, 225 225 dev_path_len); 226 - kfree(dev_path); 227 226 228 227 end: 228 + kfree(dev_path); 229 229 mutex_unlock(&dev->data_direct_lock); 230 230 return ret; 231 231 }
-2
drivers/infiniband/hw/ocrdma/ocrdma.h
··· 67 67 #define OC_SKH_DEVICE_VF 0x728 68 68 #define OCRDMA_MAX_AH 512 69 69 70 - #define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) 71 - 72 70 #define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo) 73 71 #define EQ_INTR_PER_SEC_THRSH_HI 150000 74 72 #define EQ_INTR_PER_SEC_THRSH_LOW 100000
-20
drivers/infiniband/hw/qedr/qedr.h
··· 53 53 DP_NAME(dev) ? DP_NAME(dev) : "", ## __VA_ARGS__) 54 54 55 55 #define QEDR_MSG_INIT "INIT" 56 - #define QEDR_MSG_MISC "MISC" 57 56 #define QEDR_MSG_CQ " CQ" 58 57 #define QEDR_MSG_MR " MR" 59 - #define QEDR_MSG_RQ " RQ" 60 - #define QEDR_MSG_SQ " SQ" 61 58 #define QEDR_MSG_QP " QP" 62 59 #define QEDR_MSG_SRQ " SRQ" 63 60 #define QEDR_MSG_GSI " GSI" ··· 62 65 63 66 #define QEDR_CQ_MAGIC_NUMBER (0x11223344) 64 67 65 - #define FW_PAGE_SIZE (RDMA_RING_PAGE_SIZE) 66 68 #define FW_PAGE_SHIFT (12) 67 69 68 70 struct qedr_dev; ··· 174 178 u8 user_dpm_enabled; 175 179 }; 176 180 177 - #define QEDR_MAX_SQ_PBL (0x8000) 178 181 #define QEDR_MAX_SQ_PBL_ENTRIES (0x10000 / sizeof(void *)) 179 182 #define QEDR_SQE_ELEMENT_SIZE (sizeof(struct rdma_sq_sge)) 180 183 #define QEDR_MAX_SQE_ELEMENTS_PER_SQE (ROCE_REQ_MAX_SINGLE_SQ_WQE_SIZE / \ 181 - QEDR_SQE_ELEMENT_SIZE) 182 - #define QEDR_MAX_SQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \ 183 184 QEDR_SQE_ELEMENT_SIZE) 184 185 #define QEDR_MAX_SQE ((QEDR_MAX_SQ_PBL_ENTRIES) *\ 185 186 (RDMA_RING_PAGE_SIZE) / \ 186 187 (QEDR_SQE_ELEMENT_SIZE) /\ 187 188 (QEDR_MAX_SQE_ELEMENTS_PER_SQE)) 188 189 /* RQ */ 189 - #define QEDR_MAX_RQ_PBL (0x2000) 190 190 #define QEDR_MAX_RQ_PBL_ENTRIES (0x10000 / sizeof(void *)) 191 191 #define QEDR_RQE_ELEMENT_SIZE (sizeof(struct rdma_rq_sge)) 192 192 #define QEDR_MAX_RQE_ELEMENTS_PER_RQE (RDMA_MAX_SGE_PER_RQ_WQE) 193 - #define QEDR_MAX_RQE_ELEMENTS_PER_PAGE ((RDMA_RING_PAGE_SIZE) / \ 194 - QEDR_RQE_ELEMENT_SIZE) 195 193 #define QEDR_MAX_RQE ((QEDR_MAX_RQ_PBL_ENTRIES) *\ 196 194 (RDMA_RING_PAGE_SIZE) / \ 197 195 (QEDR_RQE_ELEMENT_SIZE) /\ ··· 200 210 201 211 #define QEDR_ROCE_MAX_CNQ_SIZE (0x4000) 202 212 203 - #define QEDR_MAX_PORT (1) 204 213 #define QEDR_PORT (1) 205 214 206 - #define QEDR_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME) 207 - 208 - #define QEDR_ROCE_PKEY_MAX 1 209 215 #define QEDR_ROCE_PKEY_TABLE_LEN 1 210 216 #define QEDR_ROCE_PKEY_DEFAULT 0xffff 211 217 ··· 321 335 void __iomem *iwarp_db2; 322 336 union db_prod32 iwarp_db2_data; 323 337 }; 324 - 325 - #define QEDR_INC_SW_IDX(p_info, index) \ 326 - do { \ 327 - p_info->index = (p_info->index + 1) & \ 328 - qed_chain_get_capacity(p_info->pbl) \ 329 - } while (0) 330 338 331 339 struct qedr_srq_hwq_info { 332 340 u32 max_sges;
+3
drivers/infiniband/sw/rxe/rxe_comp.c
··· 119 119 120 120 rxe_dbg_qp(qp, "retransmit timer fired\n"); 121 121 122 + if (!rxe_get(qp)) 123 + return; 122 124 spin_lock_irqsave(&qp->state_lock, flags); 123 125 if (qp->valid) { 124 126 qp->comp.timeout = 1; 125 127 rxe_sched_task(&qp->send_task); 126 128 } 127 129 spin_unlock_irqrestore(&qp->state_lock, flags); 130 + rxe_put(qp); 128 131 } 129 132 130 133 void rxe_comp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb)
+187 -99
drivers/infiniband/sw/rxe/rxe_mr.c
··· 72 72 mr->ibmr.type = IB_MR_TYPE_DMA; 73 73 } 74 74 75 + /* 76 + * Convert iova to page_info index. The page_info stores pages of size 77 + * PAGE_SIZE, but MRs can have different page sizes. This function 78 + * handles the conversion for all cases: 79 + * 80 + * 1. mr->page_size > PAGE_SIZE: 81 + * The MR's iova may not be aligned to mr->page_size. We use the 82 + * aligned base (iova & page_mask) as reference, then calculate 83 + * which PAGE_SIZE sub-page the iova falls into. 84 + * 85 + * 2. mr->page_size <= PAGE_SIZE: 86 + * Use simple shift arithmetic since each page_info entry corresponds 87 + * to one or more MR pages. 88 + */ 75 89 static unsigned long rxe_mr_iova_to_index(struct rxe_mr *mr, u64 iova) 76 90 { 77 - return (iova >> mr->page_shift) - (mr->ibmr.iova >> mr->page_shift); 91 + int idx; 92 + 93 + if (mr_page_size(mr) > PAGE_SIZE) 94 + idx = (iova - (mr->ibmr.iova & mr->page_mask)) >> PAGE_SHIFT; 95 + else 96 + idx = (iova >> mr->page_shift) - 97 + (mr->ibmr.iova >> mr->page_shift); 98 + 99 + WARN_ON(idx >= mr->nbuf); 100 + return idx; 78 101 } 79 102 103 + /* 104 + * Convert iova to offset within the page_info entry. 105 + * 106 + * For mr_page_size > PAGE_SIZE, the offset is within the system page. 107 + * For mr_page_size <= PAGE_SIZE, the offset is within the MR page size. 108 + */ 80 109 static unsigned long rxe_mr_iova_to_page_offset(struct rxe_mr *mr, u64 iova) 81 110 { 82 - return iova & (mr_page_size(mr) - 1); 111 + if (mr_page_size(mr) > PAGE_SIZE) 112 + return iova & (PAGE_SIZE - 1); 113 + else 114 + return iova & (mr_page_size(mr) - 1); 83 115 } 84 116 85 117 static bool is_pmem_page(struct page *pg) ··· 125 93 126 94 static int rxe_mr_fill_pages_from_sgt(struct rxe_mr *mr, struct sg_table *sgt) 127 95 { 128 - XA_STATE(xas, &mr->page_list, 0); 129 96 struct sg_page_iter sg_iter; 130 97 struct page *page; 131 98 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 99 + 100 + WARN_ON(mr_page_size(mr) != PAGE_SIZE); 132 101 133 102 __sg_page_iter_start(&sg_iter, sgt->sgl, sgt->orig_nents, 0); 134 103 if (!__sg_page_iter_next(&sg_iter)) 135 104 return 0; 136 105 137 - do { 138 - xas_lock(&xas); 139 - while (true) { 140 - page = sg_page_iter_page(&sg_iter); 106 + while (true) { 107 + page = sg_page_iter_page(&sg_iter); 141 108 142 - if (persistent && !is_pmem_page(page)) { 143 - rxe_dbg_mr(mr, "Page can't be persistent\n"); 144 - xas_set_err(&xas, -EINVAL); 145 - break; 146 - } 147 - 148 - xas_store(&xas, page); 149 - if (xas_error(&xas)) 150 - break; 151 - xas_next(&xas); 152 - if (!__sg_page_iter_next(&sg_iter)) 153 - break; 109 + if (persistent && !is_pmem_page(page)) { 110 + rxe_dbg_mr(mr, "Page can't be persistent\n"); 111 + return -EINVAL; 154 112 } 155 - xas_unlock(&xas); 156 - } while (xas_nomem(&xas, GFP_KERNEL)); 157 113 158 - return xas_error(&xas); 114 + mr->page_info[mr->nbuf].page = page; 115 + mr->page_info[mr->nbuf].offset = 0; 116 + mr->nbuf++; 117 + 118 + if (!__sg_page_iter_next(&sg_iter)) 119 + break; 120 + } 121 + 122 + return 0; 123 + } 124 + 125 + static int __alloc_mr_page_info(struct rxe_mr *mr, int num_pages) 126 + { 127 + mr->page_info = kcalloc(num_pages, sizeof(struct rxe_mr_page), 128 + GFP_KERNEL); 129 + if (!mr->page_info) 130 + return -ENOMEM; 131 + 132 + mr->max_allowed_buf = num_pages; 133 + mr->nbuf = 0; 134 + 135 + return 0; 136 + } 137 + 138 + static int alloc_mr_page_info(struct rxe_mr *mr, int num_pages) 139 + { 140 + int ret; 141 + 142 + WARN_ON(mr->num_buf); 143 + ret = __alloc_mr_page_info(mr, num_pages); 144 + if (ret) 145 + return ret; 146 + 147 + mr->num_buf = num_pages; 148 + 149 + return 0; 150 + } 151 + 152 + static void free_mr_page_info(struct rxe_mr *mr) 153 + { 154 + if (!mr->page_info) 155 + return; 156 + 157 + kfree(mr->page_info); 158 + mr->page_info = NULL; 159 159 } 160 160 161 161 int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, ··· 198 134 199 135 rxe_mr_init(access, mr); 200 136 201 - xa_init(&mr->page_list); 202 - 203 137 umem = ib_umem_get(&rxe->ib_dev, start, length, access); 204 138 if (IS_ERR(umem)) { 205 139 rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n", ··· 205 143 return PTR_ERR(umem); 206 144 } 207 145 146 + err = alloc_mr_page_info(mr, ib_umem_num_pages(umem)); 147 + if (err) 148 + goto err2; 149 + 208 150 err = rxe_mr_fill_pages_from_sgt(mr, &umem->sgt_append.sgt); 209 - if (err) { 210 - ib_umem_release(umem); 211 - return err; 212 - } 151 + if (err) 152 + goto err1; 213 153 214 154 mr->umem = umem; 215 155 mr->ibmr.type = IB_MR_TYPE_USER; 216 156 mr->state = RXE_MR_STATE_VALID; 217 157 218 158 return 0; 219 - } 220 - 221 - static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf) 222 - { 223 - XA_STATE(xas, &mr->page_list, 0); 224 - int i = 0; 225 - int err; 226 - 227 - xa_init(&mr->page_list); 228 - 229 - do { 230 - xas_lock(&xas); 231 - while (i != num_buf) { 232 - xas_store(&xas, XA_ZERO_ENTRY); 233 - if (xas_error(&xas)) 234 - break; 235 - xas_next(&xas); 236 - i++; 237 - } 238 - xas_unlock(&xas); 239 - } while (xas_nomem(&xas, GFP_KERNEL)); 240 - 241 - err = xas_error(&xas); 242 - if (err) 243 - return err; 244 - 245 - mr->num_buf = num_buf; 246 - 247 - return 0; 159 + err1: 160 + free_mr_page_info(mr); 161 + err2: 162 + ib_umem_release(umem); 163 + return err; 248 164 } 249 165 250 166 int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr) ··· 232 192 /* always allow remote access for FMRs */ 233 193 rxe_mr_init(RXE_ACCESS_REMOTE, mr); 234 194 235 - err = rxe_mr_alloc(mr, max_pages); 195 + err = alloc_mr_page_info(mr, max_pages); 236 196 if (err) 237 197 goto err1; 238 198 ··· 245 205 return err; 246 206 } 247 207 208 + /* 209 + * I) MRs with page_size >= PAGE_SIZE, 210 + * Split a large MR page (mr->page_size) into multiple PAGE_SIZE 211 + * sub-pages and store them in page_info, offset is always 0. 212 + * 213 + * Called when mr->page_size > PAGE_SIZE. Each call to rxe_set_page() 214 + * represents one mr->page_size region, which we must split into 215 + * (mr->page_size >> PAGE_SHIFT) individual pages. 216 + * 217 + * II) MRs with page_size < PAGE_SIZE, 218 + * Save each PAGE_SIZE page and its offset within the system page in page_info. 219 + */ 248 220 static int rxe_set_page(struct ib_mr *ibmr, u64 dma_addr) 249 221 { 250 222 struct rxe_mr *mr = to_rmr(ibmr); 251 - struct page *page = ib_virt_dma_to_page(dma_addr); 252 223 bool persistent = !!(mr->access & IB_ACCESS_FLUSH_PERSISTENT); 253 - int err; 224 + u32 i, pages_per_mr = mr_page_size(mr) >> PAGE_SHIFT; 254 225 255 - if (persistent && !is_pmem_page(page)) { 256 - rxe_dbg_mr(mr, "Page cannot be persistent\n"); 257 - return -EINVAL; 226 + pages_per_mr = MAX(1, pages_per_mr); 227 + 228 + for (i = 0; i < pages_per_mr; i++) { 229 + u64 addr = dma_addr + i * PAGE_SIZE; 230 + struct page *sub_page = ib_virt_dma_to_page(addr); 231 + 232 + if (unlikely(mr->nbuf >= mr->max_allowed_buf)) 233 + return -ENOMEM; 234 + 235 + if (persistent && !is_pmem_page(sub_page)) { 236 + rxe_dbg_mr(mr, "Page cannot be persistent\n"); 237 + return -EINVAL; 238 + } 239 + 240 + mr->page_info[mr->nbuf].page = sub_page; 241 + mr->page_info[mr->nbuf].offset = addr & (PAGE_SIZE - 1); 242 + mr->nbuf++; 258 243 } 259 244 260 - if (unlikely(mr->nbuf == mr->num_buf)) 261 - return -ENOMEM; 262 - 263 - err = xa_err(xa_store(&mr->page_list, mr->nbuf, page, GFP_KERNEL)); 264 - if (err) 265 - return err; 266 - 267 - mr->nbuf++; 268 245 return 0; 269 246 } 270 247 ··· 291 234 struct rxe_mr *mr = to_rmr(ibmr); 292 235 unsigned int page_size = mr_page_size(mr); 293 236 237 + /* 238 + * Ensure page_size and PAGE_SIZE are compatible for mapping. 239 + * We require one to be a multiple of the other for correct 240 + * iova-to-page conversion. 241 + */ 242 + if (!IS_ALIGNED(page_size, PAGE_SIZE) && 243 + !IS_ALIGNED(PAGE_SIZE, page_size)) { 244 + rxe_dbg_mr(mr, "MR page size %u must be compatible with PAGE_SIZE %lu\n", 245 + page_size, PAGE_SIZE); 246 + return -EINVAL; 247 + } 248 + 249 + if (mr_page_size(mr) > PAGE_SIZE) { 250 + /* resize page_info if needed */ 251 + u32 map_mr_pages = (page_size >> PAGE_SHIFT) * mr->num_buf; 252 + 253 + if (map_mr_pages > mr->max_allowed_buf) { 254 + rxe_dbg_mr(mr, "requested pages %u exceed max %u\n", 255 + map_mr_pages, mr->max_allowed_buf); 256 + free_mr_page_info(mr); 257 + if (__alloc_mr_page_info(mr, map_mr_pages)) 258 + return -ENOMEM; 259 + } 260 + } 261 + 294 262 mr->nbuf = 0; 295 263 mr->page_shift = ilog2(page_size); 296 264 mr->page_mask = ~((u64)page_size - 1); 297 - mr->page_offset = mr->ibmr.iova & (page_size - 1); 298 265 299 266 return ib_sg_to_pages(ibmr, sgl, sg_nents, sg_offset, rxe_set_page); 300 267 } ··· 326 245 static int rxe_mr_copy_xarray(struct rxe_mr *mr, u64 iova, void *addr, 327 246 unsigned int length, enum rxe_mr_copy_dir dir) 328 247 { 329 - unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); 330 - unsigned long index = rxe_mr_iova_to_index(mr, iova); 331 248 unsigned int bytes; 332 - struct page *page; 333 - void *va; 249 + u8 *va; 334 250 335 251 while (length) { 336 - page = xa_load(&mr->page_list, index); 337 - if (!page) 252 + unsigned long index = rxe_mr_iova_to_index(mr, iova); 253 + struct rxe_mr_page *info = &mr->page_info[index]; 254 + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); 255 + 256 + if (!info->page) 338 257 return -EFAULT; 339 258 340 - bytes = min_t(unsigned int, length, 341 - mr_page_size(mr) - page_offset); 342 - va = kmap_local_page(page); 259 + page_offset += info->offset; 260 + bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); 261 + va = kmap_local_page(info->page); 262 + 343 263 if (dir == RXE_FROM_MR_OBJ) 344 264 memcpy(addr, va + page_offset, bytes); 345 265 else 346 266 memcpy(va + page_offset, addr, bytes); 347 267 kunmap_local(va); 348 268 349 - page_offset = 0; 350 269 addr += bytes; 270 + iova += bytes; 351 271 length -= bytes; 352 - index++; 353 272 } 354 273 355 274 return 0; ··· 507 426 508 427 static int rxe_mr_flush_pmem_iova(struct rxe_mr *mr, u64 iova, unsigned int length) 509 428 { 510 - unsigned int page_offset; 511 - unsigned long index; 512 - struct page *page; 513 429 unsigned int bytes; 514 430 int err; 515 431 u8 *va; ··· 516 438 return err; 517 439 518 440 while (length > 0) { 519 - index = rxe_mr_iova_to_index(mr, iova); 520 - page = xa_load(&mr->page_list, index); 521 - page_offset = rxe_mr_iova_to_page_offset(mr, iova); 522 - if (!page) 523 - return -EFAULT; 524 - bytes = min_t(unsigned int, length, 525 - mr_page_size(mr) - page_offset); 441 + unsigned long index = rxe_mr_iova_to_index(mr, iova); 442 + struct rxe_mr_page *info = &mr->page_info[index]; 443 + unsigned int page_offset = rxe_mr_iova_to_page_offset(mr, iova); 526 444 527 - va = kmap_local_page(page); 445 + if (!info->page) 446 + return -EFAULT; 447 + 448 + page_offset += info->offset; 449 + bytes = min_t(unsigned int, length, PAGE_SIZE - page_offset); 450 + 451 + va = kmap_local_page(info->page); 528 452 arch_wb_cache_pmem(va + page_offset, bytes); 529 453 kunmap_local(va); 530 454 ··· 581 501 } else { 582 502 unsigned long index; 583 503 int err; 504 + struct rxe_mr_page *info; 584 505 585 506 err = mr_check_range(mr, iova, sizeof(value)); 586 507 if (err) { ··· 590 509 } 591 510 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 592 511 index = rxe_mr_iova_to_index(mr, iova); 593 - page = xa_load(&mr->page_list, index); 594 - if (!page) 512 + info = &mr->page_info[index]; 513 + if (!info->page) 595 514 return RESPST_ERR_RKEY_VIOLATION; 515 + 516 + page_offset += info->offset; 517 + page = info->page; 596 518 } 597 519 598 520 if (unlikely(page_offset & 0x7)) { ··· 634 550 } else { 635 551 unsigned long index; 636 552 int err; 553 + struct rxe_mr_page *info; 637 554 638 555 /* See IBA oA19-28 */ 639 556 err = mr_check_range(mr, iova, sizeof(value)); ··· 644 559 } 645 560 page_offset = rxe_mr_iova_to_page_offset(mr, iova); 646 561 index = rxe_mr_iova_to_index(mr, iova); 647 - page = xa_load(&mr->page_list, index); 648 - if (!page) 562 + info = &mr->page_info[index]; 563 + if (!info->page) 649 564 return RESPST_ERR_RKEY_VIOLATION; 565 + 566 + page_offset += info->offset; 567 + page = info->page; 650 568 } 651 569 652 570 /* See IBA A19.4.2 */ ··· 813 725 ib_umem_release(mr->umem); 814 726 815 727 if (mr->ibmr.type != IB_MR_TYPE_DMA) 816 - xa_destroy(&mr->page_list); 728 + free_mr_page_info(mr); 817 729 }
-1
drivers/infiniband/sw/rxe/rxe_odp.c
··· 110 110 mr->access = access_flags; 111 111 mr->ibmr.length = length; 112 112 mr->ibmr.iova = iova; 113 - mr->page_offset = ib_umem_offset(&umem_odp->umem); 114 113 115 114 err = rxe_odp_init_pages(mr); 116 115 if (err) {
+3
drivers/infiniband/sw/rxe/rxe_req.c
··· 102 102 103 103 rxe_dbg_qp(qp, "nak timer fired\n"); 104 104 105 + if (!rxe_get(qp)) 106 + return; 105 107 spin_lock_irqsave(&qp->state_lock, flags); 106 108 if (qp->valid) { 107 109 /* request a send queue retry */ ··· 112 110 rxe_sched_task(&qp->send_task); 113 111 } 114 112 spin_unlock_irqrestore(&qp->state_lock, flags); 113 + rxe_put(qp); 115 114 } 116 115 117 116 static void req_check_sq_drain_done(struct rxe_qp *qp)
+3 -3
drivers/infiniband/sw/rxe/rxe_srq.c
··· 77 77 goto err_free; 78 78 } 79 79 80 - srq->rq.queue = q; 81 - init->attr.max_wr = srq->rq.max_wr; 82 - 83 80 if (uresp) { 84 81 if (copy_to_user(&uresp->srq_num, &srq->srq_num, 85 82 sizeof(uresp->srq_num))) { ··· 84 87 return -EFAULT; 85 88 } 86 89 } 90 + 91 + srq->rq.queue = q; 92 + init->attr.max_wr = srq->rq.max_wr; 87 93 88 94 return 0; 89 95
+9 -2
drivers/infiniband/sw/rxe/rxe_verbs.h
··· 335 335 return (index >= RXE_MIN_MW_INDEX) && (index <= RXE_MAX_MW_INDEX); 336 336 } 337 337 338 + struct rxe_mr_page { 339 + struct page *page; 340 + unsigned int offset; /* offset in system page */ 341 + }; 342 + 338 343 struct rxe_mr { 339 344 struct rxe_pool_elem elem; 340 345 struct ib_mr ibmr; ··· 352 347 int access; 353 348 atomic_t num_mw; 354 349 355 - unsigned int page_offset; 356 350 unsigned int page_shift; 357 351 u64 page_mask; 358 352 353 + /* size of page_info when mr allocated */ 359 354 u32 num_buf; 355 + /* real size of page_info */ 356 + u32 max_allowed_buf; 360 357 u32 nbuf; 361 358 362 - struct xarray page_list; 359 + struct rxe_mr_page *page_info; 363 360 }; 364 361 365 362 static inline unsigned int mr_page_size(struct rxe_mr *mr)
+2 -1
drivers/infiniband/sw/siw/siw_qp_rx.c
··· 1435 1435 } 1436 1436 if (unlikely(rv != 0 && rv != -EAGAIN)) { 1437 1437 if ((srx->state > SIW_GET_HDR || 1438 - qp->rx_fpdu->more_ddp_segs) && run_completion) 1438 + (qp->rx_fpdu && qp->rx_fpdu->more_ddp_segs)) && 1439 + run_completion) 1439 1440 siw_rdmap_complete(qp, rv); 1440 1441 1441 1442 siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
+4 -4
drivers/infiniband/ulp/rtrs/rtrs-clt-sysfs.c
··· 439 439 clt->kobj_paths, 440 440 "%s", str); 441 441 if (err) { 442 - pr_err("kobject_init_and_add: %d\n", err); 442 + pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err)); 443 443 kobject_put(&clt_path->kobj); 444 444 return err; 445 445 } 446 446 err = sysfs_create_group(&clt_path->kobj, &rtrs_clt_path_attr_group); 447 447 if (err) { 448 - pr_err("sysfs_create_group(): %d\n", err); 448 + pr_err("sysfs_create_group(): %pe\n", ERR_PTR(err)); 449 449 goto put_kobj; 450 450 } 451 451 err = kobject_init_and_add(&clt_path->stats->kobj_stats, &ktype_stats, 452 452 &clt_path->kobj, "stats"); 453 453 if (err) { 454 - pr_err("kobject_init_and_add: %d\n", err); 454 + pr_err("kobject_init_and_add: %pe\n", ERR_PTR(err)); 455 455 kobject_put(&clt_path->stats->kobj_stats); 456 456 goto remove_group; 457 457 } ··· 459 459 err = sysfs_create_group(&clt_path->stats->kobj_stats, 460 460 &rtrs_clt_stats_attr_group); 461 461 if (err) { 462 - pr_err("failed to create stats sysfs group, err: %d\n", err); 462 + pr_err("failed to create stats sysfs group, err: %pe\n", ERR_PTR(err)); 463 463 goto put_kobj_stats; 464 464 } 465 465
+83 -48
drivers/infiniband/ulp/rtrs/rtrs-clt.c
··· 422 422 refcount_inc(&req->ref); 423 423 err = rtrs_inv_rkey(req); 424 424 if (err) { 425 - rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %d\n", 426 - req->mr->rkey, err); 425 + rtrs_err_rl(con->c.path, "Send INV WR key=%#x: %pe\n", 426 + req->mr->rkey, ERR_PTR(err)); 427 427 } else if (can_wait) { 428 428 wait_for_completion(&req->inv_comp); 429 429 } ··· 443 443 444 444 if (errno) { 445 445 rtrs_err_rl(con->c.path, 446 - "IO %s request failed: error=%d path=%s [%s:%u] notify=%d\n", 447 - req->dir == DMA_TO_DEVICE ? "write" : "read", errno, 446 + "IO %s request failed: error=%pe path=%s [%s:%u] notify=%d\n", 447 + req->dir == DMA_TO_DEVICE ? "write" : "read", ERR_PTR(errno), 448 448 kobject_name(&clt_path->kobj), clt_path->hca_name, 449 449 clt_path->hca_port, notify); 450 450 } ··· 514 514 cqe); 515 515 err = rtrs_iu_post_recv(&con->c, iu); 516 516 if (err) { 517 - rtrs_err(con->c.path, "post iu failed %d\n", err); 517 + rtrs_err(con->c.path, "post iu failed %pe\n", ERR_PTR(err)); 518 518 rtrs_rdma_error_recovery(con); 519 519 } 520 520 } ··· 659 659 else 660 660 err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); 661 661 if (err) { 662 - rtrs_err(con->c.path, "rtrs_post_recv_empty(): %d\n", 663 - err); 662 + rtrs_err(con->c.path, "rtrs_post_recv_empty(): %pe\n", 663 + ERR_PTR(err)); 664 664 rtrs_rdma_error_recovery(con); 665 665 } 666 666 break; ··· 731 731 732 732 err = post_recv_io(to_clt_con(clt_path->s.con[cid]), q_size); 733 733 if (err) { 734 - rtrs_err(clt_path->clt, "post_recv_io(), err: %d\n", 735 - err); 734 + rtrs_err(clt_path->clt, "post_recv_io(), err: %pe\n", 735 + ERR_PTR(err)); 736 736 return err; 737 737 } 738 738 } ··· 1122 1122 ret = rtrs_map_sg_fr(req, count); 1123 1123 if (ret < 0) { 1124 1124 rtrs_err_rl(s, 1125 - "Write request failed, failed to map fast reg. data, err: %d\n", 1126 - ret); 1125 + "Write request failed, failed to map fast reg. data, err: %pe\n", 1126 + ERR_PTR(ret)); 1127 1127 ib_dma_unmap_sg(clt_path->s.dev->ib_dev, req->sglist, 1128 1128 req->sg_cnt, req->dir); 1129 1129 return ret; ··· 1150 1150 imm, wr, NULL); 1151 1151 if (ret) { 1152 1152 rtrs_err_rl(s, 1153 - "Write request failed: error=%d path=%s [%s:%u]\n", 1154 - ret, kobject_name(&clt_path->kobj), clt_path->hca_name, 1155 - clt_path->hca_port); 1153 + "Write request failed: error=%pe path=%s [%s:%u]\n", 1154 + ERR_PTR(ret), kobject_name(&clt_path->kobj), 1155 + clt_path->hca_name, clt_path->hca_port); 1156 1156 if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) 1157 1157 atomic_dec(&clt_path->stats->inflight); 1158 1158 if (req->mr->need_inval) { ··· 1208 1208 ret = rtrs_map_sg_fr(req, count); 1209 1209 if (ret < 0) { 1210 1210 rtrs_err_rl(s, 1211 - "Read request failed, failed to map fast reg. data, err: %d\n", 1212 - ret); 1211 + "Read request failed, failed to map fast reg. data, err: %pe\n", 1212 + ERR_PTR(ret)); 1213 1213 ib_dma_unmap_sg(dev->ib_dev, req->sglist, req->sg_cnt, 1214 1214 req->dir); 1215 1215 return ret; ··· 1260 1260 req->data_len, imm, wr); 1261 1261 if (ret) { 1262 1262 rtrs_err_rl(s, 1263 - "Read request failed: error=%d path=%s [%s:%u]\n", 1264 - ret, kobject_name(&clt_path->kobj), clt_path->hca_name, 1265 - clt_path->hca_port); 1263 + "Read request failed: error=%pe path=%s [%s:%u]\n", 1264 + ERR_PTR(ret), kobject_name(&clt_path->kobj), 1265 + clt_path->hca_name, clt_path->hca_port); 1266 1266 if (req->mp_policy == MP_POLICY_MIN_INFLIGHT) 1267 1267 atomic_dec(&clt_path->stats->inflight); 1268 1268 req->mr->need_inval = false; ··· 1359 1359 1360 1360 static int alloc_path_reqs(struct rtrs_clt_path *clt_path) 1361 1361 { 1362 + struct ib_device *ib_dev = clt_path->s.dev->ib_dev; 1362 1363 struct rtrs_clt_io_req *req; 1364 + enum ib_mr_type mr_type; 1363 1365 int i, err = -ENOMEM; 1364 1366 1365 1367 clt_path->reqs = kcalloc(clt_path->queue_depth, ··· 1369 1367 GFP_KERNEL); 1370 1368 if (!clt_path->reqs) 1371 1369 return -ENOMEM; 1370 + 1371 + if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) 1372 + mr_type = IB_MR_TYPE_SG_GAPS; 1373 + else 1374 + mr_type = IB_MR_TYPE_MEM_REG; 1372 1375 1373 1376 for (i = 0; i < clt_path->queue_depth; ++i) { 1374 1377 req = &clt_path->reqs[i]; ··· 1388 1381 if (!req->sge) 1389 1382 goto out; 1390 1383 1391 - req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd, 1392 - IB_MR_TYPE_MEM_REG, 1384 + req->mr = ib_alloc_mr(clt_path->s.dev->ib_pd, mr_type, 1393 1385 clt_path->max_pages_per_mr); 1394 1386 if (IS_ERR(req->mr)) { 1395 1387 err = PTR_ERR(req->mr); ··· 1781 1775 err = create_con_cq_qp(con); 1782 1776 mutex_unlock(&con->con_mutex); 1783 1777 if (err) { 1784 - rtrs_err(s, "create_con_cq_qp(), err: %d\n", err); 1778 + rtrs_err(s, "create_con_cq_qp(), err: %pe\n", ERR_PTR(err)); 1785 1779 return err; 1786 1780 } 1787 1781 err = rdma_resolve_route(con->c.cm_id, RTRS_CONNECT_TIMEOUT_MS); 1788 1782 if (err) 1789 - rtrs_err(s, "Resolving route failed, err: %d\n", err); 1783 + rtrs_err(s, "Resolving route failed, err: %pe\n", ERR_PTR(err)); 1790 1784 1791 1785 return err; 1792 1786 } ··· 1820 1814 1821 1815 err = rdma_connect_locked(con->c.cm_id, &param); 1822 1816 if (err) 1823 - rtrs_err(clt, "rdma_connect_locked(): %d\n", err); 1817 + rtrs_err(clt, "rdma_connect_locked(): %pe\n", ERR_PTR(err)); 1824 1818 1825 1819 return err; 1826 1820 } ··· 1853 1847 } 1854 1848 errno = le16_to_cpu(msg->errno); 1855 1849 if (errno) { 1856 - rtrs_err(clt, "Invalid RTRS message: errno %d\n", 1857 - errno); 1850 + rtrs_err(clt, "Invalid RTRS message: errno %pe\n", 1851 + ERR_PTR(errno)); 1858 1852 return -ECONNRESET; 1859 1853 } 1860 1854 if (con->c.cid == 0) { ··· 1929 1923 struct rtrs_path *s = con->c.path; 1930 1924 const struct rtrs_msg_conn_rsp *msg; 1931 1925 const char *rej_msg; 1932 - int status, errno; 1926 + int status, errno = -ECONNRESET; 1933 1927 u8 data_len; 1934 1928 1935 1929 status = ev->status; ··· 1943 1937 "Previous session is still exists on the server, please reconnect later\n"); 1944 1938 else 1945 1939 rtrs_err(s, 1946 - "Connect rejected: status %d (%s), rtrs errno %d\n", 1947 - status, rej_msg, errno); 1940 + "Connect rejected: status %d (%s), rtrs errno %pe\n", 1941 + status, rej_msg, ERR_PTR(errno)); 1948 1942 } else { 1949 1943 rtrs_err(s, 1950 1944 "Connect rejected but with malformed message: status %d (%s)\n", 1951 1945 status, rej_msg); 1952 1946 } 1953 1947 1954 - return -ECONNRESET; 1948 + return errno; 1955 1949 } 1956 1950 1957 1951 void rtrs_clt_close_conns(struct rtrs_clt_path *clt_path, bool wait) ··· 2015 2009 case RDMA_CM_EVENT_UNREACHABLE: 2016 2010 case RDMA_CM_EVENT_ADDR_CHANGE: 2017 2011 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 2018 - rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n", 2019 - rdma_event_msg(ev->event), ev->status); 2012 + if (ev->status < 0) { 2013 + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", 2014 + rdma_event_msg(ev->event), ERR_PTR(ev->status)); 2015 + } else if (ev->status > 0) { 2016 + rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n", 2017 + rdma_event_msg(ev->event), 2018 + rdma_reject_msg(cm_id, ev->status)); 2019 + } 2020 2020 cm_err = -ECONNRESET; 2021 2021 break; 2022 2022 case RDMA_CM_EVENT_ADDR_ERROR: 2023 2023 case RDMA_CM_EVENT_ROUTE_ERROR: 2024 - rtrs_wrn(s, "CM error (CM event: %s, err: %d)\n", 2025 - rdma_event_msg(ev->event), ev->status); 2024 + if (ev->status < 0) { 2025 + rtrs_wrn(s, "CM error (CM event: %s, err: %pe)\n", 2026 + rdma_event_msg(ev->event), 2027 + ERR_PTR(ev->status)); 2028 + } else if (ev->status > 0) { 2029 + rtrs_wrn(s, "CM error (CM event: %s, err: %s)\n", 2030 + rdma_event_msg(ev->event), 2031 + rdma_reject_msg(cm_id, ev->status)); 2032 + } 2026 2033 cm_err = -EHOSTUNREACH; 2027 2034 break; 2028 2035 case RDMA_CM_EVENT_DEVICE_REMOVAL: 2029 2036 /* 2030 2037 * Device removal is a special case. Queue close and return 0. 2031 2038 */ 2032 - rtrs_wrn_rl(s, "CM event: %s, status: %d\n", rdma_event_msg(ev->event), 2033 - ev->status); 2039 + if (ev->status < 0) { 2040 + rtrs_wrn_rl(s, "CM event: %s, status: %pe\n", 2041 + rdma_event_msg(ev->event), 2042 + ERR_PTR(ev->status)); 2043 + } else if (ev->status > 0) { 2044 + rtrs_wrn_rl(s, "CM event: %s, status: %s\n", 2045 + rdma_event_msg(ev->event), 2046 + rdma_reject_msg(cm_id, ev->status)); 2047 + } 2034 2048 rtrs_clt_close_conns(clt_path, false); 2035 2049 return 0; 2036 2050 default: 2037 - rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %d)\n", 2038 - rdma_event_msg(ev->event), ev->status); 2051 + if (ev->status < 0) { 2052 + rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %pe)\n", 2053 + rdma_event_msg(ev->event), ERR_PTR(ev->status)); 2054 + } else if (ev->status > 0) { 2055 + rtrs_err(s, "Unexpected RDMA CM error (CM event: %s, err: %s)\n", 2056 + rdma_event_msg(ev->event), 2057 + rdma_reject_msg(cm_id, ev->status)); 2058 + } 2039 2059 cm_err = -ECONNRESET; 2040 2060 break; 2041 2061 } ··· 2098 2066 /* allow the port to be reused */ 2099 2067 err = rdma_set_reuseaddr(cm_id, 1); 2100 2068 if (err != 0) { 2101 - rtrs_err(s, "Set address reuse failed, err: %d\n", err); 2069 + rtrs_err(s, "Set address reuse failed, err: %pe\n", ERR_PTR(err)); 2102 2070 return err; 2103 2071 } 2104 2072 err = rdma_resolve_addr(cm_id, (struct sockaddr *)&clt_path->s.src_addr, 2105 2073 (struct sockaddr *)&clt_path->s.dst_addr, 2106 2074 RTRS_CONNECT_TIMEOUT_MS); 2107 2075 if (err) { 2108 - rtrs_err(s, "Failed to resolve address, err: %d\n", err); 2076 + rtrs_err(s, "Failed to resolve address, err: %pe\n", ERR_PTR(err)); 2109 2077 return err; 2110 2078 } 2111 2079 /* ··· 2580 2548 /* Prepare for getting info response */ 2581 2549 err = rtrs_iu_post_recv(&usr_con->c, rx_iu); 2582 2550 if (err) { 2583 - rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %d\n", err); 2551 + rtrs_err(clt_path->clt, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err)); 2584 2552 goto out; 2585 2553 } 2586 2554 rx_iu = NULL; ··· 2596 2564 /* Send info request */ 2597 2565 err = rtrs_iu_post_send(&usr_con->c, tx_iu, sizeof(*msg), NULL); 2598 2566 if (err) { 2599 - rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %d\n", err); 2567 + rtrs_err(clt_path->clt, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err)); 2600 2568 goto out; 2601 2569 } 2602 2570 tx_iu = NULL; ··· 2647 2615 err = init_conns(clt_path); 2648 2616 if (err) { 2649 2617 rtrs_err(clt_path->clt, 2650 - "init_conns() failed: err=%d path=%s [%s:%u]\n", err, 2651 - str, clt_path->hca_name, clt_path->hca_port); 2618 + "init_conns() failed: err=%pe path=%s [%s:%u]\n", 2619 + ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port); 2652 2620 goto out; 2653 2621 } 2654 2622 err = rtrs_send_path_info(clt_path); 2655 2623 if (err) { 2656 2624 rtrs_err(clt_path->clt, 2657 - "rtrs_send_path_info() failed: err=%d path=%s [%s:%u]\n", 2658 - err, str, clt_path->hca_name, clt_path->hca_port); 2625 + "rtrs_send_path_info() failed: err=%pe path=%s [%s:%u]\n", 2626 + ERR_PTR(err), str, clt_path->hca_name, clt_path->hca_port); 2659 2627 goto out; 2660 2628 } 2661 2629 rtrs_clt_path_up(clt_path); ··· 3179 3147 void rtrs_clt_ib_event_handler(struct ib_event_handler *handler, 3180 3148 struct ib_event *ibevent) 3181 3149 { 3182 - pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), 3183 - ibevent->event); 3150 + struct ib_device *idev = ibevent->device; 3151 + u32 port_num = ibevent->element.port_num; 3152 + 3153 + pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n", 3154 + ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num); 3184 3155 } 3185 3156 3186 3157
-3
drivers/infiniband/ulp/rtrs/rtrs-clt.h
··· 92 92 * rtrs_clt_io_req - describes one inflight IO request 93 93 */ 94 94 struct rtrs_clt_io_req { 95 - struct list_head list; 96 95 struct rtrs_iu *iu; 97 96 struct scatterlist *sglist; /* list holding user data */ 98 97 unsigned int sg_cnt; ··· 102 103 bool in_use; 103 104 enum rtrs_mp_policy mp_policy; 104 105 struct rtrs_clt_con *con; 105 - struct rtrs_sg_desc *desc; 106 106 struct ib_sge *sge; 107 107 struct rtrs_permit *permit; 108 108 enum dma_data_direction dir; 109 109 void (*conf)(void *priv, int errno); 110 - unsigned long start_jiffies; 111 110 112 111 struct ib_mr *mr; 113 112 struct ib_cqe inv_cqe;
+6 -6
drivers/infiniband/ulp/rtrs/rtrs-srv-sysfs.c
··· 176 176 dev_set_uevent_suppress(&srv->dev, true); 177 177 err = device_add(&srv->dev); 178 178 if (err) { 179 - pr_err("device_add(): %d\n", err); 179 + pr_err("device_add(): %pe\n", ERR_PTR(err)); 180 180 put_device(&srv->dev); 181 181 goto unlock; 182 182 } 183 183 srv->kobj_paths = kobject_create_and_add("paths", &srv->dev.kobj); 184 184 if (!srv->kobj_paths) { 185 185 err = -ENOMEM; 186 - pr_err("kobject_create_and_add(): %d\n", err); 186 + pr_err("kobject_create_and_add(): %pe\n", ERR_PTR(err)); 187 187 device_del(&srv->dev); 188 188 put_device(&srv->dev); 189 189 goto unlock; ··· 237 237 err = kobject_init_and_add(&srv_path->stats->kobj_stats, &ktype_stats, 238 238 &srv_path->kobj, "stats"); 239 239 if (err) { 240 - rtrs_err(s, "kobject_init_and_add(): %d\n", err); 240 + rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err)); 241 241 kobject_put(&srv_path->stats->kobj_stats); 242 242 return err; 243 243 } 244 244 err = sysfs_create_group(&srv_path->stats->kobj_stats, 245 245 &rtrs_srv_stats_attr_group); 246 246 if (err) { 247 - rtrs_err(s, "sysfs_create_group(): %d\n", err); 247 + rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err)); 248 248 goto err; 249 249 } 250 250 ··· 276 276 err = kobject_init_and_add(&srv_path->kobj, &ktype, srv->kobj_paths, 277 277 "%s", str); 278 278 if (err) { 279 - rtrs_err(s, "kobject_init_and_add(): %d\n", err); 279 + rtrs_err(s, "kobject_init_and_add(): %pe\n", ERR_PTR(err)); 280 280 goto destroy_root; 281 281 } 282 282 err = sysfs_create_group(&srv_path->kobj, &rtrs_srv_path_attr_group); 283 283 if (err) { 284 - rtrs_err(s, "sysfs_create_group(): %d\n", err); 284 + rtrs_err(s, "sysfs_create_group(): %pe\n", ERR_PTR(err)); 285 285 goto put_kobj; 286 286 } 287 287 err = rtrs_srv_create_stats_files(srv_path);
+131 -61
drivers/infiniband/ulp/rtrs/rtrs-srv.c
··· 184 184 struct rtrs_srv_path *srv_path = to_srv_path(s); 185 185 186 186 if (wc->status != IB_WC_SUCCESS) { 187 - rtrs_err(s, "REG MR failed: %s\n", 187 + rtrs_err_rl(s, "REG MR failed: %s\n", 188 188 ib_wc_status_msg(wc->status)); 189 189 close_path(srv_path); 190 190 return; ··· 208 208 size_t sg_cnt; 209 209 int err, offset; 210 210 bool need_inval; 211 - u32 rkey = 0; 212 211 struct ib_reg_wr rwr; 213 212 struct ib_sge *plist; 214 213 struct ib_sge list; ··· 239 240 wr->wr.num_sge = 1; 240 241 wr->remote_addr = le64_to_cpu(id->rd_msg->desc[0].addr); 241 242 wr->rkey = le32_to_cpu(id->rd_msg->desc[0].key); 242 - if (rkey == 0) 243 - rkey = wr->rkey; 244 - else 245 - /* Only one key is actually used */ 246 - WARN_ON_ONCE(rkey != wr->rkey); 247 243 248 244 wr->wr.opcode = IB_WR_RDMA_WRITE; 249 245 wr->wr.wr_cqe = &io_comp_cqe; ··· 271 277 inv_wr.opcode = IB_WR_SEND_WITH_INV; 272 278 inv_wr.wr_cqe = &io_comp_cqe; 273 279 inv_wr.send_flags = 0; 274 - inv_wr.ex.invalidate_rkey = rkey; 280 + inv_wr.ex.invalidate_rkey = wr->rkey; 275 281 } 276 282 277 283 imm_wr.wr.next = NULL; ··· 317 323 err = ib_post_send(id->con->c.qp, &id->tx_wr.wr, NULL); 318 324 if (err) 319 325 rtrs_err(s, 320 - "Posting RDMA-Write-Request to QP failed, err: %d\n", 321 - err); 326 + "Posting RDMA-Write-Request to QP failed, err: %pe\n", 327 + ERR_PTR(err)); 322 328 323 329 return err; 324 330 } ··· 434 440 435 441 err = ib_post_send(id->con->c.qp, wr, NULL); 436 442 if (err) 437 - rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %d\n", 438 - err); 443 + rtrs_err_rl(s, "Posting RDMA-Reply to QP failed, err: %pe\n", 444 + ERR_PTR(err)); 439 445 440 446 return err; 441 447 } ··· 519 525 err = rdma_write_sg(id); 520 526 521 527 if (err) { 522 - rtrs_err_rl(s, "IO response failed: %d: srv_path=%s\n", err, 523 - kobject_name(&srv_path->kobj)); 528 + rtrs_err_rl(s, "IO response failed: %pe: srv_path=%s\n", 529 + ERR_PTR(err), kobject_name(&srv_path->kobj)); 524 530 close_path(srv_path); 525 531 } 526 532 out: ··· 562 568 563 569 static int map_cont_bufs(struct rtrs_srv_path *srv_path) 564 570 { 571 + struct ib_device *ib_dev = srv_path->s.dev->ib_dev; 565 572 struct rtrs_srv_sess *srv = srv_path->srv; 566 573 struct rtrs_path *ss = &srv_path->s; 567 574 int i, err, mrs_num; 568 575 unsigned int chunk_bits; 576 + enum ib_mr_type mr_type; 569 577 int chunks_per_mr = 1; 570 - struct ib_mr *mr; 571 578 struct sg_table *sgt; 579 + struct ib_mr *mr; 572 580 573 581 /* 574 582 * Here we map queue_depth chunks to MR. Firstly we have to ··· 597 601 srv_path->mrs_num++) { 598 602 struct rtrs_srv_mr *srv_mr = &srv_path->mrs[srv_path->mrs_num]; 599 603 struct scatterlist *s; 600 - int nr, nr_sgt, chunks; 604 + int nr, nr_sgt, chunks, ind; 601 605 602 606 sgt = &srv_mr->sgt; 603 607 chunks = chunks_per_mr * srv_path->mrs_num; ··· 619 623 err = -EINVAL; 620 624 goto free_sg; 621 625 } 622 - mr = ib_alloc_mr(srv_path->s.dev->ib_pd, IB_MR_TYPE_MEM_REG, 623 - nr_sgt); 626 + 627 + if (ib_dev->attrs.kernel_cap_flags & IBK_SG_GAPS_REG) 628 + mr_type = IB_MR_TYPE_SG_GAPS; 629 + else 630 + mr_type = IB_MR_TYPE_MEM_REG; 631 + 632 + mr = ib_alloc_mr(srv_path->s.dev->ib_pd, mr_type, nr_sgt); 624 633 if (IS_ERR(mr)) { 625 634 err = PTR_ERR(mr); 626 635 goto unmap_sg; 627 636 } 628 637 nr = ib_map_mr_sg(mr, sgt->sgl, nr_sgt, 629 638 NULL, max_chunk_size); 630 - if (nr != nr_sgt) { 639 + if (nr < nr_sgt) { 631 640 err = nr < 0 ? nr : -EINVAL; 632 641 goto dereg_mr; 633 642 } ··· 644 643 DMA_TO_DEVICE, rtrs_srv_rdma_done); 645 644 if (!srv_mr->iu) { 646 645 err = -ENOMEM; 647 - rtrs_err(ss, "rtrs_iu_alloc(), err: %d\n", err); 646 + rtrs_err(ss, "rtrs_iu_alloc(), err: %pe\n", ERR_PTR(err)); 648 647 goto dereg_mr; 649 648 } 650 649 } 651 - /* Eventually dma addr for each chunk can be cached */ 652 - for_each_sg(sgt->sgl, s, nr_sgt, i) 653 - srv_path->dma_addr[chunks + i] = sg_dma_address(s); 650 + 651 + /* 652 + * Cache DMA addresses by traversing sg entries. If 653 + * regions were merged, an inner loop is required to 654 + * populate the DMA address array by traversing larger 655 + * regions. 656 + */ 657 + ind = chunks; 658 + for_each_sg(sgt->sgl, s, nr_sgt, i) { 659 + unsigned int dma_len = sg_dma_len(s); 660 + u64 dma_addr = sg_dma_address(s); 661 + u64 dma_addr_end = dma_addr + dma_len; 662 + 663 + do { 664 + srv_path->dma_addr[ind++] = dma_addr; 665 + dma_addr += max_chunk_size; 666 + } while (dma_addr < dma_addr_end); 667 + } 654 668 655 669 ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); 656 670 srv_mr->mr = mr; ··· 820 804 821 805 err = post_recv_path(srv_path); 822 806 if (err) { 823 - rtrs_err(s, "post_recv_path(), err: %d\n", err); 807 + rtrs_err(s, "post_recv_path(), err: %pe\n", ERR_PTR(err)); 824 808 return err; 825 809 } 826 810 ··· 883 867 get_device(&srv_path->srv->dev); 884 868 err = rtrs_srv_change_state(srv_path, RTRS_SRV_CONNECTED); 885 869 if (!err) { 886 - rtrs_err(s, "rtrs_srv_change_state(), err: %d\n", err); 870 + rtrs_err(s, "rtrs_srv_change_state() failed\n"); 887 871 goto iu_free; 888 872 } 889 873 ··· 897 881 */ 898 882 err = rtrs_srv_path_up(srv_path); 899 883 if (err) { 900 - rtrs_err(s, "rtrs_srv_path_up(), err: %d\n", err); 884 + rtrs_err(s, "rtrs_srv_path_up(), err: %pe\n", ERR_PTR(err)); 901 885 goto iu_free; 902 886 } 903 887 ··· 905 889 tx_iu->dma_addr, 906 890 tx_iu->size, DMA_TO_DEVICE); 907 891 892 + /* 893 + * Now disable zombie connection closing. Since from the logs and code, 894 + * we know that it can never be in CONNECTED state. 895 + */ 896 + srv_path->connection_timeout = 0; 897 + 908 898 /* Send info response */ 909 899 err = rtrs_iu_post_send(&con->c, tx_iu, tx_sz, reg_wr); 910 900 if (err) { 911 - rtrs_err(s, "rtrs_iu_post_send(), err: %d\n", err); 901 + rtrs_err(s, "rtrs_iu_post_send(), err: %pe\n", ERR_PTR(err)); 912 902 iu_free: 913 903 rtrs_iu_free(tx_iu, srv_path->s.dev->ib_dev, 1); 914 904 } ··· 982 960 /* Prepare for getting info response */ 983 961 err = rtrs_iu_post_recv(&con->c, rx_iu); 984 962 if (err) { 985 - rtrs_err(s, "rtrs_iu_post_recv(), err: %d\n", err); 963 + rtrs_err(s, "rtrs_iu_post_recv(), err: %pe\n", ERR_PTR(err)); 986 964 rtrs_iu_free(rx_iu, srv_path->s.dev->ib_dev, 1); 987 965 return err; 988 966 } ··· 1028 1006 1029 1007 err = post_recv_io(to_srv_con(srv_path->s.con[cid]), q_size); 1030 1008 if (err) { 1031 - rtrs_err(s, "post_recv_io(), err: %d\n", err); 1009 + rtrs_err(s, "post_recv_io(), err: %pe\n", ERR_PTR(err)); 1032 1010 return err; 1033 1011 } 1034 1012 } ··· 1076 1054 1077 1055 if (ret) { 1078 1056 rtrs_err_rl(s, 1079 - "Processing read request failed, user module cb reported for msg_id %d, err: %d\n", 1080 - buf_id, ret); 1057 + "Processing read request failed, user module cb reported for msg_id %d, err: %pe\n", 1058 + buf_id, ERR_PTR(ret)); 1081 1059 goto send_err_msg; 1082 1060 } 1083 1061 ··· 1087 1065 ret = send_io_resp_imm(con, id, ret); 1088 1066 if (ret < 0) { 1089 1067 rtrs_err_rl(s, 1090 - "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %d\n", 1091 - buf_id, ret); 1068 + "Sending err msg for failed RDMA-Write-Req failed, msg_id %d, err: %pe\n", 1069 + buf_id, ERR_PTR(ret)); 1092 1070 close_path(srv_path); 1093 1071 } 1094 1072 rtrs_srv_put_ops_ids(srv_path); ··· 1128 1106 data + data_len, usr_len); 1129 1107 if (ret) { 1130 1108 rtrs_err_rl(s, 1131 - "Processing write request failed, user module callback reports err: %d\n", 1132 - ret); 1109 + "Processing write request failed, user module callback reports err: %pe\n", 1110 + ERR_PTR(ret)); 1133 1111 goto send_err_msg; 1134 1112 } 1135 1113 ··· 1139 1117 ret = send_io_resp_imm(con, id, ret); 1140 1118 if (ret < 0) { 1141 1119 rtrs_err_rl(s, 1142 - "Processing write request failed, sending I/O response failed, msg_id %d, err: %d\n", 1143 - buf_id, ret); 1120 + "Processing write request failed, sending I/O response failed, msg_id %d, err: %pe\n", 1121 + buf_id, ERR_PTR(ret)); 1144 1122 close_path(srv_path); 1145 1123 } 1146 1124 rtrs_srv_put_ops_ids(srv_path); ··· 1270 1248 srv_path->s.hb_missed_cnt = 0; 1271 1249 err = rtrs_post_recv_empty(&con->c, &io_comp_cqe); 1272 1250 if (err) { 1273 - rtrs_err(s, "rtrs_post_recv(), err: %d\n", err); 1251 + rtrs_err(s, "rtrs_post_recv(), err: %pe\n", 1252 + ERR_PTR(err)); 1274 1253 close_path(srv_path); 1275 1254 break; 1276 1255 } ··· 1296 1273 mr->msg_id = msg_id; 1297 1274 err = rtrs_srv_inv_rkey(con, mr); 1298 1275 if (err) { 1299 - rtrs_err(s, "rtrs_post_recv(), err: %d\n", 1300 - err); 1276 + rtrs_err(s, "rtrs_post_recv(), err: %pe\n", 1277 + ERR_PTR(err)); 1301 1278 close_path(srv_path); 1302 1279 break; 1303 1280 } ··· 1537 1514 } 1538 1515 } 1539 1516 1517 + /* Let's close connections which have been waiting for more than 30 seconds */ 1518 + #define RTRS_MAX_CONN_TIMEOUT 30000 1519 + 1520 + static void rtrs_srv_check_close_path(struct rtrs_srv_path *srv_path) 1521 + { 1522 + struct rtrs_path *s = &srv_path->s; 1523 + 1524 + if (srv_path->state == RTRS_SRV_CONNECTING && srv_path->connection_timeout && 1525 + (jiffies_to_msecs(jiffies - srv_path->connection_timeout) > RTRS_MAX_CONN_TIMEOUT)) { 1526 + rtrs_err(s, "Closing zombie path\n"); 1527 + close_path(srv_path); 1528 + } 1529 + } 1530 + 1540 1531 static bool __is_path_w_addr_exists(struct rtrs_srv_sess *srv, 1541 1532 struct rdma_addr *addr) 1542 1533 { 1543 1534 struct rtrs_srv_path *srv_path; 1544 1535 1545 - list_for_each_entry(srv_path, &srv->paths_list, s.entry) 1536 + list_for_each_entry(srv_path, &srv->paths_list, s.entry) { 1546 1537 if (!sockaddr_cmp((struct sockaddr *)&srv_path->s.dst_addr, 1547 1538 (struct sockaddr *)&addr->dst_addr) && 1548 1539 !sockaddr_cmp((struct sockaddr *)&srv_path->s.src_addr, 1549 - (struct sockaddr *)&addr->src_addr)) 1540 + (struct sockaddr *)&addr->src_addr)) { 1541 + rtrs_err((&srv_path->s), 1542 + "Path (%s) with same addr exists (lifetime %u)\n", 1543 + rtrs_srv_state_str(srv_path->state), 1544 + (jiffies_to_msecs(jiffies - srv_path->connection_timeout))); 1545 + rtrs_srv_check_close_path(srv_path); 1550 1546 return true; 1547 + } 1548 + } 1551 1549 1552 1550 return false; 1553 1551 } ··· 1667 1623 1668 1624 err = rdma_accept(cm_id, &param); 1669 1625 if (err) 1670 - pr_err("rdma_accept(), err: %d\n", err); 1626 + pr_err("rdma_accept(), err: %pe\n", ERR_PTR(err)); 1671 1627 1672 1628 return err; 1673 1629 } ··· 1685 1641 1686 1642 err = rdma_reject(cm_id, &msg, sizeof(msg), IB_CM_REJ_CONSUMER_DEFINED); 1687 1643 if (err) 1688 - pr_err("rdma_reject(), err: %d\n", err); 1644 + pr_err("rdma_reject(), err: %pe\n", ERR_PTR(err)); 1689 1645 1690 1646 /* Bounce errno back */ 1691 1647 return errno; ··· 1761 1717 max_send_wr, max_recv_wr, 1762 1718 IB_POLL_WORKQUEUE); 1763 1719 if (err) { 1764 - rtrs_err(s, "rtrs_cq_qp_create(), err: %d\n", err); 1720 + rtrs_err(s, "rtrs_cq_qp_create(), err: %pe\n", ERR_PTR(err)); 1765 1721 goto free_con; 1766 1722 } 1767 1723 if (con->c.cid == 0) { ··· 1806 1762 } 1807 1763 if (__is_path_w_addr_exists(srv, &cm_id->route.addr)) { 1808 1764 err = -EEXIST; 1809 - pr_err("Path with same addr exists\n"); 1810 1765 goto err; 1811 1766 } 1812 1767 srv_path = kzalloc(sizeof(*srv_path), GFP_KERNEL); ··· 1852 1809 spin_lock_init(&srv_path->state_lock); 1853 1810 INIT_WORK(&srv_path->close_work, rtrs_srv_close_work); 1854 1811 rtrs_srv_init_hb(srv_path); 1812 + srv_path->connection_timeout = 0; 1855 1813 1856 1814 srv_path->s.dev = rtrs_ib_dev_find_or_add(cm_id->device, &dev_pd); 1857 1815 if (!srv_path->s.dev) { ··· 1958 1914 goto reject_w_err; 1959 1915 } 1960 1916 if (s->con[cid]) { 1961 - rtrs_err(s, "Connection already exists: %d\n", 1962 - cid); 1917 + rtrs_err(s, "Connection (%s) already exists: %d (lifetime %u)\n", 1918 + rtrs_srv_state_str(srv_path->state), cid, 1919 + (jiffies_to_msecs(jiffies - srv_path->connection_timeout))); 1920 + rtrs_srv_check_close_path(srv_path); 1963 1921 mutex_unlock(&srv->paths_mutex); 1964 1922 goto reject_w_err; 1965 1923 } ··· 1976 1930 goto reject_w_err; 1977 1931 } 1978 1932 } 1933 + 1934 + /* 1935 + * Start of any connection creation resets the timeout for the path. 1936 + */ 1937 + srv_path->connection_timeout = jiffies; 1938 + 1979 1939 err = create_con(srv_path, cm_id, cid); 1980 1940 if (err) { 1981 - rtrs_err((&srv_path->s), "create_con(), error %d\n", err); 1941 + rtrs_err((&srv_path->s), "create_con(), error %pe\n", ERR_PTR(err)); 1982 1942 rtrs_rdma_do_reject(cm_id, err); 1983 1943 /* 1984 1944 * Since session has other connections we follow normal way ··· 1995 1943 } 1996 1944 err = rtrs_rdma_do_accept(srv_path, cm_id); 1997 1945 if (err) { 1998 - rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %d\n", err); 1946 + rtrs_err((&srv_path->s), "rtrs_rdma_do_accept(), error %pe\n", 1947 + ERR_PTR(err)); 1999 1948 rtrs_rdma_do_reject(cm_id, err); 2000 1949 /* 2001 1950 * Since current connection was successfully added to the ··· 2047 1994 case RDMA_CM_EVENT_REJECTED: 2048 1995 case RDMA_CM_EVENT_CONNECT_ERROR: 2049 1996 case RDMA_CM_EVENT_UNREACHABLE: 2050 - rtrs_err(s, "CM error (CM event: %s, err: %d)\n", 2051 - rdma_event_msg(ev->event), ev->status); 1997 + if (ev->status < 0) { 1998 + rtrs_err(s, "CM error (CM event: %s, err: %pe)\n", 1999 + rdma_event_msg(ev->event), 2000 + ERR_PTR(ev->status)); 2001 + } else if (ev->status > 0) { 2002 + rtrs_err(s, "CM error (CM event: %s, err: %s)\n", 2003 + rdma_event_msg(ev->event), 2004 + rdma_reject_msg(cm_id, ev->status)); 2005 + } 2052 2006 fallthrough; 2053 2007 case RDMA_CM_EVENT_DISCONNECTED: 2054 2008 case RDMA_CM_EVENT_ADDR_CHANGE: ··· 2064 2004 close_path(srv_path); 2065 2005 break; 2066 2006 default: 2067 - pr_err("Ignoring unexpected CM event %s, err %d\n", 2068 - rdma_event_msg(ev->event), ev->status); 2007 + if (ev->status < 0) { 2008 + pr_err("Ignoring unexpected CM event %s, err %pe\n", 2009 + rdma_event_msg(ev->event), 2010 + ERR_PTR(ev->status)); 2011 + } else if (ev->status > 0) { 2012 + pr_err("Ignoring unexpected CM event %s, err %s\n", 2013 + rdma_event_msg(ev->event), 2014 + rdma_reject_msg(cm_id, ev->status)); 2015 + } 2069 2016 break; 2070 2017 } 2071 2018 ··· 2096 2029 } 2097 2030 ret = rdma_bind_addr(cm_id, addr); 2098 2031 if (ret) { 2099 - pr_err("Binding RDMA address failed, err: %d\n", ret); 2032 + pr_err("Binding RDMA address failed, err: %pe\n", ERR_PTR(ret)); 2100 2033 goto err_cm; 2101 2034 } 2102 2035 ret = rdma_listen(cm_id, 64); 2103 2036 if (ret) { 2104 - pr_err("Listening on RDMA connection failed, err: %d\n", 2105 - ret); 2037 + pr_err("Listening on RDMA connection failed, err: %pe\n", 2038 + ERR_PTR(ret)); 2106 2039 goto err_cm; 2107 2040 } 2108 2041 ··· 2342 2275 void rtrs_srv_ib_event_handler(struct ib_event_handler *handler, 2343 2276 struct ib_event *ibevent) 2344 2277 { 2345 - pr_info("Handling event: %s (%d).\n", ib_event_msg(ibevent->event), 2346 - ibevent->event); 2278 + struct ib_device *idev = ibevent->device; 2279 + u32 port_num = ibevent->element.port_num; 2280 + 2281 + pr_info("Handling event: %s (%d). HCA name: %s, port num: %u\n", 2282 + ib_event_msg(ibevent->event), ibevent->event, idev->name, port_num); 2347 2283 } 2348 2284 2349 2285 static int rtrs_srv_ib_dev_init(struct rtrs_ib_dev *dev) ··· 2383 2313 2384 2314 err = check_module_params(); 2385 2315 if (err) { 2386 - pr_err("Failed to load module, invalid module parameters, err: %d\n", 2387 - err); 2316 + pr_err("Failed to load module, invalid module parameters, err: %pe\n", 2317 + ERR_PTR(err)); 2388 2318 return err; 2389 2319 } 2390 2320 err = class_register(&rtrs_dev_class);
+1
drivers/infiniband/ulp/rtrs/rtrs-srv.h
··· 89 89 unsigned int mem_bits; 90 90 struct kobject kobj; 91 91 struct rtrs_srv_stats *stats; 92 + unsigned long connection_timeout; 92 93 }; 93 94 94 95 static inline struct rtrs_srv_path *to_srv_path(struct rtrs_path *s)
+6 -3
drivers/infiniband/ulp/rtrs/rtrs.c
··· 273 273 274 274 ret = rdma_create_qp(cm_id, pd, &init_attr); 275 275 if (ret) { 276 - rtrs_err(con->path, "Creating QP failed, err: %d\n", ret); 276 + rtrs_err(con->path, "Creating QP failed, err: %pe\n", 277 + ERR_PTR(ret)); 277 278 return ret; 278 279 } 279 280 con->qp = cm_id->qp; ··· 342 341 err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm, 343 342 NULL); 344 343 if (err) { 345 - rtrs_err(path, "send HB ACK failed, errno: %d\n", err); 344 + rtrs_err(path, "send HB ACK failed, errno: %pe\n", 345 + ERR_PTR(err)); 346 346 path->hb_err_handler(usr_con); 347 347 return; 348 348 } ··· 377 375 err = rtrs_post_rdma_write_imm_empty(usr_con, path->hb_cqe, imm, 378 376 NULL); 379 377 if (err) { 380 - rtrs_err(path, "HB send failed, errno: %d\n", err); 378 + rtrs_err(path, "HB send failed, errno: %pe\n", 379 + ERR_PTR(err)); 381 380 path->hb_err_handler(usr_con); 382 381 return; 383 382 }
+44 -3
include/net/mana/gdma.h
··· 35 35 GDMA_CREATE_MR = 31, 36 36 GDMA_DESTROY_MR = 32, 37 37 GDMA_QUERY_HWC_TIMEOUT = 84, /* 0x54 */ 38 + GDMA_ALLOC_DM = 96, /* 0x60 */ 39 + GDMA_DESTROY_DM = 97, /* 0x61 */ 38 40 }; 39 41 40 42 #define GDMA_RESOURCE_DOORBELL_PAGE 27 ··· 868 866 GDMA_MR_TYPE_GVA = 2, 869 867 /* Guest zero-based address MRs */ 870 868 GDMA_MR_TYPE_ZBVA = 4, 869 + /* Device address MRs */ 870 + GDMA_MR_TYPE_DM = 5, 871 871 }; 872 872 873 873 struct gdma_create_mr_params { ··· 885 881 u64 dma_region_handle; 886 882 enum gdma_mr_access_flags access_flags; 887 883 } zbva; 884 + struct { 885 + u64 dm_handle; 886 + u64 offset; 887 + u64 length; 888 + enum gdma_mr_access_flags access_flags; 889 + } da; 888 890 }; 889 891 }; 890 892 ··· 905 895 u64 dma_region_handle; 906 896 u64 virtual_address; 907 897 enum gdma_mr_access_flags access_flags; 908 - } gva; 898 + } __packed gva; 909 899 struct { 910 900 u64 dma_region_handle; 911 901 enum gdma_mr_access_flags access_flags; 912 - } zbva; 913 - }; 902 + } __packed zbva; 903 + struct { 904 + u64 dm_handle; 905 + u64 offset; 906 + enum gdma_mr_access_flags access_flags; 907 + } __packed da; 908 + } __packed; 914 909 u32 reserved_2; 910 + union { 911 + struct { 912 + u64 length; 913 + } da_ext; 914 + }; 915 915 };/* HW DATA */ 916 916 917 917 struct gdma_create_mr_response { ··· 939 919 struct gdma_destroy_mr_response { 940 920 struct gdma_resp_hdr hdr; 941 921 };/* HW DATA */ 922 + 923 + struct gdma_alloc_dm_req { 924 + struct gdma_req_hdr hdr; 925 + u64 length; 926 + u32 alignment; 927 + u32 flags; 928 + }; /* HW Data */ 929 + 930 + struct gdma_alloc_dm_resp { 931 + struct gdma_resp_hdr hdr; 932 + u64 dm_handle; 933 + }; /* HW Data */ 934 + 935 + struct gdma_destroy_dm_req { 936 + struct gdma_req_hdr hdr; 937 + u64 dm_handle; 938 + }; /* HW Data */ 939 + 940 + struct gdma_destroy_dm_resp { 941 + struct gdma_resp_hdr hdr; 942 + }; /* HW Data */ 942 943 943 944 int mana_gd_verify_vf_version(struct pci_dev *pdev); 944 945
+68 -2
include/rdma/ib_verbs.h
··· 15 15 #include <linux/ethtool.h> 16 16 #include <linux/types.h> 17 17 #include <linux/device.h> 18 + #include <linux/bvec.h> 18 19 #include <linux/dma-mapping.h> 19 20 #include <linux/kref.h> 20 21 #include <linux/list.h> ··· 44 43 #include <uapi/rdma/rdma_user_ioctl.h> 45 44 #include <uapi/rdma/ib_user_ioctl_verbs.h> 46 45 #include <linux/pci-tph.h> 46 + #include <linux/dma-buf.h> 47 47 48 48 #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN 49 49 ··· 766 764 IB_EVENT_CLIENT_REREGISTER, 767 765 IB_EVENT_GID_CHANGE, 768 766 IB_EVENT_WQ_FATAL, 767 + IB_EVENT_DEVICE_SPEED_CHANGE, 769 768 }; 770 769 771 770 const char *__attribute_const__ ib_event_msg(enum ib_event_type event); ··· 880 877 */ 881 878 __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate); 882 879 880 + struct ib_port_speed_info { 881 + const char *str; 882 + int rate; /* in deci-Gb/sec (100 MBps units) */ 883 + }; 884 + 885 + /** 886 + * ib_port_attr_to_speed_info - Convert port attributes to speed information 887 + * @attr: Port attributes containing active_speed and active_width 888 + * @speed_info: Speed information to return 889 + * 890 + * Returns 0 on success, -EINVAL on error. 891 + */ 892 + int ib_port_attr_to_speed_info(struct ib_port_attr *attr, 893 + struct ib_port_speed_info *speed_info); 883 894 884 895 /** 885 896 * enum ib_mr_type - memory region type ··· 2365 2348 unsigned long start_pgoff; 2366 2349 size_t npages; 2367 2350 bool driver_removed; 2351 + /* protects access to dmabufs */ 2352 + struct mutex dmabufs_lock; 2353 + struct list_head dmabufs; 2368 2354 }; 2369 2355 2370 2356 /* Return the offset (in bytes) the user should pass to libc's mmap() */ ··· 2423 2403 int comp_vector); 2424 2404 int (*query_port)(struct ib_device *device, u32 port_num, 2425 2405 struct ib_port_attr *port_attr); 2406 + int (*query_port_speed)(struct ib_device *device, u32 port_num, 2407 + u64 *speed); 2426 2408 int (*modify_port)(struct ib_device *device, u32 port_num, 2427 2409 int port_modify_mask, 2428 2410 struct ib_port_modify *port_modify); ··· 2505 2483 * Therefore needs to be implemented by the driver in mmap_free. 2506 2484 */ 2507 2485 void (*mmap_free)(struct rdma_user_mmap_entry *entry); 2486 + int (*mmap_get_pfns)(struct rdma_user_mmap_entry *entry, 2487 + struct phys_vec *phys_vec, 2488 + struct p2pdma_provider **provider); 2489 + struct rdma_user_mmap_entry *(*pgoff_to_mmap_entry)(struct ib_ucontext *ucontext, 2490 + off_t pg_off); 2508 2491 void (*disassociate_ucontext)(struct ib_ucontext *ibcontext); 2509 2492 int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata); 2510 2493 int (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata); ··· 4276 4249 dma_unmap_page(dev->dma_device, addr, size, direction); 4277 4250 } 4278 4251 4252 + /** 4253 + * ib_dma_map_bvec - Map a bio_vec to DMA address 4254 + * @dev: The device for which the dma_addr is to be created 4255 + * @bvec: The bio_vec to map 4256 + * @direction: The direction of the DMA 4257 + * 4258 + * Returns a DMA address for the bio_vec. The caller must check the 4259 + * result with ib_dma_mapping_error() before use; a failed mapping 4260 + * must not be passed to ib_dma_unmap_bvec(). 4261 + * 4262 + * For software RDMA devices (rxe, siw), returns a virtual address 4263 + * and no actual DMA mapping occurs. 4264 + */ 4265 + static inline u64 ib_dma_map_bvec(struct ib_device *dev, 4266 + struct bio_vec *bvec, 4267 + enum dma_data_direction direction) 4268 + { 4269 + if (ib_uses_virt_dma(dev)) 4270 + return (uintptr_t)bvec_virt(bvec); 4271 + return dma_map_phys(dev->dma_device, bvec_phys(bvec), 4272 + bvec->bv_len, direction, 0); 4273 + } 4274 + 4275 + /** 4276 + * ib_dma_unmap_bvec - Unmap a bio_vec DMA mapping 4277 + * @dev: The device for which the DMA address was created 4278 + * @addr: The DMA address returned by ib_dma_map_bvec() 4279 + * @size: The size of the region in bytes 4280 + * @direction: The direction of the DMA 4281 + * 4282 + * Releases a DMA mapping created by ib_dma_map_bvec(). For software 4283 + * RDMA devices this is a no-op since no actual mapping occurred. 4284 + */ 4285 + static inline void ib_dma_unmap_bvec(struct ib_device *dev, 4286 + u64 addr, size_t size, 4287 + enum dma_data_direction direction) 4288 + { 4289 + if (!ib_uses_virt_dma(dev)) 4290 + dma_unmap_phys(dev->dma_device, addr, size, direction, 0); 4291 + } 4292 + 4279 4293 int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int nents); 4280 4294 static inline int ib_dma_map_sg_attrs(struct ib_device *dev, 4281 4295 struct scatterlist *sg, int nents, ··· 4613 4545 void ib_device_put(struct ib_device *device); 4614 4546 struct ib_device *ib_device_get_by_netdev(struct net_device *ndev, 4615 4547 enum rdma_driver_id driver_id); 4616 - struct ib_device *ib_device_get_by_name(const char *name, 4617 - enum rdma_driver_id driver_id); 4618 4548 struct net_device *ib_get_net_dev_by_params(struct ib_device *dev, u32 port, 4619 4549 u16 pkey, const union ib_gid *gid, 4620 4550 const struct sockaddr *addr);
+22
include/rdma/rw.h
··· 5 5 #ifndef _RDMA_RW_H 6 6 #define _RDMA_RW_H 7 7 8 + #include <linux/bvec.h> 8 9 #include <linux/dma-mapping.h> 9 10 #include <linux/scatterlist.h> 10 11 #include <rdma/ib_verbs.h> ··· 32 31 struct ib_rdma_wr *wrs; 33 32 } map; 34 33 34 + /* for IOVA-based mapping of bvecs into contiguous DMA range: */ 35 + struct { 36 + struct dma_iova_state state; 37 + struct ib_sge sge; 38 + struct ib_rdma_wr wr; 39 + size_t mapped_len; 40 + } iova; 41 + 35 42 /* for registering multiple WRs: */ 36 43 struct rdma_rw_reg_ctx { 37 44 struct ib_sge sge; ··· 47 38 struct ib_reg_wr reg_wr; 48 39 struct ib_send_wr inv_wr; 49 40 struct ib_mr *mr; 41 + struct sg_table sgt; 50 42 } *reg; 51 43 }; 52 44 }; ··· 58 48 void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 59 49 u32 port_num, struct scatterlist *sg, u32 sg_cnt, 60 50 enum dma_data_direction dir); 51 + 52 + struct bio_vec; 53 + 54 + int rdma_rw_ctx_init_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 55 + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, 56 + struct bvec_iter iter, u64 remote_addr, u32 rkey, 57 + enum dma_data_direction dir); 58 + void rdma_rw_ctx_destroy_bvec(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 59 + u32 port_num, const struct bio_vec *bvecs, u32 nr_bvec, 60 + enum dma_data_direction dir); 61 61 62 62 int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, 63 63 u32 port_num, struct scatterlist *sg, u32 sg_cnt, ··· 86 66 87 67 unsigned int rdma_rw_mr_factor(struct ib_device *device, u32 port_num, 88 68 unsigned int maxpages); 69 + unsigned int rdma_rw_max_send_wr(struct ib_device *dev, u32 port_num, 70 + unsigned int max_rdma_ctxs, u32 create_flags); 89 71 void rdma_rw_init_qp(struct ib_device *dev, struct ib_qp_init_attr *attr); 90 72 int rdma_rw_init_mrs(struct ib_qp *qp, struct ib_qp_init_attr *attr); 91 73 void rdma_rw_cleanup_mrs(struct ib_qp *qp);
+1
include/rdma/uverbs_types.h
··· 186 186 extern const struct uverbs_obj_type_class uverbs_idr_class; 187 187 extern const struct uverbs_obj_type_class uverbs_fd_class; 188 188 int uverbs_uobject_fd_release(struct inode *inode, struct file *filp); 189 + int uverbs_uobject_release(struct ib_uobject *uobj); 189 190 190 191 #define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ 191 192 sizeof(char))
+16
include/uapi/rdma/bnxt_re-abi.h
··· 56 56 BNXT_RE_UCNTX_CMASK_DBR_PACING_ENABLED = 0x08ULL, 57 57 BNXT_RE_UCNTX_CMASK_POW2_DISABLED = 0x10ULL, 58 58 BNXT_RE_UCNTX_CMASK_MSN_TABLE_ENABLED = 0x40, 59 + BNXT_RE_UCNTX_CMASK_QP_RATE_LIMIT_ENABLED = 0x80ULL, 59 60 }; 60 61 61 62 enum bnxt_re_wqe_mode { ··· 215 214 enum bnxt_re_toggle_mem_methods { 216 215 BNXT_RE_METHOD_GET_TOGGLE_MEM = (1U << UVERBS_ID_NS_SHIFT), 217 216 BNXT_RE_METHOD_RELEASE_TOGGLE_MEM, 217 + }; 218 + 219 + struct bnxt_re_packet_pacing_caps { 220 + __u32 qp_rate_limit_min; 221 + __u32 qp_rate_limit_max; /* In kbps */ 222 + /* Corresponding bit will be set if qp type from 223 + * 'enum ib_qp_type' is supported, e.g. 224 + * supported_qpts |= 1 << IB_QPT_RC 225 + */ 226 + __u32 supported_qpts; 227 + __u32 reserved; 228 + }; 229 + 230 + struct bnxt_re_query_device_ex_resp { 231 + struct bnxt_re_packet_pacing_caps packet_pacing_caps; 218 232 }; 219 233 #endif /* __BNXT_RE_UVERBS_ABI_H__*/
+16
include/uapi/rdma/ib_user_ioctl_cmds.h
··· 56 56 UVERBS_OBJECT_COUNTERS, 57 57 UVERBS_OBJECT_ASYNC_EVENT, 58 58 UVERBS_OBJECT_DMAH, 59 + UVERBS_OBJECT_DMABUF, 59 60 }; 60 61 61 62 enum { ··· 74 73 UVERBS_METHOD_QUERY_CONTEXT, 75 74 UVERBS_METHOD_QUERY_GID_TABLE, 76 75 UVERBS_METHOD_QUERY_GID_ENTRY, 76 + UVERBS_METHOD_QUERY_PORT_SPEED, 77 77 }; 78 78 79 79 enum uverbs_attrs_invoke_write_cmd_attr_ids { ··· 86 84 enum uverbs_attrs_query_port_cmd_attr_ids { 87 85 UVERBS_ATTR_QUERY_PORT_PORT_NUM, 88 86 UVERBS_ATTR_QUERY_PORT_RESP, 87 + }; 88 + 89 + enum uverbs_attrs_query_port_speed_cmd_attr_ids { 90 + UVERBS_ATTR_QUERY_PORT_SPEED_PORT_NUM, 91 + UVERBS_ATTR_QUERY_PORT_SPEED_RESP, 89 92 }; 90 93 91 94 enum uverbs_attrs_get_context_attr_ids { ··· 262 255 enum uverbs_methods_dmah { 263 256 UVERBS_METHOD_DMAH_ALLOC, 264 257 UVERBS_METHOD_DMAH_FREE, 258 + }; 259 + 260 + enum uverbs_attrs_alloc_dmabuf_cmd_attr_ids { 261 + UVERBS_ATTR_ALLOC_DMABUF_HANDLE, 262 + UVERBS_ATTR_ALLOC_DMABUF_PGOFF, 263 + }; 264 + 265 + enum uverbs_methods_dmabuf { 266 + UVERBS_METHOD_DMABUF_ALLOC, 265 267 }; 266 268 267 269 enum uverbs_attrs_reg_dm_mr_cmd_attr_ids {
+3
include/uapi/rdma/mana-abi.h
··· 17 17 #define MANA_IB_UVERBS_ABI_VERSION 1 18 18 19 19 enum mana_ib_create_cq_flags { 20 + /* Reserved for backward compatibility. Legacy 21 + * kernel versions use it to create CQs in RNIC 22 + */ 20 23 MANA_IB_CREATE_RNIC_CQ = 1 << 0, 21 24 }; 22 25
+86 -69
net/sunrpc/xprtrdma/svc_rdma_rw.c
··· 5 5 * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. 6 6 */ 7 7 8 + #include <linux/bvec.h> 9 + #include <linux/overflow.h> 8 10 #include <rdma/rw.h> 9 11 10 12 #include <linux/sunrpc/xdr.h> ··· 22 20 /* Each R/W context contains state for one chain of RDMA Read or 23 21 * Write Work Requests. 24 22 * 25 - * Each WR chain handles a single contiguous server-side buffer, 26 - * because scatterlist entries after the first have to start on 27 - * page alignment. xdr_buf iovecs cannot guarantee alignment. 23 + * Each WR chain handles a single contiguous server-side buffer. 24 + * - each xdr_buf iovec is a single contiguous buffer 25 + * - the xdr_buf pages array is a single contiguous buffer because the 26 + * second through the last element always start on a page boundary 28 27 * 29 28 * Each WR chain handles only one R_key. Each RPC-over-RDMA segment 30 29 * from a client may contain a unique R_key, so each WR chain moves 31 30 * up to one segment at a time. 32 31 * 33 - * The scatterlist makes this data structure over 4KB in size. To 34 - * make it less likely to fail, and to handle the allocation for 35 - * smaller I/O requests without disabling bottom-halves, these 36 - * contexts are created on demand, but cached and reused until the 37 - * controlling svcxprt_rdma is destroyed. 32 + * The inline bvec array is sized to handle most I/O requests without 33 + * additional allocation. Larger requests fall back to dynamic allocation. 34 + * These contexts are created on demand, but cached and reused until 35 + * the controlling svcxprt_rdma is destroyed. 38 36 */ 39 37 struct svc_rdma_rw_ctxt { 40 38 struct llist_node rw_node; 41 39 struct list_head rw_list; 42 40 struct rdma_rw_ctx rw_ctx; 43 41 unsigned int rw_nents; 44 - unsigned int rw_first_sgl_nents; 45 - struct sg_table rw_sg_table; 46 - struct scatterlist rw_first_sgl[]; 42 + unsigned int rw_first_bvec_nents; 43 + struct bio_vec *rw_bvec; 44 + struct bio_vec rw_first_bvec[]; 47 45 }; 46 + 47 + static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, 48 + struct svc_rdma_rw_ctxt *ctxt); 48 49 49 50 static inline struct svc_rdma_rw_ctxt * 50 51 svc_rdma_next_ctxt(struct list_head *list) ··· 57 52 } 58 53 59 54 static struct svc_rdma_rw_ctxt * 60 - svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) 55 + svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int nr_bvec) 61 56 { 62 57 struct ib_device *dev = rdma->sc_cm_id->device; 63 - unsigned int first_sgl_nents = dev->attrs.max_send_sge; 58 + unsigned int first_bvec_nents = dev->attrs.max_send_sge; 64 59 struct svc_rdma_rw_ctxt *ctxt; 65 60 struct llist_node *node; 66 61 ··· 70 65 if (node) { 71 66 ctxt = llist_entry(node, struct svc_rdma_rw_ctxt, rw_node); 72 67 } else { 73 - ctxt = kmalloc_node(struct_size(ctxt, rw_first_sgl, first_sgl_nents), 68 + ctxt = kmalloc_node(struct_size(ctxt, rw_first_bvec, 69 + first_bvec_nents), 74 70 GFP_KERNEL, ibdev_to_node(dev)); 75 71 if (!ctxt) 76 72 goto out_noctx; 77 73 78 74 INIT_LIST_HEAD(&ctxt->rw_list); 79 - ctxt->rw_first_sgl_nents = first_sgl_nents; 75 + ctxt->rw_first_bvec_nents = first_bvec_nents; 80 76 } 81 77 82 - ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; 83 - if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, 84 - ctxt->rw_sg_table.sgl, 85 - first_sgl_nents)) 86 - goto out_free; 78 + if (nr_bvec <= ctxt->rw_first_bvec_nents) { 79 + ctxt->rw_bvec = ctxt->rw_first_bvec; 80 + } else { 81 + ctxt->rw_bvec = kmalloc_array_node(nr_bvec, 82 + sizeof(*ctxt->rw_bvec), 83 + GFP_KERNEL, 84 + ibdev_to_node(dev)); 85 + if (!ctxt->rw_bvec) 86 + goto out_free; 87 + } 87 88 return ctxt; 88 89 89 90 out_free: 90 - kfree(ctxt); 91 + /* Return cached contexts to cache; free freshly allocated ones */ 92 + if (node) 93 + svc_rdma_put_rw_ctxt(rdma, ctxt); 94 + else 95 + kfree(ctxt); 91 96 out_noctx: 92 - trace_svcrdma_rwctx_empty(rdma, sges); 97 + trace_svcrdma_rwctx_empty(rdma, nr_bvec); 93 98 return NULL; 94 99 } 95 100 96 101 static void __svc_rdma_put_rw_ctxt(struct svc_rdma_rw_ctxt *ctxt, 97 102 struct llist_head *list) 98 103 { 99 - sg_free_table_chained(&ctxt->rw_sg_table, ctxt->rw_first_sgl_nents); 104 + if (ctxt->rw_bvec != ctxt->rw_first_bvec) 105 + kfree(ctxt->rw_bvec); 100 106 llist_add(&ctxt->rw_node, list); 101 107 } 102 108 ··· 139 123 * @ctxt: R/W context to prepare 140 124 * @offset: RDMA offset 141 125 * @handle: RDMA tag/handle 126 + * @length: total number of bytes in the bvec array 142 127 * @direction: I/O direction 143 128 * 144 129 * Returns on success, the number of WQEs that will be needed ··· 147 130 */ 148 131 static int svc_rdma_rw_ctx_init(struct svcxprt_rdma *rdma, 149 132 struct svc_rdma_rw_ctxt *ctxt, 150 - u64 offset, u32 handle, 133 + u64 offset, u32 handle, unsigned int length, 151 134 enum dma_data_direction direction) 152 135 { 136 + struct bvec_iter iter = { 137 + .bi_size = length, 138 + }; 153 139 int ret; 154 140 155 - ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, rdma->sc_port_num, 156 - ctxt->rw_sg_table.sgl, ctxt->rw_nents, 157 - 0, offset, handle, direction); 141 + ret = rdma_rw_ctx_init_bvec(&ctxt->rw_ctx, rdma->sc_qp, 142 + rdma->sc_port_num, 143 + ctxt->rw_bvec, ctxt->rw_nents, 144 + iter, offset, handle, direction); 158 145 if (unlikely(ret < 0)) { 159 146 trace_svcrdma_dma_map_rw_err(rdma, offset, handle, 160 147 ctxt->rw_nents, ret); ··· 196 175 { 197 176 struct llist_node *first, *last; 198 177 struct svc_rdma_rw_ctxt *ctxt; 199 - LLIST_HEAD(free); 200 178 201 179 trace_svcrdma_cc_release(&cc->cc_cid, cc->cc_sqecount); 202 180 ··· 203 183 while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { 204 184 list_del(&ctxt->rw_list); 205 185 206 - rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, 207 - rdma->sc_port_num, ctxt->rw_sg_table.sgl, 208 - ctxt->rw_nents, dir); 209 - __svc_rdma_put_rw_ctxt(ctxt, &free); 186 + rdma_rw_ctx_destroy_bvec(&ctxt->rw_ctx, rdma->sc_qp, 187 + rdma->sc_port_num, 188 + ctxt->rw_bvec, ctxt->rw_nents, dir); 189 + if (ctxt->rw_bvec != ctxt->rw_first_bvec) 190 + kfree(ctxt->rw_bvec); 210 191 211 192 ctxt->rw_node.next = first; 212 193 first = &ctxt->rw_node; ··· 435 414 return -ENOTCONN; 436 415 } 437 416 438 - /* Build and DMA-map an SGL that covers one kvec in an xdr_buf 417 + /* Build a bvec that covers one kvec in an xdr_buf. 439 418 */ 440 - static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, 441 - unsigned int len, 442 - struct svc_rdma_rw_ctxt *ctxt) 419 + static void svc_rdma_vec_to_bvec(struct svc_rdma_write_info *info, 420 + unsigned int len, 421 + struct svc_rdma_rw_ctxt *ctxt) 443 422 { 444 - struct scatterlist *sg = ctxt->rw_sg_table.sgl; 445 - 446 - sg_set_buf(&sg[0], info->wi_base, len); 423 + bvec_set_virt(&ctxt->rw_bvec[0], info->wi_base, len); 447 424 info->wi_base += len; 448 425 449 426 ctxt->rw_nents = 1; 450 427 } 451 428 452 - /* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. 429 + /* Build a bvec array that covers part of an xdr_buf's pagelist. 453 430 */ 454 - static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, 455 - unsigned int remaining, 456 - struct svc_rdma_rw_ctxt *ctxt) 431 + static void svc_rdma_pagelist_to_bvec(struct svc_rdma_write_info *info, 432 + unsigned int remaining, 433 + struct svc_rdma_rw_ctxt *ctxt) 457 434 { 458 - unsigned int sge_no, sge_bytes, page_off, page_no; 435 + unsigned int bvec_idx, bvec_len, page_off, page_no; 459 436 const struct xdr_buf *xdr = info->wi_xdr; 460 - struct scatterlist *sg; 461 437 struct page **page; 462 438 463 439 page_off = info->wi_next_off + xdr->page_base; ··· 462 444 page_off = offset_in_page(page_off); 463 445 page = xdr->pages + page_no; 464 446 info->wi_next_off += remaining; 465 - sg = ctxt->rw_sg_table.sgl; 466 - sge_no = 0; 447 + bvec_idx = 0; 467 448 do { 468 - sge_bytes = min_t(unsigned int, remaining, 469 - PAGE_SIZE - page_off); 470 - sg_set_page(sg, *page, sge_bytes, page_off); 471 - 472 - remaining -= sge_bytes; 473 - sg = sg_next(sg); 449 + bvec_len = min_t(unsigned int, remaining, 450 + PAGE_SIZE - page_off); 451 + bvec_set_page(&ctxt->rw_bvec[bvec_idx], *page, bvec_len, 452 + page_off); 453 + remaining -= bvec_len; 474 454 page_off = 0; 475 - sge_no++; 455 + bvec_idx++; 476 456 page++; 477 457 } while (remaining); 478 458 479 - ctxt->rw_nents = sge_no; 459 + ctxt->rw_nents = bvec_idx; 480 460 } 481 461 482 462 /* Construct RDMA Write WRs to send a portion of an xdr_buf containing ··· 512 496 constructor(info, write_len, ctxt); 513 497 offset = seg->rs_offset + info->wi_seg_off; 514 498 ret = svc_rdma_rw_ctx_init(rdma, ctxt, offset, seg->rs_handle, 515 - DMA_TO_DEVICE); 499 + write_len, DMA_TO_DEVICE); 516 500 if (ret < 0) 517 501 return -EIO; 518 502 percpu_counter_inc(&svcrdma_stat_write); ··· 551 535 const struct kvec *iov) 552 536 { 553 537 info->wi_base = iov->iov_base; 554 - return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, 538 + return svc_rdma_build_writes(info, svc_rdma_vec_to_bvec, 555 539 iov->iov_len); 556 540 } 557 541 ··· 575 559 { 576 560 info->wi_xdr = xdr; 577 561 info->wi_next_off = offset - xdr->head[0].iov_len; 578 - return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, 562 + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_bvec, 579 563 length); 580 564 } 581 565 ··· 750 734 { 751 735 struct svcxprt_rdma *rdma = svc_rdma_rqst_rdma(rqstp); 752 736 struct svc_rdma_chunk_ctxt *cc = &head->rc_cc; 753 - unsigned int sge_no, seg_len, len; 737 + unsigned int bvec_idx, nr_bvec, seg_len, len, total; 754 738 struct svc_rdma_rw_ctxt *ctxt; 755 - struct scatterlist *sg; 756 739 int ret; 757 740 758 741 len = segment->rs_length; 759 - sge_no = PAGE_ALIGN(head->rc_pageoff + len) >> PAGE_SHIFT; 760 - ctxt = svc_rdma_get_rw_ctxt(rdma, sge_no); 742 + if (check_add_overflow(head->rc_pageoff, len, &total)) 743 + return -EINVAL; 744 + nr_bvec = PAGE_ALIGN(total) >> PAGE_SHIFT; 745 + ctxt = svc_rdma_get_rw_ctxt(rdma, nr_bvec); 761 746 if (!ctxt) 762 747 return -ENOMEM; 763 - ctxt->rw_nents = sge_no; 748 + ctxt->rw_nents = nr_bvec; 764 749 765 - sg = ctxt->rw_sg_table.sgl; 766 - for (sge_no = 0; sge_no < ctxt->rw_nents; sge_no++) { 750 + for (bvec_idx = 0; bvec_idx < ctxt->rw_nents; bvec_idx++) { 767 751 seg_len = min_t(unsigned int, len, 768 752 PAGE_SIZE - head->rc_pageoff); 769 753 770 754 if (!head->rc_pageoff) 771 755 head->rc_page_count++; 772 756 773 - sg_set_page(sg, rqstp->rq_pages[head->rc_curpage], 774 - seg_len, head->rc_pageoff); 775 - sg = sg_next(sg); 757 + bvec_set_page(&ctxt->rw_bvec[bvec_idx], 758 + rqstp->rq_pages[head->rc_curpage], 759 + seg_len, head->rc_pageoff); 776 760 777 761 head->rc_pageoff += seg_len; 778 762 if (head->rc_pageoff == PAGE_SIZE) { ··· 786 770 } 787 771 788 772 ret = svc_rdma_rw_ctx_init(rdma, ctxt, segment->rs_offset, 789 - segment->rs_handle, DMA_FROM_DEVICE); 773 + segment->rs_handle, segment->rs_length, 774 + DMA_FROM_DEVICE); 790 775 if (ret < 0) 791 776 return -EIO; 792 777 percpu_counter_inc(&svcrdma_stat_read);
+6 -2
net/sunrpc/xprtrdma/svc_rdma_transport.c
··· 462 462 newxprt->sc_max_bc_requests = 2; 463 463 } 464 464 465 - /* Arbitrary estimate of the needed number of rdma_rw contexts. 465 + /* Estimate the needed number of rdma_rw contexts. The maximum 466 + * Read and Write chunks have one segment each. Each request 467 + * can involve one Read chunk and either a Write chunk or Reply 468 + * chunk; thus a factor of three. 466 469 */ 467 470 maxpayload = min(xprt->xpt_server->sv_max_payload, 468 471 RPCSVC_MAXPAYLOAD_RDMA); ··· 473 470 rdma_rw_mr_factor(dev, newxprt->sc_port_num, 474 471 maxpayload >> PAGE_SHIFT); 475 472 476 - newxprt->sc_sq_depth = rq_depth + ctxts; 473 + newxprt->sc_sq_depth = rq_depth + 474 + rdma_rw_max_send_wr(dev, newxprt->sc_port_num, ctxts, 0); 477 475 if (newxprt->sc_sq_depth > dev->attrs.max_qp_wr) 478 476 newxprt->sc_sq_depth = dev->attrs.max_qp_wr; 479 477 atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth);