Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/irdma: Add support for revocable pinned dmabuf import

Use the new API to support importing pinned dmabufs from exporters
that require revocation, such as VFIO. The revoke semantic is
achieved by issuing a HW invalidation command but not freeing
the key. This prevents further accesses to the region (they will
result in an invalid key AE), but also keeps the key reserved
until the region is actually deregistered (i.e., ibv_dereg_mr)
so that a new MR registration cannot acquire the same key.

Tested with lockdep+kasan and a memfd backed dmabuf.

The rereg_mr path is explicitly blocked in libibverbs for dmabuf MRs
(more specifically, any MR not of type IBV_MR_TYPE_MR), so the rereg_mr
path for dmabufs was tested with a modified libibverbs.

Signed-off-by: Jacob Moroni <jmoroni@google.com>
Link: https://patch.msgid.link/20260305170826.3803155-6-jmoroni@google.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>

authored by

Jacob Moroni and committed by
Leon Romanovsky
4707bf5f 3a0b1713

+93 -12
+93 -12
drivers/infiniband/hw/irdma/verbs.c
··· 3590 3590 return ERR_PTR(err); 3591 3591 } 3592 3592 3593 + static int irdma_hwdereg_mr(struct ib_mr *ib_mr); 3594 + 3595 + static void irdma_umem_dmabuf_revoke(void *priv) 3596 + { 3597 + /* priv is guaranteed to be valid any time this callback is invoked 3598 + * because we do not set the callback until after successful iwmr 3599 + * allocation and initialization. 3600 + */ 3601 + struct irdma_mr *iwmr = priv; 3602 + int err; 3603 + 3604 + /* Invalidate the key in hardware. This does not actually release the 3605 + * key for potential reuse - that only occurs when the region is fully 3606 + * deregistered. 3607 + * 3608 + * The irdma_hwdereg_mr call is a no-op if the region is not currently 3609 + * registered with hardware. 3610 + */ 3611 + err = irdma_hwdereg_mr(&iwmr->ibmr); 3612 + if (err) { 3613 + struct irdma_device *iwdev = to_iwdev(iwmr->ibmr.device); 3614 + 3615 + ibdev_err(&iwdev->ibdev, "dmabuf mr revoke failed %d", err); 3616 + if (!iwdev->rf->reset) { 3617 + iwdev->rf->reset = true; 3618 + iwdev->rf->gen_ops.request_reset(iwdev->rf); 3619 + } 3620 + } 3621 + } 3622 + 3593 3623 static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start, 3594 3624 u64 len, u64 virt, 3595 3625 int fd, int access, ··· 3637 3607 if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) 3638 3608 return ERR_PTR(-EINVAL); 3639 3609 3640 - umem_dmabuf = ib_umem_dmabuf_get_pinned(pd->device, start, len, fd, access); 3610 + umem_dmabuf = 3611 + ib_umem_dmabuf_get_pinned_revocable_and_lock(pd->device, start, 3612 + len, fd, access); 3641 3613 if (IS_ERR(umem_dmabuf)) { 3642 3614 ibdev_dbg(&iwdev->ibdev, "Failed to get dmabuf umem[%pe]\n", 3643 3615 umem_dmabuf); ··· 3656 3624 if (err) 3657 3625 goto err_iwmr; 3658 3626 3627 + ib_umem_dmabuf_set_revoke_locked(umem_dmabuf, irdma_umem_dmabuf_revoke, 3628 + iwmr); 3629 + ib_umem_dmabuf_revoke_unlock(umem_dmabuf); 3659 3630 return &iwmr->ibmr; 3660 3631 3661 3632 err_iwmr: 3662 3633 irdma_free_iwmr(iwmr); 3663 3634 3664 3635 err_release: 3636 + ib_umem_dmabuf_revoke_unlock(umem_dmabuf); 3637 + 3638 + /* Will result in a call to revoke, but driver callback is not set and 3639 + * is therefore skipped. 3640 + */ 3665 3641 ib_umem_release(&umem_dmabuf->umem); 3666 3642 3667 3643 return ERR_PTR(err); ··· 3789 3749 struct irdma_device *iwdev = to_iwdev(ib_mr->device); 3790 3750 struct irdma_mr *iwmr = to_iwmr(ib_mr); 3791 3751 struct irdma_pbl *iwpbl = &iwmr->iwpbl; 3752 + bool dmabuf_revocable = iwmr->region && iwmr->region->is_dmabuf; 3753 + struct ib_umem_dmabuf *umem_dmabuf; 3792 3754 int ret; 3793 3755 3794 3756 if (len > iwdev->rf->sc_dev.hw_attrs.max_mr_size) ··· 3799 3757 if (flags & ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) 3800 3758 return ERR_PTR(-EOPNOTSUPP); 3801 3759 3760 + if (dmabuf_revocable) { 3761 + umem_dmabuf = to_ib_umem_dmabuf(iwmr->region); 3762 + 3763 + ib_umem_dmabuf_revoke_lock(umem_dmabuf); 3764 + 3765 + /* If the dmabuf has been revoked, it means that the region has 3766 + * been invalidated in HW. We must not allow it to become valid 3767 + * again unless the user is requesting a change in translation 3768 + * which will end up dropping the umem dmabuf and allocating an 3769 + * entirely new umem anyway. 3770 + */ 3771 + if (umem_dmabuf->revoked && !(flags & IB_MR_REREG_TRANS)) { 3772 + ret = -EINVAL; 3773 + goto err_unlock; 3774 + } 3775 + } 3776 + 3802 3777 ret = irdma_hwdereg_mr(ib_mr); 3803 3778 if (ret) 3804 - return ERR_PTR(ret); 3779 + goto err_unlock; 3805 3780 3806 3781 if (flags & IB_MR_REREG_ACCESS) 3807 3782 iwmr->access = new_access; ··· 3834 3775 &iwpbl->pble_alloc); 3835 3776 iwpbl->pbl_allocated = false; 3836 3777 } 3778 + 3779 + if (dmabuf_revocable) { 3780 + /* Must unlock before release to prevent deadlock */ 3781 + ib_umem_dmabuf_revoke_unlock(umem_dmabuf); 3782 + dmabuf_revocable = false; 3783 + } 3784 + 3837 3785 if (iwmr->region) { 3838 3786 ib_umem_release(iwmr->region); 3839 3787 iwmr->region = NULL; 3840 3788 } 3841 3789 3842 3790 ret = irdma_rereg_mr_trans(iwmr, start, len, virt); 3843 - } else 3791 + } else { 3844 3792 ret = irdma_hwreg_mr(iwdev, iwmr, iwmr->access); 3845 - if (ret) 3846 - return ERR_PTR(ret); 3793 + } 3847 3794 3848 - return NULL; 3795 + err_unlock: 3796 + if (dmabuf_revocable) 3797 + ib_umem_dmabuf_revoke_unlock(umem_dmabuf); 3798 + 3799 + return ret ? ERR_PTR(ret) : NULL; 3849 3800 } 3850 3801 3851 3802 /** ··· 3978 3909 struct irdma_mr *iwmr = to_iwmr(ib_mr); 3979 3910 struct irdma_device *iwdev = to_iwdev(ib_mr->device); 3980 3911 struct irdma_pbl *iwpbl = &iwmr->iwpbl; 3912 + bool dmabuf_revocable = iwmr->region && iwmr->region->is_dmabuf; 3981 3913 int ret; 3982 3914 3983 3915 if (iwmr->type != IRDMA_MEMREG_TYPE_MEM) { ··· 3993 3923 goto done; 3994 3924 } 3995 3925 3996 - ret = irdma_hwdereg_mr(ib_mr); 3997 - if (ret) 3998 - return ret; 3926 + if (!dmabuf_revocable) { 3927 + ret = irdma_hwdereg_mr(ib_mr); 3928 + if (ret) 3929 + return ret; 3999 3930 4000 - irdma_free_stag(iwdev, iwmr->stag); 3931 + irdma_free_stag(iwdev, iwmr->stag); 3932 + } 4001 3933 done: 3934 + if (iwmr->region) 3935 + /* For dmabuf MRs, ib_umem_release will trigger a synchronous 3936 + * call to the revoke callback which will perform the actual HW 3937 + * invalidation via irdma_hwdereg_mr. We rely on this for its 3938 + * implicit serialization w.r.t. concurrent revocations. This 3939 + * must be done before freeing the PBLEs. 3940 + */ 3941 + ib_umem_release(iwmr->region); 3942 + 4002 3943 if (iwpbl->pbl_allocated) 4003 3944 irdma_free_pble(iwdev->rf->pble_rsrc, &iwpbl->pble_alloc); 4004 3945 4005 - if (iwmr->region) 4006 - ib_umem_release(iwmr->region); 3946 + if (dmabuf_revocable) 3947 + irdma_free_stag(iwdev, iwmr->stag); 4007 3948 4008 3949 kfree(iwmr); 4009 3950