Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

RDMA/mlx5: Consolidate MR destruction to mlx5_ib_dereg_mr()

Now that the SRCU stuff has been removed the entire MR destroy logic can
be made a lot simpler. Currently there are many different ways to destroy a
MR and it makes it really hard to do this task correctly. Route all
destruction through mlx5_ib_dereg_mr() and make it work for all
situations.

Since it turns out all the different MR types do basically the same thing
this removes a lot of knowledge of MR internals from ODP and leaves ODP
just exporting an operation to clean up children.

This fixes a few weird corner cases bugs and firmly uses the correct
ordering of the MR destruction:
- Stop parallel access to the mkey via the ODP xarray
- Stop DMA
- Release the umem
- Clean up ODP children
- Free/Recycle the MR

Link: https://lore.kernel.org/r/20210304120745.1090751-4-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

+92 -205
+4
drivers/infiniband/core/umem_dmabuf.c
··· 168 168 { 169 169 struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf; 170 170 171 + dma_resv_lock(dmabuf->resv, NULL); 172 + ib_umem_dmabuf_unmap_pages(umem_dmabuf); 173 + dma_resv_unlock(dmabuf->resv); 174 + 171 175 dma_buf_detach(dmabuf, umem_dmabuf->attach); 172 176 dma_buf_put(dmabuf); 173 177 kfree(umem_dmabuf);
+1 -4
drivers/infiniband/hw/mlx5/mlx5_ib.h
··· 1285 1285 struct ib_udata *udata, 1286 1286 int access_flags); 1287 1287 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr); 1288 - void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr); 1289 - void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr); 1288 + void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr); 1290 1289 struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start, 1291 1290 u64 length, u64 virt_addr, int access_flags, 1292 1291 struct ib_pd *pd, struct ib_udata *udata); ··· 1333 1334 1334 1335 struct mlx5_ib_mr *mlx5_mr_cache_alloc(struct mlx5_ib_dev *dev, 1335 1336 unsigned int entry, int access_flags); 1336 - void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); 1337 - int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr); 1338 1337 1339 1338 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask, 1340 1339 struct ib_mr_status *mr_status);
+62 -71
drivers/infiniband/hw/mlx5/mr.c
··· 119 119 create_mkey_callback, context); 120 120 } 121 121 122 - static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); 123 - static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr); 124 122 static int mr_cache_max_order(struct mlx5_ib_dev *dev); 125 123 static void queue_adjust_cache_locked(struct mlx5_cache_ent *ent); 126 124 ··· 625 627 return NULL; 626 628 } 627 629 628 - static void detach_mr_from_cache(struct mlx5_ib_mr *mr) 630 + static void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 629 631 { 630 632 struct mlx5_cache_ent *ent = mr->cache_ent; 631 - 632 - mr->cache_ent = NULL; 633 - spin_lock_irq(&ent->lock); 634 - ent->total_mrs--; 635 - spin_unlock_irq(&ent->lock); 636 - } 637 - 638 - void mlx5_mr_cache_free(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 639 - { 640 - struct mlx5_cache_ent *ent = mr->cache_ent; 641 - 642 - if (!ent) 643 - return; 644 - 645 - if (mlx5_mr_cache_invalidate(mr)) { 646 - detach_mr_from_cache(mr); 647 - destroy_mkey(dev, mr); 648 - kfree(mr); 649 - return; 650 - } 651 633 652 634 spin_lock_irq(&ent->lock); 653 635 list_add_tail(&mr->list, &ent->head); ··· 1481 1503 */ 1482 1504 err = mlx5_ib_update_mr_pas(mr, MLX5_IB_UPD_XLT_ENABLE); 1483 1505 if (err) { 1484 - dereg_mr(dev, mr); 1506 + mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1485 1507 return ERR_PTR(err); 1486 1508 } 1487 1509 } ··· 1538 1560 return &mr->ibmr; 1539 1561 1540 1562 err_dereg_mr: 1541 - dereg_mr(dev, mr); 1563 + mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1542 1564 return ERR_PTR(err); 1543 1565 } 1544 1566 ··· 1635 1657 return &mr->ibmr; 1636 1658 1637 1659 err_dereg_mr: 1638 - dereg_mr(dev, mr); 1660 + mlx5_ib_dereg_mr(&mr->ibmr, NULL); 1639 1661 return ERR_PTR(err); 1640 1662 } 1641 1663 ··· 1647 1669 * and any DMA inprogress will be completed. Failure of this function 1648 1670 * indicates the HW has failed catastrophically. 1649 1671 */ 1650 - int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr) 1672 + static int mlx5_mr_cache_invalidate(struct mlx5_ib_mr *mr) 1651 1673 { 1652 1674 struct mlx5_umr_wr umrwr = {}; 1653 1675 ··· 1919 1941 } 1920 1942 } 1921 1943 1922 - static void clean_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 1944 + int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1923 1945 { 1924 - if (mr->ibmr.type == IB_MR_TYPE_INTEGRITY) { 1946 + struct mlx5_ib_mr *mr = to_mmr(ibmr); 1947 + struct mlx5_ib_dev *dev = to_mdev(ibmr->device); 1948 + int rc; 1949 + 1950 + /* 1951 + * Any async use of the mr must hold the refcount, once the refcount 1952 + * goes to zero no other thread, such as ODP page faults, prefetch, any 1953 + * UMR activity, etc can touch the mkey. Thus it is safe to destroy it. 1954 + */ 1955 + if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && 1956 + refcount_read(&mr->mmkey.usecount) != 0 && 1957 + xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key))) 1958 + mlx5r_deref_wait_odp_mkey(&mr->mmkey); 1959 + 1960 + if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1961 + xa_cmpxchg(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key), ibmr, 1962 + NULL, GFP_KERNEL); 1963 + 1964 + if (mr->mtt_mr) { 1965 + rc = mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 1966 + if (rc) 1967 + return rc; 1968 + mr->mtt_mr = NULL; 1969 + } 1970 + if (mr->klm_mr) { 1971 + mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 1972 + if (rc) 1973 + return rc; 1974 + mr->klm_mr = NULL; 1975 + } 1976 + 1925 1977 if (mlx5_core_destroy_psv(dev->mdev, 1926 1978 mr->sig->psv_memory.psv_idx)) 1927 1979 mlx5_ib_warn(dev, "failed to destroy mem psv %d\n", 1928 1980 mr->sig->psv_memory.psv_idx); 1929 - if (mlx5_core_destroy_psv(dev->mdev, 1930 - mr->sig->psv_wire.psv_idx)) 1981 + if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_wire.psv_idx)) 1931 1982 mlx5_ib_warn(dev, "failed to destroy wire psv %d\n", 1932 1983 mr->sig->psv_wire.psv_idx); 1933 - xa_erase(&dev->sig_mrs, mlx5_base_mkey(mr->mmkey.key)); 1934 1984 kfree(mr->sig); 1935 1985 mr->sig = NULL; 1936 1986 } 1937 1987 1988 + /* Stop DMA */ 1989 + if (mr->cache_ent) { 1990 + if (mlx5_mr_cache_invalidate(mr)) { 1991 + spin_lock_irq(&mr->cache_ent->lock); 1992 + mr->cache_ent->total_mrs--; 1993 + spin_unlock_irq(&mr->cache_ent->lock); 1994 + mr->cache_ent = NULL; 1995 + } 1996 + } 1938 1997 if (!mr->cache_ent) { 1939 - destroy_mkey(dev, mr); 1940 - mlx5_free_priv_descs(mr); 1998 + rc = destroy_mkey(to_mdev(mr->ibmr.device), mr); 1999 + if (rc) 2000 + return rc; 1941 2001 } 1942 - } 1943 2002 1944 - static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) 1945 - { 1946 - struct ib_umem *umem = mr->umem; 2003 + if (mr->umem) { 2004 + bool is_odp = is_odp_mr(mr); 1947 2005 1948 - /* Stop all DMA */ 1949 - if (is_odp_mr(mr)) 1950 - mlx5_ib_fence_odp_mr(mr); 1951 - else if (is_dmabuf_mr(mr)) 1952 - mlx5_ib_fence_dmabuf_mr(mr); 1953 - else 1954 - clean_mr(dev, mr); 1955 - 1956 - if (umem) { 1957 - if (!is_odp_mr(mr)) 1958 - atomic_sub(ib_umem_num_pages(umem), 2006 + if (!is_odp) 2007 + atomic_sub(ib_umem_num_pages(mr->umem), 1959 2008 &dev->mdev->priv.reg_pages); 1960 - ib_umem_release(umem); 2009 + ib_umem_release(mr->umem); 2010 + if (is_odp) 2011 + mlx5_ib_free_odp_mr(mr); 1961 2012 } 1962 2013 1963 - if (mr->cache_ent) 2014 + if (mr->cache_ent) { 1964 2015 mlx5_mr_cache_free(dev, mr); 1965 - else 2016 + } else { 2017 + mlx5_free_priv_descs(mr); 1966 2018 kfree(mr); 1967 - } 1968 - 1969 - int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) 1970 - { 1971 - struct mlx5_ib_mr *mmr = to_mmr(ibmr); 1972 - 1973 - if (ibmr->type == IB_MR_TYPE_INTEGRITY) { 1974 - dereg_mr(to_mdev(mmr->mtt_mr->ibmr.device), mmr->mtt_mr); 1975 - dereg_mr(to_mdev(mmr->klm_mr->ibmr.device), mmr->klm_mr); 1976 2019 } 1977 - 1978 - if (is_odp_mr(mmr) && to_ib_umem_odp(mmr->umem)->is_implicit_odp) { 1979 - mlx5_ib_free_implicit_mr(mmr); 1980 - return 0; 1981 - } 1982 - 1983 - dereg_mr(to_mdev(ibmr->device), mmr); 1984 - 1985 2020 return 0; 1986 2021 } 1987 2022 ··· 2166 2175 destroy_mkey(dev, mr); 2167 2176 mlx5_free_priv_descs(mr); 2168 2177 err_free_mtt_mr: 2169 - dereg_mr(to_mdev(mr->mtt_mr->ibmr.device), mr->mtt_mr); 2178 + mlx5_ib_dereg_mr(&mr->mtt_mr->ibmr, NULL); 2170 2179 mr->mtt_mr = NULL; 2171 2180 err_free_klm_mr: 2172 - dereg_mr(to_mdev(mr->klm_mr->ibmr.device), mr->klm_mr); 2181 + mlx5_ib_dereg_mr(&mr->klm_mr->ibmr, NULL); 2173 2182 mr->klm_mr = NULL; 2174 2183 err_destroy_psv: 2175 2184 if (mlx5_core_destroy_psv(dev->mdev, mr->sig->psv_memory.psv_idx))
+25 -130
drivers/infiniband/hw/mlx5/odp.c
··· 181 181 } 182 182 } 183 183 184 - static void dma_fence_odp_mr(struct mlx5_ib_mr *mr) 185 - { 186 - struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 187 - 188 - /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */ 189 - mutex_lock(&odp->umem_mutex); 190 - if (odp->npages) { 191 - mlx5_mr_cache_invalidate(mr); 192 - ib_umem_odp_unmap_dma_pages(odp, ib_umem_start(odp), 193 - ib_umem_end(odp)); 194 - WARN_ON(odp->npages); 195 - } 196 - odp->private = NULL; 197 - mutex_unlock(&odp->umem_mutex); 198 - 199 - if (!mr->cache_ent) { 200 - mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey); 201 - WARN_ON(mr->descs); 202 - } 203 - } 204 - 205 184 /* 206 185 * This must be called after the mr has been removed from implicit_children. 207 186 * NOTE: The MR does not necessarily have to be 208 187 * empty here, parallel page faults could have raced with the free process and 209 188 * added pages to it. 210 189 */ 211 - static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt) 212 - { 213 - struct mlx5_ib_mr *imr = mr->parent; 214 - struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 215 - struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 216 - unsigned long idx = ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT; 217 - 218 - mlx5r_deref_wait_odp_mkey(&mr->mmkey); 219 - 220 - if (need_imr_xlt) { 221 - mutex_lock(&odp_imr->umem_mutex); 222 - mlx5_ib_update_xlt(mr->parent, idx, 1, 0, 223 - MLX5_IB_UPD_XLT_INDIRECT | 224 - MLX5_IB_UPD_XLT_ATOMIC); 225 - mutex_unlock(&odp_imr->umem_mutex); 226 - } 227 - 228 - dma_fence_odp_mr(mr); 229 - 230 - mlx5_mr_cache_free(mr_to_mdev(mr), mr); 231 - ib_umem_odp_release(odp); 232 - } 233 - 234 190 static void free_implicit_child_mr_work(struct work_struct *work) 235 191 { 236 192 struct mlx5_ib_mr *mr = 237 193 container_of(work, struct mlx5_ib_mr, odp_destroy.work); 238 194 struct mlx5_ib_mr *imr = mr->parent; 195 + struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 196 + struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem); 239 197 240 - free_implicit_child_mr(mr, true); 198 + mlx5r_deref_wait_odp_mkey(&mr->mmkey); 199 + 200 + mutex_lock(&odp_imr->umem_mutex); 201 + mlx5_ib_update_xlt(mr->parent, ib_umem_start(odp) >> MLX5_IMR_MTT_SHIFT, 202 + 1, 0, 203 + MLX5_IB_UPD_XLT_INDIRECT | MLX5_IB_UPD_XLT_ATOMIC); 204 + mutex_unlock(&odp_imr->umem_mutex); 205 + mlx5_ib_dereg_mr(&mr->ibmr, NULL); 206 + 241 207 mlx5r_deref_odp_mkey(&imr->mmkey); 242 208 } 243 209 ··· 420 454 421 455 ret = mr = mlx5_mr_cache_alloc( 422 456 mr_to_mdev(imr), MLX5_IMR_MTT_CACHE_ENTRY, imr->access_flags); 423 - if (IS_ERR(mr)) 424 - goto out_umem; 457 + if (IS_ERR(mr)) { 458 + ib_umem_odp_release(odp); 459 + return mr; 460 + } 425 461 426 462 mr->ibmr.pd = imr->ibmr.pd; 427 463 mr->ibmr.device = &mr_to_mdev(imr)->ib_dev; ··· 473 505 out_lock: 474 506 xa_unlock(&imr->implicit_children); 475 507 out_mr: 476 - mlx5_mr_cache_free(mr_to_mdev(imr), mr); 477 - out_umem: 478 - ib_umem_odp_release(odp); 508 + mlx5_ib_dereg_mr(&mr->ibmr, NULL); 479 509 return ret; 480 510 } 481 511 ··· 496 530 497 531 imr = mlx5_mr_cache_alloc(dev, MLX5_IMR_KSM_CACHE_ENTRY, access_flags); 498 532 if (IS_ERR(imr)) { 499 - err = PTR_ERR(imr); 500 - goto out_umem; 533 + ib_umem_odp_release(umem_odp); 534 + return imr; 501 535 } 502 536 503 537 imr->ibmr.pd = &pd->ibpd; ··· 527 561 return imr; 528 562 out_mr: 529 563 mlx5_ib_err(dev, "Failed to register MKEY %d\n", err); 530 - mlx5_mr_cache_free(dev, imr); 531 - out_umem: 532 - ib_umem_odp_release(umem_odp); 564 + mlx5_ib_dereg_mr(&imr->ibmr, NULL); 533 565 return ERR_PTR(err); 534 566 } 535 567 536 - void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) 568 + void mlx5_ib_free_odp_mr(struct mlx5_ib_mr *mr) 537 569 { 538 - struct ib_umem_odp *odp_imr = to_ib_umem_odp(imr->umem); 539 - struct mlx5_ib_dev *dev = mr_to_mdev(imr); 540 570 struct mlx5_ib_mr *mtt; 541 571 unsigned long idx; 542 572 543 - xa_erase(&dev->odp_mkeys, mlx5_base_mkey(imr->mmkey.key)); 544 573 /* 545 - * All work on the prefetch list must be completed, xa_erase() prevented 546 - * new work from being created. 574 + * If this is an implicit MR it is already invalidated so we can just 575 + * delete the children mkeys. 547 576 */ 548 - mlx5r_deref_wait_odp_mkey(&imr->mmkey); 549 - /* 550 - * At this point it is forbidden for any other thread to enter 551 - * pagefault_mr() on this imr. It is already forbidden to call 552 - * pagefault_mr() on an implicit child. Due to this additions to 553 - * implicit_children are prevented. 554 - * In addition, any new call to destroy_unused_implicit_child_mr() 555 - * may return immediately. 556 - */ 557 - 558 - /* 559 - * Fence the imr before we destroy the children. This allows us to 560 - * skip updating the XLT of the imr during destroy of the child mkey 561 - * the imr points to. 562 - */ 563 - mlx5_mr_cache_invalidate(imr); 564 - 565 - xa_for_each(&imr->implicit_children, idx, mtt) { 566 - xa_erase(&imr->implicit_children, idx); 567 - free_implicit_child_mr(mtt, false); 568 - } 569 - 570 - mlx5_mr_cache_free(dev, imr); 571 - ib_umem_odp_release(odp_imr); 572 - } 573 - 574 - /** 575 - * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR 576 - * @mr: to fence 577 - * 578 - * On return no parallel threads will be touching this MR and no DMA will be 579 - * active. 580 - */ 581 - void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr) 582 - { 583 - /* Prevent new page faults and prefetch requests from succeeding */ 584 - xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); 585 - 586 - /* Wait for all running page-fault handlers to finish. */ 587 - mlx5r_deref_wait_odp_mkey(&mr->mmkey); 588 - 589 - dma_fence_odp_mr(mr); 590 - } 591 - 592 - /** 593 - * mlx5_ib_fence_dmabuf_mr - Stop all access to the dmabuf MR 594 - * @mr: to fence 595 - * 596 - * On return no parallel threads will be touching this MR and no DMA will be 597 - * active. 598 - */ 599 - void mlx5_ib_fence_dmabuf_mr(struct mlx5_ib_mr *mr) 600 - { 601 - struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem); 602 - 603 - /* Prevent new page faults and prefetch requests from succeeding */ 604 - xa_erase(&mr_to_mdev(mr)->odp_mkeys, mlx5_base_mkey(mr->mmkey.key)); 605 - 606 - mlx5r_deref_wait_odp_mkey(&mr->mmkey); 607 - 608 - dma_resv_lock(umem_dmabuf->attach->dmabuf->resv, NULL); 609 - mlx5_mr_cache_invalidate(mr); 610 - umem_dmabuf->private = NULL; 611 - ib_umem_dmabuf_unmap_pages(umem_dmabuf); 612 - dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv); 613 - 614 - if (!mr->cache_ent) { 615 - mlx5_core_destroy_mkey(mr_to_mdev(mr)->mdev, &mr->mmkey); 616 - WARN_ON(mr->descs); 577 + xa_for_each(&mr->implicit_children, idx, mtt) { 578 + xa_erase(&mr->implicit_children, idx); 579 + mlx5_ib_dereg_mr(&mtt->ibmr, NULL); 617 580 } 618 581 } 619 582