Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma

Pull rdma fixes from Jason Gunthorpe:

- Several hfi1 patches fixing some long standing driver bugs

- Overflow when working with sg lists with elements greater than 4G

- An rxe regression with object numbering after the mrs reach their
limit

- A theoretical problem with the scatterlist merging code

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma:
lib/scatterlist: Fix to calculate the last_pg properly
IB/hfi1: Remove user expected buffer invalidate race
IB/hfi1: Immediately remove invalid memory from hardware
IB/hfi1: Fix expected receive setup error exit issues
IB/hfi1: Reserve user expected TIDs
IB/hfi1: Reject a zero-length user expected buffer
RDMA/core: Fix ib block iterator counter overflow
RDMA/rxe: Prevent faulty rkey generation
RDMA/rxe: Fix inaccurate constants in rxe_type_info

+180 -89
+5 -2
drivers/infiniband/core/verbs.c
··· 2957 2957 bool __rdma_block_iter_next(struct ib_block_iter *biter) 2958 2958 { 2959 2959 unsigned int block_offset; 2960 + unsigned int sg_delta; 2960 2961 2961 2962 if (!biter->__sg_nents || !biter->__sg) 2962 2963 return false; 2963 2964 2964 2965 biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; 2965 2966 block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); 2966 - biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset; 2967 + sg_delta = BIT_ULL(biter->__pg_bit) - block_offset; 2967 2968 2968 - if (biter->__sg_advance >= sg_dma_len(biter->__sg)) { 2969 + if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) { 2970 + biter->__sg_advance += sg_delta; 2971 + } else { 2969 2972 biter->__sg_advance = 0; 2970 2973 biter->__sg = sg_next(biter->__sg); 2971 2974 biter->__sg_nents--;
+141 -61
drivers/infiniband/hw/hfi1/user_exp_rcv.c
··· 23 23 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni, 24 24 const struct mmu_notifier_range *range, 25 25 unsigned long cur_seq); 26 + static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, 27 + const struct mmu_notifier_range *range, 28 + unsigned long cur_seq); 26 29 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *, 27 30 struct tid_group *grp, 28 31 unsigned int start, u16 count, 29 32 u32 *tidlist, unsigned int *tididx, 30 33 unsigned int *pmapped); 31 - static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, 32 - struct tid_group **grp); 34 + static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo); 35 + static void __clear_tid_node(struct hfi1_filedata *fd, 36 + struct tid_rb_node *node); 33 37 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node); 34 38 35 39 static const struct mmu_interval_notifier_ops tid_mn_ops = { 36 40 .invalidate = tid_rb_invalidate, 41 + }; 42 + static const struct mmu_interval_notifier_ops tid_cover_ops = { 43 + .invalidate = tid_cover_invalidate, 37 44 }; 38 45 39 46 /* ··· 260 253 tididx = 0, mapped, mapped_pages = 0; 261 254 u32 *tidlist = NULL; 262 255 struct tid_user_buf *tidbuf; 256 + unsigned long mmu_seq = 0; 263 257 264 258 if (!PAGE_ALIGNED(tinfo->vaddr)) 259 + return -EINVAL; 260 + if (tinfo->length == 0) 265 261 return -EINVAL; 266 262 267 263 tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL); 268 264 if (!tidbuf) 269 265 return -ENOMEM; 270 266 267 + mutex_init(&tidbuf->cover_mutex); 271 268 tidbuf->vaddr = tinfo->vaddr; 272 269 tidbuf->length = tinfo->length; 273 270 tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets), 274 271 GFP_KERNEL); 275 272 if (!tidbuf->psets) { 276 - kfree(tidbuf); 277 - return -ENOMEM; 273 + ret = -ENOMEM; 274 + goto fail_release_mem; 275 + } 276 + 277 + if (fd->use_mn) { 278 + ret = mmu_interval_notifier_insert( 279 + &tidbuf->notifier, current->mm, 280 + tidbuf->vaddr, tidbuf->npages * PAGE_SIZE, 281 + &tid_cover_ops); 282 + if (ret) 283 + goto fail_release_mem; 284 + mmu_seq = mmu_interval_read_begin(&tidbuf->notifier); 278 285 } 279 286 280 287 pinned = pin_rcv_pages(fd, tidbuf); 281 288 if (pinned <= 0) { 282 - kfree(tidbuf->psets); 283 - kfree(tidbuf); 284 - return pinned; 289 + ret = (pinned < 0) ? pinned : -ENOSPC; 290 + goto fail_unpin; 285 291 } 286 292 287 293 /* Find sets of physically contiguous pages */ 288 294 tidbuf->n_psets = find_phys_blocks(tidbuf, pinned); 289 295 290 - /* 291 - * We don't need to access this under a lock since tid_used is per 292 - * process and the same process cannot be in hfi1_user_exp_rcv_clear() 293 - * and hfi1_user_exp_rcv_setup() at the same time. 294 - */ 296 + /* Reserve the number of expected tids to be used. */ 295 297 spin_lock(&fd->tid_lock); 296 298 if (fd->tid_used + tidbuf->n_psets > fd->tid_limit) 297 299 pageset_count = fd->tid_limit - fd->tid_used; 298 300 else 299 301 pageset_count = tidbuf->n_psets; 302 + fd->tid_used += pageset_count; 300 303 spin_unlock(&fd->tid_lock); 301 304 302 - if (!pageset_count) 303 - goto bail; 305 + if (!pageset_count) { 306 + ret = -ENOSPC; 307 + goto fail_unreserve; 308 + } 304 309 305 310 ngroups = pageset_count / dd->rcv_entries.group_size; 306 311 tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL); 307 312 if (!tidlist) { 308 313 ret = -ENOMEM; 309 - goto nomem; 314 + goto fail_unreserve; 310 315 } 311 316 312 317 tididx = 0; ··· 414 395 } 415 396 unlock: 416 397 mutex_unlock(&uctxt->exp_mutex); 417 - nomem: 418 398 hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx, 419 399 mapped_pages, ret); 420 - if (tididx) { 421 - spin_lock(&fd->tid_lock); 422 - fd->tid_used += tididx; 423 - spin_unlock(&fd->tid_lock); 424 - tinfo->tidcnt = tididx; 425 - tinfo->length = mapped_pages * PAGE_SIZE; 426 400 427 - if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), 428 - tidlist, sizeof(tidlist[0]) * tididx)) { 429 - /* 430 - * On failure to copy to the user level, we need to undo 431 - * everything done so far so we don't leak resources. 432 - */ 433 - tinfo->tidlist = (unsigned long)&tidlist; 434 - hfi1_user_exp_rcv_clear(fd, tinfo); 435 - tinfo->tidlist = 0; 436 - ret = -EFAULT; 437 - goto bail; 401 + /* fail if nothing was programmed, set error if none provided */ 402 + if (tididx == 0) { 403 + if (ret >= 0) 404 + ret = -ENOSPC; 405 + goto fail_unreserve; 406 + } 407 + 408 + /* adjust reserved tid_used to actual count */ 409 + spin_lock(&fd->tid_lock); 410 + fd->tid_used -= pageset_count - tididx; 411 + spin_unlock(&fd->tid_lock); 412 + 413 + /* unpin all pages not covered by a TID */ 414 + unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages, 415 + false); 416 + 417 + if (fd->use_mn) { 418 + /* check for an invalidate during setup */ 419 + bool fail = false; 420 + 421 + mutex_lock(&tidbuf->cover_mutex); 422 + fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq); 423 + mutex_unlock(&tidbuf->cover_mutex); 424 + 425 + if (fail) { 426 + ret = -EBUSY; 427 + goto fail_unprogram; 438 428 } 439 429 } 440 430 441 - /* 442 - * If not everything was mapped (due to insufficient RcvArray entries, 443 - * for example), unpin all unmapped pages so we can pin them nex time. 444 - */ 445 - if (mapped_pages != pinned) 446 - unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, 447 - (pinned - mapped_pages), false); 448 - bail: 449 - kfree(tidbuf->psets); 450 - kfree(tidlist); 431 + tinfo->tidcnt = tididx; 432 + tinfo->length = mapped_pages * PAGE_SIZE; 433 + 434 + if (copy_to_user(u64_to_user_ptr(tinfo->tidlist), 435 + tidlist, sizeof(tidlist[0]) * tididx)) { 436 + ret = -EFAULT; 437 + goto fail_unprogram; 438 + } 439 + 440 + if (fd->use_mn) 441 + mmu_interval_notifier_remove(&tidbuf->notifier); 451 442 kfree(tidbuf->pages); 443 + kfree(tidbuf->psets); 452 444 kfree(tidbuf); 453 - return ret > 0 ? 0 : ret; 445 + kfree(tidlist); 446 + return 0; 447 + 448 + fail_unprogram: 449 + /* unprogram, unmap, and unpin all allocated TIDs */ 450 + tinfo->tidlist = (unsigned long)tidlist; 451 + hfi1_user_exp_rcv_clear(fd, tinfo); 452 + tinfo->tidlist = 0; 453 + pinned = 0; /* nothing left to unpin */ 454 + pageset_count = 0; /* nothing left reserved */ 455 + fail_unreserve: 456 + spin_lock(&fd->tid_lock); 457 + fd->tid_used -= pageset_count; 458 + spin_unlock(&fd->tid_lock); 459 + fail_unpin: 460 + if (fd->use_mn) 461 + mmu_interval_notifier_remove(&tidbuf->notifier); 462 + if (pinned > 0) 463 + unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false); 464 + fail_release_mem: 465 + kfree(tidbuf->pages); 466 + kfree(tidbuf->psets); 467 + kfree(tidbuf); 468 + kfree(tidlist); 469 + return ret; 454 470 } 455 471 456 472 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd, ··· 506 452 507 453 mutex_lock(&uctxt->exp_mutex); 508 454 for (tididx = 0; tididx < tinfo->tidcnt; tididx++) { 509 - ret = unprogram_rcvarray(fd, tidinfo[tididx], NULL); 455 + ret = unprogram_rcvarray(fd, tidinfo[tididx]); 510 456 if (ret) { 511 457 hfi1_cdbg(TID, "Failed to unprogram rcv array %d", 512 458 ret); ··· 760 706 } 761 707 762 708 node->fdata = fd; 709 + mutex_init(&node->invalidate_mutex); 763 710 node->phys = page_to_phys(pages[0]); 764 711 node->npages = npages; 765 712 node->rcventry = rcventry; ··· 776 721 &tid_mn_ops); 777 722 if (ret) 778 723 goto out_unmap; 779 - /* 780 - * FIXME: This is in the wrong order, the notifier should be 781 - * established before the pages are pinned by pin_rcv_pages. 782 - */ 783 - mmu_interval_read_begin(&node->notifier); 784 724 } 785 725 fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node; 786 726 ··· 795 745 return -EFAULT; 796 746 } 797 747 798 - static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo, 799 - struct tid_group **grp) 748 + static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo) 800 749 { 801 750 struct hfi1_ctxtdata *uctxt = fd->uctxt; 802 751 struct hfi1_devdata *dd = uctxt->dd; ··· 818 769 if (!node || node->rcventry != (uctxt->expected_base + rcventry)) 819 770 return -EBADF; 820 771 821 - if (grp) 822 - *grp = node->grp; 823 - 824 772 if (fd->use_mn) 825 773 mmu_interval_notifier_remove(&node->notifier); 826 774 cacheless_tid_rb_remove(fd, node); ··· 825 779 return 0; 826 780 } 827 781 828 - static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) 782 + static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) 829 783 { 830 784 struct hfi1_ctxtdata *uctxt = fd->uctxt; 831 785 struct hfi1_devdata *dd = uctxt->dd; 786 + 787 + mutex_lock(&node->invalidate_mutex); 788 + if (node->freed) 789 + goto done; 790 + node->freed = true; 832 791 833 792 trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry, 834 793 node->npages, 835 794 node->notifier.interval_tree.start, node->phys, 836 795 node->dma_addr); 837 796 838 - /* 839 - * Make sure device has seen the write before we unpin the 840 - * pages. 841 - */ 797 + /* Make sure device has seen the write before pages are unpinned */ 842 798 hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0); 843 799 844 800 unpin_rcv_pages(fd, NULL, node, 0, node->npages, true); 801 + done: 802 + mutex_unlock(&node->invalidate_mutex); 803 + } 804 + 805 + static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node) 806 + { 807 + struct hfi1_ctxtdata *uctxt = fd->uctxt; 808 + 809 + __clear_tid_node(fd, node); 845 810 846 811 node->grp->used--; 847 812 node->grp->map &= ~(1 << (node->rcventry - node->grp->base)); ··· 911 854 if (node->freed) 912 855 return true; 913 856 857 + /* take action only if unmapping */ 858 + if (range->event != MMU_NOTIFY_UNMAP) 859 + return true; 860 + 914 861 trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt, 915 862 node->notifier.interval_tree.start, 916 863 node->rcventry, node->npages, node->dma_addr); 917 - node->freed = true; 864 + 865 + /* clear the hardware rcvarray entry */ 866 + __clear_tid_node(fdata, node); 918 867 919 868 spin_lock(&fdata->invalid_lock); 920 869 if (fdata->invalid_tid_idx < uctxt->expected_count) { ··· 947 884 fdata->invalid_tid_idx++; 948 885 } 949 886 spin_unlock(&fdata->invalid_lock); 887 + return true; 888 + } 889 + 890 + static bool tid_cover_invalidate(struct mmu_interval_notifier *mni, 891 + const struct mmu_notifier_range *range, 892 + unsigned long cur_seq) 893 + { 894 + struct tid_user_buf *tidbuf = 895 + container_of(mni, struct tid_user_buf, notifier); 896 + 897 + /* take action only if unmapping */ 898 + if (range->event == MMU_NOTIFY_UNMAP) { 899 + mutex_lock(&tidbuf->cover_mutex); 900 + mmu_interval_set_seq(mni, cur_seq); 901 + mutex_unlock(&tidbuf->cover_mutex); 902 + } 903 + 950 904 return true; 951 905 } 952 906
+3
drivers/infiniband/hw/hfi1/user_exp_rcv.h
··· 16 16 }; 17 17 18 18 struct tid_user_buf { 19 + struct mmu_interval_notifier notifier; 20 + struct mutex cover_mutex; 19 21 unsigned long vaddr; 20 22 unsigned long length; 21 23 unsigned int npages; ··· 29 27 struct tid_rb_node { 30 28 struct mmu_interval_notifier notifier; 31 29 struct hfi1_filedata *fdata; 30 + struct mutex invalidate_mutex; /* covers hw removal */ 32 31 unsigned long phys; 33 32 struct tid_group *grp; 34 33 u32 rcventry;
+5 -5
drivers/infiniband/sw/rxe/rxe_param.h
··· 98 98 RXE_MAX_SRQ = DEFAULT_MAX_VALUE - RXE_MIN_SRQ_INDEX, 99 99 100 100 RXE_MIN_MR_INDEX = 0x00000001, 101 - RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE, 102 - RXE_MAX_MR = DEFAULT_MAX_VALUE - RXE_MIN_MR_INDEX, 103 - RXE_MIN_MW_INDEX = 0x00010001, 104 - RXE_MAX_MW_INDEX = 0x00020000, 105 - RXE_MAX_MW = 0x00001000, 101 + RXE_MAX_MR_INDEX = DEFAULT_MAX_VALUE >> 1, 102 + RXE_MAX_MR = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX, 103 + RXE_MIN_MW_INDEX = RXE_MAX_MR_INDEX + 1, 104 + RXE_MAX_MW_INDEX = DEFAULT_MAX_VALUE, 105 + RXE_MAX_MW = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX, 106 106 107 107 RXE_MAX_PKT_PER_ACK = 64, 108 108
+11 -11
drivers/infiniband/sw/rxe/rxe_pool.c
··· 23 23 .size = sizeof(struct rxe_ucontext), 24 24 .elem_offset = offsetof(struct rxe_ucontext, elem), 25 25 .min_index = 1, 26 - .max_index = UINT_MAX, 27 - .max_elem = UINT_MAX, 26 + .max_index = RXE_MAX_UCONTEXT, 27 + .max_elem = RXE_MAX_UCONTEXT, 28 28 }, 29 29 [RXE_TYPE_PD] = { 30 30 .name = "pd", 31 31 .size = sizeof(struct rxe_pd), 32 32 .elem_offset = offsetof(struct rxe_pd, elem), 33 33 .min_index = 1, 34 - .max_index = UINT_MAX, 35 - .max_elem = UINT_MAX, 34 + .max_index = RXE_MAX_PD, 35 + .max_elem = RXE_MAX_PD, 36 36 }, 37 37 [RXE_TYPE_AH] = { 38 38 .name = "ah", ··· 40 40 .elem_offset = offsetof(struct rxe_ah, elem), 41 41 .min_index = RXE_MIN_AH_INDEX, 42 42 .max_index = RXE_MAX_AH_INDEX, 43 - .max_elem = RXE_MAX_AH_INDEX - RXE_MIN_AH_INDEX + 1, 43 + .max_elem = RXE_MAX_AH, 44 44 }, 45 45 [RXE_TYPE_SRQ] = { 46 46 .name = "srq", ··· 49 49 .cleanup = rxe_srq_cleanup, 50 50 .min_index = RXE_MIN_SRQ_INDEX, 51 51 .max_index = RXE_MAX_SRQ_INDEX, 52 - .max_elem = RXE_MAX_SRQ_INDEX - RXE_MIN_SRQ_INDEX + 1, 52 + .max_elem = RXE_MAX_SRQ, 53 53 }, 54 54 [RXE_TYPE_QP] = { 55 55 .name = "qp", ··· 58 58 .cleanup = rxe_qp_cleanup, 59 59 .min_index = RXE_MIN_QP_INDEX, 60 60 .max_index = RXE_MAX_QP_INDEX, 61 - .max_elem = RXE_MAX_QP_INDEX - RXE_MIN_QP_INDEX + 1, 61 + .max_elem = RXE_MAX_QP, 62 62 }, 63 63 [RXE_TYPE_CQ] = { 64 64 .name = "cq", ··· 66 66 .elem_offset = offsetof(struct rxe_cq, elem), 67 67 .cleanup = rxe_cq_cleanup, 68 68 .min_index = 1, 69 - .max_index = UINT_MAX, 70 - .max_elem = UINT_MAX, 69 + .max_index = RXE_MAX_CQ, 70 + .max_elem = RXE_MAX_CQ, 71 71 }, 72 72 [RXE_TYPE_MR] = { 73 73 .name = "mr", ··· 76 76 .cleanup = rxe_mr_cleanup, 77 77 .min_index = RXE_MIN_MR_INDEX, 78 78 .max_index = RXE_MAX_MR_INDEX, 79 - .max_elem = RXE_MAX_MR_INDEX - RXE_MIN_MR_INDEX + 1, 79 + .max_elem = RXE_MAX_MR, 80 80 }, 81 81 [RXE_TYPE_MW] = { 82 82 .name = "mw", ··· 85 85 .cleanup = rxe_mw_cleanup, 86 86 .min_index = RXE_MIN_MW_INDEX, 87 87 .max_index = RXE_MAX_MW_INDEX, 88 - .max_elem = RXE_MAX_MW_INDEX - RXE_MIN_MW_INDEX + 1, 88 + .max_elem = RXE_MAX_MW, 89 89 }, 90 90 }; 91 91
+15 -10
lib/scatterlist.c
··· 470 470 return -EOPNOTSUPP; 471 471 472 472 if (sgt_append->prv) { 473 + unsigned long next_pfn = (page_to_phys(sg_page(sgt_append->prv)) + 474 + sgt_append->prv->offset + sgt_append->prv->length) / PAGE_SIZE; 475 + 473 476 if (WARN_ON(offset)) 474 477 return -EINVAL; 475 478 476 479 /* Merge contiguous pages into the last SG */ 477 480 prv_len = sgt_append->prv->length; 478 - last_pg = sg_page(sgt_append->prv); 479 - while (n_pages && pages_are_mergeable(pages[0], last_pg)) { 480 - if (sgt_append->prv->length + PAGE_SIZE > max_segment) 481 - break; 482 - sgt_append->prv->length += PAGE_SIZE; 483 - last_pg = pages[0]; 484 - pages++; 485 - n_pages--; 481 + if (page_to_pfn(pages[0]) == next_pfn) { 482 + last_pg = pfn_to_page(next_pfn - 1); 483 + while (n_pages && pages_are_mergeable(pages[0], last_pg)) { 484 + if (sgt_append->prv->length + PAGE_SIZE > max_segment) 485 + break; 486 + sgt_append->prv->length += PAGE_SIZE; 487 + last_pg = pages[0]; 488 + pages++; 489 + n_pages--; 490 + } 491 + if (!n_pages) 492 + goto out; 486 493 } 487 - if (!n_pages) 488 - goto out; 489 494 } 490 495 491 496 /* compute number of contiguous chunks */