Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

Pull vhost fixes from Michael Tsirkin:
"Some last minute vhost,vdpa fixes.

The last two of them haven't been in next but they do seem kind of
obvious, very small and safe, fix bugs reported in the field, and they
are both in a new mlx5 vdpa driver, so it's not like we can introduce
regressions"

* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost:
vdpa/mlx5: Fix dependency on MLX5_CORE
vdpa/mlx5: should keep avail_index despite device status
vhost-vdpa: fix page pinning leakage in error path
vhost-vdpa: fix vhost_vdpa_map() on error condition
vhost: Don't call log_access_ok() when using IOTLB
vhost: Use vhost_get_used_size() in vhost_vring_set_addr()
vhost: Don't call access_ok() when using IOTLB
vhost vdpa: fix vhost_vdpa_open error handling

+117 -70
+3 -4
drivers/vdpa/Kconfig
··· 30 30 be called ifcvf. 31 31 32 32 config MLX5_VDPA 33 - bool "MLX5 VDPA support library for ConnectX devices" 34 - depends on MLX5_CORE 35 - default n 33 + bool 36 34 help 37 35 Support library for Mellanox VDPA drivers. Provides code that is 38 36 common for all types of VDPA drivers. The following drivers are planned: ··· 38 40 39 41 config MLX5_VDPA_NET 40 42 tristate "vDPA driver for ConnectX devices" 41 - depends on MLX5_VDPA 43 + select MLX5_VDPA 44 + depends on MLX5_CORE 42 45 default n 43 46 help 44 47 VDPA network driver for ConnectX6 and newer. Provides offloading
+14 -6
drivers/vdpa/mlx5/net/mlx5_vnet.c
··· 1133 1133 if (!mvq->initialized) 1134 1134 return; 1135 1135 1136 - if (query_virtqueue(ndev, mvq, &attr)) { 1137 - mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n"); 1138 - return; 1139 - } 1140 1136 if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) 1141 1137 return; 1142 1138 1143 1139 if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)) 1144 1140 mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n"); 1141 + 1142 + if (query_virtqueue(ndev, mvq, &attr)) { 1143 + mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n"); 1144 + return; 1145 + } 1146 + mvq->avail_idx = attr.available_index; 1145 1147 } 1146 1148 1147 1149 static void suspend_vqs(struct mlx5_vdpa_net *ndev) ··· 1413 1411 struct mlx5_virtq_attr attr; 1414 1412 int err; 1415 1413 1416 - if (!mvq->initialized) 1417 - return -EAGAIN; 1414 + /* If the virtq object was destroyed, use the value saved at 1415 + * the last minute of suspend_vq. This caters for userspace 1416 + * that cares about emulating the index after vq is stopped. 1417 + */ 1418 + if (!mvq->initialized) { 1419 + state->avail_index = mvq->avail_idx; 1420 + return 0; 1421 + } 1418 1422 1419 1423 err = query_virtqueue(ndev, mvq, &attr); 1420 1424 if (err) {
+77 -50
drivers/vhost/vdpa.c
··· 565 565 perm_to_iommu_flags(perm)); 566 566 } 567 567 568 + if (r) 569 + vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); 570 + 568 571 return r; 569 572 } 570 573 ··· 595 592 struct vhost_dev *dev = &v->vdev; 596 593 struct vhost_iotlb *iotlb = dev->iotlb; 597 594 struct page **page_list; 598 - unsigned long list_size = PAGE_SIZE / sizeof(struct page *); 595 + struct vm_area_struct **vmas; 599 596 unsigned int gup_flags = FOLL_LONGTERM; 600 - unsigned long npages, cur_base, map_pfn, last_pfn = 0; 601 - unsigned long locked, lock_limit, pinned, i; 597 + unsigned long map_pfn, last_pfn = 0; 598 + unsigned long npages, lock_limit; 599 + unsigned long i, nmap = 0; 602 600 u64 iova = msg->iova; 601 + long pinned; 603 602 int ret = 0; 604 603 605 604 if (vhost_iotlb_itree_first(iotlb, msg->iova, 606 605 msg->iova + msg->size - 1)) 607 606 return -EEXIST; 608 - 609 - page_list = (struct page **) __get_free_page(GFP_KERNEL); 610 - if (!page_list) 611 - return -ENOMEM; 612 607 613 608 if (msg->perm & VHOST_ACCESS_WO) 614 609 gup_flags |= FOLL_WRITE; ··· 615 614 if (!npages) 616 615 return -EINVAL; 617 616 617 + page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); 618 + vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *), 619 + GFP_KERNEL); 620 + if (!page_list || !vmas) { 621 + ret = -ENOMEM; 622 + goto free; 623 + } 624 + 618 625 mmap_read_lock(dev->mm); 619 626 620 - locked = atomic64_add_return(npages, &dev->mm->pinned_vm); 621 627 lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; 622 - 623 - if (locked > lock_limit) { 628 + if (npages + atomic64_read(&dev->mm->pinned_vm) > lock_limit) { 624 629 ret = -ENOMEM; 625 - goto out; 630 + goto unlock; 626 631 } 627 632 628 - cur_base = msg->uaddr & PAGE_MASK; 629 - iova &= PAGE_MASK; 630 - 631 - while (npages) { 632 - pinned = min_t(unsigned long, npages, list_size); 633 - ret = pin_user_pages(cur_base, pinned, 634 - gup_flags, page_list, NULL); 635 - if (ret != pinned) 636 - goto out; 637 - 638 - if (!last_pfn) 639 - map_pfn = page_to_pfn(page_list[0]); 640 - 641 - for (i = 0; i < ret; i++) { 642 - unsigned long this_pfn = page_to_pfn(page_list[i]); 643 - u64 csize; 644 - 645 - if (last_pfn && (this_pfn != last_pfn + 1)) { 646 - /* Pin a contiguous chunk of memory */ 647 - csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; 648 - if (vhost_vdpa_map(v, iova, csize, 649 - map_pfn << PAGE_SHIFT, 650 - msg->perm)) 651 - goto out; 652 - map_pfn = this_pfn; 653 - iova += csize; 654 - } 655 - 656 - last_pfn = this_pfn; 633 + pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags, 634 + page_list, vmas); 635 + if (npages != pinned) { 636 + if (pinned < 0) { 637 + ret = pinned; 638 + } else { 639 + unpin_user_pages(page_list, pinned); 640 + ret = -ENOMEM; 657 641 } 658 - 659 - cur_base += ret << PAGE_SHIFT; 660 - npages -= ret; 642 + goto unlock; 661 643 } 662 644 663 - /* Pin the rest chunk */ 664 - ret = vhost_vdpa_map(v, iova, (last_pfn - map_pfn + 1) << PAGE_SHIFT, 665 - map_pfn << PAGE_SHIFT, msg->perm); 645 + iova &= PAGE_MASK; 646 + map_pfn = page_to_pfn(page_list[0]); 647 + 648 + /* One more iteration to avoid extra vdpa_map() call out of loop. */ 649 + for (i = 0; i <= npages; i++) { 650 + unsigned long this_pfn; 651 + u64 csize; 652 + 653 + /* The last chunk may have no valid PFN next to it */ 654 + this_pfn = i < npages ? page_to_pfn(page_list[i]) : -1UL; 655 + 656 + if (last_pfn && (this_pfn == -1UL || 657 + this_pfn != last_pfn + 1)) { 658 + /* Pin a contiguous chunk of memory */ 659 + csize = last_pfn - map_pfn + 1; 660 + ret = vhost_vdpa_map(v, iova, csize << PAGE_SHIFT, 661 + map_pfn << PAGE_SHIFT, 662 + msg->perm); 663 + if (ret) { 664 + /* 665 + * Unpin the rest chunks of memory on the 666 + * flight with no corresponding vdpa_map() 667 + * calls having been made yet. On the other 668 + * hand, vdpa_unmap() in the failure path 669 + * is in charge of accounting the number of 670 + * pinned pages for its own. 671 + * This asymmetrical pattern of accounting 672 + * is for efficiency to pin all pages at 673 + * once, while there is no other callsite 674 + * of vdpa_map() than here above. 675 + */ 676 + unpin_user_pages(&page_list[nmap], 677 + npages - nmap); 678 + goto out; 679 + } 680 + atomic64_add(csize, &dev->mm->pinned_vm); 681 + nmap += csize; 682 + iova += csize << PAGE_SHIFT; 683 + map_pfn = this_pfn; 684 + } 685 + last_pfn = this_pfn; 686 + } 687 + 688 + WARN_ON(nmap != npages); 666 689 out: 667 - if (ret) { 690 + if (ret) 668 691 vhost_vdpa_unmap(v, msg->iova, msg->size); 669 - atomic64_sub(npages, &dev->mm->pinned_vm); 670 - } 692 + unlock: 671 693 mmap_read_unlock(dev->mm); 672 - free_page((unsigned long)page_list); 694 + free: 695 + kvfree(vmas); 696 + kvfree(page_list); 673 697 return ret; 674 698 } 675 699 ··· 836 810 837 811 err_init_iotlb: 838 812 vhost_dev_cleanup(&v->vdev); 813 + kfree(vqs); 839 814 err: 840 815 atomic_dec(&v->opened); 841 816 return r;
+23 -10
drivers/vhost/vhost.c
··· 1290 1290 vring_used_t __user *used) 1291 1291 1292 1292 { 1293 + /* If an IOTLB device is present, the vring addresses are 1294 + * GIOVAs. Access validation occurs at prefetch time. */ 1295 + if (vq->iotlb) 1296 + return true; 1297 + 1293 1298 return access_ok(desc, vhost_get_desc_size(vq, num)) && 1294 1299 access_ok(avail, vhost_get_avail_size(vq, num)) && 1295 1300 access_ok(used, vhost_get_used_size(vq, num)); ··· 1370 1365 } 1371 1366 EXPORT_SYMBOL_GPL(vhost_log_access_ok); 1372 1367 1368 + static bool vq_log_used_access_ok(struct vhost_virtqueue *vq, 1369 + void __user *log_base, 1370 + bool log_used, 1371 + u64 log_addr) 1372 + { 1373 + /* If an IOTLB device is present, log_addr is a GIOVA that 1374 + * will never be logged by log_used(). */ 1375 + if (vq->iotlb) 1376 + return true; 1377 + 1378 + return !log_used || log_access_ok(log_base, log_addr, 1379 + vhost_get_used_size(vq, vq->num)); 1380 + } 1381 + 1373 1382 /* Verify access for write logging. */ 1374 1383 /* Caller should have vq mutex and device mutex */ 1375 1384 static bool vq_log_access_ok(struct vhost_virtqueue *vq, ··· 1391 1372 { 1392 1373 return vq_memory_access_ok(log_base, vq->umem, 1393 1374 vhost_has_feature(vq, VHOST_F_LOG_ALL)) && 1394 - (!vq->log_used || log_access_ok(log_base, vq->log_addr, 1395 - vhost_get_used_size(vq, vq->num))); 1375 + vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr); 1396 1376 } 1397 1377 1398 1378 /* Can we start vq? */ ··· 1400 1382 { 1401 1383 if (!vq_log_access_ok(vq, vq->log_base)) 1402 1384 return false; 1403 - 1404 - /* Access validation occurs at prefetch time with IOTLB */ 1405 - if (vq->iotlb) 1406 - return true; 1407 1385 1408 1386 return vq_access_ok(vq, vq->num, vq->desc, vq->avail, vq->used); 1409 1387 } ··· 1530 1516 return -EINVAL; 1531 1517 1532 1518 /* Also validate log access for used ring if enabled. */ 1533 - if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && 1534 - !log_access_ok(vq->log_base, a.log_guest_addr, 1535 - sizeof *vq->used + 1536 - vq->num * sizeof *vq->used->ring)) 1519 + if (!vq_log_used_access_ok(vq, vq->log_base, 1520 + a.flags & (0x1 << VHOST_VRING_F_LOG), 1521 + a.log_guest_addr)) 1537 1522 return -EINVAL; 1538 1523 } 1539 1524