Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost

+4 -4

drivers/gpu/drm/virtio/virtgpu_drv.c

··· 130 130 131 131 static void virtio_gpu_shutdown(struct virtio_device *vdev) 132 132 { 133 - /* 134 - * drm does its own synchronization on shutdown. 135 - * Do nothing here, opt out of device reset. 136 - */ 133 + struct drm_device *dev = vdev->priv; 134 + 135 + /* stop talking to the device */ 136 + drm_dev_unplug(dev); 137 137 } 138 138 139 139 static void virtio_gpu_config_changed(struct virtio_device *vdev)

+3

drivers/vdpa/mlx5/core/mr.c

··· 908 908 { 909 909 struct mlx5_vdpa_mr_resources *mres = &mvdev->mres; 910 910 911 + if (!mres->wq_gc) 912 + return; 913 + 911 914 atomic_set(&mres->shutdown, 1); 912 915 913 916 flush_delayed_work(&mres->gc_dwork_ent);

+7 -5

drivers/vdpa/mlx5/net/mlx5_vnet.c

··· 2491 2491 } 2492 2492 2493 2493 mvq = &ndev->vqs[idx]; 2494 - ndev->needs_teardown = num != mvq->num_ent; 2494 + ndev->needs_teardown |= num != mvq->num_ent; 2495 2495 mvq->num_ent = num; 2496 2496 } 2497 2497 ··· 3432 3432 3433 3433 ndev = to_mlx5_vdpa_ndev(mvdev); 3434 3434 3435 + /* Functions called here should be able to work with 3436 + * uninitialized resources. 3437 + */ 3435 3438 free_fixed_resources(ndev); 3436 3439 mlx5_vdpa_clean_mrs(mvdev); 3437 3440 mlx5_vdpa_destroy_mr_resources(&ndev->mvdev); 3438 - mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); 3439 - 3440 3441 if (!is_zero_ether_addr(ndev->config.mac)) { 3441 3442 pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev)); 3442 3443 mlx5_mpfs_del_mac(pfmdev, ndev->config.mac); 3443 3444 } 3445 + mlx5_cmd_cleanup_async_ctx(&mvdev->async_ctx); 3444 3446 mlx5_vdpa_free_resources(&ndev->mvdev); 3445 3447 free_irqs(ndev); 3446 3448 kfree(ndev->event_cbs); ··· 3890 3888 mvdev->actual_features = 3891 3889 (device_features & BIT_ULL(VIRTIO_F_VERSION_1)); 3892 3890 3891 + mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx); 3892 + 3893 3893 ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL); 3894 3894 ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL); 3895 3895 if (!ndev->vqs || !ndev->event_cbs) { ··· 3963 3959 } else { 3964 3960 ndev->rqt_size = 1; 3965 3961 } 3966 - 3967 - mlx5_cmd_init_async_ctx(mdev, &mvdev->async_ctx); 3968 3962 3969 3963 ndev->mvdev.mlx_features = device_features; 3970 3964 mvdev->vdev.dma_dev = &mdev->pdev->dev;

+1

drivers/vdpa/vdpa_user/vduse_dev.c

··· 2216 2216 cdev_del(&vduse_ctrl_cdev); 2217 2217 unregister_chrdev_region(vduse_major, VDUSE_DEV_MAX); 2218 2218 class_unregister(&vduse_class); 2219 + idr_destroy(&vduse_idr); 2219 2220 } 2220 2221 module_exit(vduse_exit); 2221 2222

+18

drivers/vhost/Kconfig

··· 95 95 96 96 If unsure, say "N". 97 97 98 + config VHOST_ENABLE_FORK_OWNER_CONTROL 99 + bool "Enable VHOST_ENABLE_FORK_OWNER_CONTROL" 100 + default y 101 + help 102 + This option enables two IOCTLs: VHOST_SET_FORK_FROM_OWNER and 103 + VHOST_GET_FORK_FROM_OWNER. These allow userspace applications 104 + to modify the vhost worker mode for vhost devices. 105 + 106 + Also expose module parameter 'fork_from_owner_default' to allow users 107 + to configure the default mode for vhost workers. 108 + 109 + By default, `VHOST_ENABLE_FORK_OWNER_CONTROL` is set to `y`, 110 + users can change the worker thread mode as needed. 111 + If this config is disabled (n),the related IOCTLs and parameters will 112 + be unavailable. 113 + 114 + If unsure, say "Y". 115 + 98 116 endif

+63 -25

drivers/vhost/net.c

··· 74 74 (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | 75 75 (1ULL << VIRTIO_NET_F_MRG_RXBUF) | 76 76 (1ULL << VIRTIO_F_ACCESS_PLATFORM) | 77 - (1ULL << VIRTIO_F_RING_RESET), 77 + (1ULL << VIRTIO_F_RING_RESET) | 78 + (1ULL << VIRTIO_F_IN_ORDER), 78 79 VIRTIO_BIT(VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO) | 79 80 VIRTIO_BIT(VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO), 80 81 }; ··· 377 376 while (j) { 378 377 add = min(UIO_MAXIOV - nvq->done_idx, j); 379 378 vhost_add_used_and_signal_n(vq->dev, vq, 380 - &vq->heads[nvq->done_idx], add); 379 + &vq->heads[nvq->done_idx], 380 + NULL, add); 381 381 nvq->done_idx = (nvq->done_idx + add) % UIO_MAXIOV; 382 382 j -= add; 383 383 } ··· 453 451 return vhost_poll_start(poll, sock->file); 454 452 } 455 453 456 - static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq) 454 + static void vhost_net_signal_used(struct vhost_net_virtqueue *nvq, 455 + unsigned int count) 457 456 { 458 457 struct vhost_virtqueue *vq = &nvq->vq; 459 458 struct vhost_dev *dev = vq->dev; ··· 462 459 if (!nvq->done_idx) 463 460 return; 464 461 465 - vhost_add_used_and_signal_n(dev, vq, vq->heads, nvq->done_idx); 462 + vhost_add_used_and_signal_n(dev, vq, vq->heads, 463 + vq->nheads, count); 466 464 nvq->done_idx = 0; 467 465 } 468 466 ··· 472 468 struct socket *sock, 473 469 struct msghdr *msghdr) 474 470 { 471 + struct vhost_virtqueue *vq = &nvq->vq; 472 + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); 475 473 struct tun_msg_ctl ctl = { 476 474 .type = TUN_MSG_PTR, 477 475 .num = nvq->batched_xdp, 478 476 .ptr = nvq->xdp, 479 477 }; 480 478 int i, err; 479 + 480 + if (in_order) { 481 + vq->heads[0].len = 0; 482 + vq->nheads[0] = nvq->done_idx; 483 + } 481 484 482 485 if (nvq->batched_xdp == 0) 483 486 goto signal_used; ··· 507 496 } 508 497 509 498 signal_used: 510 - vhost_net_signal_used(nvq); 499 + vhost_net_signal_used(nvq, in_order ? 1 : nvq->done_idx); 511 500 nvq->batched_xdp = 0; 512 501 } 513 502 ··· 761 750 int sent_pkts = 0; 762 751 bool sock_can_batch = (sock->sk->sk_sndbuf == INT_MAX); 763 752 bool busyloop_intr; 753 + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); 764 754 765 755 do { 766 756 busyloop_intr = false; ··· 798 786 break; 799 787 } 800 788 801 - /* We can't build XDP buff, go for single 802 - * packet path but let's flush batched 803 - * packets. 804 - */ 805 - vhost_tx_batch(net, nvq, sock, &msg); 789 + if (nvq->batched_xdp) { 790 + /* We can't build XDP buff, go for single 791 + * packet path but let's flush batched 792 + * packets. 793 + */ 794 + vhost_tx_batch(net, nvq, sock, &msg); 795 + } 806 796 msg.msg_control = NULL; 807 797 } else { 808 798 if (tx_can_batch(vq, total_len)) ··· 825 811 pr_debug("Truncated TX packet: len %d != %zd\n", 826 812 err, len); 827 813 done: 828 - vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); 829 - vq->heads[nvq->done_idx].len = 0; 814 + if (in_order) { 815 + vq->heads[0].id = cpu_to_vhost32(vq, head); 816 + } else { 817 + vq->heads[nvq->done_idx].id = cpu_to_vhost32(vq, head); 818 + vq->heads[nvq->done_idx].len = 0; 819 + } 830 820 ++nvq->done_idx; 831 821 } while (likely(!vhost_exceeds_weight(vq, ++sent_pkts, total_len))); 832 822 ··· 1009 991 } 1010 992 1011 993 static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk, 1012 - bool *busyloop_intr) 994 + bool *busyloop_intr, unsigned int count) 1013 995 { 1014 996 struct vhost_net_virtqueue *rnvq = &net->vqs[VHOST_NET_VQ_RX]; 1015 997 struct vhost_net_virtqueue *tnvq = &net->vqs[VHOST_NET_VQ_TX]; ··· 1019 1001 1020 1002 if (!len && rvq->busyloop_timeout) { 1021 1003 /* Flush batched heads first */ 1022 - vhost_net_signal_used(rnvq); 1004 + vhost_net_signal_used(rnvq, count); 1023 1005 /* Both tx vq and rx socket were polled here */ 1024 1006 vhost_net_busy_poll(net, rvq, tvq, busyloop_intr, true); 1025 1007 ··· 1031 1013 1032 1014 /* This is a multi-buffer version of vhost_get_desc, that works if 1033 1015 * vq has read descriptors only. 1034 - * @vq - the relevant virtqueue 1016 + * @nvq - the relevant vhost_net virtqueue 1035 1017 * @datalen - data length we'll be reading 1036 1018 * @iovcount - returned count of io vectors we fill 1037 1019 * @log - vhost log ··· 1039 1021 * @quota - headcount quota, 1 for big buffer 1040 1022 * returns number of buffer heads allocated, negative on error 1041 1023 */ 1042 - static int get_rx_bufs(struct vhost_virtqueue *vq, 1024 + static int get_rx_bufs(struct vhost_net_virtqueue *nvq, 1043 1025 struct vring_used_elem *heads, 1026 + u16 *nheads, 1044 1027 int datalen, 1045 1028 unsigned *iovcount, 1046 1029 struct vhost_log *log, 1047 1030 unsigned *log_num, 1048 1031 unsigned int quota) 1049 1032 { 1033 + struct vhost_virtqueue *vq = &nvq->vq; 1034 + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); 1050 1035 unsigned int out, in; 1051 1036 int seg = 0; 1052 1037 int headcount = 0; ··· 1086 1065 nlogs += *log_num; 1087 1066 log += *log_num; 1088 1067 } 1089 - heads[headcount].id = cpu_to_vhost32(vq, d); 1090 1068 len = iov_length(vq->iov + seg, in); 1091 - heads[headcount].len = cpu_to_vhost32(vq, len); 1092 - datalen -= len; 1069 + if (!in_order) { 1070 + heads[headcount].id = cpu_to_vhost32(vq, d); 1071 + heads[headcount].len = cpu_to_vhost32(vq, len); 1072 + } 1093 1073 ++headcount; 1074 + datalen -= len; 1094 1075 seg += in; 1095 1076 } 1096 - heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); 1077 + 1097 1078 *iovcount = seg; 1098 1079 if (unlikely(log)) 1099 1080 *log_num = nlogs; ··· 1105 1082 r = UIO_MAXIOV + 1; 1106 1083 goto err; 1107 1084 } 1085 + 1086 + if (!in_order) 1087 + heads[headcount - 1].len = cpu_to_vhost32(vq, len + datalen); 1088 + else { 1089 + heads[0].len = cpu_to_vhost32(vq, len + datalen); 1090 + heads[0].id = cpu_to_vhost32(vq, d); 1091 + nheads[0] = headcount; 1092 + } 1093 + 1108 1094 return headcount; 1109 1095 err: 1110 1096 vhost_discard_vq_desc(vq, headcount); ··· 1126 1094 { 1127 1095 struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX]; 1128 1096 struct vhost_virtqueue *vq = &nvq->vq; 1097 + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); 1098 + unsigned int count = 0; 1129 1099 unsigned in, log; 1130 1100 struct vhost_log *vq_log; 1131 1101 struct msghdr msg = { ··· 1175 1141 1176 1142 do { 1177 1143 sock_len = vhost_net_rx_peek_head_len(net, sock->sk, 1178 - &busyloop_intr); 1144 + &busyloop_intr, count); 1179 1145 if (!sock_len) 1180 1146 break; 1181 1147 sock_len += sock_hlen; 1182 1148 vhost_len = sock_len + vhost_hlen; 1183 - headcount = get_rx_bufs(vq, vq->heads + nvq->done_idx, 1149 + headcount = get_rx_bufs(nvq, vq->heads + count, 1150 + vq->nheads + count, 1184 1151 vhost_len, &in, vq_log, &log, 1185 1152 likely(mergeable) ? UIO_MAXIOV : 1); 1186 1153 /* On error, stop handling until the next kick. */ ··· 1257 1222 goto out; 1258 1223 } 1259 1224 nvq->done_idx += headcount; 1260 - if (nvq->done_idx > VHOST_NET_BATCH) 1261 - vhost_net_signal_used(nvq); 1225 + count += in_order ? 1 : headcount; 1226 + if (nvq->done_idx > VHOST_NET_BATCH) { 1227 + vhost_net_signal_used(nvq, count); 1228 + count = 0; 1229 + } 1262 1230 if (unlikely(vq_log)) 1263 1231 vhost_log_write(vq, vq_log, log, vhost_len, 1264 1232 vq->iov, in); ··· 1273 1235 else if (!sock_len) 1274 1236 vhost_net_enable_vq(net, vq); 1275 1237 out: 1276 - vhost_net_signal_used(nvq); 1238 + vhost_net_signal_used(nvq, count); 1277 1239 mutex_unlock(&vq->mutex); 1278 1240 } 1279 1241

+11 -13

drivers/vhost/scsi.c

··· 71 71 if (ret) 72 72 return ret; 73 73 74 - if (ret > VHOST_SCSI_PREALLOC_SGLS) { 74 + if (cnt > VHOST_SCSI_PREALLOC_SGLS) { 75 75 pr_err("Max inline_sg_cnt is %u\n", VHOST_SCSI_PREALLOC_SGLS); 76 76 return -EINVAL; 77 77 } ··· 152 152 struct vhost_scsi_tpg { 153 153 /* Vhost port target portal group tag for TCM */ 154 154 u16 tport_tpgt; 155 - /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ 155 + /* Used to track number of TPG Port/Lun Links wrt to explicit I_T Nexus shutdown */ 156 156 int tv_tpg_port_count; 157 157 /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ 158 158 int tv_tpg_vhost_count; ··· 311 311 312 312 mutex_lock(&vq->mutex); 313 313 314 - /* store old infight */ 314 + /* store old inflight */ 315 315 idx = vs->vqs[i].inflight_idx; 316 316 if (old_inflight) 317 317 old_inflight[i] = &vs->vqs[i].inflights[idx]; 318 318 319 - /* setup new infight */ 319 + /* setup new inflight */ 320 320 vs->vqs[i].inflight_idx = idx ^ 1; 321 321 new_inflight = &vs->vqs[i].inflights[idx ^ 1]; 322 322 kref_init(&new_inflight->kref); ··· 1226 1226 /* validated at handler entry */ 1227 1227 vs_tpg = vhost_vq_get_backend(vq); 1228 1228 tpg = READ_ONCE(vs_tpg[*vc->target]); 1229 - if (unlikely(!tpg)) { 1230 - vq_err(vq, "Target 0x%x does not exist\n", *vc->target); 1229 + if (unlikely(!tpg)) 1231 1230 goto out; 1232 - } 1233 1231 } 1234 1232 1235 1233 if (tpgp) ··· 1247 1249 if (!in_iovs_cnt) 1248 1250 return 0; 1249 1251 /* 1250 - * Initiator's normally just put the virtio_scsi_cmd_resp in the first 1252 + * Initiators normally just put the virtio_scsi_cmd_resp in the first 1251 1253 * iov, but just in case they wedged in some data with it we check for 1252 1254 * greater than or equal to the response struct. 1253 1255 */ ··· 1455 1457 cmd = vhost_scsi_get_cmd(vq, tag); 1456 1458 if (IS_ERR(cmd)) { 1457 1459 ret = PTR_ERR(cmd); 1458 - vq_err(vq, "vhost_scsi_get_tag failed %dd\n", ret); 1460 + vq_err(vq, "vhost_scsi_get_tag failed %d\n", ret); 1459 1461 goto err; 1460 1462 } 1461 1463 cmd->tvc_vq = vq; ··· 2607 2609 return -ENOMEM; 2608 2610 } 2609 2611 /* 2610 - * Since we are running in 'demo mode' this call with generate a 2612 + * Since we are running in 'demo mode' this call will generate a 2611 2613 * struct se_node_acl for the vhost_scsi struct se_portal_group with 2612 2614 * the SCSI Initiator port name of the passed configfs group 'name'. 2613 2615 */ ··· 2913 2915 vhost_scsi_wwn_version_show(struct config_item *item, char *page) 2914 2916 { 2915 2917 return sysfs_emit(page, "TCM_VHOST fabric module %s on %s/%s" 2916 - "on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname, 2918 + " on "UTS_RELEASE"\n", VHOST_SCSI_VERSION, utsname()->sysname, 2917 2919 utsname()->machine); 2918 2920 } 2919 2921 ··· 2981 2983 vhost_scsi_deregister(); 2982 2984 out: 2983 2985 return ret; 2984 - }; 2986 + } 2985 2987 2986 2988 static void vhost_scsi_exit(void) 2987 2989 { 2988 2990 target_unregister_template(&vhost_scsi_ops); 2989 2991 vhost_scsi_deregister(); 2990 - }; 2992 + } 2991 2993 2992 2994 MODULE_DESCRIPTION("VHOST_SCSI series fabric driver"); 2993 2995 MODULE_ALIAS("tcm_vhost");

+333 -44

drivers/vhost/vhost.c

··· 22 22 #include <linux/slab.h> 23 23 #include <linux/vmalloc.h> 24 24 #include <linux/kthread.h> 25 + #include <linux/cgroup.h> 25 26 #include <linux/module.h> 26 27 #include <linux/sort.h> 27 28 #include <linux/sched/mm.h> ··· 42 41 module_param(max_iotlb_entries, int, 0444); 43 42 MODULE_PARM_DESC(max_iotlb_entries, 44 43 "Maximum number of iotlb entries. (default: 2048)"); 44 + static bool fork_from_owner_default = VHOST_FORK_OWNER_TASK; 45 + 46 + #ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL 47 + module_param(fork_from_owner_default, bool, 0444); 48 + MODULE_PARM_DESC(fork_from_owner_default, 49 + "Set task mode as the default(default: Y)"); 50 + #endif 45 51 46 52 enum { 47 53 VHOST_MEMORY_F_LOG = 0x1, ··· 250 242 * test_and_set_bit() implies a memory barrier. 251 243 */ 252 244 llist_add(&work->node, &worker->work_list); 253 - vhost_task_wake(worker->vtsk); 245 + worker->ops->wakeup(worker); 254 246 } 255 247 } 256 248 ··· 372 364 vq->avail = NULL; 373 365 vq->used = NULL; 374 366 vq->last_avail_idx = 0; 367 + vq->next_avail_head = 0; 375 368 vq->avail_idx = 0; 376 369 vq->last_used_idx = 0; 377 370 vq->signalled_used = 0; ··· 395 386 rcu_assign_pointer(vq->worker, NULL); 396 387 vhost_vring_call_reset(&vq->call_ctx); 397 388 __vhost_vq_meta_reset(vq); 389 + } 390 + 391 + static int vhost_run_work_kthread_list(void *data) 392 + { 393 + struct vhost_worker *worker = data; 394 + struct vhost_work *work, *work_next; 395 + struct vhost_dev *dev = worker->dev; 396 + struct llist_node *node; 397 + 398 + kthread_use_mm(dev->mm); 399 + 400 + for (;;) { 401 + /* mb paired w/ kthread_stop */ 402 + set_current_state(TASK_INTERRUPTIBLE); 403 + 404 + if (kthread_should_stop()) { 405 + __set_current_state(TASK_RUNNING); 406 + break; 407 + } 408 + node = llist_del_all(&worker->work_list); 409 + if (!node) 410 + schedule(); 411 + 412 + node = llist_reverse_order(node); 413 + /* make sure flag is seen after deletion */ 414 + smp_wmb(); 415 + llist_for_each_entry_safe(work, work_next, node, node) { 416 + clear_bit(VHOST_WORK_QUEUED, &work->flags); 417 + __set_current_state(TASK_RUNNING); 418 + kcov_remote_start_common(worker->kcov_handle); 419 + work->fn(work); 420 + kcov_remote_stop(); 421 + cond_resched(); 422 + } 423 + } 424 + kthread_unuse_mm(dev->mm); 425 + 426 + return 0; 398 427 } 399 428 400 429 static bool vhost_run_work_list(void *data) ··· 502 455 vq->log = NULL; 503 456 kfree(vq->heads); 504 457 vq->heads = NULL; 458 + kfree(vq->nheads); 459 + vq->nheads = NULL; 505 460 } 506 461 507 462 /* Helper to allocate iovec buffers for all vqs. */ ··· 521 472 GFP_KERNEL); 522 473 vq->heads = kmalloc_array(dev->iov_limit, sizeof(*vq->heads), 523 474 GFP_KERNEL); 524 - if (!vq->indirect || !vq->log || !vq->heads) 475 + vq->nheads = kmalloc_array(dev->iov_limit, sizeof(*vq->nheads), 476 + GFP_KERNEL); 477 + if (!vq->indirect || !vq->log || !vq->heads || !vq->nheads) 525 478 goto err_nomem; 526 479 } 527 480 return 0; ··· 603 552 dev->byte_weight = byte_weight; 604 553 dev->use_worker = use_worker; 605 554 dev->msg_handler = msg_handler; 555 + dev->fork_owner = fork_from_owner_default; 606 556 init_waitqueue_head(&dev->wait); 607 557 INIT_LIST_HEAD(&dev->read_list); 608 558 INIT_LIST_HEAD(&dev->pending_list); ··· 633 581 } 634 582 EXPORT_SYMBOL_GPL(vhost_dev_check_owner); 635 583 584 + struct vhost_attach_cgroups_struct { 585 + struct vhost_work work; 586 + struct task_struct *owner; 587 + int ret; 588 + }; 589 + 590 + static void vhost_attach_cgroups_work(struct vhost_work *work) 591 + { 592 + struct vhost_attach_cgroups_struct *s; 593 + 594 + s = container_of(work, struct vhost_attach_cgroups_struct, work); 595 + s->ret = cgroup_attach_task_all(s->owner, current); 596 + } 597 + 598 + static int vhost_attach_task_to_cgroups(struct vhost_worker *worker) 599 + { 600 + struct vhost_attach_cgroups_struct attach; 601 + int saved_cnt; 602 + 603 + attach.owner = current; 604 + 605 + vhost_work_init(&attach.work, vhost_attach_cgroups_work); 606 + vhost_worker_queue(worker, &attach.work); 607 + 608 + mutex_lock(&worker->mutex); 609 + 610 + /* 611 + * Bypass attachment_cnt check in __vhost_worker_flush: 612 + * Temporarily change it to INT_MAX to bypass the check 613 + */ 614 + saved_cnt = worker->attachment_cnt; 615 + worker->attachment_cnt = INT_MAX; 616 + __vhost_worker_flush(worker); 617 + worker->attachment_cnt = saved_cnt; 618 + 619 + mutex_unlock(&worker->mutex); 620 + 621 + return attach.ret; 622 + } 623 + 636 624 /* Caller should have device mutex */ 637 625 bool vhost_dev_has_owner(struct vhost_dev *dev) 638 626 { ··· 686 594 if (dev->use_worker) { 687 595 dev->mm = get_task_mm(current); 688 596 } else { 689 - /* vDPA device does not use worker thead, so there's 690 - * no need to hold the address space for mm. This help 597 + /* vDPA device does not use worker thread, so there's 598 + * no need to hold the address space for mm. This helps 691 599 * to avoid deadlock in the case of mmap() which may 692 - * held the refcnt of the file and depends on release 600 + * hold the refcnt of the file and depends on release 693 601 * method to remove vma. 694 602 */ 695 603 dev->mm = current->mm; ··· 718 626 719 627 WARN_ON(!llist_empty(&worker->work_list)); 720 628 xa_erase(&dev->worker_xa, worker->id); 721 - vhost_task_stop(worker->vtsk); 629 + worker->ops->stop(worker); 722 630 kfree(worker); 723 631 } 724 632 ··· 741 649 xa_destroy(&dev->worker_xa); 742 650 } 743 651 652 + static void vhost_task_wakeup(struct vhost_worker *worker) 653 + { 654 + return vhost_task_wake(worker->vtsk); 655 + } 656 + 657 + static void vhost_kthread_wakeup(struct vhost_worker *worker) 658 + { 659 + wake_up_process(worker->kthread_task); 660 + } 661 + 662 + static void vhost_task_do_stop(struct vhost_worker *worker) 663 + { 664 + return vhost_task_stop(worker->vtsk); 665 + } 666 + 667 + static void vhost_kthread_do_stop(struct vhost_worker *worker) 668 + { 669 + kthread_stop(worker->kthread_task); 670 + } 671 + 672 + static int vhost_task_worker_create(struct vhost_worker *worker, 673 + struct vhost_dev *dev, const char *name) 674 + { 675 + struct vhost_task *vtsk; 676 + u32 id; 677 + int ret; 678 + 679 + vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, 680 + worker, name); 681 + if (IS_ERR(vtsk)) 682 + return PTR_ERR(vtsk); 683 + 684 + worker->vtsk = vtsk; 685 + vhost_task_start(vtsk); 686 + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); 687 + if (ret < 0) { 688 + vhost_task_do_stop(worker); 689 + return ret; 690 + } 691 + worker->id = id; 692 + return 0; 693 + } 694 + 695 + static int vhost_kthread_worker_create(struct vhost_worker *worker, 696 + struct vhost_dev *dev, const char *name) 697 + { 698 + struct task_struct *task; 699 + u32 id; 700 + int ret; 701 + 702 + task = kthread_create(vhost_run_work_kthread_list, worker, "%s", name); 703 + if (IS_ERR(task)) 704 + return PTR_ERR(task); 705 + 706 + worker->kthread_task = task; 707 + wake_up_process(task); 708 + ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); 709 + if (ret < 0) 710 + goto stop_worker; 711 + 712 + ret = vhost_attach_task_to_cgroups(worker); 713 + if (ret) 714 + goto stop_worker; 715 + 716 + worker->id = id; 717 + return 0; 718 + 719 + stop_worker: 720 + vhost_kthread_do_stop(worker); 721 + return ret; 722 + } 723 + 724 + static const struct vhost_worker_ops kthread_ops = { 725 + .create = vhost_kthread_worker_create, 726 + .stop = vhost_kthread_do_stop, 727 + .wakeup = vhost_kthread_wakeup, 728 + }; 729 + 730 + static const struct vhost_worker_ops vhost_task_ops = { 731 + .create = vhost_task_worker_create, 732 + .stop = vhost_task_do_stop, 733 + .wakeup = vhost_task_wakeup, 734 + }; 735 + 744 736 static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev) 745 737 { 746 738 struct vhost_worker *worker; 747 - struct vhost_task *vtsk; 748 739 char name[TASK_COMM_LEN]; 749 740 int ret; 750 - u32 id; 741 + const struct vhost_worker_ops *ops = dev->fork_owner ? &vhost_task_ops : 742 + &kthread_ops; 751 743 752 744 worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); 753 745 if (!worker) 754 746 return NULL; 755 747 756 748 worker->dev = dev; 749 + worker->ops = ops; 757 750 snprintf(name, sizeof(name), "vhost-%d", current->pid); 758 - 759 - vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed, 760 - worker, name); 761 - if (IS_ERR(vtsk)) 762 - goto free_worker; 763 751 764 752 mutex_init(&worker->mutex); 765 753 init_llist_head(&worker->work_list); 766 754 worker->kcov_handle = kcov_common_handle(); 767 - worker->vtsk = vtsk; 768 - 769 - vhost_task_start(vtsk); 770 - 771 - ret = xa_alloc(&dev->worker_xa, &id, worker, xa_limit_32b, GFP_KERNEL); 755 + ret = ops->create(worker, dev, name); 772 756 if (ret < 0) 773 - goto stop_worker; 774 - worker->id = id; 757 + goto free_worker; 775 758 776 759 return worker; 777 760 778 - stop_worker: 779 - vhost_task_stop(vtsk); 780 761 free_worker: 781 762 kfree(worker); 782 763 return NULL; ··· 896 731 * We don't want to call synchronize_rcu for every vq during setup 897 732 * because it will slow down VM startup. If we haven't done 898 733 * VHOST_SET_VRING_KICK and not done the driver specific 899 - * SET_ENDPOINT/RUNNUNG then we can skip the sync since there will 734 + * SET_ENDPOINT/RUNNING then we can skip the sync since there will 900 735 * not be any works queued for scsi and net. 901 736 */ 902 737 mutex_lock(&vq->mutex); ··· 1030 865 switch (ioctl) { 1031 866 /* dev worker ioctls */ 1032 867 case VHOST_NEW_WORKER: 868 + /* 869 + * vhost_tasks will account for worker threads under the parent's 870 + * NPROC value but kthreads do not. To avoid userspace overflowing 871 + * the system with worker threads fork_owner must be true. 872 + */ 873 + if (!dev->fork_owner) 874 + return -EFAULT; 875 + 1033 876 ret = vhost_new_worker(dev, &state); 1034 877 if (!ret && copy_to_user(argp, &state, sizeof(state))) 1035 878 ret = -EFAULT; ··· 1155 982 1156 983 vhost_dev_cleanup(dev); 1157 984 985 + dev->fork_owner = fork_from_owner_default; 1158 986 dev->umem = umem; 1159 987 /* We don't need VQ locks below since vhost_dev_cleanup makes sure 1160 988 * VQs aren't running. ··· 2164 1990 break; 2165 1991 } 2166 1992 if (vhost_has_feature(vq, VIRTIO_F_RING_PACKED)) { 2167 - vq->last_avail_idx = s.num & 0xffff; 1993 + vq->next_avail_head = vq->last_avail_idx = 1994 + s.num & 0xffff; 2168 1995 vq->last_used_idx = (s.num >> 16) & 0xffff; 2169 1996 } else { 2170 1997 if (s.num > 0xffff) { 2171 1998 r = -EINVAL; 2172 1999 break; 2173 2000 } 2174 - vq->last_avail_idx = s.num; 2001 + vq->next_avail_head = vq->last_avail_idx = s.num; 2175 2002 } 2176 2003 /* Forget the cached index value. */ 2177 2004 vq->avail_idx = vq->last_avail_idx; ··· 2309 2134 r = vhost_dev_set_owner(d); 2310 2135 goto done; 2311 2136 } 2137 + 2138 + #ifdef CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL 2139 + if (ioctl == VHOST_SET_FORK_FROM_OWNER) { 2140 + /* Only allow modification before owner is set */ 2141 + if (vhost_dev_has_owner(d)) { 2142 + r = -EBUSY; 2143 + goto done; 2144 + } 2145 + u8 fork_owner_val; 2146 + 2147 + if (get_user(fork_owner_val, (u8 __user *)argp)) { 2148 + r = -EFAULT; 2149 + goto done; 2150 + } 2151 + if (fork_owner_val != VHOST_FORK_OWNER_TASK && 2152 + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { 2153 + r = -EINVAL; 2154 + goto done; 2155 + } 2156 + d->fork_owner = !!fork_owner_val; 2157 + r = 0; 2158 + goto done; 2159 + } 2160 + if (ioctl == VHOST_GET_FORK_FROM_OWNER) { 2161 + u8 fork_owner_val = d->fork_owner; 2162 + 2163 + if (fork_owner_val != VHOST_FORK_OWNER_TASK && 2164 + fork_owner_val != VHOST_FORK_OWNER_KTHREAD) { 2165 + r = -EINVAL; 2166 + goto done; 2167 + } 2168 + if (put_user(fork_owner_val, (u8 __user *)argp)) { 2169 + r = -EFAULT; 2170 + goto done; 2171 + } 2172 + r = 0; 2173 + goto done; 2174 + } 2175 + #endif 2312 2176 2313 2177 /* You must be the owner to do anything else */ 2314 2178 r = vhost_dev_check_owner(d); ··· 2804 2590 unsigned int *out_num, unsigned int *in_num, 2805 2591 struct vhost_log *log, unsigned int *log_num) 2806 2592 { 2593 + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); 2807 2594 struct vring_desc desc; 2808 2595 unsigned int i, head, found = 0; 2809 2596 u16 last_avail_idx = vq->last_avail_idx; 2810 2597 __virtio16 ring_head; 2811 - int ret, access; 2598 + int ret, access, c = 0; 2812 2599 2813 2600 if (vq->avail_idx == vq->last_avail_idx) { 2814 2601 ret = vhost_get_avail_idx(vq); ··· 2820 2605 return vq->num; 2821 2606 } 2822 2607 2823 - /* Grab the next descriptor number they're advertising, and increment 2824 - * the index we've seen. */ 2825 - if (unlikely(vhost_get_avail_head(vq, &ring_head, last_avail_idx))) { 2826 - vq_err(vq, "Failed to read head: idx %d address %p\n", 2827 - last_avail_idx, 2828 - &vq->avail->ring[last_avail_idx % vq->num]); 2829 - return -EFAULT; 2608 + if (in_order) 2609 + head = vq->next_avail_head & (vq->num - 1); 2610 + else { 2611 + /* Grab the next descriptor number they're 2612 + * advertising, and increment the index we've seen. */ 2613 + if (unlikely(vhost_get_avail_head(vq, &ring_head, 2614 + last_avail_idx))) { 2615 + vq_err(vq, "Failed to read head: idx %d address %p\n", 2616 + last_avail_idx, 2617 + &vq->avail->ring[last_avail_idx % vq->num]); 2618 + return -EFAULT; 2619 + } 2620 + head = vhost16_to_cpu(vq, ring_head); 2830 2621 } 2831 - 2832 - head = vhost16_to_cpu(vq, ring_head); 2833 2622 2834 2623 /* If their number is silly, that's an error. */ 2835 2624 if (unlikely(head >= vq->num)) { ··· 2877 2658 "in indirect descriptor at idx %d\n", i); 2878 2659 return ret; 2879 2660 } 2661 + ++c; 2880 2662 continue; 2881 2663 } 2882 2664 ··· 2913 2693 } 2914 2694 *out_num += ret; 2915 2695 } 2696 + ++c; 2916 2697 } while ((i = next_desc(vq, &desc)) != -1); 2917 2698 2918 2699 /* On success, increment avail index. */ 2919 2700 vq->last_avail_idx++; 2701 + vq->next_avail_head += c; 2920 2702 2921 2703 /* Assume notifications from guest are disabled at this point, 2922 2704 * if they aren't we would need to update avail_event index. */ ··· 2942 2720 cpu_to_vhost32(vq, head), 2943 2721 cpu_to_vhost32(vq, len) 2944 2722 }; 2723 + u16 nheads = 1; 2945 2724 2946 - return vhost_add_used_n(vq, &heads, 1); 2725 + return vhost_add_used_n(vq, &heads, &nheads, 1); 2947 2726 } 2948 2727 EXPORT_SYMBOL_GPL(vhost_add_used); 2949 2728 ··· 2980 2757 return 0; 2981 2758 } 2982 2759 2983 - /* After we've used one of their buffers, we tell them about it. We'll then 2984 - * want to notify the guest, using eventfd. */ 2985 - int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, 2986 - unsigned count) 2760 + static int vhost_add_used_n_ooo(struct vhost_virtqueue *vq, 2761 + struct vring_used_elem *heads, 2762 + unsigned count) 2987 2763 { 2988 2764 int start, n, r; 2989 2765 ··· 2995 2773 heads += n; 2996 2774 count -= n; 2997 2775 } 2998 - r = __vhost_add_used_n(vq, heads, count); 2776 + return __vhost_add_used_n(vq, heads, count); 2777 + } 2778 + 2779 + static int vhost_add_used_n_in_order(struct vhost_virtqueue *vq, 2780 + struct vring_used_elem *heads, 2781 + const u16 *nheads, 2782 + unsigned count) 2783 + { 2784 + vring_used_elem_t __user *used; 2785 + u16 old, new = vq->last_used_idx; 2786 + int start, i; 2787 + 2788 + if (!nheads) 2789 + return -EINVAL; 2790 + 2791 + start = vq->last_used_idx & (vq->num - 1); 2792 + used = vq->used->ring + start; 2793 + 2794 + for (i = 0; i < count; i++) { 2795 + if (vhost_put_used(vq, &heads[i], start, 1)) { 2796 + vq_err(vq, "Failed to write used"); 2797 + return -EFAULT; 2798 + } 2799 + start += nheads[i]; 2800 + new += nheads[i]; 2801 + if (start >= vq->num) 2802 + start -= vq->num; 2803 + } 2804 + 2805 + if (unlikely(vq->log_used)) { 2806 + /* Make sure data is seen before log. */ 2807 + smp_wmb(); 2808 + /* Log used ring entry write. */ 2809 + log_used(vq, ((void __user *)used - (void __user *)vq->used), 2810 + (vq->num - start) * sizeof *used); 2811 + if (start + count > vq->num) 2812 + log_used(vq, 0, 2813 + (start + count - vq->num) * sizeof *used); 2814 + } 2815 + 2816 + old = vq->last_used_idx; 2817 + vq->last_used_idx = new; 2818 + /* If the driver never bothers to signal in a very long while, 2819 + * used index might wrap around. If that happens, invalidate 2820 + * signalled_used index we stored. TODO: make sure driver 2821 + * signals at least once in 2^16 and remove this. */ 2822 + if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) 2823 + vq->signalled_used_valid = false; 2824 + return 0; 2825 + } 2826 + 2827 + /* After we've used one of their buffers, we tell them about it. We'll then 2828 + * want to notify the guest, using eventfd. */ 2829 + int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, 2830 + u16 *nheads, unsigned count) 2831 + { 2832 + bool in_order = vhost_has_feature(vq, VIRTIO_F_IN_ORDER); 2833 + int r; 2834 + 2835 + if (!in_order || !nheads) 2836 + r = vhost_add_used_n_ooo(vq, heads, count); 2837 + else 2838 + r = vhost_add_used_n_in_order(vq, heads, nheads, count); 2839 + 2840 + if (r < 0) 2841 + return r; 2999 2842 3000 2843 /* Make sure buffer is written before we update index. */ 3001 2844 smp_wmb(); ··· 3140 2853 /* multi-buffer version of vhost_add_used_and_signal */ 3141 2854 void vhost_add_used_and_signal_n(struct vhost_dev *dev, 3142 2855 struct vhost_virtqueue *vq, 3143 - struct vring_used_elem *heads, unsigned count) 2856 + struct vring_used_elem *heads, 2857 + u16 *nheads, 2858 + unsigned count) 3144 2859 { 3145 - vhost_add_used_n(vq, heads, count); 2860 + vhost_add_used_n(vq, heads, nheads, count); 3146 2861 vhost_signal(dev, vq); 3147 2862 } 3148 2863 EXPORT_SYMBOL_GPL(vhost_add_used_and_signal_n); 3149 2864 3150 - /* return true if we're sure that avaiable ring is empty */ 2865 + /* return true if we're sure that available ring is empty */ 3151 2866 bool vhost_vq_avail_empty(struct vhost_dev *dev, struct vhost_virtqueue *vq) 3152 2867 { 3153 2868 int r;

+28 -2

drivers/vhost/vhost.h

··· 26 26 unsigned long flags; 27 27 }; 28 28 29 + struct vhost_worker; 30 + struct vhost_dev; 31 + 32 + struct vhost_worker_ops { 33 + int (*create)(struct vhost_worker *worker, struct vhost_dev *dev, 34 + const char *name); 35 + void (*stop)(struct vhost_worker *worker); 36 + void (*wakeup)(struct vhost_worker *worker); 37 + }; 38 + 29 39 struct vhost_worker { 40 + struct task_struct *kthread_task; 30 41 struct vhost_task *vtsk; 31 42 struct vhost_dev *dev; 32 43 /* Used to serialize device wide flushing with worker swapping. */ ··· 47 36 u32 id; 48 37 int attachment_cnt; 49 38 bool killed; 39 + const struct vhost_worker_ops *ops; 50 40 }; 51 41 52 42 /* Poll a file (eventfd or socket) */ ··· 115 103 * Values are limited to 0x7fff, and the high bit is used as 116 104 * a wrap counter when using VIRTIO_F_RING_PACKED. */ 117 105 u16 last_avail_idx; 106 + /* Next avail ring head when VIRTIO_F_IN_ORDER is negoitated */ 107 + u16 next_avail_head; 118 108 119 109 /* Caches available index value from user. */ 120 110 u16 avail_idx; ··· 143 129 struct iovec iotlb_iov[64]; 144 130 struct iovec *indirect; 145 131 struct vring_used_elem *heads; 132 + u16 *nheads; 146 133 /* Protected by virtqueue mutex. */ 147 134 struct vhost_iotlb *umem; 148 135 struct vhost_iotlb *iotlb; ··· 191 176 int byte_weight; 192 177 struct xarray worker_xa; 193 178 bool use_worker; 179 + /* 180 + * If fork_owner is true we use vhost_tasks to create 181 + * the worker so all settings/limits like cgroups, NPROC, 182 + * scheduler, etc are inherited from the owner. If false, 183 + * we use kthreads and only attach to the same cgroups 184 + * as the owner for compat with older kernels. 185 + * here we use true as default value. 186 + * The default value is set by fork_from_owner_default 187 + */ 188 + bool fork_owner; 194 189 int (*msg_handler)(struct vhost_dev *dev, u32 asid, 195 190 struct vhost_iotlb_msg *msg); 196 191 }; ··· 238 213 int vhost_vq_init_access(struct vhost_virtqueue *); 239 214 int vhost_add_used(struct vhost_virtqueue *, unsigned int head, int len); 240 215 int vhost_add_used_n(struct vhost_virtqueue *, struct vring_used_elem *heads, 241 - unsigned count); 216 + u16 *nheads, unsigned count); 242 217 void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, 243 218 unsigned int id, int len); 244 219 void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, 245 - struct vring_used_elem *heads, unsigned count); 220 + struct vring_used_elem *heads, u16 *nheads, 221 + unsigned count); 246 222 void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); 247 223 void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); 248 224 bool vhost_vq_avail_empty(struct vhost_dev *, struct vhost_virtqueue *);

-118

drivers/vhost/vringh.c

··· 780 780 EXPORT_SYMBOL(vringh_iov_push_user); 781 781 782 782 /** 783 - * vringh_abandon_user - we've decided not to handle the descriptor(s). 784 - * @vrh: the vring. 785 - * @num: the number of descriptors to put back (ie. num 786 - * vringh_get_user() to undo). 787 - * 788 - * The next vringh_get_user() will return the old descriptor(s) again. 789 - */ 790 - void vringh_abandon_user(struct vringh *vrh, unsigned int num) 791 - { 792 - /* We only update vring_avail_event(vr) when we want to be notified, 793 - * so we haven't changed that yet. */ 794 - vrh->last_avail_idx -= num; 795 - } 796 - EXPORT_SYMBOL(vringh_abandon_user); 797 - 798 - /** 799 783 * vringh_complete_user - we've finished with descriptor, publish it. 800 784 * @vrh: the vring. 801 785 * @head: the head as filled in by vringh_getdesc_user. ··· 884 900 return 0; 885 901 } 886 902 887 - static inline int xfer_kern(const struct vringh *vrh, void *src, 888 - void *dst, size_t len) 889 - { 890 - memcpy(dst, src, len); 891 - return 0; 892 - } 893 - 894 - static inline int kern_xfer(const struct vringh *vrh, void *dst, 895 - void *src, size_t len) 896 - { 897 - memcpy(dst, src, len); 898 - return 0; 899 - } 900 - 901 903 /** 902 904 * vringh_init_kern - initialize a vringh for a kernelspace vring. 903 905 * @vrh: the vringh to initialize. ··· 967 997 return 1; 968 998 } 969 999 EXPORT_SYMBOL(vringh_getdesc_kern); 970 - 971 - /** 972 - * vringh_iov_pull_kern - copy bytes from vring_iov. 973 - * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume) 974 - * @dst: the place to copy. 975 - * @len: the maximum length to copy. 976 - * 977 - * Returns the bytes copied <= len or a negative errno. 978 - */ 979 - ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len) 980 - { 981 - return vringh_iov_xfer(NULL, riov, dst, len, xfer_kern); 982 - } 983 - EXPORT_SYMBOL(vringh_iov_pull_kern); 984 - 985 - /** 986 - * vringh_iov_push_kern - copy bytes into vring_iov. 987 - * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) 988 - * @src: the place to copy from. 989 - * @len: the maximum length to copy. 990 - * 991 - * Returns the bytes copied <= len or a negative errno. 992 - */ 993 - ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, 994 - const void *src, size_t len) 995 - { 996 - return vringh_iov_xfer(NULL, wiov, (void *)src, len, kern_xfer); 997 - } 998 - EXPORT_SYMBOL(vringh_iov_push_kern); 999 - 1000 - /** 1001 - * vringh_abandon_kern - we've decided not to handle the descriptor(s). 1002 - * @vrh: the vring. 1003 - * @num: the number of descriptors to put back (ie. num 1004 - * vringh_get_kern() to undo). 1005 - * 1006 - * The next vringh_get_kern() will return the old descriptor(s) again. 1007 - */ 1008 - void vringh_abandon_kern(struct vringh *vrh, unsigned int num) 1009 - { 1010 - /* We only update vring_avail_event(vr) when we want to be notified, 1011 - * so we haven't changed that yet. */ 1012 - vrh->last_avail_idx -= num; 1013 - } 1014 - EXPORT_SYMBOL(vringh_abandon_kern); 1015 1000 1016 1001 /** 1017 1002 * vringh_complete_kern - we've finished with descriptor, publish it. ··· 1460 1535 EXPORT_SYMBOL(vringh_iov_push_iotlb); 1461 1536 1462 1537 /** 1463 - * vringh_abandon_iotlb - we've decided not to handle the descriptor(s). 1464 - * @vrh: the vring. 1465 - * @num: the number of descriptors to put back (ie. num 1466 - * vringh_get_iotlb() to undo). 1467 - * 1468 - * The next vringh_get_iotlb() will return the old descriptor(s) again. 1469 - */ 1470 - void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num) 1471 - { 1472 - /* We only update vring_avail_event(vr) when we want to be notified, 1473 - * so we haven't changed that yet. 1474 - */ 1475 - vrh->last_avail_idx -= num; 1476 - } 1477 - EXPORT_SYMBOL(vringh_abandon_iotlb); 1478 - 1479 - /** 1480 1538 * vringh_complete_iotlb - we've finished with descriptor, publish it. 1481 1539 * @vrh: the vring. 1482 1540 * @head: the head as filled in by vringh_getdesc_iotlb. ··· 1478 1570 return __vringh_complete(vrh, &used, 1, putu16_iotlb, putused_iotlb); 1479 1571 } 1480 1572 EXPORT_SYMBOL(vringh_complete_iotlb); 1481 - 1482 - /** 1483 - * vringh_notify_enable_iotlb - we want to know if something changes. 1484 - * @vrh: the vring. 1485 - * 1486 - * This always enables notifications, but returns false if there are 1487 - * now more buffers available in the vring. 1488 - */ 1489 - bool vringh_notify_enable_iotlb(struct vringh *vrh) 1490 - { 1491 - return __vringh_notify_enable(vrh, getu16_iotlb, putu16_iotlb); 1492 - } 1493 - EXPORT_SYMBOL(vringh_notify_enable_iotlb); 1494 - 1495 - /** 1496 - * vringh_notify_disable_iotlb - don't tell us if something changes. 1497 - * @vrh: the vring. 1498 - * 1499 - * This is our normal running state: we disable and then only enable when 1500 - * we're going to sleep. 1501 - */ 1502 - void vringh_notify_disable_iotlb(struct vringh *vrh) 1503 - { 1504 - __vringh_notify_disable(vrh, putu16_iotlb); 1505 - } 1506 - EXPORT_SYMBOL(vringh_notify_disable_iotlb); 1507 1573 1508 1574 /** 1509 1575 * vringh_need_notify_iotlb - must we tell the other side about used buffers?

+8 -7

drivers/vhost/vsock.c

··· 344 344 345 345 len = iov_length(vq->iov, out); 346 346 347 + if (len < VIRTIO_VSOCK_SKB_HEADROOM || 348 + len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM) 349 + return NULL; 350 + 347 351 /* len contains both payload and hdr */ 348 352 skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); 349 353 if (!skb) ··· 371 367 return skb; 372 368 373 369 /* The pkt is too big or the length in the header is invalid */ 374 - if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || 375 - payload_len + sizeof(*hdr) > len) { 370 + if (payload_len + sizeof(*hdr) > len) { 376 371 kfree_skb(skb); 377 372 return NULL; 378 373 } 379 374 380 - virtio_vsock_skb_rx_put(skb); 375 + virtio_vsock_skb_put(skb, payload_len); 381 376 382 - nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); 383 - if (nbytes != payload_len) { 384 - vq_err(vq, "Expected %zu byte payload, got %zu bytes\n", 385 - payload_len, nbytes); 377 + if (skb_copy_datagram_from_iter(skb, 0, &iov_iter, payload_len)) { 378 + vq_err(vq, "Failed to copy %zu byte payload\n", payload_len); 386 379 kfree_skb(skb); 387 380 return NULL; 388 381 }

+4 -3

drivers/virtio/virtio.c

··· 147 147 148 148 /** 149 149 * virtio_config_driver_disable - disable config change reporting by drivers 150 - * @dev: the device to reset 150 + * @dev: the device to disable 151 151 * 152 152 * This is only allowed to be called by a driver and disabling can't 153 153 * be nested. ··· 162 162 163 163 /** 164 164 * virtio_config_driver_enable - enable config change reporting by drivers 165 - * @dev: the device to reset 165 + * @dev: the device to enable 166 166 * 167 167 * This is only allowed to be called by a driver and enabling can't 168 168 * be nested. ··· 512 512 * On error, the caller must call put_device on &@dev->dev (and not kfree), 513 513 * as another code path may have obtained a reference to @dev. 514 514 * 515 - * Returns: 0 on suceess, -error on failure 515 + * Returns: 0 on success, -error on failure 516 516 */ 517 517 int register_virtio_device(struct virtio_device *dev) 518 518 { ··· 536 536 goto out_ida_remove; 537 537 538 538 spin_lock_init(&dev->config_lock); 539 + dev->config_driver_disabled = false; 539 540 dev->config_core_enabled = false; 540 541 dev->config_change_pending = false; 541 542

+2

drivers/virtio/virtio_dma_buf.c

··· 36 36 37 37 /** 38 38 * virtio_dma_buf_attach - mandatory attach callback for virtio dma-bufs 39 + * @dma_buf: [in] buffer to attach 40 + * @attach: [in] attachment structure 39 41 */ 40 42 int virtio_dma_buf_attach(struct dma_buf *dma_buf, 41 43 struct dma_buf_attachment *attach)

+3 -49

drivers/virtio/virtio_mmio.c

··· 65 65 #include <linux/platform_device.h> 66 66 #include <linux/pm.h> 67 67 #include <linux/slab.h> 68 - #include <linux/spinlock.h> 69 68 #include <linux/virtio.h> 70 69 #include <linux/virtio_config.h> 71 70 #include <uapi/linux/virtio_mmio.h> ··· 87 88 88 89 void __iomem *base; 89 90 unsigned long version; 90 - 91 - /* a list of queues so we can dispatch IRQs */ 92 - spinlock_t lock; 93 - struct list_head virtqueues; 94 91 }; 95 - 96 - struct virtio_mmio_vq_info { 97 - /* the actual virtqueue */ 98 - struct virtqueue *vq; 99 - 100 - /* the list node for the virtqueues list */ 101 - struct list_head node; 102 - }; 103 - 104 - 105 92 106 93 /* Configuration interface */ 107 94 ··· 285 300 static irqreturn_t vm_interrupt(int irq, void *opaque) 286 301 { 287 302 struct virtio_mmio_device *vm_dev = opaque; 288 - struct virtio_mmio_vq_info *info; 303 + struct virtqueue *vq; 289 304 unsigned long status; 290 - unsigned long flags; 291 305 irqreturn_t ret = IRQ_NONE; 292 306 293 307 /* Read and acknowledge interrupts */ ··· 299 315 } 300 316 301 317 if (likely(status & VIRTIO_MMIO_INT_VRING)) { 302 - spin_lock_irqsave(&vm_dev->lock, flags); 303 - list_for_each_entry(info, &vm_dev->virtqueues, node) 304 - ret |= vring_interrupt(irq, info->vq); 305 - spin_unlock_irqrestore(&vm_dev->lock, flags); 318 + virtio_device_for_each_vq(&vm_dev->vdev, vq) 319 + ret |= vring_interrupt(irq, vq); 306 320 } 307 321 308 322 return ret; ··· 311 329 static void vm_del_vq(struct virtqueue *vq) 312 330 { 313 331 struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); 314 - struct virtio_mmio_vq_info *info = vq->priv; 315 - unsigned long flags; 316 332 unsigned int index = vq->index; 317 - 318 - spin_lock_irqsave(&vm_dev->lock, flags); 319 - list_del(&info->node); 320 - spin_unlock_irqrestore(&vm_dev->lock, flags); 321 333 322 334 /* Select and deactivate the queue */ 323 335 writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); ··· 323 347 } 324 348 325 349 vring_del_virtqueue(vq); 326 - 327 - kfree(info); 328 350 } 329 351 330 352 static void vm_del_vqs(struct virtio_device *vdev) ··· 349 375 { 350 376 struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); 351 377 bool (*notify)(struct virtqueue *vq); 352 - struct virtio_mmio_vq_info *info; 353 378 struct virtqueue *vq; 354 - unsigned long flags; 355 379 unsigned int num; 356 380 int err; 357 381 ··· 369 397 VIRTIO_MMIO_QUEUE_PFN : VIRTIO_MMIO_QUEUE_READY))) { 370 398 err = -ENOENT; 371 399 goto error_available; 372 - } 373 - 374 - /* Allocate and fill out our active queue description */ 375 - info = kmalloc(sizeof(*info), GFP_KERNEL); 376 - if (!info) { 377 - err = -ENOMEM; 378 - goto error_kmalloc; 379 400 } 380 401 381 402 num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); ··· 428 463 writel(1, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); 429 464 } 430 465 431 - vq->priv = info; 432 - info->vq = vq; 433 - 434 - spin_lock_irqsave(&vm_dev->lock, flags); 435 - list_add(&info->node, &vm_dev->virtqueues); 436 - spin_unlock_irqrestore(&vm_dev->lock, flags); 437 - 438 466 return vq; 439 467 440 468 error_bad_pfn: ··· 439 481 writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); 440 482 WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); 441 483 } 442 - kfree(info); 443 - error_kmalloc: 444 484 error_available: 445 485 return ERR_PTR(err); 446 486 } ··· 583 627 vm_dev->vdev.dev.release = virtio_mmio_release_dev; 584 628 vm_dev->vdev.config = &virtio_mmio_config_ops; 585 629 vm_dev->pdev = pdev; 586 - INIT_LIST_HEAD(&vm_dev->virtqueues); 587 - spin_lock_init(&vm_dev->lock); 588 630 589 631 vm_dev->base = devm_platform_ioremap_resource(pdev, 0); 590 632 if (IS_ERR(vm_dev->base)) {

+4

drivers/virtio/virtio_ring.c

··· 2296 2296 * at the same time (except where noted). 2297 2297 * 2298 2298 * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). 2299 + * 2300 + * NB: ENOSPC is a special code that is only returned on an attempt to add a 2301 + * buffer to a full VQ. It indicates that some buffers are outstanding and that 2302 + * the operation can be retried after some buffers have been used. 2299 2303 */ 2300 2304 int virtqueue_add_sgs(struct virtqueue *_vq, 2301 2305 struct scatterlist *sgs[],

+3 -41

drivers/virtio/virtio_vdpa.c

··· 28 28 struct virtio_device vdev; 29 29 struct vdpa_device *vdpa; 30 30 u64 features; 31 - 32 - /* The lock to protect virtqueue list */ 33 - spinlock_t lock; 34 - /* List of virtio_vdpa_vq_info */ 35 - struct list_head virtqueues; 36 - }; 37 - 38 - struct virtio_vdpa_vq_info { 39 - /* the actual virtqueue */ 40 - struct virtqueue *vq; 41 - 42 - /* the list node for the virtqueues list */ 43 - struct list_head node; 44 31 }; 45 32 46 33 static inline struct virtio_vdpa_device * ··· 122 135 123 136 static irqreturn_t virtio_vdpa_virtqueue_cb(void *private) 124 137 { 125 - struct virtio_vdpa_vq_info *info = private; 138 + struct virtqueue *vq = private; 126 139 127 - return vring_interrupt(0, info->vq); 140 + return vring_interrupt(0, vq); 128 141 } 129 142 130 143 static struct virtqueue * ··· 132 145 void (*callback)(struct virtqueue *vq), 133 146 const char *name, bool ctx) 134 147 { 135 - struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vdev); 136 148 struct vdpa_device *vdpa = vd_get_vdpa(vdev); 137 149 struct device *dma_dev; 138 150 const struct vdpa_config_ops *ops = vdpa->config; 139 - struct virtio_vdpa_vq_info *info; 140 151 bool (*notify)(struct virtqueue *vq) = virtio_vdpa_notify; 141 152 struct vdpa_callback cb; 142 153 struct virtqueue *vq; 143 154 u64 desc_addr, driver_addr, device_addr; 144 155 /* Assume split virtqueue, switch to packed if necessary */ 145 156 struct vdpa_vq_state state = {0}; 146 - unsigned long flags; 147 157 u32 align, max_num, min_num = 1; 148 158 bool may_reduce_num = true; 149 159 int err; ··· 163 179 if (ops->get_vq_ready(vdpa, index)) 164 180 return ERR_PTR(-ENOENT); 165 181 166 - /* Allocate and fill out our active queue description */ 167 - info = kmalloc(sizeof(*info), GFP_KERNEL); 168 - if (!info) 169 - return ERR_PTR(-ENOMEM); 170 182 if (ops->get_vq_size) 171 183 max_num = ops->get_vq_size(vdpa, index); 172 184 else ··· 197 217 198 218 /* Setup virtqueue callback */ 199 219 cb.callback = callback ? virtio_vdpa_virtqueue_cb : NULL; 200 - cb.private = info; 220 + cb.private = vq; 201 221 cb.trigger = NULL; 202 222 ops->set_vq_cb(vdpa, index, &cb); 203 223 ops->set_vq_num(vdpa, index, virtqueue_get_vring_size(vq)); ··· 228 248 229 249 ops->set_vq_ready(vdpa, index, 1); 230 250 231 - vq->priv = info; 232 - info->vq = vq; 233 - 234 - spin_lock_irqsave(&vd_dev->lock, flags); 235 - list_add(&info->node, &vd_dev->virtqueues); 236 - spin_unlock_irqrestore(&vd_dev->lock, flags); 237 - 238 251 return vq; 239 252 240 253 err_vq: ··· 236 263 ops->set_vq_ready(vdpa, index, 0); 237 264 /* VDPA driver should make sure vq is stopeed here */ 238 265 WARN_ON(ops->get_vq_ready(vdpa, index)); 239 - kfree(info); 240 266 return ERR_PTR(err); 241 267 } 242 268 ··· 244 272 struct virtio_vdpa_device *vd_dev = to_virtio_vdpa_device(vq->vdev); 245 273 struct vdpa_device *vdpa = vd_dev->vdpa; 246 274 const struct vdpa_config_ops *ops = vdpa->config; 247 - struct virtio_vdpa_vq_info *info = vq->priv; 248 275 unsigned int index = vq->index; 249 - unsigned long flags; 250 - 251 - spin_lock_irqsave(&vd_dev->lock, flags); 252 - list_del(&info->node); 253 - spin_unlock_irqrestore(&vd_dev->lock, flags); 254 276 255 277 /* Select and deactivate the queue (best effort) */ 256 278 ops->set_vq_ready(vdpa, index, 0); 257 279 258 280 vring_del_virtqueue(vq); 259 - 260 - kfree(info); 261 281 } 262 282 263 283 static void virtio_vdpa_del_vqs(struct virtio_device *vdev) ··· 466 502 vd_dev->vdev.dev.release = virtio_vdpa_release_dev; 467 503 vd_dev->vdev.config = &virtio_vdpa_config_ops; 468 504 vd_dev->vdpa = vdpa; 469 - INIT_LIST_HEAD(&vd_dev->virtqueues); 470 - spin_lock_init(&vd_dev->lock); 471 505 472 506 vd_dev->vdev.id.device = ops->get_device_id(vdpa); 473 507 if (vd_dev->vdev.id.device == 0)

+1 -1

include/linux/virtio.h

··· 199 199 size_t virtio_max_dma_size(const struct virtio_device *vdev); 200 200 201 201 #define virtio_device_for_each_vq(vdev, vq) \ 202 - list_for_each_entry(vq, &vdev->vqs, list) 202 + list_for_each_entry(vq, &(vdev)->vqs, list) 203 203 204 204 /** 205 205 * struct virtio_driver - operations for a virtio I/O driver

+35 -11

include/linux/virtio_vsock.h

··· 47 47 VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false; 48 48 } 49 49 50 - static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb) 50 + static inline void virtio_vsock_skb_put(struct sk_buff *skb, u32 len) 51 51 { 52 - u32 len; 52 + DEBUG_NET_WARN_ON_ONCE(skb->len); 53 53 54 - len = le32_to_cpu(virtio_vsock_hdr(skb)->len); 55 - 56 - if (len > 0) 54 + if (skb_is_nonlinear(skb)) 55 + skb->len = len; 56 + else 57 57 skb_put(skb, len); 58 58 } 59 59 60 - static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask) 60 + static inline struct sk_buff * 61 + __virtio_vsock_alloc_skb_with_frags(unsigned int header_len, 62 + unsigned int data_len, 63 + gfp_t mask) 61 64 { 62 65 struct sk_buff *skb; 66 + int err; 63 67 64 - if (size < VIRTIO_VSOCK_SKB_HEADROOM) 65 - return NULL; 66 - 67 - skb = alloc_skb(size, mask); 68 + skb = alloc_skb_with_frags(header_len, data_len, 69 + PAGE_ALLOC_COSTLY_ORDER, &err, mask); 68 70 if (!skb) 69 71 return NULL; 70 72 71 73 skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM); 74 + skb->data_len = data_len; 72 75 return skb; 76 + } 77 + 78 + static inline struct sk_buff * 79 + virtio_vsock_alloc_linear_skb(unsigned int size, gfp_t mask) 80 + { 81 + return __virtio_vsock_alloc_skb_with_frags(size, 0, mask); 82 + } 83 + 84 + static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask) 85 + { 86 + if (size <= SKB_WITH_OVERHEAD(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) 87 + return virtio_vsock_alloc_linear_skb(size, mask); 88 + 89 + size -= VIRTIO_VSOCK_SKB_HEADROOM; 90 + return __virtio_vsock_alloc_skb_with_frags(VIRTIO_VSOCK_SKB_HEADROOM, 91 + size, mask); 73 92 } 74 93 75 94 static inline void ··· 130 111 return (size_t)(skb_end_pointer(skb) - skb->head); 131 112 } 132 113 133 - #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) 114 + /* Dimension the RX SKB so that the entire thing fits exactly into 115 + * a single 4KiB page. This avoids wasting memory due to alloc_skb() 116 + * rounding up to the next page order and also means that we 117 + * don't leave higher-order pages sitting around in the RX queue. 118 + */ 119 + #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE SKB_WITH_OVERHEAD(1024 * 4) 134 120 #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL 135 121 #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) 136 122

-12

include/linux/vringh.h

··· 175 175 const struct vring_used_elem used[], 176 176 unsigned num_used); 177 177 178 - /* Pretend we've never seen descriptor (for easy error handling). */ 179 - void vringh_abandon_user(struct vringh *vrh, unsigned int num); 180 - 181 178 /* Do we need to fire the eventfd to notify the other side? */ 182 179 int vringh_need_notify_user(struct vringh *vrh); 183 180 ··· 232 235 u16 *head, 233 236 gfp_t gfp); 234 237 235 - ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len); 236 - ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov, 237 - const void *src, size_t len); 238 - void vringh_abandon_kern(struct vringh *vrh, unsigned int num); 239 238 int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len); 240 239 241 240 bool vringh_notify_enable_kern(struct vringh *vrh); ··· 312 319 struct vringh_kiov *wiov, 313 320 const void *src, size_t len); 314 321 315 - void vringh_abandon_iotlb(struct vringh *vrh, unsigned int num); 316 - 317 322 int vringh_complete_iotlb(struct vringh *vrh, u16 head, u32 len); 318 - 319 - bool vringh_notify_enable_iotlb(struct vringh *vrh); 320 - void vringh_notify_disable_iotlb(struct vringh *vrh); 321 323 322 324 int vringh_need_notify_iotlb(struct vringh *vrh); 323 325

+28

include/uapi/linux/vhost.h

··· 242 242 #define VHOST_SET_FEATURES_ARRAY _IOW(VHOST_VIRTIO, 0x83, \ 243 243 struct vhost_features_array) 244 244 245 + /* fork_owner values for vhost */ 246 + #define VHOST_FORK_OWNER_KTHREAD 0 247 + #define VHOST_FORK_OWNER_TASK 1 248 + 249 + /** 250 + * VHOST_SET_FORK_FROM_OWNER - Set the fork_owner flag for the vhost device, 251 + * This ioctl must called before VHOST_SET_OWNER. 252 + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y 253 + * 254 + * @param fork_owner: An 8-bit value that determines the vhost thread mode 255 + * 256 + * When fork_owner is set to VHOST_FORK_OWNER_TASK(default value): 257 + * - Vhost will create vhost worker as tasks forked from the owner, 258 + * inheriting all of the owner's attributes. 259 + * 260 + * When fork_owner is set to VHOST_FORK_OWNER_KTHREAD: 261 + * - Vhost will create vhost workers as kernel threads. 262 + */ 263 + #define VHOST_SET_FORK_FROM_OWNER _IOW(VHOST_VIRTIO, 0x83, __u8) 264 + 265 + /** 266 + * VHOST_GET_FORK_OWNER - Get the current fork_owner flag for the vhost device. 267 + * Only available when CONFIG_VHOST_ENABLE_FORK_OWNER_CONTROL=y 268 + * 269 + * @return: An 8-bit value indicating the current thread mode. 270 + */ 271 + #define VHOST_GET_FORK_FROM_OWNER _IOR(VHOST_VIRTIO, 0x84, __u8) 272 + 245 273 #endif

+1 -1

kernel/vhost_task.c

··· 145 145 tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args); 146 146 if (IS_ERR(tsk)) { 147 147 kfree(vtsk); 148 - return ERR_PTR(PTR_ERR(tsk)); 148 + return ERR_CAST(tsk); 149 149 } 150 150 151 151 vtsk->task = tsk;

+15 -5

net/vmw_vsock/virtio_transport.c

··· 307 307 308 308 static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) 309 309 { 310 - int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM; 310 + int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; 311 311 struct scatterlist pkt, *p; 312 312 struct virtqueue *vq; 313 313 struct sk_buff *skb; ··· 316 316 vq = vsock->vqs[VSOCK_VQ_RX]; 317 317 318 318 do { 319 - skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL); 319 + skb = virtio_vsock_alloc_linear_skb(total_len, GFP_KERNEL); 320 320 if (!skb) 321 321 break; 322 322 ··· 624 624 do { 625 625 virtqueue_disable_cb(vq); 626 626 for (;;) { 627 + unsigned int len, payload_len; 628 + struct virtio_vsock_hdr *hdr; 627 629 struct sk_buff *skb; 628 - unsigned int len; 629 630 630 631 if (!virtio_transport_more_replies(vsock)) { 631 632 /* Stop rx until the device processes already ··· 643 642 vsock->rx_buf_nr--; 644 643 645 644 /* Drop short/long packets */ 646 - if (unlikely(len < sizeof(struct virtio_vsock_hdr) || 645 + if (unlikely(len < sizeof(*hdr) || 647 646 len > virtio_vsock_skb_len(skb))) { 648 647 kfree_skb(skb); 649 648 continue; 650 649 } 651 650 652 - virtio_vsock_skb_rx_put(skb); 651 + hdr = virtio_vsock_hdr(skb); 652 + payload_len = le32_to_cpu(hdr->len); 653 + if (unlikely(payload_len > len - sizeof(*hdr))) { 654 + kfree_skb(skb); 655 + continue; 656 + } 657 + 658 + if (payload_len) 659 + virtio_vsock_skb_put(skb, payload_len); 660 + 653 661 virtio_transport_deliver_tap_pkt(skb); 654 662 virtio_transport_recv_pkt(&virtio_transport, skb); 655 663 }

+2 -1

net/vmw_vsock/virtio_transport_common.c

··· 109 109 return __zerocopy_sg_from_iter(info->msg, NULL, skb, 110 110 &info->msg->msg_iter, len, NULL); 111 111 112 - return memcpy_from_msg(skb_put(skb, len), info->msg, len); 112 + virtio_vsock_skb_put(skb, len); 113 + return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len); 113 114 } 114 115 115 116 static void virtio_transport_init_hdr(struct sk_buff *skb,

Configure Feed

Configure Feed