Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'vsock-virtio' into main

Luigi Leonardi says:

====================
vsock: avoid queuing on intermediate queue if possible

This series introduces an optimization for vsock/virtio to reduce latency
and increase the throughput: When the guest sends a packet to the host,
and the intermediate queue (send_pkt_queue) is empty, if there is enough
space, the packet is put directly in the virtqueue.

v3->v4
While running experiments on fio with 64B payload, I realized that there
was a mistake in my fio configuration, so I re-ran all the experiments
and now the latency numbers are indeed lower with the patch applied.
I also noticed that I was kicking the host without the lock.

- Fixed a configuration mistake on fio and re-ran all experiments.
- Fio latency measurement using 64B payload.
- virtio_transport_send_skb_fast_path sends kick with the tx_lock acquired
- Addressed all minor style changes requested by maintainer.
- Rebased on latest net-next
- Link to v3: https://lore.kernel.org/r/20240711-pinna-v3-0-697d4164fe80@outlook.com

v2->v3
- Performed more experiments using iperf3 using multiple streams
- Handling of reply packets removed from virtio_transport_send_skb,
as is needed just by the worker.
- Removed atomic_inc/atomic_sub when queuing directly to the vq.
- Introduced virtio_transport_send_skb_fast_path that handles the
steps for sending on the vq.
- Fixed a missing mutex_unlock in error path.
- Changed authorship of the second commit
- Rebased on latest net-next

v1->v2
In this v2 I replaced a mutex_lock with a mutex_trylock because it was
insidea RCU critical section. I also added a check on tx_run, so if the
module is being removed the packet is not queued. I'd like to thank Stefano
for reporting the tx_run issue.

Applied all Stefano's suggestions:
- Minor code style changes
- Minor commit text rewrite
Performed more experiments:
- Check if all the packets go directly to the vq (Matias' suggestion)
- Used iperf3 to see if there is any improvement in overall throughput
from guest to host
- Pinned the vhost process to a pCPU.
- Run fio using 512B payload
Rebased on latest net-next
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+202 -8
+3 -1
drivers/vhost/vsock.c
··· 244 244 restart_tx = true; 245 245 } 246 246 247 - consume_skb(skb); 247 + virtio_transport_consume_skb_sent(skb, true); 248 248 } 249 249 } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); 250 250 if (added) ··· 450 450 .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, 451 451 .notify_buffer_size = virtio_transport_notify_buffer_size, 452 452 .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, 453 + 454 + .unsent_bytes = virtio_transport_unsent_bytes, 453 455 454 456 .read_skb = virtio_transport_read_skb, 455 457 },
+6
include/linux/virtio_vsock.h
··· 133 133 u32 tx_cnt; 134 134 u32 peer_fwd_cnt; 135 135 u32 peer_buf_alloc; 136 + size_t bytes_unsent; 136 137 137 138 /* Protected by rx_lock */ 138 139 u32 fwd_cnt; ··· 193 192 s64 virtio_transport_stream_has_data(struct vsock_sock *vsk); 194 193 s64 virtio_transport_stream_has_space(struct vsock_sock *vsk); 195 194 u32 virtio_transport_seqpacket_has_data(struct vsock_sock *vsk); 195 + 196 + ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk); 197 + 198 + void virtio_transport_consume_skb_sent(struct sk_buff *skb, 199 + bool consume); 196 200 197 201 int virtio_transport_do_socket_init(struct vsock_sock *vsk, 198 202 struct vsock_sock *psk);
+3
include/net/af_vsock.h
··· 169 169 void (*notify_buffer_size)(struct vsock_sock *, u64 *); 170 170 int (*notify_set_rcvlowat)(struct vsock_sock *vsk, int val); 171 171 172 + /* SIOCOUTQ ioctl */ 173 + ssize_t (*unsent_bytes)(struct vsock_sock *vsk); 174 + 172 175 /* Shutdown. */ 173 176 int (*shutdown)(struct vsock_sock *, int); 174 177
+55 -3
net/vmw_vsock/af_vsock.c
··· 112 112 #include <net/sock.h> 113 113 #include <net/af_vsock.h> 114 114 #include <uapi/linux/vm_sockets.h> 115 + #include <uapi/asm-generic/ioctls.h> 115 116 116 117 static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); 117 118 static void vsock_sk_destruct(struct sock *sk); ··· 1293 1292 } 1294 1293 EXPORT_SYMBOL_GPL(vsock_dgram_recvmsg); 1295 1294 1295 + static int vsock_do_ioctl(struct socket *sock, unsigned int cmd, 1296 + int __user *arg) 1297 + { 1298 + struct sock *sk = sock->sk; 1299 + struct vsock_sock *vsk; 1300 + int ret; 1301 + 1302 + vsk = vsock_sk(sk); 1303 + 1304 + switch (cmd) { 1305 + case SIOCOUTQ: { 1306 + ssize_t n_bytes; 1307 + 1308 + if (!vsk->transport || !vsk->transport->unsent_bytes) { 1309 + ret = -EOPNOTSUPP; 1310 + break; 1311 + } 1312 + 1313 + if (sock_type_connectible(sk->sk_type) && sk->sk_state == TCP_LISTEN) { 1314 + ret = -EINVAL; 1315 + break; 1316 + } 1317 + 1318 + n_bytes = vsk->transport->unsent_bytes(vsk); 1319 + if (n_bytes < 0) { 1320 + ret = n_bytes; 1321 + break; 1322 + } 1323 + 1324 + ret = put_user(n_bytes, arg); 1325 + break; 1326 + } 1327 + default: 1328 + ret = -ENOIOCTLCMD; 1329 + } 1330 + 1331 + return ret; 1332 + } 1333 + 1334 + static int vsock_ioctl(struct socket *sock, unsigned int cmd, 1335 + unsigned long arg) 1336 + { 1337 + int ret; 1338 + 1339 + lock_sock(sock->sk); 1340 + ret = vsock_do_ioctl(sock, cmd, (int __user *)arg); 1341 + release_sock(sock->sk); 1342 + 1343 + return ret; 1344 + } 1345 + 1296 1346 static const struct proto_ops vsock_dgram_ops = { 1297 1347 .family = PF_VSOCK, 1298 1348 .owner = THIS_MODULE, ··· 1354 1302 .accept = sock_no_accept, 1355 1303 .getname = vsock_getname, 1356 1304 .poll = vsock_poll, 1357 - .ioctl = sock_no_ioctl, 1305 + .ioctl = vsock_ioctl, 1358 1306 .listen = sock_no_listen, 1359 1307 .shutdown = vsock_shutdown, 1360 1308 .sendmsg = vsock_dgram_sendmsg, ··· 2338 2286 .accept = vsock_accept, 2339 2287 .getname = vsock_getname, 2340 2288 .poll = vsock_poll, 2341 - .ioctl = sock_no_ioctl, 2289 + .ioctl = vsock_ioctl, 2342 2290 .listen = vsock_listen, 2343 2291 .shutdown = vsock_shutdown, 2344 2292 .setsockopt = vsock_connectible_setsockopt, ··· 2360 2308 .accept = vsock_accept, 2361 2309 .getname = vsock_getname, 2362 2310 .poll = vsock_poll, 2363 - .ioctl = sock_no_ioctl, 2311 + .ioctl = vsock_ioctl, 2364 2312 .listen = vsock_listen, 2365 2313 .shutdown = vsock_shutdown, 2366 2314 .setsockopt = vsock_connectible_setsockopt,
+3 -1
net/vmw_vsock/virtio_transport.c
··· 311 311 312 312 virtqueue_disable_cb(vq); 313 313 while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { 314 - consume_skb(skb); 314 + virtio_transport_consume_skb_sent(skb, true); 315 315 added = true; 316 316 } 317 317 } while (!virtqueue_enable_cb(vq)); ··· 539 539 .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, 540 540 .notify_buffer_size = virtio_transport_notify_buffer_size, 541 541 .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, 542 + 543 + .unsent_bytes = virtio_transport_unsent_bytes, 542 544 543 545 .read_skb = virtio_transport_read_skb, 544 546 },
+35
net/vmw_vsock/virtio_transport_common.c
··· 463 463 } 464 464 EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); 465 465 466 + void virtio_transport_consume_skb_sent(struct sk_buff *skb, bool consume) 467 + { 468 + struct sock *s = skb->sk; 469 + 470 + if (s && skb->len) { 471 + struct vsock_sock *vs = vsock_sk(s); 472 + struct virtio_vsock_sock *vvs; 473 + 474 + vvs = vs->trans; 475 + 476 + spin_lock_bh(&vvs->tx_lock); 477 + vvs->bytes_unsent -= skb->len; 478 + spin_unlock_bh(&vvs->tx_lock); 479 + } 480 + 481 + if (consume) 482 + consume_skb(skb); 483 + } 484 + EXPORT_SYMBOL_GPL(virtio_transport_consume_skb_sent); 485 + 466 486 u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 credit) 467 487 { 468 488 u32 ret; ··· 495 475 if (ret > credit) 496 476 ret = credit; 497 477 vvs->tx_cnt += ret; 478 + vvs->bytes_unsent += ret; 498 479 spin_unlock_bh(&vvs->tx_lock); 499 480 500 481 return ret; ··· 509 488 510 489 spin_lock_bh(&vvs->tx_lock); 511 490 vvs->tx_cnt -= credit; 491 + vvs->bytes_unsent -= credit; 512 492 spin_unlock_bh(&vvs->tx_lock); 513 493 } 514 494 EXPORT_SYMBOL_GPL(virtio_transport_put_credit); ··· 1111 1089 kfree(vvs); 1112 1090 } 1113 1091 EXPORT_SYMBOL_GPL(virtio_transport_destruct); 1092 + 1093 + ssize_t virtio_transport_unsent_bytes(struct vsock_sock *vsk) 1094 + { 1095 + struct virtio_vsock_sock *vvs = vsk->trans; 1096 + size_t ret; 1097 + 1098 + spin_lock_bh(&vvs->tx_lock); 1099 + ret = vvs->bytes_unsent; 1100 + spin_unlock_bh(&vvs->tx_lock); 1101 + 1102 + return ret; 1103 + } 1104 + EXPORT_SYMBOL_GPL(virtio_transport_unsent_bytes); 1114 1105 1115 1106 static int virtio_transport_reset(struct vsock_sock *vsk, 1116 1107 struct sk_buff *skb)
+6
net/vmw_vsock/vsock_loopback.c
··· 98 98 .notify_buffer_size = virtio_transport_notify_buffer_size, 99 99 .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, 100 100 101 + .unsent_bytes = virtio_transport_unsent_bytes, 102 + 101 103 .read_skb = virtio_transport_read_skb, 102 104 }, 103 105 ··· 125 123 spin_unlock_bh(&vsock->pkt_queue.lock); 126 124 127 125 while ((skb = __skb_dequeue(&pkts))) { 126 + /* Decrement the bytes_unsent counter without deallocating skb 127 + * It is freed by the receiver. 128 + */ 129 + virtio_transport_consume_skb_sent(skb, false); 128 130 virtio_transport_deliver_tap_pkt(skb); 129 131 virtio_transport_recv_pkt(&loopback_transport, skb); 130 132 }
+3 -3
tools/testing/vsock/util.c
··· 139 139 } 140 140 141 141 /* Connect to <cid, port> and return the file descriptor. */ 142 - static int vsock_connect(unsigned int cid, unsigned int port, int type) 142 + int vsock_connect(unsigned int cid, unsigned int port, int type) 143 143 { 144 144 union { 145 145 struct sockaddr sa; ··· 226 226 /* Listen on <cid, port> and return the first incoming connection. The remote 227 227 * address is stored to clientaddrp. clientaddrp may be NULL. 228 228 */ 229 - static int vsock_accept(unsigned int cid, unsigned int port, 230 - struct sockaddr_vm *clientaddrp, int type) 229 + int vsock_accept(unsigned int cid, unsigned int port, 230 + struct sockaddr_vm *clientaddrp, int type) 231 231 { 232 232 union { 233 233 struct sockaddr sa;
+3
tools/testing/vsock/util.h
··· 39 39 void init_signals(void); 40 40 unsigned int parse_cid(const char *str); 41 41 unsigned int parse_port(const char *str); 42 + int vsock_connect(unsigned int cid, unsigned int port, int type); 43 + int vsock_accept(unsigned int cid, unsigned int port, 44 + struct sockaddr_vm *clientaddrp, int type); 42 45 int vsock_stream_connect(unsigned int cid, unsigned int port); 43 46 int vsock_bind_connect(unsigned int cid, unsigned int port, 44 47 unsigned int bind_port, int type);
+85
tools/testing/vsock/vsock_test.c
··· 20 20 #include <sys/mman.h> 21 21 #include <poll.h> 22 22 #include <signal.h> 23 + #include <sys/ioctl.h> 24 + #include <linux/sockios.h> 23 25 24 26 #include "vsock_test_zerocopy.h" 25 27 #include "timeout.h" ··· 1240 1238 } 1241 1239 } 1242 1240 1241 + #define MSG_BUF_IOCTL_LEN 64 1242 + static void test_unsent_bytes_server(const struct test_opts *opts, int type) 1243 + { 1244 + unsigned char buf[MSG_BUF_IOCTL_LEN]; 1245 + int client_fd; 1246 + 1247 + client_fd = vsock_accept(VMADDR_CID_ANY, opts->peer_port, NULL, type); 1248 + if (client_fd < 0) { 1249 + perror("accept"); 1250 + exit(EXIT_FAILURE); 1251 + } 1252 + 1253 + recv_buf(client_fd, buf, sizeof(buf), 0, sizeof(buf)); 1254 + control_writeln("RECEIVED"); 1255 + 1256 + close(client_fd); 1257 + } 1258 + 1259 + static void test_unsent_bytes_client(const struct test_opts *opts, int type) 1260 + { 1261 + unsigned char buf[MSG_BUF_IOCTL_LEN]; 1262 + int ret, fd, sock_bytes_unsent; 1263 + 1264 + fd = vsock_connect(opts->peer_cid, opts->peer_port, type); 1265 + if (fd < 0) { 1266 + perror("connect"); 1267 + exit(EXIT_FAILURE); 1268 + } 1269 + 1270 + for (int i = 0; i < sizeof(buf); i++) 1271 + buf[i] = rand() & 0xFF; 1272 + 1273 + send_buf(fd, buf, sizeof(buf), 0, sizeof(buf)); 1274 + control_expectln("RECEIVED"); 1275 + 1276 + ret = ioctl(fd, SIOCOUTQ, &sock_bytes_unsent); 1277 + if (ret < 0) { 1278 + if (errno == EOPNOTSUPP) { 1279 + fprintf(stderr, "Test skipped, SIOCOUTQ not supported.\n"); 1280 + } else { 1281 + perror("ioctl"); 1282 + exit(EXIT_FAILURE); 1283 + } 1284 + } else if (ret == 0 && sock_bytes_unsent != 0) { 1285 + fprintf(stderr, 1286 + "Unexpected 'SIOCOUTQ' value, expected 0, got %i\n", 1287 + sock_bytes_unsent); 1288 + exit(EXIT_FAILURE); 1289 + } 1290 + 1291 + close(fd); 1292 + } 1293 + 1294 + static void test_stream_unsent_bytes_client(const struct test_opts *opts) 1295 + { 1296 + test_unsent_bytes_client(opts, SOCK_STREAM); 1297 + } 1298 + 1299 + static void test_stream_unsent_bytes_server(const struct test_opts *opts) 1300 + { 1301 + test_unsent_bytes_server(opts, SOCK_STREAM); 1302 + } 1303 + 1304 + static void test_seqpacket_unsent_bytes_client(const struct test_opts *opts) 1305 + { 1306 + test_unsent_bytes_client(opts, SOCK_SEQPACKET); 1307 + } 1308 + 1309 + static void test_seqpacket_unsent_bytes_server(const struct test_opts *opts) 1310 + { 1311 + test_unsent_bytes_server(opts, SOCK_SEQPACKET); 1312 + } 1313 + 1243 1314 #define RCVLOWAT_CREDIT_UPD_BUF_SIZE (1024 * 128) 1244 1315 /* This define is the same as in 'include/linux/virtio_vsock.h': 1245 1316 * it is used to decide when to send credit update message during ··· 1597 1522 .name = "SOCK_STREAM virtio credit update + low rx_bytes", 1598 1523 .run_client = test_stream_rcvlowat_def_cred_upd_client, 1599 1524 .run_server = test_stream_cred_upd_on_low_rx_bytes, 1525 + }, 1526 + { 1527 + .name = "SOCK_STREAM ioctl(SIOCOUTQ) 0 unsent bytes", 1528 + .run_client = test_stream_unsent_bytes_client, 1529 + .run_server = test_stream_unsent_bytes_server, 1530 + }, 1531 + { 1532 + .name = "SOCK_SEQPACKET ioctl(SIOCOUTQ) 0 unsent bytes", 1533 + .run_client = test_seqpacket_unsent_bytes_client, 1534 + .run_server = test_seqpacket_unsent_bytes_server, 1600 1535 }, 1601 1536 {}, 1602 1537 };