Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'udp-small-changes-on-receive-path'

Eric Dumazet says:

====================
udp: small changes on receive path

This series is based on an observation I made in UDP receive path.

The sock_def_readable() costs are pretty high, especially when
epoll is used to generate EPOLLIN events.

First patch annotates races on sk->sk_rcvbuf reads.

Second patch replaces an atomic_add_return()
with a less expensive atomic_add()

Third patch avoids calling sock_def_readable() when possible.

Fourth patch adds sk_wake_async_rcu() to get better inlining
and code generation.
====================

Link: https://lore.kernel.org/r/20240328144032.1864988-1-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+38 -28
+2 -2
crypto/af_alg.c
··· 847 847 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | 848 848 EPOLLRDNORM | 849 849 EPOLLRDBAND); 850 - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 850 + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 851 851 rcu_read_unlock(); 852 852 } 853 853 EXPORT_SYMBOL_GPL(af_alg_wmem_wakeup); ··· 914 914 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 915 915 EPOLLRDNORM | 916 916 EPOLLRDBAND); 917 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 917 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 918 918 rcu_read_unlock(); 919 919 } 920 920
+6
include/net/sock.h
··· 2513 2513 } 2514 2514 } 2515 2515 2516 + static inline void sk_wake_async_rcu(const struct sock *sk, int how, int band) 2517 + { 2518 + if (unlikely(sock_flag(sk, SOCK_FASYNC))) 2519 + sock_wake_async(rcu_dereference(sk->sk_wq), how, band); 2520 + } 2521 + 2516 2522 /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might 2517 2523 * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. 2518 2524 * Note: for send buffers, TCP works better if we can build two skbs at
+1 -1
net/atm/common.c
··· 116 116 if (skwq_has_sleeper(wq)) 117 117 wake_up_interruptible(&wq->wait); 118 118 119 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 119 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 120 120 } 121 121 122 122 rcu_read_unlock();
+4 -4
net/core/sock.c
··· 3338 3338 wq = rcu_dereference(sk->sk_wq); 3339 3339 if (skwq_has_sleeper(wq)) 3340 3340 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3341 - sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 3341 + sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3342 3342 rcu_read_unlock(); 3343 3343 } 3344 3344 ··· 3353 3353 if (skwq_has_sleeper(wq)) 3354 3354 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3355 3355 EPOLLRDNORM | EPOLLRDBAND); 3356 - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 3356 + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3357 3357 rcu_read_unlock(); 3358 3358 } 3359 3359 ··· 3373 3373 EPOLLWRNORM | EPOLLWRBAND); 3374 3374 3375 3375 /* Should agree with poll, otherwise some programs break */ 3376 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3376 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3377 3377 } 3378 3378 3379 3379 rcu_read_unlock(); ··· 3398 3398 EPOLLWRNORM | EPOLLWRBAND); 3399 3399 3400 3400 /* Should agree with poll, otherwise some programs break */ 3401 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 3401 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3402 3402 } 3403 3403 } 3404 3404
+1 -1
net/dccp/output.c
··· 204 204 wake_up_interruptible(&wq->wait); 205 205 /* Should agree with poll, otherwise some programs break */ 206 206 if (sock_writeable(sk)) 207 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 207 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 208 208 209 209 rcu_read_unlock(); 210 210 }
+18 -14
net/ipv4/udp.c
··· 1492 1492 struct sk_buff_head *list = &sk->sk_receive_queue; 1493 1493 int rmem, err = -ENOMEM; 1494 1494 spinlock_t *busy = NULL; 1495 - int size; 1495 + bool becomes_readable; 1496 + int size, rcvbuf; 1496 1497 1497 - /* try to avoid the costly atomic add/sub pair when the receive 1498 - * queue is full; always allow at least a packet 1498 + /* Immediately drop when the receive queue is full. 1499 + * Always allow at least one packet. 1499 1500 */ 1500 1501 rmem = atomic_read(&sk->sk_rmem_alloc); 1501 - if (rmem > sk->sk_rcvbuf) 1502 + rcvbuf = READ_ONCE(sk->sk_rcvbuf); 1503 + if (rmem > rcvbuf) 1502 1504 goto drop; 1503 1505 1504 1506 /* Under mem pressure, it might be helpful to help udp_recvmsg() ··· 1509 1507 * - Less cache line misses at copyout() time 1510 1508 * - Less work at consume_skb() (less alien page frag freeing) 1511 1509 */ 1512 - if (rmem > (sk->sk_rcvbuf >> 1)) { 1510 + if (rmem > (rcvbuf >> 1)) { 1513 1511 skb_condense(skb); 1514 1512 1515 1513 busy = busylock_acquire(sk); ··· 1517 1515 size = skb->truesize; 1518 1516 udp_set_dev_scratch(skb); 1519 1517 1520 - /* we drop only if the receive buf is full and the receive 1521 - * queue contains some other skb 1522 - */ 1523 - rmem = atomic_add_return(size, &sk->sk_rmem_alloc); 1524 - if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) 1525 - goto uncharge_drop; 1518 + atomic_add(size, &sk->sk_rmem_alloc); 1526 1519 1527 1520 spin_lock(&list->lock); 1528 1521 err = udp_rmem_schedule(sk, size); ··· 1533 1536 */ 1534 1537 sock_skb_set_dropcount(sk, skb); 1535 1538 1539 + becomes_readable = skb_queue_empty(list); 1536 1540 __skb_queue_tail(list, skb); 1537 1541 spin_unlock(&list->lock); 1538 1542 1539 - if (!sock_flag(sk, SOCK_DEAD)) 1540 - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); 1541 - 1543 + if (!sock_flag(sk, SOCK_DEAD)) { 1544 + if (becomes_readable || 1545 + sk->sk_data_ready != sock_def_readable || 1546 + READ_ONCE(sk->sk_peek_off) >= 0) 1547 + INDIRECT_CALL_1(sk->sk_data_ready, 1548 + sock_def_readable, sk); 1549 + else 1550 + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 1551 + } 1542 1552 busylock_release(busy); 1543 1553 return 0; 1544 1554
+1 -1
net/iucv/af_iucv.c
··· 184 184 wq = rcu_dereference(sk->sk_wq); 185 185 if (skwq_has_sleeper(wq)) 186 186 wake_up_interruptible_all(&wq->wait); 187 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 187 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 188 188 rcu_read_unlock(); 189 189 } 190 190
+1 -1
net/rxrpc/af_rxrpc.c
··· 65 65 66 66 if (skwq_has_sleeper(wq)) 67 67 wake_up_interruptible(&wq->wait); 68 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 68 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 69 69 } 70 70 rcu_read_unlock(); 71 71 }
+1 -1
net/sctp/socket.c
··· 9276 9276 if (skwq_has_sleeper(wq)) 9277 9277 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | 9278 9278 EPOLLRDNORM | EPOLLRDBAND); 9279 - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 9279 + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 9280 9280 rcu_read_unlock(); 9281 9281 } 9282 9282
+2 -2
net/smc/smc_rx.c
··· 42 42 if (skwq_has_sleeper(wq)) 43 43 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 44 44 EPOLLRDNORM | EPOLLRDBAND); 45 - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 45 + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 46 46 if ((sk->sk_shutdown == SHUTDOWN_MASK) || 47 47 (sk->sk_state == SMC_CLOSED)) 48 - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); 48 + sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_HUP); 49 49 rcu_read_unlock(); 50 50 } 51 51
+1 -1
net/unix/af_unix.c
··· 546 546 if (skwq_has_sleeper(wq)) 547 547 wake_up_interruptible_sync_poll(&wq->wait, 548 548 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND); 549 - sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 549 + sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 550 550 } 551 551 rcu_read_unlock(); 552 552 }