Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

tcp: Use per-vma locking for receive zerocopy

Per-VMA locking allows us to lock a struct vm_area_struct without
taking the process-wide mmap lock in read mode.

Consider a process workload where the mmap lock is taken constantly in
write mode. In this scenario, all zerocopy receives are periodically
blocked during that period of time - though in principle, the memory
ranges being used by TCP are not touched by the operations that need
the mmap write lock. This results in performance degradation.

Now consider another workload where the mmap lock is never taken in
write mode, but there are many TCP connections using receive zerocopy
that are concurrently receiving. These connections all take the mmap
lock in read mode, but this does induce a lot of contention and atomic
ops for this process-wide lock. This results in additional CPU
overhead caused by contending on the cache line for this lock.

However, with per-vma locking, both of these problems can be avoided.

As a test, I ran an RPC-style request/response workload with 4KB
payloads and receive zerocopy enabled, with 100 simultaneous TCP
connections. I measured perf cycles within the
find_tcp_vma/mmap_read_lock/mmap_read_unlock codepath, with and
without per-vma locking enabled.

When using process-wide mmap semaphore read locking, about 1% of
measured perf cycles were within this path. With per-VMA locking, this
value dropped to about 0.45%.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Arjun Roy and committed by
David S. Miller
7a7f0946 b650d953

+60 -11
+1
MAINTAINERS
··· 14743 14743 M: Eric Dumazet <edumazet@google.com> 14744 14744 L: netdev@vger.kernel.org 14745 14745 S: Maintained 14746 + F: include/linux/net_mm.h 14746 14747 F: include/linux/tcp.h 14747 14748 F: include/net/tcp.h 14748 14749 F: include/trace/events/tcp.h
+17
include/linux/net_mm.h
··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + #ifdef CONFIG_MMU 3 + 4 + #ifdef CONFIG_INET 5 + extern const struct vm_operations_struct tcp_vm_ops; 6 + static inline bool vma_is_tcp(const struct vm_area_struct *vma) 7 + { 8 + return vma->vm_ops == &tcp_vm_ops; 9 + } 10 + #else 11 + static inline bool vma_is_tcp(const struct vm_area_struct *vma) 12 + { 13 + return false; 14 + } 15 + #endif /* CONFIG_INET*/ 16 + 17 + #endif /* CONFIG_MMU */
+1
include/net/tcp.h
··· 45 45 #include <linux/memcontrol.h> 46 46 #include <linux/bpf-cgroup.h> 47 47 #include <linux/siphash.h> 48 + #include <linux/net_mm.h> 48 49 49 50 extern struct inet_hashinfo tcp_hashinfo; 50 51
+4 -3
mm/memory.c
··· 77 77 #include <linux/ptrace.h> 78 78 #include <linux/vmalloc.h> 79 79 #include <linux/sched/sysctl.h> 80 + #include <linux/net_mm.h> 80 81 81 82 #include <trace/events/kmem.h> 82 83 ··· 5281 5280 if (!vma) 5282 5281 goto inval; 5283 5282 5284 - /* Only anonymous vmas are supported for now */ 5285 - if (!vma_is_anonymous(vma)) 5283 + /* Only anonymous and tcp vmas are supported for now */ 5284 + if (!vma_is_anonymous(vma) && !vma_is_tcp(vma)) 5286 5285 goto inval; 5287 5286 5288 5287 /* find_mergeable_anon_vma uses adjacent vmas which are not locked */ 5289 - if (!vma->anon_vma) 5288 + if (!vma->anon_vma && !vma_is_tcp(vma)) 5290 5289 goto inval; 5291 5290 5292 5291 if (!vma_start_read(vma))
+37 -8
net/ipv4/tcp.c
··· 1774 1774 } 1775 1775 1776 1776 #ifdef CONFIG_MMU 1777 - static const struct vm_operations_struct tcp_vm_ops = { 1777 + const struct vm_operations_struct tcp_vm_ops = { 1778 1778 }; 1779 1779 1780 1780 int tcp_mmap(struct file *file, struct socket *sock, ··· 2073 2073 } 2074 2074 } 2075 2075 2076 + static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, 2077 + unsigned long address, 2078 + bool *mmap_locked) 2079 + { 2080 + struct vm_area_struct *vma = NULL; 2081 + 2082 + #ifdef CONFIG_PER_VMA_LOCK 2083 + vma = lock_vma_under_rcu(mm, address); 2084 + #endif 2085 + if (vma) { 2086 + if (!vma_is_tcp(vma)) { 2087 + vma_end_read(vma); 2088 + return NULL; 2089 + } 2090 + *mmap_locked = false; 2091 + return vma; 2092 + } 2093 + 2094 + mmap_read_lock(mm); 2095 + vma = vma_lookup(mm, address); 2096 + if (!vma || !vma_is_tcp(vma)) { 2097 + mmap_read_unlock(mm); 2098 + return NULL; 2099 + } 2100 + *mmap_locked = true; 2101 + return vma; 2102 + } 2103 + 2076 2104 #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 2077 2105 static int tcp_zerocopy_receive(struct sock *sk, 2078 2106 struct tcp_zerocopy_receive *zc, ··· 2118 2090 u32 seq = tp->copied_seq; 2119 2091 u32 total_bytes_to_map; 2120 2092 int inq = tcp_inq(sk); 2093 + bool mmap_locked; 2121 2094 int ret; 2122 2095 2123 2096 zc->copybuf_len = 0; ··· 2143 2114 return 0; 2144 2115 } 2145 2116 2146 - mmap_read_lock(current->mm); 2147 - 2148 - vma = vma_lookup(current->mm, address); 2149 - if (!vma || vma->vm_ops != &tcp_vm_ops) { 2150 - mmap_read_unlock(current->mm); 2117 + vma = find_tcp_vma(current->mm, address, &mmap_locked); 2118 + if (!vma) 2151 2119 return -EINVAL; 2152 - } 2120 + 2153 2121 vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); 2154 2122 avail_len = min_t(u32, vma_len, inq); 2155 2123 total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); ··· 2220 2194 zc, total_bytes_to_map); 2221 2195 } 2222 2196 out: 2223 - mmap_read_unlock(current->mm); 2197 + if (mmap_locked) 2198 + mmap_read_unlock(current->mm); 2199 + else 2200 + vma_end_read(vma); 2224 2201 /* Try to copy straggler data. */ 2225 2202 if (!ret) 2226 2203 copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss);