Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'af_unix-random-improvements-for-gc'

Kuniyuki Iwashima says:

====================
af_unix: Random improvements for GC.

If more than 16000 inflight AF_UNIX sockets exist on a host, each
sendmsg() will be forced to wait for unix_gc() even if a process
is not sending any FD.

This series tries not to impose such a penalty on sane users who
do not send AF_UNIX FDs or do not have inflight sockets more than
SCM_MAX_FD * 8.

The first patch can be backported to -stable.

Cleanup patches for commit 69db702c8387 ("io_uring/af_unix: disable
sending io_uring over sockets") and large refactoring of GC will
be followed later.

v4: https://lore.kernel.org/netdev/20231219030102.27509-1-kuniyu@amazon.com/
v3: https://lore.kernel.org/netdev/20231218075020.60826-1-kuniyu@amazon.com/
v2: https://lore.kernel.org/netdev/20231123014747.66063-1-kuniyu@amazon.com/
v1: https://lore.kernel.org/netdev/20231122013629.28554-1-kuniyu@amazon.com/
====================

Link: https://lore.kernel.org/r/20240123170856.41348-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+85 -70
+11 -3
include/net/af_unix.h
··· 8 8 #include <linux/refcount.h> 9 9 #include <net/sock.h> 10 10 11 + #if IS_ENABLED(CONFIG_UNIX) 12 + struct unix_sock *unix_get_socket(struct file *filp); 13 + #else 14 + static inline struct unix_sock *unix_get_socket(struct file *filp) 15 + { 16 + return NULL; 17 + } 18 + #endif 19 + 11 20 void unix_inflight(struct user_struct *user, struct file *fp); 12 21 void unix_notinflight(struct user_struct *user, struct file *fp); 13 22 void unix_destruct_scm(struct sk_buff *skb); 14 23 void io_uring_destruct_scm(struct sk_buff *skb); 15 24 void unix_gc(void); 16 - void wait_for_unix_gc(void); 17 - struct sock *unix_get_socket(struct file *filp); 25 + void wait_for_unix_gc(struct scm_fp_list *fpl); 18 26 struct sock *unix_peer_get(struct sock *sk); 19 27 20 28 #define UNIX_HASH_MOD (256 - 1) ··· 69 61 struct mutex iolock, bindlock; 70 62 struct sock *peer; 71 63 struct list_head link; 72 - atomic_long_t inflight; 64 + unsigned long inflight; 73 65 spinlock_t lock; 74 66 unsigned long gc_flags; 75 67 #define UNIX_GC_CANDIDATE 0
+1
include/net/scm.h
··· 25 25 26 26 struct scm_fp_list { 27 27 short count; 28 + short count_unix; 28 29 short max; 29 30 struct user_struct *user; 30 31 struct file *fp[SCM_MAX_FD];
+5
net/core/scm.c
··· 36 36 #include <net/compat.h> 37 37 #include <net/scm.h> 38 38 #include <net/cls_cgroup.h> 39 + #include <net/af_unix.h> 39 40 40 41 41 42 /* ··· 86 85 return -ENOMEM; 87 86 *fplp = fpl; 88 87 fpl->count = 0; 88 + fpl->count_unix = 0; 89 89 fpl->max = SCM_MAX_FD; 90 90 fpl->user = NULL; 91 91 } ··· 111 109 fput(file); 112 110 return -EINVAL; 113 111 } 112 + if (unix_get_socket(file)) 113 + fpl->count_unix++; 114 + 114 115 *fpp++ = file; 115 116 fpl->count++; 116 117 }
+6 -4
net/unix/af_unix.c
··· 993 993 sk->sk_write_space = unix_write_space; 994 994 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; 995 995 sk->sk_destruct = unix_sock_destructor; 996 - u = unix_sk(sk); 996 + u = unix_sk(sk); 997 + u->inflight = 0; 997 998 u->path.dentry = NULL; 998 999 u->path.mnt = NULL; 999 1000 spin_lock_init(&u->lock); 1000 - atomic_long_set(&u->inflight, 0); 1001 1001 INIT_LIST_HEAD(&u->link); 1002 1002 mutex_init(&u->iolock); /* single task reading lock */ 1003 1003 mutex_init(&u->bindlock); /* single task binding lock */ ··· 1923 1923 long timeo; 1924 1924 int err; 1925 1925 1926 - wait_for_unix_gc(); 1927 1926 err = scm_send(sock, msg, &scm, false); 1928 1927 if (err < 0) 1929 1928 return err; 1929 + 1930 + wait_for_unix_gc(scm.fp); 1930 1931 1931 1932 err = -EOPNOTSUPP; 1932 1933 if (msg->msg_flags&MSG_OOB) ··· 2200 2199 bool fds_sent = false; 2201 2200 int data_len; 2202 2201 2203 - wait_for_unix_gc(); 2204 2202 err = scm_send(sock, msg, &scm, false); 2205 2203 if (err < 0) 2206 2204 return err; 2205 + 2206 + wait_for_unix_gc(scm.fp); 2207 2207 2208 2208 err = -EOPNOTSUPP; 2209 2209 if (msg->msg_flags & MSG_OOB) {
+50 -48
net/unix/garbage.c
··· 86 86 /* Internal data structures and random procedures: */ 87 87 88 88 static LIST_HEAD(gc_candidates); 89 - static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); 90 89 91 90 static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), 92 91 struct sk_buff_head *hitlist) ··· 104 105 105 106 while (nfd--) { 106 107 /* Get the socket the fd matches if it indeed does so */ 107 - struct sock *sk = unix_get_socket(*fp++); 108 + struct unix_sock *u = unix_get_socket(*fp++); 108 109 109 - if (sk) { 110 - struct unix_sock *u = unix_sk(sk); 110 + /* Ignore non-candidates, they could have been added 111 + * to the queues after starting the garbage collection 112 + */ 113 + if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { 114 + hit = true; 111 115 112 - /* Ignore non-candidates, they could 113 - * have been added to the queues after 114 - * starting the garbage collection 115 - */ 116 - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { 117 - hit = true; 118 - 119 - func(u); 120 - } 116 + func(u); 121 117 } 122 118 } 123 119 if (hit && hitlist != NULL) { ··· 160 166 161 167 static void dec_inflight(struct unix_sock *usk) 162 168 { 163 - atomic_long_dec(&usk->inflight); 169 + usk->inflight--; 164 170 } 165 171 166 172 static void inc_inflight(struct unix_sock *usk) 167 173 { 168 - atomic_long_inc(&usk->inflight); 174 + usk->inflight++; 169 175 } 170 176 171 177 static void inc_inflight_move_tail(struct unix_sock *u) 172 178 { 173 - atomic_long_inc(&u->inflight); 179 + u->inflight++; 180 + 174 181 /* If this still might be part of a cycle, move it to the end 175 182 * of the list, so that it's checked even if it was already 176 183 * passed over ··· 181 186 } 182 187 183 188 static bool gc_in_progress; 184 - #define UNIX_INFLIGHT_TRIGGER_GC 16000 185 189 186 - void wait_for_unix_gc(void) 187 - { 188 - /* If number of inflight sockets is insane, 189 - * force a garbage collect right now. 190 - * Paired with the WRITE_ONCE() in unix_inflight(), 191 - * unix_notinflight() and gc_in_progress(). 192 - */ 193 - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && 194 - !READ_ONCE(gc_in_progress)) 195 - unix_gc(); 196 - wait_event(unix_gc_wait, gc_in_progress == false); 197 - } 198 - 199 - /* The external entry point: unix_gc() */ 200 - void unix_gc(void) 190 + static void __unix_gc(struct work_struct *work) 201 191 { 202 192 struct sk_buff *next_skb, *skb; 203 193 struct unix_sock *u; ··· 192 212 LIST_HEAD(not_cycle_list); 193 213 194 214 spin_lock(&unix_gc_lock); 195 - 196 - /* Avoid a recursive GC. */ 197 - if (gc_in_progress) 198 - goto out; 199 - 200 - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ 201 - WRITE_ONCE(gc_in_progress, true); 202 215 203 216 /* First, select candidates for garbage collection. Only 204 217 * in-flight sockets are considered, and from those only ones ··· 210 237 */ 211 238 list_for_each_entry_safe(u, next, &gc_inflight_list, link) { 212 239 long total_refs; 213 - long inflight_refs; 214 240 215 241 total_refs = file_count(u->sk.sk_socket->file); 216 - inflight_refs = atomic_long_read(&u->inflight); 217 242 218 - BUG_ON(inflight_refs < 1); 219 - BUG_ON(total_refs < inflight_refs); 220 - if (total_refs == inflight_refs) { 243 + BUG_ON(!u->inflight); 244 + BUG_ON(total_refs < u->inflight); 245 + if (total_refs == u->inflight) { 221 246 list_move_tail(&u->link, &gc_candidates); 222 247 __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); 223 248 __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); ··· 242 271 /* Move cursor to after the current position. */ 243 272 list_move(&cursor, &u->link); 244 273 245 - if (atomic_long_read(&u->inflight) > 0) { 274 + if (u->inflight) { 246 275 list_move_tail(&u->link, &not_cycle_list); 247 276 __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); 248 277 scan_children(&u->sk, inc_inflight_move_tail, NULL); ··· 299 328 /* Paired with READ_ONCE() in wait_for_unix_gc(). */ 300 329 WRITE_ONCE(gc_in_progress, false); 301 330 302 - wake_up(&unix_gc_wait); 303 - 304 - out: 305 331 spin_unlock(&unix_gc_lock); 332 + } 333 + 334 + static DECLARE_WORK(unix_gc_work, __unix_gc); 335 + 336 + void unix_gc(void) 337 + { 338 + WRITE_ONCE(gc_in_progress, true); 339 + queue_work(system_unbound_wq, &unix_gc_work); 340 + } 341 + 342 + #define UNIX_INFLIGHT_TRIGGER_GC 16000 343 + #define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) 344 + 345 + void wait_for_unix_gc(struct scm_fp_list *fpl) 346 + { 347 + /* If number of inflight sockets is insane, 348 + * force a garbage collect right now. 349 + * 350 + * Paired with the WRITE_ONCE() in unix_inflight(), 351 + * unix_notinflight(), and __unix_gc(). 352 + */ 353 + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && 354 + !READ_ONCE(gc_in_progress)) 355 + unix_gc(); 356 + 357 + /* Penalise users who want to send AF_UNIX sockets 358 + * but whose sockets have not been received yet. 359 + */ 360 + if (!fpl || !fpl->count_unix || 361 + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) 362 + return; 363 + 364 + if (READ_ONCE(gc_in_progress)) 365 + flush_work(&unix_gc_work); 306 366 }
+12 -15
net/unix/scm.c
··· 21 21 DEFINE_SPINLOCK(unix_gc_lock); 22 22 EXPORT_SYMBOL(unix_gc_lock); 23 23 24 - struct sock *unix_get_socket(struct file *filp) 24 + struct unix_sock *unix_get_socket(struct file *filp) 25 25 { 26 - struct sock *u_sock = NULL; 27 26 struct inode *inode = file_inode(filp); 28 27 29 28 /* Socket ? */ ··· 33 34 34 35 /* PF_UNIX ? */ 35 36 if (s && ops && ops->family == PF_UNIX) 36 - u_sock = s; 37 + return unix_sk(s); 37 38 } 38 39 39 - return u_sock; 40 + return NULL; 40 41 } 41 42 EXPORT_SYMBOL(unix_get_socket); 42 43 ··· 45 46 */ 46 47 void unix_inflight(struct user_struct *user, struct file *fp) 47 48 { 48 - struct sock *s = unix_get_socket(fp); 49 + struct unix_sock *u = unix_get_socket(fp); 49 50 50 51 spin_lock(&unix_gc_lock); 51 52 52 - if (s) { 53 - struct unix_sock *u = unix_sk(s); 54 - 55 - if (atomic_long_inc_return(&u->inflight) == 1) { 53 + if (u) { 54 + if (!u->inflight) { 56 55 BUG_ON(!list_empty(&u->link)); 57 56 list_add_tail(&u->link, &gc_inflight_list); 58 57 } else { 59 58 BUG_ON(list_empty(&u->link)); 60 59 } 60 + u->inflight++; 61 61 /* Paired with READ_ONCE() in wait_for_unix_gc() */ 62 62 WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); 63 63 } ··· 66 68 67 69 void unix_notinflight(struct user_struct *user, struct file *fp) 68 70 { 69 - struct sock *s = unix_get_socket(fp); 71 + struct unix_sock *u = unix_get_socket(fp); 70 72 71 73 spin_lock(&unix_gc_lock); 72 74 73 - if (s) { 74 - struct unix_sock *u = unix_sk(s); 75 - 76 - BUG_ON(!atomic_long_read(&u->inflight)); 75 + if (u) { 76 + BUG_ON(!u->inflight); 77 77 BUG_ON(list_empty(&u->link)); 78 78 79 - if (atomic_long_dec_and_test(&u->inflight)) 79 + u->inflight--; 80 + if (!u->inflight) 80 81 list_del_init(&u->link); 81 82 /* Paired with READ_ONCE() in wait_for_unix_gc() */ 82 83 WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);