Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'inet-frags-fully-use-rcu'

Eric Dumazet says:

====================
inet: frags: fully use RCU

While inet reassembly uses RCU, it is acquiring/releasing
a refcount on struct inet_frag_queue in fast path,
for no good reason.

This was mentioned in one patch changelog seven years ago :/

This series is removing these refcount changes, by extending
RCU sections.
====================

Link: https://patch.msgid.link/20250312082250.1803501-1-edumazet@google.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

+89 -84
+3 -3
include/net/inet_frag.h
··· 137 137 } 138 138 void fqdir_exit(struct fqdir *fqdir); 139 139 140 - void inet_frag_kill(struct inet_frag_queue *q); 140 + void inet_frag_kill(struct inet_frag_queue *q, int *refs); 141 141 void inet_frag_destroy(struct inet_frag_queue *q); 142 142 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); 143 143 ··· 145 145 unsigned int inet_frag_rbtree_purge(struct rb_root *root, 146 146 enum skb_drop_reason reason); 147 147 148 - static inline void inet_frag_put(struct inet_frag_queue *q) 148 + static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) 149 149 { 150 - if (refcount_dec_and_test(&q->refcnt)) 150 + if (refs && refcount_sub_and_test(refs, &q->refcnt)) 151 151 inet_frag_destroy(q); 152 152 } 153 153
+3 -2
include/net/ipv6_frag.h
··· 66 66 { 67 67 struct net_device *dev = NULL; 68 68 struct sk_buff *head; 69 + int refs = 1; 69 70 70 71 rcu_read_lock(); 71 72 /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ ··· 78 77 goto out; 79 78 80 79 fq->q.flags |= INET_FRAG_DROP; 81 - inet_frag_kill(&fq->q); 80 + inet_frag_kill(&fq->q, &refs); 82 81 83 82 dev = dev_get_by_index_rcu(net, fq->iif); 84 83 if (!dev) ··· 110 109 spin_unlock(&fq->q.lock); 111 110 out_rcu_unlock: 112 111 rcu_read_unlock(); 113 - inet_frag_put(&fq->q); 112 + inet_frag_putn(&fq->q, refs); 114 113 } 115 114 116 115 /* Check if the upper layer header is truncated in the first fragment. */
+17 -10
net/ieee802154/6lowpan/reassembly.c
··· 31 31 static struct inet_frags lowpan_frags; 32 32 33 33 static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, 34 - struct sk_buff *prev, struct net_device *ldev); 34 + struct sk_buff *prev, struct net_device *ldev, 35 + int *refs); 35 36 36 37 static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) 37 38 { ··· 46 45 { 47 46 struct inet_frag_queue *frag = from_timer(frag, t, timer); 48 47 struct frag_queue *fq; 48 + int refs = 1; 49 49 50 50 fq = container_of(frag, struct frag_queue, q); 51 51 ··· 55 53 if (fq->q.flags & INET_FRAG_COMPLETE) 56 54 goto out; 57 55 58 - inet_frag_kill(&fq->q); 56 + inet_frag_kill(&fq->q, &refs); 59 57 out: 60 58 spin_unlock(&fq->q.lock); 61 - inet_frag_put(&fq->q); 59 + inet_frag_putn(&fq->q, refs); 62 60 } 63 61 64 62 static inline struct lowpan_frag_queue * ··· 84 82 } 85 83 86 84 static int lowpan_frag_queue(struct lowpan_frag_queue *fq, 87 - struct sk_buff *skb, u8 frag_type) 85 + struct sk_buff *skb, u8 frag_type, 86 + int *refs) 88 87 { 89 88 struct sk_buff *prev_tail; 90 89 struct net_device *ldev; ··· 146 143 unsigned long orefdst = skb->_skb_refdst; 147 144 148 145 skb->_skb_refdst = 0UL; 149 - res = lowpan_frag_reasm(fq, skb, prev_tail, ldev); 146 + res = lowpan_frag_reasm(fq, skb, prev_tail, ldev, refs); 150 147 skb->_skb_refdst = orefdst; 151 148 return res; 152 149 } ··· 165 162 * the last and the first frames arrived and all the bits are here. 166 163 */ 167 164 static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, 168 - struct sk_buff *prev_tail, struct net_device *ldev) 165 + struct sk_buff *prev_tail, struct net_device *ldev, 166 + int *refs) 169 167 { 170 168 void *reasm_data; 171 169 172 - inet_frag_kill(&fq->q); 170 + inet_frag_kill(&fq->q, refs); 173 171 174 172 reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); 175 173 if (!reasm_data) ··· 304 300 goto err; 305 301 } 306 302 303 + rcu_read_lock(); 307 304 fq = fq_find(net, cb, &hdr.source, &hdr.dest); 308 305 if (fq != NULL) { 309 - int ret; 306 + int ret, refs = 0; 310 307 311 308 spin_lock(&fq->q.lock); 312 - ret = lowpan_frag_queue(fq, skb, frag_type); 309 + ret = lowpan_frag_queue(fq, skb, frag_type, &refs); 313 310 spin_unlock(&fq->q.lock); 314 311 315 - inet_frag_put(&fq->q); 312 + rcu_read_unlock(); 313 + inet_frag_putn(&fq->q, refs); 316 314 return ret; 317 315 } 316 + rcu_read_unlock(); 318 317 319 318 err: 320 319 kfree_skb(skb);
+15 -16
net/ipv4/inet_fragment.c
··· 145 145 } 146 146 spin_unlock_bh(&fq->lock); 147 147 148 - if (refcount_sub_and_test(count, &fq->refcnt)) 149 - inet_frag_destroy(fq); 148 + inet_frag_putn(fq, count); 150 149 } 151 150 152 151 static LLIST_HEAD(fqdir_free_list); ··· 225 226 } 226 227 EXPORT_SYMBOL(fqdir_exit); 227 228 228 - void inet_frag_kill(struct inet_frag_queue *fq) 229 + void inet_frag_kill(struct inet_frag_queue *fq, int *refs) 229 230 { 230 231 if (del_timer(&fq->timer)) 231 - refcount_dec(&fq->refcnt); 232 + (*refs)++; 232 233 233 234 if (!(fq->flags & INET_FRAG_COMPLETE)) { 234 235 struct fqdir *fqdir = fq->fqdir; ··· 243 244 if (!READ_ONCE(fqdir->dead)) { 244 245 rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, 245 246 fqdir->f->rhash_params); 246 - refcount_dec(&fq->refcnt); 247 + (*refs)++; 247 248 } else { 248 249 fq->flags |= INET_FRAG_HASH_DEAD; 249 250 } ··· 327 328 328 329 timer_setup(&q->timer, f->frag_expire, 0); 329 330 spin_lock_init(&q->lock); 330 - refcount_set(&q->refcnt, 3); 331 + /* One reference for the timer, one for the hash table. */ 332 + refcount_set(&q->refcnt, 2); 331 333 332 334 return q; 333 335 } ··· 350 350 *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, 351 351 &q->node, f->rhash_params); 352 352 if (*prev) { 353 + /* We could not insert in the hash table, 354 + * we need to cancel what inet_frag_alloc() 355 + * anticipated. 356 + */ 357 + int refs = 1; 358 + 353 359 q->flags |= INET_FRAG_COMPLETE; 354 - inet_frag_kill(q); 355 - inet_frag_destroy(q); 360 + inet_frag_kill(q, &refs); 361 + inet_frag_putn(q, refs); 356 362 return NULL; 357 363 } 358 364 return q; 359 365 } 360 366 361 - /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ 362 367 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) 363 368 { 364 369 /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ ··· 373 368 if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) 374 369 return NULL; 375 370 376 - rcu_read_lock(); 377 - 378 371 prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); 379 372 if (!prev) 380 373 fq = inet_frag_create(fqdir, key, &prev); 381 - if (!IS_ERR_OR_NULL(prev)) { 374 + if (!IS_ERR_OR_NULL(prev)) 382 375 fq = prev; 383 - if (!refcount_inc_not_zero(&fq->refcnt)) 384 - fq = NULL; 385 - } 386 - rcu_read_unlock(); 387 376 return fq; 388 377 } 389 378 EXPORT_SYMBOL(inet_frag_find);
+19 -29
net/ipv4/ip_fragment.c
··· 76 76 static struct inet_frags ip4_frags; 77 77 78 78 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, 79 - struct sk_buff *prev_tail, struct net_device *dev); 79 + struct sk_buff *prev_tail, struct net_device *dev, 80 + int *refs); 80 81 81 82 82 83 static void ip4_frag_init(struct inet_frag_queue *q, const void *a) ··· 108 107 inet_putpeer(qp->peer); 109 108 } 110 109 111 - 112 - /* Destruction primitives. */ 113 - 114 - static void ipq_put(struct ipq *ipq) 115 - { 116 - inet_frag_put(&ipq->q); 117 - } 118 - 119 - /* Kill ipq entry. It is not destroyed immediately, 120 - * because caller (and someone more) holds reference count. 121 - */ 122 - static void ipq_kill(struct ipq *ipq) 123 - { 124 - inet_frag_kill(&ipq->q); 125 - } 126 - 127 110 static bool frag_expire_skip_icmp(u32 user) 128 111 { 129 112 return user == IP_DEFRAG_AF_PACKET || ··· 128 143 struct sk_buff *head = NULL; 129 144 struct net *net; 130 145 struct ipq *qp; 146 + int refs = 1; 131 147 132 148 qp = container_of(frag, struct ipq, q); 133 149 net = qp->q.fqdir->net; ··· 145 159 goto out; 146 160 147 161 qp->q.flags |= INET_FRAG_DROP; 148 - ipq_kill(qp); 162 + inet_frag_kill(&qp->q, &refs); 149 163 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 150 164 __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); 151 165 ··· 188 202 out_rcu_unlock: 189 203 rcu_read_unlock(); 190 204 kfree_skb_reason(head, reason); 191 - ipq_put(qp); 205 + inet_frag_putn(&qp->q, refs); 192 206 } 193 207 194 208 /* Find the correct entry in the "incomplete datagrams" queue for ··· 264 278 } 265 279 266 280 /* Add new segment to existing queue. */ 267 - static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) 281 + static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb, int *refs) 268 282 { 269 283 struct net *net = qp->q.fqdir->net; 270 284 int ihl, end, flags, offset; ··· 284 298 if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && 285 299 unlikely(ip_frag_too_far(qp)) && 286 300 unlikely(err = ip_frag_reinit(qp))) { 287 - ipq_kill(qp); 301 + inet_frag_kill(&qp->q, refs); 288 302 goto err; 289 303 } 290 304 ··· 368 382 unsigned long orefdst = skb->_skb_refdst; 369 383 370 384 skb->_skb_refdst = 0UL; 371 - err = ip_frag_reasm(qp, skb, prev_tail, dev); 385 + err = ip_frag_reasm(qp, skb, prev_tail, dev, refs); 372 386 skb->_skb_refdst = orefdst; 373 387 if (err) 374 - inet_frag_kill(&qp->q); 388 + inet_frag_kill(&qp->q, refs); 375 389 return err; 376 390 } 377 391 ··· 388 402 err = -EINVAL; 389 403 __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); 390 404 discard_qp: 391 - inet_frag_kill(&qp->q); 405 + inet_frag_kill(&qp->q, refs); 392 406 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 393 407 err: 394 408 kfree_skb_reason(skb, reason); ··· 402 416 403 417 /* Build a new IP datagram from all its fragments. */ 404 418 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, 405 - struct sk_buff *prev_tail, struct net_device *dev) 419 + struct sk_buff *prev_tail, struct net_device *dev, 420 + int *refs) 406 421 { 407 422 struct net *net = qp->q.fqdir->net; 408 423 struct iphdr *iph; ··· 411 424 int len, err; 412 425 u8 ecn; 413 426 414 - ipq_kill(qp); 427 + inet_frag_kill(&qp->q, refs); 415 428 416 429 ecn = ip_frag_ecn_table[qp->ecn]; 417 430 if (unlikely(ecn == 0xff)) { ··· 483 496 __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); 484 497 485 498 /* Lookup (or create) queue header */ 499 + rcu_read_lock(); 486 500 qp = ip_find(net, ip_hdr(skb), user, vif); 487 501 if (qp) { 488 - int ret; 502 + int ret, refs = 0; 489 503 490 504 spin_lock(&qp->q.lock); 491 505 492 - ret = ip_frag_queue(qp, skb); 506 + ret = ip_frag_queue(qp, skb, &refs); 493 507 494 508 spin_unlock(&qp->q.lock); 495 - ipq_put(qp); 509 + rcu_read_unlock(); 510 + inet_frag_putn(&qp->q, refs); 496 511 return ret; 497 512 } 513 + rcu_read_unlock(); 498 514 499 515 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 500 516 kfree_skb(skb);
+17 -10
net/ipv6/netfilter/nf_conntrack_reasm.c
··· 123 123 #endif 124 124 125 125 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, 126 - struct sk_buff *prev_tail, struct net_device *dev); 126 + struct sk_buff *prev_tail, struct net_device *dev, 127 + int *refs); 127 128 128 129 static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) 129 130 { ··· 168 167 169 168 170 169 static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, 171 - const struct frag_hdr *fhdr, int nhoff) 170 + const struct frag_hdr *fhdr, int nhoff, 171 + int *refs) 172 172 { 173 173 unsigned int payload_len; 174 174 struct net_device *dev; ··· 223 221 * this case. -DaveM 224 222 */ 225 223 pr_debug("end of fragment not rounded to 8 bytes.\n"); 226 - inet_frag_kill(&fq->q); 224 + inet_frag_kill(&fq->q, refs); 227 225 return -EPROTO; 228 226 } 229 227 if (end > fq->q.len) { ··· 289 287 unsigned long orefdst = skb->_skb_refdst; 290 288 291 289 skb->_skb_refdst = 0UL; 292 - err = nf_ct_frag6_reasm(fq, skb, prev, dev); 290 + err = nf_ct_frag6_reasm(fq, skb, prev, dev, refs); 293 291 skb->_skb_refdst = orefdst; 294 292 295 293 /* After queue has assumed skb ownership, only 0 or ··· 303 301 return -EINPROGRESS; 304 302 305 303 insert_error: 306 - inet_frag_kill(&fq->q); 304 + inet_frag_kill(&fq->q, refs); 307 305 err: 308 306 skb_dst_drop(skb); 309 307 return -EINVAL; ··· 317 315 * the last and the first frames arrived and all the bits are here. 318 316 */ 319 317 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, 320 - struct sk_buff *prev_tail, struct net_device *dev) 318 + struct sk_buff *prev_tail, struct net_device *dev, 319 + int *refs) 321 320 { 322 321 void *reasm_data; 323 322 int payload_len; 324 323 u8 ecn; 325 324 326 - inet_frag_kill(&fq->q); 325 + inet_frag_kill(&fq->q, refs); 327 326 328 327 ecn = ip_frag_ecn_table[fq->ecn]; 329 328 if (unlikely(ecn == 0xff)) ··· 375 372 return 0; 376 373 377 374 err: 378 - inet_frag_kill(&fq->q); 375 + inet_frag_kill(&fq->q, refs); 379 376 return -EINVAL; 380 377 } 381 378 ··· 450 447 struct frag_hdr *fhdr; 451 448 struct frag_queue *fq; 452 449 struct ipv6hdr *hdr; 450 + int refs = 0; 453 451 u8 prevhdr; 454 452 455 453 /* Jumbo payload inhibits frag. header */ ··· 477 473 hdr = ipv6_hdr(skb); 478 474 fhdr = (struct frag_hdr *)skb_transport_header(skb); 479 475 476 + rcu_read_lock(); 480 477 fq = fq_find(net, fhdr->identification, user, hdr, 481 478 skb->dev ? skb->dev->ifindex : 0); 482 479 if (fq == NULL) { 480 + rcu_read_unlock(); 483 481 pr_debug("Can't find and can't create new queue\n"); 484 482 return -ENOMEM; 485 483 } 486 484 487 485 spin_lock_bh(&fq->q.lock); 488 486 489 - ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); 487 + ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, &refs); 490 488 if (ret == -EPROTO) { 491 489 skb->transport_header = savethdr; 492 490 ret = 0; 493 491 } 494 492 495 493 spin_unlock_bh(&fq->q.lock); 496 - inet_frag_put(&fq->q); 494 + rcu_read_unlock(); 495 + inet_frag_putn(&fq->q, refs); 497 496 return ret; 498 497 } 499 498 EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
+15 -14
net/ipv6/reassembly.c
··· 68 68 static struct inet_frags ip6_frags; 69 69 70 70 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, 71 - struct sk_buff *prev_tail, struct net_device *dev); 71 + struct sk_buff *prev_tail, struct net_device *dev, 72 + int *refs); 72 73 73 74 static void ip6_frag_expire(struct timer_list *t) 74 75 { ··· 106 105 107 106 static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, 108 107 struct frag_hdr *fhdr, int nhoff, 109 - u32 *prob_offset) 108 + u32 *prob_offset, int *refs) 110 109 { 111 110 struct net *net = dev_net(skb_dst(skb)->dev); 112 111 int offset, end, fragsize; ··· 221 220 unsigned long orefdst = skb->_skb_refdst; 222 221 223 222 skb->_skb_refdst = 0UL; 224 - err = ip6_frag_reasm(fq, skb, prev_tail, dev); 223 + err = ip6_frag_reasm(fq, skb, prev_tail, dev, refs); 225 224 skb->_skb_refdst = orefdst; 226 225 return err; 227 226 } ··· 239 238 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 240 239 IPSTATS_MIB_REASM_OVERLAPS); 241 240 discard_fq: 242 - inet_frag_kill(&fq->q); 241 + inet_frag_kill(&fq->q, refs); 243 242 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 244 243 IPSTATS_MIB_REASMFAILS); 245 244 err: ··· 255 254 * the last and the first frames arrived and all the bits are here. 256 255 */ 257 256 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, 258 - struct sk_buff *prev_tail, struct net_device *dev) 257 + struct sk_buff *prev_tail, struct net_device *dev, 258 + int *refs) 259 259 { 260 260 struct net *net = fq->q.fqdir->net; 261 261 unsigned int nhoff; ··· 264 262 int payload_len; 265 263 u8 ecn; 266 264 267 - inet_frag_kill(&fq->q); 265 + inet_frag_kill(&fq->q, refs); 268 266 269 267 ecn = ip_frag_ecn_table[fq->ecn]; 270 268 if (unlikely(ecn == 0xff)) ··· 305 303 skb_postpush_rcsum(skb, skb_network_header(skb), 306 304 skb_network_header_len(skb)); 307 305 308 - rcu_read_lock(); 309 306 __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMOKS); 310 - rcu_read_unlock(); 311 307 fq->q.rb_fragments = RB_ROOT; 312 308 fq->q.fragments_tail = NULL; 313 309 fq->q.last_run_head = NULL; ··· 317 317 out_oom: 318 318 net_dbg_ratelimited("ip6_frag_reasm: no memory for reassembly\n"); 319 319 out_fail: 320 - rcu_read_lock(); 321 320 __IP6_INC_STATS(net, __in6_dev_stats_get(dev, skb), IPSTATS_MIB_REASMFAILS); 322 - rcu_read_unlock(); 323 - inet_frag_kill(&fq->q); 321 + inet_frag_kill(&fq->q, refs); 324 322 return -1; 325 323 } 326 324 ··· 375 377 } 376 378 377 379 iif = skb->dev ? skb->dev->ifindex : 0; 380 + rcu_read_lock(); 378 381 fq = fq_find(net, fhdr->identification, hdr, iif); 379 382 if (fq) { 380 383 u32 prob_offset = 0; 381 - int ret; 384 + int ret, refs = 0; 382 385 383 386 spin_lock(&fq->q.lock); 384 387 385 388 fq->iif = iif; 386 389 ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, 387 - &prob_offset); 390 + &prob_offset, &refs); 388 391 389 392 spin_unlock(&fq->q.lock); 390 - inet_frag_put(&fq->q); 393 + rcu_read_unlock(); 394 + inet_frag_putn(&fq->q, refs); 391 395 if (prob_offset) { 392 396 __IP6_INC_STATS(net, __in6_dev_get_safely(skb->dev), 393 397 IPSTATS_MIB_INHDRERRORS); ··· 398 398 } 399 399 return ret; 400 400 } 401 + rcu_read_unlock(); 401 402 402 403 __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); 403 404 kfree_skb(skb);