Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'inet-frags-flush-pending-skbs-in-fqdir_pre_exit'

Jakub Kicinski says:

====================
inet: frags: flush pending skbs in fqdir_pre_exit()

Fix the issue reported by NIPA starting on Sep 18th [1], where
pernet_ops_rwsem is constantly held by a reader, preventing writers
from grabbing it (specifically driver modules from loading).

The fact that reports started around that time seems coincidental.
The issue seems to be skbs queued for defrag preventing conntrack
from exiting.

First patch fixes another theoretical issue, it's mostly a leftover
from an attempt to get rid of the inet_frag_queue refcnt, which
I gave up on (still think it's doable but a bit of a time sink).
Second patch is a minor refactor.

The real fix is in the third patch. It's the simplest fix I can
think of which is to flush the frag queues. Perhaps someone has
a better suggestion?

Last patch adds an explicit warning for conntrack getting stuck,
as this seems like something that can easily happen if bugs sneak in.
The warning will hopefully save us the first 20% of the investigation
effort.

Link: https://lore.kernel.org/20251001082036.0fc51440@kernel.org # [1]
====================

Link: https://patch.msgid.link/20251207010942.1672972-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+72 -35
+3 -15
include/net/inet_frag.h
··· 123 123 124 124 int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); 125 125 126 - static inline void fqdir_pre_exit(struct fqdir *fqdir) 127 - { 128 - /* Prevent creation of new frags. 129 - * Pairs with READ_ONCE() in inet_frag_find(). 130 - */ 131 - WRITE_ONCE(fqdir->high_thresh, 0); 132 - 133 - /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() 134 - * and ip6frag_expire_frag_queue(). 135 - */ 136 - WRITE_ONCE(fqdir->dead, true); 137 - } 126 + void fqdir_pre_exit(struct fqdir *fqdir); 138 127 void fqdir_exit(struct fqdir *fqdir); 139 128 140 129 void inet_frag_kill(struct inet_frag_queue *q, int *refs); 141 130 void inet_frag_destroy(struct inet_frag_queue *q); 142 131 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); 143 132 144 - /* Free all skbs in the queue; return the sum of their truesizes. */ 145 - unsigned int inet_frag_rbtree_purge(struct rb_root *root, 146 - enum skb_drop_reason reason); 133 + void inet_frag_queue_flush(struct inet_frag_queue *q, 134 + enum skb_drop_reason reason); 147 135 148 136 static inline void inet_frag_putn(struct inet_frag_queue *q, int refs) 149 137 {
+6 -3
include/net/ipv6_frag.h
··· 69 69 int refs = 1; 70 70 71 71 rcu_read_lock(); 72 - /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ 73 - if (READ_ONCE(fq->q.fqdir->dead)) 74 - goto out_rcu_unlock; 75 72 spin_lock(&fq->q.lock); 76 73 77 74 if (fq->q.flags & INET_FRAG_COMPLETE) ··· 76 79 77 80 fq->q.flags |= INET_FRAG_DROP; 78 81 inet_frag_kill(&fq->q, &refs); 82 + 83 + /* Paired with the WRITE_ONCE() in fqdir_pre_exit(). */ 84 + if (READ_ONCE(fq->q.fqdir->dead)) { 85 + inet_frag_queue_flush(&fq->q, 0); 86 + goto out; 87 + } 79 88 80 89 dev = dev_get_by_index_rcu(net, fq->iif); 81 90 if (!dev)
+51 -4
net/ipv4/inet_fragment.c
··· 218 218 219 219 pure_initcall(inet_frag_wq_init); 220 220 221 + void fqdir_pre_exit(struct fqdir *fqdir) 222 + { 223 + struct inet_frag_queue *fq; 224 + struct rhashtable_iter hti; 225 + 226 + /* Prevent creation of new frags. 227 + * Pairs with READ_ONCE() in inet_frag_find(). 228 + */ 229 + WRITE_ONCE(fqdir->high_thresh, 0); 230 + 231 + /* Pairs with READ_ONCE() in inet_frag_kill(), ip_expire() 232 + * and ip6frag_expire_frag_queue(). 233 + */ 234 + WRITE_ONCE(fqdir->dead, true); 235 + 236 + rhashtable_walk_enter(&fqdir->rhashtable, &hti); 237 + rhashtable_walk_start(&hti); 238 + 239 + while ((fq = rhashtable_walk_next(&hti))) { 240 + if (IS_ERR(fq)) { 241 + if (PTR_ERR(fq) != -EAGAIN) 242 + break; 243 + continue; 244 + } 245 + spin_lock_bh(&fq->lock); 246 + if (!(fq->flags & INET_FRAG_COMPLETE)) 247 + inet_frag_queue_flush(fq, 0); 248 + spin_unlock_bh(&fq->lock); 249 + } 250 + 251 + rhashtable_walk_stop(&hti); 252 + rhashtable_walk_exit(&hti); 253 + } 254 + EXPORT_SYMBOL(fqdir_pre_exit); 255 + 221 256 void fqdir_exit(struct fqdir *fqdir) 222 257 { 223 258 INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); ··· 298 263 kmem_cache_free(f->frags_cachep, q); 299 264 } 300 265 301 - unsigned int inet_frag_rbtree_purge(struct rb_root *root, 302 - enum skb_drop_reason reason) 266 + static unsigned int 267 + inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason) 303 268 { 304 269 struct rb_node *p = rb_first(root); 305 270 unsigned int sum = 0; ··· 319 284 } 320 285 return sum; 321 286 } 322 - EXPORT_SYMBOL(inet_frag_rbtree_purge); 287 + 288 + void inet_frag_queue_flush(struct inet_frag_queue *q, 289 + enum skb_drop_reason reason) 290 + { 291 + unsigned int sum; 292 + 293 + reason = reason ?: SKB_DROP_REASON_FRAG_REASM_TIMEOUT; 294 + sum = inet_frag_rbtree_purge(&q->rb_fragments, reason); 295 + sub_frag_mem_limit(q->fqdir, sum); 296 + } 297 + EXPORT_SYMBOL(inet_frag_queue_flush); 323 298 324 299 void inet_frag_destroy(struct inet_frag_queue *q) 325 300 { ··· 372 327 373 328 timer_setup(&q->timer, f->frag_expire, 0); 374 329 spin_lock_init(&q->lock); 375 - /* One reference for the timer, one for the hash table. */ 330 + /* One reference for the timer, one for the hash table. 331 + * We never take any extra references, only decrement this field. 332 + */ 376 333 refcount_set(&q->refcnt, 2); 377 334 378 335 return q;
+9 -13
net/ipv4/ip_fragment.c
··· 134 134 net = qp->q.fqdir->net; 135 135 136 136 rcu_read_lock(); 137 - 138 - /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ 139 - if (READ_ONCE(qp->q.fqdir->dead)) 140 - goto out_rcu_unlock; 141 - 142 137 spin_lock(&qp->q.lock); 143 138 144 139 if (qp->q.flags & INET_FRAG_COMPLETE) ··· 141 146 142 147 qp->q.flags |= INET_FRAG_DROP; 143 148 inet_frag_kill(&qp->q, &refs); 149 + 150 + /* Paired with WRITE_ONCE() in fqdir_pre_exit(). */ 151 + if (READ_ONCE(qp->q.fqdir->dead)) { 152 + inet_frag_queue_flush(&qp->q, 0); 153 + goto out; 154 + } 155 + 144 156 __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); 145 157 __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); 146 158 ··· 242 240 243 241 static int ip_frag_reinit(struct ipq *qp) 244 242 { 245 - unsigned int sum_truesize = 0; 246 - 247 - if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) { 248 - refcount_inc(&qp->q.refcnt); 243 + if (!mod_timer_pending(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) 249 244 return -ETIMEDOUT; 250 - } 251 245 252 - sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments, 253 - SKB_DROP_REASON_FRAG_TOO_FAR); 254 - sub_frag_mem_limit(qp->q.fqdir, sum_truesize); 246 + inet_frag_queue_flush(&qp->q, SKB_DROP_REASON_FRAG_TOO_FAR); 255 247 256 248 qp->q.flags = 0; 257 249 qp->q.len = 0;
+3
net/netfilter/nf_conntrack_core.c
··· 2487 2487 void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) 2488 2488 { 2489 2489 struct nf_ct_iter_data iter_data = {}; 2490 + unsigned long start = jiffies; 2490 2491 struct net *net; 2491 2492 int busy; 2492 2493 ··· 2508 2507 busy = 1; 2509 2508 } 2510 2509 if (busy) { 2510 + DEBUG_NET_WARN_ONCE(time_after(jiffies, start + 60 * HZ), 2511 + "conntrack cleanup blocked for 60s"); 2511 2512 schedule(); 2512 2513 goto i_see_dead_people; 2513 2514 }