rhashtable: Bounce deferred worker kick through irq_work

Inserts past 75% load call schedule_work(&ht->run_work) to kick an
async resize. If a caller holds a raw spinlock (e.g. an
insecure_elasticity user), schedule_work() under that lock records

caller_lock -> pool->lock -> pi_lock -> rq->__lock

A cycle forms if any of these locks is acquired in the reverse
direction elsewhere. sched_ext, the only current insecure_elasticity
user, hits this: it holds scx_sched_lock across rhashtable inserts of
sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock.
Exercising the resize path produces:

Chain exists of:
&pool->lock --> &rq->__lock --> scx_sched_lock

Bounce the kick from the insert paths through irq_work so
schedule_work() runs from hard IRQ context with the caller's lock no
longer held. rht_deferred_worker()'s self-rearm on error stays on
schedule_work(&ht->run_work) - the worker runs in process context with
no caller lock held, and keeping the self-requeue on @run_work lets
cancel_work_sync() in rhashtable_free_and_destroy() drain it.

v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work).
Routing it through irq_work in v2 broke cancel_work_sync()'s
self-requeue handling - an irq_work queued after irq_work_sync()
returned but while cancel_work_sync() was still waiting could fire
post-teardown.

v2: Bounce unconditionally instead of gating on insecure_elasticity,
as suggested by Herbert.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>

Tejun Heo 3 weeks ago 4fe98529 5897ca15

+33 -4

3 changed files

expand all

include

linux

rhashtable-types.h

rhashtable.h

lib

rhashtable.c

include/linux/rhashtable-types.h

··· 12 12 #include <linux/alloc_tag.h> 13 13 #include <linux/atomic.h> 14 14 #include <linux/compiler.h> 15 + #include <linux/irq_work_types.h> 15 16 #include <linux/mutex.h> 16 17 #include <linux/workqueue_types.h> 17 18 ··· 78 77 * @p: Configuration parameters 79 78 * @rhlist: True if this is an rhltable 80 79 * @run_work: Deferred worker to expand/shrink asynchronously 80 + * @run_irq_work: Bounces the @run_work kick through hard IRQ context. 81 81 * @mutex: Mutex to protect current/future table swapping 82 82 * @lock: Spin lock to protect walker list 83 83 * @nelems: Number of elements in table ··· 90 88 struct rhashtable_params p; 91 89 bool rhlist; 92 90 struct work_struct run_work; 91 + struct irq_work run_irq_work; 93 92 struct mutex mutex; 94 93 spinlock_t lock; 95 94 atomic_t nelems;

+2 -1

include/linux/rhashtable.h

··· 20 20 21 21 #include <linux/err.h> 22 22 #include <linux/errno.h> 23 + #include <linux/irq_work.h> 23 24 #include <linux/jhash.h> 24 25 #include <linux/list_nulls.h> 25 26 #include <linux/workqueue.h> ··· 848 847 rht_assign_unlock(tbl, bkt, obj, flags); 849 848 850 849 if (rht_grow_above_75(ht, tbl)) 851 - schedule_work(&ht->run_work); 850 + irq_work_queue(&ht->run_irq_work); 852 851 853 852 data = NULL; 854 853 out:

+28 -3

lib/rhashtable.c

··· 441 441 442 442 mutex_unlock(&ht->mutex); 443 443 444 + /* 445 + * Re-arm via @run_work, not @run_irq_work. 446 + * rhashtable_free_and_destroy() drains async work as irq_work_sync() 447 + * followed by cancel_work_sync(). If this site queued irq_work while 448 + * cancel_work_sync() was waiting for us, irq_work_sync() would already 449 + * have returned and the stale irq_work could fire post-teardown. 450 + * cancel_work_sync() natively handles self-requeue on @run_work. 451 + */ 444 452 if (err) 445 453 schedule_work(&ht->run_work); 454 + } 455 + 456 + /* 457 + * Insert-path callers can run under a raw spinlock (e.g. an insecure_elasticity 458 + * user). Calling schedule_work() under that lock records caller_lock -> 459 + * pool->lock -> pi_lock -> rq->__lock, closing a locking cycle if any of 460 + * these is acquired in the reverse direction elsewhere. Bounce through 461 + * irq_work so the schedule_work() runs with the caller's lock no longer held. 462 + */ 463 + static void rht_deferred_irq_work(struct irq_work *irq_work) 464 + { 465 + struct rhashtable *ht = container_of(irq_work, struct rhashtable, 466 + run_irq_work); 467 + 468 + schedule_work(&ht->run_work); 446 469 } 447 470 448 471 static int rhashtable_insert_rehash(struct rhashtable *ht, ··· 500 477 if (err == -EEXIST) 501 478 err = 0; 502 479 } else 503 - schedule_work(&ht->run_work); 480 + irq_work_queue(&ht->run_irq_work); 504 481 505 482 return err; 506 483 ··· 511 488 512 489 /* Schedule async rehash to retry allocation in process context. */ 513 490 if (err == -ENOMEM) 514 - schedule_work(&ht->run_work); 491 + irq_work_queue(&ht->run_irq_work); 515 492 516 493 return err; 517 494 } ··· 653 630 rht_unlock(tbl, bkt, flags); 654 631 655 632 if (inserted && rht_grow_above_75(ht, tbl)) 656 - schedule_work(&ht->run_work); 633 + irq_work_queue(&ht->run_irq_work); 657 634 } 658 635 } while (!IS_ERR_OR_NULL(new_tbl)); 659 636 ··· 1108 1085 RCU_INIT_POINTER(ht->tbl, tbl); 1109 1086 1110 1087 INIT_WORK(&ht->run_work, rht_deferred_worker); 1088 + init_irq_work(&ht->run_irq_work, rht_deferred_irq_work); 1111 1089 1112 1090 return 0; 1113 1091 } ··· 1174 1150 struct bucket_table *tbl, *next_tbl; 1175 1151 unsigned int i; 1176 1152 1153 + irq_work_sync(&ht->run_irq_work); 1177 1154 cancel_work_sync(&ht->run_work); 1178 1155 1179 1156 mutex_lock(&ht->mutex);

Configure Feed

Configure Feed