Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

rcu: Use an intermediate irq_work to start process_srcu()

Since commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms
of SRCU-fast") we switched to SRCU in BPF. However as BPF instrument can
happen basically everywhere (including where a scheduler lock is held),
call_srcu() now needs to avoid acquiring scheduler lock because
otherwise it could cause deadlock [1]. Fix this by following what the
previous RCU Tasks Trace did: using an irq_work to delay the queuing of
the work to start process_srcu().

[boqun: Apply Joel's feedback]
[boqun: Apply Andrea's test feedback]

Reported-by: Andrea Righi <arighi@nvidia.com>
Closes: https://lore.kernel.org/all/abjzvz_tL_siV17s@gpd4/
Fixes: commit c27cea4416a3 ("rcu: Re-implement RCU Tasks Trace in terms of SRCU-fast")
Link: https://lore.kernel.org/rcu/3c4c5a29-24ea-492d-aeee-e0d9605b4183@nvidia.com/ [1]
Suggested-by: Zqiang <qiang.zhang@linux.dev>
Tested-by: Andrea Righi <arighi@nvidia.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Joel Fernandes <joelagnelf@nvidia.com>
Signed-off-by: Boqun Feng <boqun@kernel.org>

+29 -2
+1
include/linux/srcutree.h
··· 95 95 unsigned long reschedule_jiffies; 96 96 unsigned long reschedule_count; 97 97 struct delayed_work work; 98 + struct irq_work irq_work; 98 99 struct srcu_struct *srcu_ssp; 99 100 }; 100 101
+28 -2
kernel/rcu/srcutree.c
··· 19 19 #include <linux/mutex.h> 20 20 #include <linux/percpu.h> 21 21 #include <linux/preempt.h> 22 + #include <linux/irq_work.h> 22 23 #include <linux/rcupdate_wait.h> 23 24 #include <linux/sched.h> 24 25 #include <linux/smp.h> ··· 76 75 static void srcu_invoke_callbacks(struct work_struct *work); 77 76 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); 78 77 static void process_srcu(struct work_struct *work); 78 + static void srcu_irq_work(struct irq_work *work); 79 79 static void srcu_delay_timer(struct timer_list *t); 80 80 81 81 /* ··· 218 216 mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); 219 217 atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); 220 218 INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); 219 + init_irq_work(&ssp->srcu_sup->irq_work, srcu_irq_work); 221 220 ssp->srcu_sup->sda_is_static = is_static; 222 221 if (!is_static) { 223 222 ssp->sda = alloc_percpu(struct srcu_data); ··· 719 716 return; /* Just leak it! */ 720 717 if (WARN_ON(srcu_readers_active(ssp))) 721 718 return; /* Just leak it! */ 719 + /* Wait for irq_work to finish first as it may queue a new work. */ 720 + irq_work_sync(&sup->irq_work); 722 721 flush_delayed_work(&sup->work); 723 722 for_each_possible_cpu(cpu) { 724 723 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); ··· 1126 1121 // it isn't. And it does not have to be. After all, it 1127 1122 // can only be executed during early boot when there is only 1128 1123 // the one boot CPU running with interrupts still disabled. 1124 + // 1125 + // Use an irq_work here to avoid acquiring runqueue lock with 1126 + // srcu rcu_node::lock held. BPF instrument could introduce the 1127 + // opposite dependency, hence we need to break the possible 1128 + // locking dependency here. 1129 1129 if (likely(srcu_init_done)) 1130 - queue_delayed_work(rcu_gp_wq, &sup->work, 1131 - !!srcu_get_delay(ssp)); 1130 + irq_work_queue(&sup->irq_work); 1132 1131 else if (list_empty(&sup->work.work.entry)) 1133 1132 list_add(&sup->work.work.entry, &srcu_boot_list); 1134 1133 } ··· 1989 1980 } 1990 1981 } 1991 1982 srcu_reschedule(ssp, curdelay); 1983 + } 1984 + 1985 + static void srcu_irq_work(struct irq_work *work) 1986 + { 1987 + struct srcu_struct *ssp; 1988 + struct srcu_usage *sup; 1989 + unsigned long delay; 1990 + unsigned long flags; 1991 + 1992 + sup = container_of(work, struct srcu_usage, irq_work); 1993 + ssp = sup->srcu_ssp; 1994 + 1995 + raw_spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); 1996 + delay = srcu_get_delay(ssp); 1997 + raw_spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 1998 + 1999 + queue_delayed_work(rcu_gp_wq, &sup->work, !!delay); 1992 2000 } 1993 2001 1994 2002 void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags,