Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

kernfs: RCU protect kernfs_nodes and avoid kernfs_idr_lock in kernfs_find_and_get_node_by_id()

The BPF helper bpf_cgroup_from_id() calls kernfs_find_and_get_node_by_id()
which acquires kernfs_idr_lock, which is an non-raw non-IRQ-safe lock. This
can lead to deadlocks as bpf_cgroup_from_id() can be called from any BPF
programs including e.g. the ones that attach to functions which are holding
the scheduler rq lock.

Consider the following BPF program:

SEC("fentry/__set_cpus_allowed_ptr_locked")
int BPF_PROG(__set_cpus_allowed_ptr_locked, struct task_struct *p,
struct affinity_context *affn_ctx, struct rq *rq, struct rq_flags *rf)
{
struct cgroup *cgrp = bpf_cgroup_from_id(p->cgroups->dfl_cgrp->kn->id);

if (cgrp) {
bpf_printk("%d[%s] in %s", p->pid, p->comm, cgrp->kn->name);
bpf_cgroup_release(cgrp);
}
return 0;
}

__set_cpus_allowed_ptr_locked() is called with rq lock held and the above
BPF program calls bpf_cgroup_from_id() within leading to the following
lockdep warning:

=====================================================
WARNING: HARDIRQ-safe -> HARDIRQ-unsafe lock order detected
6.7.0-rc3-work-00053-g07124366a1d7-dirty #147 Not tainted
-----------------------------------------------------
repro/1620 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire:
ffffffff833b3688 (kernfs_idr_lock){+.+.}-{2:2}, at: kernfs_find_and_get_node_by_id+0x1e/0x70

and this task is already holding:
ffff888237ced698 (&rq->__lock){-.-.}-{2:2}, at: task_rq_lock+0x4e/0xf0
which would create a new lock dependency:
(&rq->__lock){-.-.}-{2:2} -> (kernfs_idr_lock){+.+.}-{2:2}
...
Possible interrupt unsafe locking scenario:

CPU0 CPU1
---- ----
lock(kernfs_idr_lock);
local_irq_disable();
lock(&rq->__lock);
lock(kernfs_idr_lock);
<Interrupt>
lock(&rq->__lock);

*** DEADLOCK ***
...
Call Trace:
dump_stack_lvl+0x55/0x70
dump_stack+0x10/0x20
__lock_acquire+0x781/0x2a40
lock_acquire+0xbf/0x1f0
_raw_spin_lock+0x2f/0x40
kernfs_find_and_get_node_by_id+0x1e/0x70
cgroup_get_from_id+0x21/0x240
bpf_cgroup_from_id+0xe/0x20
bpf_prog_98652316e9337a5a___set_cpus_allowed_ptr_locked+0x96/0x11a
bpf_trampoline_6442545632+0x4f/0x1000
__set_cpus_allowed_ptr_locked+0x5/0x5a0
sched_setaffinity+0x1b3/0x290
__x64_sys_sched_setaffinity+0x4f/0x60
do_syscall_64+0x40/0xe0
entry_SYSCALL_64_after_hwframe+0x46/0x4e

Let's fix it by protecting kernfs_node and kernfs_root with RCU and making
kernfs_find_and_get_node_by_id() acquire rcu_read_lock() instead of
kernfs_idr_lock.

This adds an rcu_head to kernfs_node making it larger by 16 bytes on 64bit.
Combined with the preceding rearrange patch, the net increase is 8 bytes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrea Righi <andrea.righi@canonical.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Link: https://lore.kernel.org/r/20240109214828.252092-4-tj@kernel.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

authored by

Tejun Heo and committed by
Greg Kroah-Hartman
4207b556 1c9f2c76

+24 -11
+20 -11
fs/kernfs/dir.c
··· 529 529 } 530 530 EXPORT_SYMBOL_GPL(kernfs_get); 531 531 532 + static void kernfs_free_rcu(struct rcu_head *rcu) 533 + { 534 + struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); 535 + 536 + kfree_const(kn->name); 537 + 538 + if (kn->iattr) { 539 + simple_xattrs_free(&kn->iattr->xattrs, NULL); 540 + kmem_cache_free(kernfs_iattrs_cache, kn->iattr); 541 + } 542 + 543 + kmem_cache_free(kernfs_node_cache, kn); 544 + } 545 + 532 546 /** 533 547 * kernfs_put - put a reference count on a kernfs_node 534 548 * @kn: the target kernfs_node ··· 571 557 if (kernfs_type(kn) == KERNFS_LINK) 572 558 kernfs_put(kn->symlink.target_kn); 573 559 574 - kfree_const(kn->name); 575 - 576 - if (kn->iattr) { 577 - simple_xattrs_free(&kn->iattr->xattrs, NULL); 578 - kmem_cache_free(kernfs_iattrs_cache, kn->iattr); 579 - } 580 560 spin_lock(&kernfs_idr_lock); 581 561 idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); 582 562 spin_unlock(&kernfs_idr_lock); 583 - kmem_cache_free(kernfs_node_cache, kn); 563 + 564 + call_rcu(&kn->rcu, kernfs_free_rcu); 584 565 585 566 kn = parent; 586 567 if (kn) { ··· 584 575 } else { 585 576 /* just released the root kn, free @root too */ 586 577 idr_destroy(&root->ino_idr); 587 - kfree(root); 578 + kfree_rcu(root, rcu); 588 579 } 589 580 } 590 581 EXPORT_SYMBOL_GPL(kernfs_put); ··· 724 715 ino_t ino = kernfs_id_ino(id); 725 716 u32 gen = kernfs_id_gen(id); 726 717 727 - spin_lock(&kernfs_idr_lock); 718 + rcu_read_lock(); 728 719 729 720 kn = idr_find(&root->ino_idr, (u32)ino); 730 721 if (!kn) ··· 748 739 if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count))) 749 740 goto err_unlock; 750 741 751 - spin_unlock(&kernfs_idr_lock); 742 + rcu_read_unlock(); 752 743 return kn; 753 744 err_unlock: 754 - spin_unlock(&kernfs_idr_lock); 745 + rcu_read_unlock(); 755 746 return NULL; 756 747 } 757 748
+2
fs/kernfs/kernfs-internal.h
··· 49 49 struct rw_semaphore kernfs_rwsem; 50 50 struct rw_semaphore kernfs_iattr_rwsem; 51 51 struct rw_semaphore kernfs_supers_rwsem; 52 + 53 + struct rcu_head rcu; 52 54 }; 53 55 54 56 /* +1 to avoid triggering overflow warning when negating it */
+2
include/linux/kernfs.h
··· 223 223 224 224 void *priv; 225 225 struct kernfs_iattrs *iattr; 226 + 227 + struct rcu_head rcu; 226 228 }; 227 229 228 230 /*