Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

pidfs: convert rb-tree to rhashtable

Mateusz reported performance penalties [1] during task creation because
pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
rhashtable to have separate fine-grained locking and to decouple from
pidmap_lock moving all heavy manipulations outside of it.

Convert the pidfs inode-to-pid mapping from an rb-tree with seqcount
protection to an rhashtable. This removes the global pidmap_lock
contention from pidfs_ino_get_pid() lookups and allows the hashtable
insert to happen outside the pidmap_lock.

pidfs_add_pid() is split. pidfs_prepare_pid() allocates inode number and
initializes pid fields and is called inside pidmap_lock. pidfs_add_pid()
inserts pid into rhashtable and is called outside pidmap_lock. Insertion
into the rhashtable can fail and memory allocation may happen so we need
to drop the spinlock.

To guard against accidently opening an already reaped task
pidfs_ino_get_pid() uses additional checks beyond pid_vnr(). If
pid->attr is PIDFS_PID_DEAD or NULL the pid either never had a pidfd or
it already went through pidfs_exit() aka the process as already reaped.
If pid->attr is valid check PIDFS_ATTR_BIT_EXIT to figure out whether
the task has exited.

This slightly changes visibility semantics: pidfd creation is denied
after pidfs_exit() runs, which is just before the pid number is removed
from the via free_pid(). That should not be an issue though.

Link: https://lore.kernel.org/20251206131955.780557-1-mjguzik@gmail.com [1]
Link: https://patch.msgid.link/20260120-work-pidfs-rhashtable-v2-1-d593c4d0f576@kernel.org
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Christian Brauner <brauner@kernel.org>

+46 -55
+33 -48
fs/pidfs.c
··· 21 21 #include <linux/utsname.h> 22 22 #include <net/net_namespace.h> 23 23 #include <linux/coredump.h> 24 + #include <linux/rhashtable.h> 24 25 #include <linux/xattr.h> 25 26 26 27 #include "internal.h" ··· 56 55 __u32 coredump_signal; 57 56 }; 58 57 59 - static struct rb_root pidfs_ino_tree = RB_ROOT; 58 + static struct rhashtable pidfs_ino_ht; 59 + 60 + static const struct rhashtable_params pidfs_ino_ht_params = { 61 + .key_offset = offsetof(struct pid, ino), 62 + .key_len = sizeof(u64), 63 + .head_offset = offsetof(struct pid, pidfs_hash), 64 + .automatic_shrinking = true, 65 + }; 60 66 61 67 #if BITS_PER_LONG == 32 62 68 static inline unsigned long pidfs_ino(u64 ino) ··· 92 84 } 93 85 #endif 94 86 95 - static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) 96 - { 97 - struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); 98 - struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); 99 - u64 pid_ino_a = pid_a->ino; 100 - u64 pid_ino_b = pid_b->ino; 101 - 102 - if (pid_ino_a < pid_ino_b) 103 - return -1; 104 - if (pid_ino_a > pid_ino_b) 105 - return 1; 106 - return 0; 107 - } 108 - 109 - void pidfs_add_pid(struct pid *pid) 87 + /* 88 + * Allocate inode number and initialize pidfs fields. 89 + * Called with pidmap_lock held. 90 + */ 91 + void pidfs_prepare_pid(struct pid *pid) 110 92 { 111 93 static u64 pidfs_ino_nr = 2; 112 94 ··· 129 131 pidfs_ino_nr += 2; 130 132 131 133 pid->ino = pidfs_ino_nr; 134 + pid->pidfs_hash.next = NULL; 132 135 pid->stashed = NULL; 133 136 pid->attr = NULL; 134 137 pidfs_ino_nr++; 138 + } 135 139 136 - write_seqcount_begin(&pidmap_lock_seq); 137 - rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); 138 - write_seqcount_end(&pidmap_lock_seq); 140 + int pidfs_add_pid(struct pid *pid) 141 + { 142 + return rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, 143 + pidfs_ino_ht_params); 139 144 } 140 145 141 146 void pidfs_remove_pid(struct pid *pid) 142 147 { 143 - write_seqcount_begin(&pidmap_lock_seq); 144 - rb_erase(&pid->pidfs_node, &pidfs_ino_tree); 145 - write_seqcount_end(&pidmap_lock_seq); 148 + rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, 149 + pidfs_ino_ht_params); 146 150 } 147 151 148 152 void pidfs_free_pid(struct pid *pid) ··· 773 773 return FILEID_KERNFS; 774 774 } 775 775 776 - static int pidfs_ino_find(const void *key, const struct rb_node *node) 777 - { 778 - const u64 pid_ino = *(u64 *)key; 779 - const struct pid *pid = rb_entry(node, struct pid, pidfs_node); 780 - 781 - if (pid_ino < pid->ino) 782 - return -1; 783 - if (pid_ino > pid->ino) 784 - return 1; 785 - return 0; 786 - } 787 - 788 776 /* Find a struct pid based on the inode number. */ 789 777 static struct pid *pidfs_ino_get_pid(u64 ino) 790 778 { 791 779 struct pid *pid; 792 - struct rb_node *node; 793 - unsigned int seq; 780 + struct pidfs_attr *attr; 794 781 795 782 guard(rcu)(); 796 - do { 797 - seq = read_seqcount_begin(&pidmap_lock_seq); 798 - node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); 799 - if (node) 800 - break; 801 - } while (read_seqcount_retry(&pidmap_lock_seq, seq)); 802 - 803 - if (!node) 783 + pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params); 784 + if (!pid) 804 785 return NULL; 805 - 806 - pid = rb_entry(node, struct pid, pidfs_node); 807 - 786 + attr = READ_ONCE(pid->attr); 787 + if (IS_ERR_OR_NULL(attr)) 788 + return NULL; 789 + if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) 790 + return NULL; 808 791 /* Within our pid namespace hierarchy? */ 809 792 if (pid_vnr(pid) == 0) 810 793 return NULL; 811 - 812 794 return get_pid(pid); 813 795 } 814 796 ··· 1068 1086 1069 1087 void __init pidfs_init(void) 1070 1088 { 1089 + if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params)) 1090 + panic("Failed to initialize pidfs hashtable"); 1091 + 1071 1092 pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, 1072 1093 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1073 1094 SLAB_ACCOUNT | SLAB_PANIC), NULL);
+2 -2
include/linux/pid.h
··· 6 6 #include <linux/rculist.h> 7 7 #include <linux/rcupdate.h> 8 8 #include <linux/refcount.h> 9 + #include <linux/rhashtable-types.h> 9 10 #include <linux/sched.h> 10 11 #include <linux/wait.h> 11 12 ··· 61 60 spinlock_t lock; 62 61 struct { 63 62 u64 ino; 64 - struct rb_node pidfs_node; 63 + struct rhash_head pidfs_hash; 65 64 struct dentry *stashed; 66 65 struct pidfs_attr *attr; 67 66 }; ··· 74 73 struct upid numbers[]; 75 74 }; 76 75 77 - extern seqcount_spinlock_t pidmap_lock_seq; 78 76 extern struct pid init_struct_pid; 79 77 80 78 struct file;
+2 -1
include/linux/pidfs.h
··· 6 6 7 7 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); 8 8 void __init pidfs_init(void); 9 - void pidfs_add_pid(struct pid *pid); 9 + void pidfs_prepare_pid(struct pid *pid); 10 + int pidfs_add_pid(struct pid *pid); 10 11 void pidfs_remove_pid(struct pid *pid); 11 12 void pidfs_exit(struct task_struct *tsk); 12 13 #ifdef CONFIG_COREDUMP
+9 -4
kernel/pid.c
··· 43 43 #include <linux/sched/task.h> 44 44 #include <linux/idr.h> 45 45 #include <linux/pidfs.h> 46 - #include <linux/seqlock.h> 47 46 #include <net/sock.h> 48 47 #include <uapi/linux/pidfd.h> 49 48 ··· 84 85 EXPORT_SYMBOL_GPL(init_pid_ns); 85 86 86 87 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 87 - seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); 88 88 89 89 void put_pid(struct pid *pid) 90 90 { ··· 139 141 140 142 idr_remove(&ns->idr, upid->nr); 141 143 } 142 - pidfs_remove_pid(pid); 143 144 spin_unlock(&pidmap_lock); 144 145 146 + pidfs_remove_pid(pid); 145 147 call_rcu(&pid->rcu, delayed_put_pid); 146 148 } 147 149 ··· 314 316 retval = -ENOMEM; 315 317 if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) 316 318 goto out_free; 317 - pidfs_add_pid(pid); 319 + pidfs_prepare_pid(pid); 320 + 318 321 for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { 319 322 /* Make the PID visible to find_pid_ns. */ 320 323 idr_replace(&upid->ns->idr, pid, upid->nr); ··· 324 325 spin_unlock(&pidmap_lock); 325 326 idr_preload_end(); 326 327 ns_ref_active_get(ns); 328 + 329 + retval = pidfs_add_pid(pid); 330 + if (unlikely(retval)) { 331 + free_pid(pid); 332 + pid = ERR_PTR(-ENOMEM); 333 + } 327 334 328 335 return pid; 329 336