Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull pidfs updates from Christian Brauner:

- pid: introduce task_ppid_vnr() helper

- pidfs: convert rb-tree to rhashtable

Mateusz reported performance penalties during task creation because
pidfs uses pidmap_lock to add elements into the rbtree. Switch to an
rhashtable to have separate fine-grained locking and to decouple from
pidmap_lock moving all heavy manipulations outside of it

Also move inode allocation outside of pidmap_lock. With this there's
nothing happening for pidfs under pidmap_lock

- pid: reorder fields in pid_namespace to reduce false sharing

- Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie
callers"

- ipc: Add SPDX license id to mqueue.c

* tag 'kernel-7.0-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
pid: introduce task_ppid_vnr() helper
pidfs: implement ino allocation without the pidmap lock
Revert "pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers"
pid: reorder fields in pid_namespace to reduce false sharing
pidfs: convert rb-tree to rhashtable
ipc: Add SPDX license id to mqueue.c

+120 -96
+94 -78
fs/pidfs.c
··· 21 21 #include <linux/utsname.h> 22 22 #include <net/net_namespace.h> 23 23 #include <linux/coredump.h> 24 + #include <linux/rhashtable.h> 24 25 #include <linux/xattr.h> 26 + #include <linux/cookie.h> 25 27 26 28 #include "internal.h" 27 29 #include "mount.h" ··· 57 55 __u32 coredump_signal; 58 56 }; 59 57 60 - static struct rb_root pidfs_ino_tree = RB_ROOT; 58 + static struct rhashtable pidfs_ino_ht; 59 + 60 + static const struct rhashtable_params pidfs_ino_ht_params = { 61 + .key_offset = offsetof(struct pid, ino), 62 + .key_len = sizeof(u64), 63 + .head_offset = offsetof(struct pid, pidfs_hash), 64 + .automatic_shrinking = true, 65 + }; 66 + 67 + /* 68 + * inode number handling 69 + * 70 + * On 64 bit nothing special happens. The 64bit number assigned 71 + * to struct pid is the inode number. 72 + * 73 + * On 32 bit the 64 bit number assigned to struct pid is split 74 + * into two 32 bit numbers. The lower 32 bits are used as the 75 + * inode number and the upper 32 bits are used as the inode 76 + * generation number. 77 + * 78 + * On 32 bit pidfs_ino() will return the lower 32 bit. When 79 + * pidfs_ino() returns zero a wrap around happened. When a 80 + * wraparound happens the 64 bit number will be incremented by 1 81 + * so inode numbering starts at 1 again. 82 + * 83 + * On 64 bit comparing two pidfds is as simple as comparing 84 + * inode numbers. 85 + * 86 + * When a wraparound happens on 32 bit multiple pidfds with the 87 + * same inode number are likely to exist (This isn't a problem 88 + * since before pidfs pidfds used the anonymous inode meaning 89 + * all pidfds had the same inode number.). Userspace can 90 + * reconstruct the 64 bit identifier by retrieving both the 91 + * inode number and the inode generation number to compare or 92 + * use file handles. 93 + */ 61 94 62 95 #if BITS_PER_LONG == 32 96 + 97 + DEFINE_SPINLOCK(pidfs_ino_lock); 98 + static u64 pidfs_ino_nr = 1; 99 + 63 100 static inline unsigned long pidfs_ino(u64 ino) 64 101 { 65 102 return lower_32_bits(ino); ··· 108 67 static inline u32 pidfs_gen(u64 ino) 109 68 { 110 69 return upper_32_bits(ino); 70 + } 71 + 72 + static inline u64 pidfs_alloc_ino(void) 73 + { 74 + u64 ino; 75 + 76 + spin_lock(&pidfs_ino_lock); 77 + if (pidfs_ino(pidfs_ino_nr) == 0) 78 + pidfs_ino_nr++; 79 + ino = pidfs_ino_nr++; 80 + spin_unlock(&pidfs_ino_lock); 81 + return ino; 111 82 } 112 83 113 84 #else ··· 135 82 { 136 83 return 0; 137 84 } 138 - #endif 139 85 140 - static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) 86 + DEFINE_COOKIE(pidfs_ino_cookie); 87 + 88 + static u64 pidfs_alloc_ino(void) 141 89 { 142 - struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); 143 - struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); 144 - u64 pid_ino_a = pid_a->ino; 145 - u64 pid_ino_b = pid_b->ino; 90 + u64 ino; 146 91 147 - if (pid_ino_a < pid_ino_b) 148 - return -1; 149 - if (pid_ino_a > pid_ino_b) 150 - return 1; 151 - return 0; 92 + preempt_disable(); 93 + ino = gen_cookie_next(&pidfs_ino_cookie); 94 + preempt_enable(); 95 + 96 + VFS_WARN_ON_ONCE(ino < 1); 97 + return ino; 152 98 } 153 99 154 - void pidfs_add_pid(struct pid *pid) 100 + #endif 101 + 102 + void pidfs_prepare_pid(struct pid *pid) 155 103 { 156 - static u64 pidfs_ino_nr = 2; 157 - 158 - /* 159 - * On 64 bit nothing special happens. The 64bit number assigned 160 - * to struct pid is the inode number. 161 - * 162 - * On 32 bit the 64 bit number assigned to struct pid is split 163 - * into two 32 bit numbers. The lower 32 bits are used as the 164 - * inode number and the upper 32 bits are used as the inode 165 - * generation number. 166 - * 167 - * On 32 bit pidfs_ino() will return the lower 32 bit. When 168 - * pidfs_ino() returns zero a wrap around happened. When a 169 - * wraparound happens the 64 bit number will be incremented by 2 170 - * so inode numbering starts at 2 again. 171 - * 172 - * On 64 bit comparing two pidfds is as simple as comparing 173 - * inode numbers. 174 - * 175 - * When a wraparound happens on 32 bit multiple pidfds with the 176 - * same inode number are likely to exist (This isn't a problem 177 - * since before pidfs pidfds used the anonymous inode meaning 178 - * all pidfds had the same inode number.). Userspace can 179 - * reconstruct the 64 bit identifier by retrieving both the 180 - * inode number and the inode generation number to compare or 181 - * use file handles. 182 - */ 183 - if (pidfs_ino(pidfs_ino_nr) == 0) 184 - pidfs_ino_nr += 2; 185 - 186 - pid->ino = pidfs_ino_nr; 187 104 pid->stashed = NULL; 188 105 pid->attr = NULL; 189 - pidfs_ino_nr++; 106 + pid->ino = 0; 107 + } 190 108 191 - write_seqcount_begin(&pidmap_lock_seq); 192 - rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); 193 - write_seqcount_end(&pidmap_lock_seq); 109 + int pidfs_add_pid(struct pid *pid) 110 + { 111 + int ret; 112 + 113 + pid->ino = pidfs_alloc_ino(); 114 + ret = rhashtable_insert_fast(&pidfs_ino_ht, &pid->pidfs_hash, 115 + pidfs_ino_ht_params); 116 + if (unlikely(ret)) 117 + pid->ino = 0; 118 + return ret; 194 119 } 195 120 196 121 void pidfs_remove_pid(struct pid *pid) 197 122 { 198 - write_seqcount_begin(&pidmap_lock_seq); 199 - rb_erase(&pid->pidfs_node, &pidfs_ino_tree); 200 - write_seqcount_end(&pidmap_lock_seq); 123 + if (likely(pid->ino)) 124 + rhashtable_remove_fast(&pidfs_ino_ht, &pid->pidfs_hash, 125 + pidfs_ino_ht_params); 201 126 } 202 127 203 128 void pidfs_free_pid(struct pid *pid) ··· 446 415 * the fields are set correctly, or return ESRCH to avoid providing 447 416 * incomplete information. */ 448 417 449 - kinfo.ppid = task_ppid_nr_ns(task, NULL); 418 + kinfo.ppid = task_ppid_vnr(task); 450 419 kinfo.tgid = task_tgid_vnr(task); 451 420 kinfo.pid = task_pid_vnr(task); 452 421 kinfo.mask |= PIDFD_INFO_PID; ··· 822 791 return FILEID_KERNFS; 823 792 } 824 793 825 - static int pidfs_ino_find(const void *key, const struct rb_node *node) 826 - { 827 - const u64 pid_ino = *(u64 *)key; 828 - const struct pid *pid = rb_entry(node, struct pid, pidfs_node); 829 - 830 - if (pid_ino < pid->ino) 831 - return -1; 832 - if (pid_ino > pid->ino) 833 - return 1; 834 - return 0; 835 - } 836 - 837 794 /* Find a struct pid based on the inode number. */ 838 795 static struct pid *pidfs_ino_get_pid(u64 ino) 839 796 { 840 797 struct pid *pid; 841 - struct rb_node *node; 842 - unsigned int seq; 798 + struct pidfs_attr *attr; 843 799 844 800 guard(rcu)(); 845 - do { 846 - seq = read_seqcount_begin(&pidmap_lock_seq); 847 - node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); 848 - if (node) 849 - break; 850 - } while (read_seqcount_retry(&pidmap_lock_seq, seq)); 851 - 852 - if (!node) 801 + pid = rhashtable_lookup(&pidfs_ino_ht, &ino, pidfs_ino_ht_params); 802 + if (!pid) 853 803 return NULL; 854 - 855 - pid = rb_entry(node, struct pid, pidfs_node); 856 - 804 + attr = READ_ONCE(pid->attr); 805 + if (IS_ERR_OR_NULL(attr)) 806 + return NULL; 807 + if (test_bit(PIDFS_ATTR_BIT_EXIT, &attr->attr_mask)) 808 + return NULL; 857 809 /* Within our pid namespace hierarchy? */ 858 810 if (pid_vnr(pid) == 0) 859 811 return NULL; 860 - 861 812 return get_pid(pid); 862 813 } 863 814 ··· 1117 1104 1118 1105 void __init pidfs_init(void) 1119 1106 { 1107 + if (rhashtable_init(&pidfs_ino_ht, &pidfs_ino_ht_params)) 1108 + panic("Failed to initialize pidfs hashtable"); 1109 + 1120 1110 pidfs_attr_cachep = kmem_cache_create("pidfs_attr_cache", sizeof(struct pidfs_attr), 0, 1121 1111 (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | 1122 1112 SLAB_ACCOUNT | SLAB_PANIC), NULL);
+7 -2
include/linux/pid.h
··· 6 6 #include <linux/rculist.h> 7 7 #include <linux/rcupdate.h> 8 8 #include <linux/refcount.h> 9 + #include <linux/rhashtable-types.h> 9 10 #include <linux/sched.h> 10 11 #include <linux/wait.h> 11 12 ··· 61 60 spinlock_t lock; 62 61 struct { 63 62 u64 ino; 64 - struct rb_node pidfs_node; 63 + struct rhash_head pidfs_hash; 65 64 struct dentry *stashed; 66 65 struct pidfs_attr *attr; 67 66 }; ··· 74 73 struct upid numbers[]; 75 74 }; 76 75 77 - extern seqcount_spinlock_t pidmap_lock_seq; 78 76 extern struct pid init_struct_pid; 79 77 80 78 struct file; ··· 308 308 rcu_read_unlock(); 309 309 310 310 return pid; 311 + } 312 + 313 + static inline pid_t task_ppid_vnr(const struct task_struct *tsk) 314 + { 315 + return task_ppid_nr_ns(tsk, NULL); 311 316 } 312 317 313 318 static inline pid_t task_ppid_nr(const struct task_struct *tsk)
+7 -7
include/linux/pid_namespace.h
··· 27 27 struct idr idr; 28 28 struct rcu_head rcu; 29 29 unsigned int pid_allocated; 30 + #ifdef CONFIG_SYSCTL 31 + #if defined(CONFIG_MEMFD_CREATE) 32 + int memfd_noexec_scope; 33 + #endif 34 + struct ctl_table_set set; 35 + struct ctl_table_header *sysctls; 36 + #endif 30 37 struct task_struct *child_reaper; 31 38 struct kmem_cache *pid_cachep; 32 39 unsigned int level; ··· 47 40 int reboot; /* group exit code if this pidns was rebooted */ 48 41 struct ns_common ns; 49 42 struct work_struct work; 50 - #ifdef CONFIG_SYSCTL 51 - struct ctl_table_set set; 52 - struct ctl_table_header *sysctls; 53 - #if defined(CONFIG_MEMFD_CREATE) 54 - int memfd_noexec_scope; 55 - #endif 56 - #endif 57 43 } __randomize_layout; 58 44 59 45 extern struct pid_namespace init_pid_ns;
+2 -1
include/linux/pidfs.h
··· 6 6 7 7 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); 8 8 void __init pidfs_init(void); 9 - void pidfs_add_pid(struct pid *pid); 9 + void pidfs_prepare_pid(struct pid *pid); 10 + int pidfs_add_pid(struct pid *pid); 10 11 void pidfs_remove_pid(struct pid *pid); 11 12 void pidfs_exit(struct task_struct *tsk); 12 13 #ifdef CONFIG_COREDUMP
+1 -2
ipc/mqueue.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 1 2 /* 2 3 * POSIX message queues filesystem for Linux. 3 4 * ··· 10 9 * Manfred Spraul (manfred@colorfullife.com) 11 10 * 12 11 * Audit: George Wilson (ltcgcw@us.ibm.com) 13 - * 14 - * This file is released under the GPL. 15 12 */ 16 13 17 14 #include <linux/capability.h>
+9 -6
kernel/pid.c
··· 43 43 #include <linux/sched/task.h> 44 44 #include <linux/idr.h> 45 45 #include <linux/pidfs.h> 46 - #include <linux/seqlock.h> 47 46 #include <net/sock.h> 48 47 #include <uapi/linux/pidfd.h> 49 48 ··· 84 85 EXPORT_SYMBOL_GPL(init_pid_ns); 85 86 86 87 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 87 - seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); 88 88 89 89 void put_pid(struct pid *pid) 90 90 { ··· 139 141 140 142 idr_remove(&ns->idr, upid->nr); 141 143 } 142 - pidfs_remove_pid(pid); 143 144 spin_unlock(&pidmap_lock); 144 145 146 + pidfs_remove_pid(pid); 145 147 call_rcu(&pid->rcu, delayed_put_pid); 146 148 } 147 149 ··· 198 200 INIT_HLIST_HEAD(&pid->tasks[type]); 199 201 init_waitqueue_head(&pid->wait_pidfd); 200 202 INIT_HLIST_HEAD(&pid->inodes); 203 + pidfs_prepare_pid(pid); 201 204 202 205 /* 203 206 * 2. perm check checkpoint_restore_ns_capable() ··· 315 316 retval = -ENOMEM; 316 317 if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) 317 318 goto out_free; 318 - pidfs_add_pid(pid); 319 319 for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { 320 320 /* Make the PID visible to find_pid_ns. */ 321 321 idr_replace(&upid->ns->idr, pid, upid->nr); ··· 323 325 spin_unlock(&pidmap_lock); 324 326 idr_preload_end(); 325 327 ns_ref_active_get(ns); 328 + 329 + retval = pidfs_add_pid(pid); 330 + if (unlikely(retval)) { 331 + free_pid(pid); 332 + pid = ERR_PTR(-ENOMEM); 333 + } 326 334 327 335 return pid; 328 336 ··· 558 554 rcu_read_lock(); 559 555 if (!ns) 560 556 ns = task_active_pid_ns(current); 561 - if (ns) 562 - nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); 557 + nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); 563 558 rcu_read_unlock(); 564 559 565 560 return nr;