Merge patch series "further damage-control lack of clone scalability"

Mateusz Guzik <mjguzik@gmail.com> says:

When spawning and killing threads in separate processes in parallel the
primary bottleneck on the stock kernel is pidmap_lock, largely because
of a back-to-back acquire in the common case.

Benchmark code at the end.

With this patchset alloc_pid() only takes the lock once and consequently
alleviates the problem. While scalability improves, the lock remains the
primary bottleneck by a large margin.

I believe idr is a poor choice for the task at hand to begin with, but
sorting out that out beyond the scope of this patchset. At the same time
any replacement would be best evaluated against a state where the
above relock problem is fixed.

Performance improvement varies between reboots. When benchmarking with
20 processes creating and killing threads in a loop, the unpatched
baseline hovers around 465k ops/s, while patched is anything between
~510k ops/s and ~560k depending on false-sharing (which I only minimally
sanitized). So this is at least 10% if you are unlucky.

bench from will-it-scale:

char *testcase_description = "Thread creation and teardown";

static void *worker(void *arg)
{
return (NULL);
}

void testcase(unsigned long long *iterations, unsigned long nr)
{
pthread_t thread[1];
int error;

while (1) {
for (int i = 0; i < 1; i++) {
error = pthread_create(&thread[i], NULL, worker, NULL);
assert(error == 0);
}
for (int i = 0; i < 1; i++) {
error = pthread_join(thread[i], NULL);
assert(error == 0);
}
(*iterations)++;
}
}

v2:
- cosmetic fixes from Oleg
- drop idr_preload_many, relock pidmap + call idr_preload again instead
- write a commit message for the alloc pid patch

* patches from https://patch.msgid.link/20251203092851.287617-1-mjguzik@gmail.com:
pid: only take pidmap_lock once on alloc
ns: pad refcount

Link: https://patch.msgid.link/20251203092851.287617-1-mjguzik@gmail.com
Signed-off-by: Christian Brauner <brauner@kernel.org>

Christian Brauner 6 months ago 961b2ad1 887e9774

+88 -47

2 changed files

expand all

include

linux

ns_common_types.h

kernel

pid.c

+3 -1

include/linux/ns/ns_common_types.h

··· 108 108 * @ns_tree: namespace tree nodes and active reference count 109 109 */ 110 110 struct ns_common { 111 + struct { 112 + refcount_t __ns_ref; /* do not use directly */ 113 + } ____cacheline_aligned_in_smp; 111 114 u32 ns_type; 112 115 struct dentry *stashed; 113 116 const struct proc_ns_operations *ops; 114 117 unsigned int inum; 115 - refcount_t __ns_ref; /* do not use directly */ 116 118 union { 117 119 struct ns_tree; 118 120 struct rcu_head ns_rcu;

+85 -46

kernel/pid.c

··· 159 159 free_pid(pids[tmp]); 160 160 } 161 161 162 - struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, 163 - size_t set_tid_size) 162 + struct pid *alloc_pid(struct pid_namespace *ns, pid_t *arg_set_tid, 163 + size_t arg_set_tid_size) 164 164 { 165 + int set_tid[MAX_PID_NS_LEVEL + 1] = {}; 166 + int pid_max[MAX_PID_NS_LEVEL + 1] = {}; 165 167 struct pid *pid; 166 168 enum pid_type type; 167 169 int i, nr; 168 170 struct pid_namespace *tmp; 169 171 struct upid *upid; 170 172 int retval = -ENOMEM; 173 + bool retried_preload; 171 174 172 175 /* 173 - * set_tid_size contains the size of the set_tid array. Starting at 176 + * arg_set_tid_size contains the size of the arg_set_tid array. Starting at 174 177 * the most nested currently active PID namespace it tells alloc_pid() 175 178 * which PID to set for a process in that most nested PID namespace 176 - * up to set_tid_size PID namespaces. It does not have to set the PID 177 - * for a process in all nested PID namespaces but set_tid_size must 179 + * up to arg_set_tid_size PID namespaces. It does not have to set the PID 180 + * for a process in all nested PID namespaces but arg_set_tid_size must 178 181 * never be greater than the current ns->level + 1. 179 182 */ 180 - if (set_tid_size > ns->level + 1) 183 + if (arg_set_tid_size > ns->level + 1) 181 184 return ERR_PTR(-EINVAL); 182 185 186 + /* 187 + * Prep before we take locks: 188 + * 189 + * 1. allocate and fill in pid struct 190 + */ 183 191 pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); 184 192 if (!pid) 185 193 return ERR_PTR(retval); 186 194 187 - tmp = ns; 195 + get_pid_ns(ns); 188 196 pid->level = ns->level; 197 + refcount_set(&pid->count, 1); 198 + spin_lock_init(&pid->lock); 199 + for (type = 0; type < PIDTYPE_MAX; ++type) 200 + INIT_HLIST_HEAD(&pid->tasks[type]); 201 + init_waitqueue_head(&pid->wait_pidfd); 202 + INIT_HLIST_HEAD(&pid->inodes); 189 203 190 - for (i = ns->level; i >= 0; i--) { 191 - int tid = 0; 192 - int pid_max = READ_ONCE(tmp->pid_max); 204 + /* 205 + * 2. perm check checkpoint_restore_ns_capable() 206 + * 207 + * This stores found pid_max to make sure the used value is the same should 208 + * later code need it. 209 + */ 210 + for (tmp = ns, i = ns->level; i >= 0; i--) { 211 + pid_max[ns->level - i] = READ_ONCE(tmp->pid_max); 193 212 194 - if (set_tid_size) { 195 - tid = set_tid[ns->level - i]; 213 + if (arg_set_tid_size) { 214 + int tid = set_tid[ns->level - i] = arg_set_tid[ns->level - i]; 196 215 197 216 retval = -EINVAL; 198 - if (tid < 1 || tid >= pid_max) 199 - goto out_free; 217 + if (tid < 1 || tid >= pid_max[ns->level - i]) 218 + goto out_abort; 200 219 /* 201 220 * Also fail if a PID != 1 is requested and 202 221 * no PID 1 exists. 203 222 */ 204 223 if (tid != 1 && !tmp->child_reaper) 205 - goto out_free; 224 + goto out_abort; 206 225 retval = -EPERM; 207 226 if (!checkpoint_restore_ns_capable(tmp->user_ns)) 208 - goto out_free; 209 - set_tid_size--; 227 + goto out_abort; 228 + arg_set_tid_size--; 210 229 } 211 230 212 - idr_preload(GFP_KERNEL); 213 - spin_lock(&pidmap_lock); 231 + tmp = tmp->parent; 232 + } 233 + 234 + /* 235 + * Prep is done, id allocation goes here: 236 + */ 237 + retried_preload = false; 238 + idr_preload(GFP_KERNEL); 239 + spin_lock(&pidmap_lock); 240 + for (tmp = ns, i = ns->level; i >= 0;) { 241 + int tid = set_tid[ns->level - i]; 214 242 215 243 if (tid) { 216 244 nr = idr_alloc(&tmp->idr, NULL, tid, ··· 248 220 * alreay in use. Return EEXIST in that case. 249 221 */ 250 222 if (nr == -ENOSPC) 223 + 251 224 nr = -EEXIST; 252 225 } else { 253 226 int pid_min = 1; ··· 264 235 * a partially initialized PID (see below). 265 236 */ 266 237 nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, 267 - pid_max, GFP_ATOMIC); 238 + pid_max[ns->level - i], GFP_ATOMIC); 239 + if (nr == -ENOSPC) 240 + nr = -EAGAIN; 268 241 } 269 - spin_unlock(&pidmap_lock); 270 - idr_preload_end(); 271 242 272 - if (nr < 0) { 273 - retval = (nr == -ENOSPC) ? -EAGAIN : nr; 243 + if (unlikely(nr < 0)) { 244 + /* 245 + * Preload more memory if idr_alloc{,cyclic} failed with -ENOMEM. 246 + * 247 + * The IDR API only allows us to preload memory for one call, while we may end 248 + * up doing several under pidmap_lock with GFP_ATOMIC. The situation may be 249 + * salvageable with GFP_KERNEL. But make sure to not loop indefinitely if preload 250 + * did not help (the routine unfortunately returns void, so we have no idea 251 + * if it got anywhere). 252 + * 253 + * The lock can be safely dropped and picked up as historically pid allocation 254 + * for different namespaces was *not* atomic -- we try to hold on to it the 255 + * entire time only for performance reasons. 256 + */ 257 + if (nr == -ENOMEM && !retried_preload) { 258 + spin_unlock(&pidmap_lock); 259 + idr_preload_end(); 260 + retried_preload = true; 261 + idr_preload(GFP_KERNEL); 262 + spin_lock(&pidmap_lock); 263 + continue; 264 + } 265 + retval = nr; 274 266 goto out_free; 275 267 } 276 268 277 269 pid->numbers[i].nr = nr; 278 270 pid->numbers[i].ns = tmp; 279 271 tmp = tmp->parent; 272 + i--; 273 + retried_preload = false; 280 274 } 281 275 282 276 /* ··· 309 257 * is what we have exposed to userspace for a long time and it is 310 258 * documented behavior for pid namespaces. So we can't easily 311 259 * change it even if there were an error code better suited. 260 + * 261 + * This can't be done earlier because we need to preserve other 262 + * error conditions. 312 263 */ 313 264 retval = -ENOMEM; 314 - 315 - get_pid_ns(ns); 316 - refcount_set(&pid->count, 1); 317 - spin_lock_init(&pid->lock); 318 - for (type = 0; type < PIDTYPE_MAX; ++type) 319 - INIT_HLIST_HEAD(&pid->tasks[type]); 320 - 321 - init_waitqueue_head(&pid->wait_pidfd); 322 - INIT_HLIST_HEAD(&pid->inodes); 323 - 324 - upid = pid->numbers + ns->level; 325 - idr_preload(GFP_KERNEL); 326 - spin_lock(&pidmap_lock); 327 - if (!(ns->pid_allocated & PIDNS_ADDING)) 328 - goto out_unlock; 265 + if (unlikely(!(ns->pid_allocated & PIDNS_ADDING))) 266 + goto out_free; 329 267 pidfs_add_pid(pid); 330 - for ( ; upid >= pid->numbers; --upid) { 268 + for (upid = pid->numbers + ns->level; upid >= pid->numbers; --upid) { 331 269 /* Make the PID visible to find_pid_ns. */ 332 270 idr_replace(&upid->ns->idr, pid, upid->nr); 333 271 upid->ns->pid_allocated++; ··· 328 286 329 287 return pid; 330 288 331 - out_unlock: 332 - spin_unlock(&pidmap_lock); 333 - idr_preload_end(); 334 - put_pid_ns(ns); 335 - 336 289 out_free: 337 - spin_lock(&pidmap_lock); 338 290 while (++i <= ns->level) { 339 291 upid = pid->numbers + i; 340 292 idr_remove(&upid->ns->idr, upid->nr); ··· 339 303 idr_set_cursor(&ns->idr, 0); 340 304 341 305 spin_unlock(&pidmap_lock); 306 + idr_preload_end(); 342 307 308 + out_abort: 309 + put_pid_ns(ns); 343 310 kmem_cache_free(ns->pid_cachep, pid); 344 311 return ERR_PTR(retval); 345 312 }

Configure Feed

Configure Feed