Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull pid_namespace updates from Christian Brauner:

- pid_namespace: make init creation more flexible

Annotate ->child_reaper accesses with {READ,WRITE}_ONCE() to protect
the unlocked readers from cpu/compiler reordering, and enforce that
pid 1 in a pid namespace is always the first allocated pid (the
set_tid path already required this).

On top of that, allow opening pid_for_children before the pid
namespace init has been created. This lets one process create the pid
namespace and a different process create the init via setns(), which
makes clone3(set_tid) usable in all cases evenly and is particularly
useful to CRIU when restoring nested containers.

A new selftest covers both the basic create-pidns-then-init flow and
the cross-process variant, and a MAINTAINERS entry for the pid
namespace code is added.

- unrelated signal cleanup: update outdated comment for the removed
freezable_schedule()

* tag 'kernel-7.1-rc1.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
signal: update outdated comment for removed freezable_schedule()
MAINTAINERS: add a pid namespace entry
selftests: Add tests for creating pidns init via setns
pid_namespace: allow opening pid_for_children before init was created
pid: check init is created first after idr alloc
pid_namespace: avoid optimization of accesses to ->child_reaper

+269 -24
+9 -2
MAINTAINERS
··· 18191 18191 F: drivers/mtd/nand/ 18192 18192 F: include/linux/mtd/*nand*.h 18193 18193 18194 + NAMESPACES: 18195 + M: Christian Brauner <christian@brauner.io> 18196 + R: Pavel Tikhomirov <ptikhomirov@virtuozzo.com> 18197 + L: linux-kernel@vger.kernel.org 18198 + S: Maintained 18199 + F: rust/kernel/pid_namespace.rs 18200 + F: kernel/pid_namespace.c 18201 + F: tools/testing/selftests/pid_namespace/ 18202 + 18194 18203 NATIONAL INSTRUMENTS SERIAL DRIVER 18195 18204 M: Chaitanya Vadrevu <chaitanya.vadrevu@emerson.com> 18196 18205 L: linux-serial@vger.kernel.org ··· 20813 20804 L: linux-kernel@vger.kernel.org 20814 20805 S: Maintained 20815 20806 T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git 20816 - F: rust/kernel/pid_namespace.rs 20817 20807 F: samples/pidfd/ 20818 20808 F: tools/testing/selftests/clone3/ 20819 - F: tools/testing/selftests/pid_namespace/ 20820 20809 F: tools/testing/selftests/pidfd/ 20821 20810 K: (?i)pidfd 20822 20811 K: (?i)clone3
+2 -1
kernel/exit.c
··· 608 608 609 609 reaper = find_alive_thread(father); 610 610 if (reaper) { 611 - pid_ns->child_reaper = reaper; 611 + ASSERT_EXCLUSIVE_WRITER(pid_ns->child_reaper); 612 + WRITE_ONCE(pid_ns->child_reaper, reaper); 612 613 return reaper; 613 614 } 614 615
+4 -1
kernel/fork.c
··· 2469 2469 init_task_pid(p, PIDTYPE_SID, task_session(current)); 2470 2470 2471 2471 if (is_child_reaper(pid)) { 2472 - ns_of_pid(pid)->child_reaper = p; 2472 + struct pid_namespace *ns = ns_of_pid(pid); 2473 + 2474 + ASSERT_EXCLUSIVE_WRITER(ns->child_reaper); 2475 + WRITE_ONCE(ns->child_reaper, p); 2473 2476 p->signal->flags |= SIGNAL_UNKILLABLE; 2474 2477 } 2475 2478 p->signal->shared_pending.signal = delayed.signal;
+11 -8
kernel/pid.c
··· 128 128 * is the reaper wake up the reaper. The reaper 129 129 * may be sleeping in zap_pid_ns_processes(). 130 130 */ 131 - wake_up_process(ns->child_reaper); 131 + wake_up_process(READ_ONCE(ns->child_reaper)); 132 132 break; 133 133 case PIDNS_ADDING: 134 134 /* Handle a fork failure of the first process */ ··· 215 215 retval = -EINVAL; 216 216 if (tid < 1 || tid >= pid_max[ns->level - i]) 217 217 goto out_abort; 218 - /* 219 - * Also fail if a PID != 1 is requested and 220 - * no PID 1 exists. 221 - */ 222 - if (tid != 1 && !tmp->child_reaper) 223 - goto out_abort; 224 218 retval = -EPERM; 225 219 if (!checkpoint_restore_ns_capable(tmp->user_ns)) 226 220 goto out_abort; ··· 290 296 291 297 pid->numbers[i].nr = nr; 292 298 pid->numbers[i].ns = tmp; 293 - tmp = tmp->parent; 294 299 i--; 295 300 retried_preload = false; 301 + 302 + /* 303 + * PID 1 (init) must be created first. 304 + */ 305 + if (!READ_ONCE(tmp->child_reaper) && nr != 1) { 306 + retval = -EINVAL; 307 + goto out_free; 308 + } 309 + 310 + tmp = tmp->parent; 296 311 } 297 312 298 313 /*
-9
kernel/pid_namespace.c
··· 369 369 } 370 370 task_unlock(task); 371 371 372 - if (ns) { 373 - read_lock(&tasklist_lock); 374 - if (!ns->child_reaper) { 375 - put_pid_ns(ns); 376 - ns = NULL; 377 - } 378 - read_unlock(&tasklist_lock); 379 - } 380 - 381 372 return ns ? &ns->ns : NULL; 382 373 } 383 374
+3 -2
kernel/signal.c
··· 2818 2818 2819 2819 /* 2820 2820 * Do this once, we can't return to user-mode if freezing() == T. 2821 - * do_signal_stop() and ptrace_stop() do freezable_schedule() and 2822 - * thus do not need another check after return. 2821 + * do_signal_stop() and ptrace_stop() set TASK_STOPPED/TASK_TRACED 2822 + * and the freezer handles those states via TASK_FROZEN, thus they 2823 + * do not need another check after return. 2823 2824 */ 2824 2825 try_to_freeze(); 2825 2826
+1
tools/testing/selftests/pid_namespace/.gitignore
··· 1 1 pid_max 2 + pidns_init_via_setns 2 3 regression_enomem
+1 -1
tools/testing/selftests/pid_namespace/Makefile
··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 CFLAGS += -g $(KHDR_INCLUDES) 3 3 4 - TEST_GEN_PROGS = regression_enomem pid_max 4 + TEST_GEN_PROGS = regression_enomem pid_max pidns_init_via_setns 5 5 6 6 LOCAL_HDRS += $(selfdir)/pidfd/pidfd.h 7 7
+238
tools/testing/selftests/pid_namespace/pidns_init_via_setns.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <fcntl.h> 4 + #include <sched.h> 5 + #include <stdio.h> 6 + #include <sys/types.h> 7 + #include <unistd.h> 8 + 9 + #include "kselftest_harness.h" 10 + #include "../pidfd/pidfd.h" 11 + 12 + /* 13 + * Test that a process can become PID 1 (init) in a new PID namespace 14 + * created via unshare() and joined via setns(). 15 + * 16 + * Flow: 17 + * 1. Parent creates a pipe for synchronization. 18 + * 2. Parent forks a child. 19 + * 3. Parent calls unshare(CLONE_NEWPID) to create a new PID namespace. 20 + * 4. Parent signals the child via the pipe. 21 + * 5. Child opens parent's /proc/<ppid>/ns/pid_for_children and calls 22 + * setns(fd, CLONE_NEWPID) to join the new namespace. 23 + * 6. Child forks a grandchild. 24 + * 7. Grandchild verifies getpid() == 1. 25 + */ 26 + TEST(pidns_init_via_setns) 27 + { 28 + pid_t child, parent_pid; 29 + int pipe_fd[2]; 30 + char buf; 31 + 32 + if (geteuid()) 33 + ASSERT_EQ(0, unshare(CLONE_NEWUSER)); 34 + 35 + parent_pid = getpid(); 36 + 37 + ASSERT_EQ(0, pipe(pipe_fd)); 38 + 39 + child = fork(); 40 + ASSERT_GE(child, 0); 41 + 42 + if (child == 0) { 43 + char path[256]; 44 + int nsfd; 45 + pid_t grandchild; 46 + 47 + close(pipe_fd[1]); 48 + 49 + /* Wait for parent to complete unshare */ 50 + ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1)); 51 + close(pipe_fd[0]); 52 + 53 + snprintf(path, sizeof(path), 54 + "/proc/%d/ns/pid_for_children", parent_pid); 55 + nsfd = open(path, O_RDONLY); 56 + ASSERT_GE(nsfd, 0); 57 + 58 + ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID)); 59 + close(nsfd); 60 + 61 + grandchild = fork(); 62 + ASSERT_GE(grandchild, 0); 63 + 64 + if (grandchild == 0) { 65 + /* Should be init (PID 1) in the new namespace */ 66 + if (getpid() != 1) 67 + _exit(1); 68 + _exit(0); 69 + } 70 + 71 + ASSERT_EQ(0, wait_for_pid(grandchild)); 72 + _exit(0); 73 + } 74 + 75 + close(pipe_fd[0]); 76 + 77 + ASSERT_EQ(0, unshare(CLONE_NEWPID)); 78 + 79 + /* Signal child that the new PID namespace is ready */ 80 + buf = 0; 81 + ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1)); 82 + close(pipe_fd[1]); 83 + 84 + ASSERT_EQ(0, wait_for_pid(child)); 85 + } 86 + 87 + /* 88 + * Similar to pidns_init_via_setns, but: 89 + * 1. Parent enters a new PID namespace right from the start to be able to 90 + * later freely use pid 1001 in it. 91 + * 2. After forking child, parent also calls unshare(CLONE_NEWUSER) 92 + * before unshare(CLONE_NEWPID) so that new old and new pid namespaces have 93 + * different user namespace owners. 94 + * 3. Child uses clone3() with set_tid={1, 1001} instead of fork() and 95 + * grandchild checks that it gets desired pids . 96 + * 97 + * Flow: 98 + * 1. Test process creates a new PID namespace and forks a wrapper 99 + * (PID 1 in the outer namespace). 100 + * 2. Wrapper forks a child. 101 + * 3. Wrapper calls unshare(CLONE_NEWUSER) + unshare(CLONE_NEWPID) 102 + * to create an inner PID namespace. 103 + * 4. Wrapper signals the child via pipe. 104 + * 5. Child opens wrapper's /proc/<pid>/ns/pid_for_children and calls 105 + * setns(fd, CLONE_NEWPID) to join the inner namespace. 106 + * 6. Child calls clone3() with set_tid={1, 1001}. 107 + * 7. Grandchild verifies its NSpid ends with "1001 1". 108 + */ 109 + 110 + pid_t set_tid[] = {1, 1001}; 111 + 112 + static int pidns_init_via_setns_set_tid_grandchild(struct __test_metadata *_metadata) 113 + { 114 + char *line = NULL; 115 + size_t len = 0; 116 + int found = 0; 117 + FILE *gf; 118 + 119 + gf = fopen("/proc/self/status", "r"); 120 + ASSERT_NE(gf, NULL); 121 + 122 + while (getline(&line, &len, gf) != -1) { 123 + if (strncmp(line, "NSpid:", 6) != 0) 124 + continue; 125 + 126 + for (int i = 0; i < 2; i++) { 127 + char *last = strrchr(line, '\t'); 128 + pid_t pid; 129 + 130 + ASSERT_NE(last, NULL); 131 + ASSERT_EQ(sscanf(last, "%d", &pid), 1); 132 + ASSERT_EQ(pid, set_tid[i]); 133 + *last = '\0'; 134 + } 135 + 136 + found = true; 137 + break; 138 + } 139 + 140 + free(line); 141 + fclose(gf); 142 + ASSERT_TRUE(found); 143 + return 0; 144 + } 145 + 146 + static int pidns_init_via_setns_set_tid_child(struct __test_metadata *_metadata, 147 + pid_t parent_pid, int pipe_fd[2]) 148 + { 149 + struct __clone_args args = { 150 + .exit_signal = SIGCHLD, 151 + .set_tid = ptr_to_u64(set_tid), 152 + .set_tid_size = 2, 153 + }; 154 + pid_t grandchild; 155 + char path[256]; 156 + char buf; 157 + int nsfd; 158 + 159 + close(pipe_fd[1]); 160 + 161 + ASSERT_EQ(1, read_nointr(pipe_fd[0], &buf, 1)); 162 + close(pipe_fd[0]); 163 + 164 + snprintf(path, sizeof(path), 165 + "/proc/%d/ns/pid_for_children", parent_pid); 166 + nsfd = open(path, O_RDONLY); 167 + ASSERT_GE(nsfd, 0); 168 + 169 + ASSERT_EQ(0, setns(nsfd, CLONE_NEWPID)); 170 + close(nsfd); 171 + 172 + grandchild = sys_clone3(&args, sizeof(args)); 173 + ASSERT_GE(grandchild, 0); 174 + 175 + if (grandchild == 0) 176 + _exit(pidns_init_via_setns_set_tid_grandchild(_metadata)); 177 + 178 + ASSERT_EQ(0, wait_for_pid(grandchild)); 179 + return 0; 180 + } 181 + 182 + static int pidns_init_via_setns_set_tid_wrapper(struct __test_metadata *_metadata) 183 + { 184 + int pipe_fd[2]; 185 + pid_t child, parent_pid; 186 + char buf; 187 + FILE *f; 188 + 189 + /* 190 + * We are PID 1 inside the new namespace, but /proc is 191 + * mounted from the host. Read our host-visible PID so 192 + * the child can reach our pid_for_children via /proc. 193 + */ 194 + f = fopen("/proc/self/stat", "r"); 195 + ASSERT_NE(f, NULL); 196 + ASSERT_EQ(fscanf(f, "%d", &parent_pid), 1); 197 + ASSERT_EQ(0, pipe(pipe_fd)); 198 + 199 + child = fork(); 200 + ASSERT_GE(child, 0); 201 + 202 + if (child == 0) 203 + _exit(pidns_init_via_setns_set_tid_child(_metadata, parent_pid, pipe_fd)); 204 + 205 + close(pipe_fd[0]); 206 + 207 + ASSERT_EQ(0, unshare(CLONE_NEWUSER)); 208 + ASSERT_EQ(0, unshare(CLONE_NEWPID)); 209 + 210 + buf = 0; 211 + ASSERT_EQ(1, write_nointr(pipe_fd[1], &buf, 1)); 212 + close(pipe_fd[1]); 213 + 214 + ASSERT_EQ(0, wait_for_pid(child)); 215 + 216 + fclose(f); 217 + return 0; 218 + } 219 + 220 + TEST(pidns_init_via_setns_set_tid) 221 + { 222 + pid_t wrapper; 223 + 224 + if (geteuid()) 225 + SKIP(return, "This test needs root to run!"); 226 + 227 + ASSERT_EQ(0, unshare(CLONE_NEWPID)); 228 + 229 + wrapper = fork(); 230 + ASSERT_GE(wrapper, 0); 231 + 232 + if (wrapper == 0) 233 + _exit(pidns_init_via_setns_set_tid_wrapper(_metadata)); 234 + 235 + ASSERT_EQ(0, wait_for_pid(wrapper)); 236 + } 237 + 238 + TEST_HARNESS_MAIN