Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge patch series "ns: fixes for namespace iteration and active reference counting"

Christian Brauner <brauner@kernel.org> says:

* Make sure to initialize the active reference count for the initial
network namespace and prevent __ns_common_init() from returning too
early.

* Make sure that passive reference counts are dropped outside of rcu
read locks as some namespaces such as the mount namespace do in fact
sleep when putting the last reference.

* The setns() system call supports:

(1) namespace file descriptors (nsfd)
(2) process file descriptors (pidfd)

When using nsfds the namespaces will remain active because they are
pinned by the vfs. However, when pidfds are used things are more
complicated.

When the target task exits and passes through exit_nsproxy_namespaces()
or is reaped and thus also passes through exit_cred_namespaces() after
the setns()'ing task has called prepare_nsset() but before the active
reference count of the set of namespaces it wants to setns() to might
have been dropped already:

P1 P2

pid_p1 = clone(CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
pidfd = pidfd_open(pid_p1)
setns(pidfd, CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWNS)
prepare_nsset()

exit(0)
// ns->__ns_active_ref == 1
// parent_ns->__ns_active_ref == 1
-> exit_nsproxy_namespaces()
-> exit_cred_namespaces()

// ns_active_ref_put() will also put
// the reference on the owner of the
// namespace. If the only reason the
// owning namespace was alive was
// because it was a parent of @ns
// it's active reference count now goes
// to zero... --------------------------------
// |
// ns->__ns_active_ref == 0 |
// parent_ns->__ns_active_ref == 0 |
| commit_nsset()
-----------------> // If setns()
// now manages to install the namespaces
// it will call ns_active_ref_get()
// on them thus bumping the active reference
// count from zero again but without also
// taking the required reference on the owner.
// Thus we get:
//
// ns->__ns_active_ref == 1
// parent_ns->__ns_active_ref == 0

When later someone does ns_active_ref_put() on @ns it will underflow
parent_ns->__ns_active_ref leading to a splat from our asserts
thinking there are still active references when in fact the counter
just underflowed.

So resurrect the ownership chain if necessary as well. If the caller
succeeded to grab passive references to the set of namespaces the
setns() should simply succeed even if the target task exists or gets
reaped in the meantime.

The race is rare and can only be triggered when using pidfs to setns()
to namespaces. Also note that active reference on initial namespaces are
nops.

Since we now always handle parent references directly we can drop
ns_ref_active_get_owner() when adding a namespace to a namespace tree.
This is now all handled uniformly in the places where the new namespaces
actually become active.

* patches from https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org:
selftests/namespaces: test for efault
selftests/namespaces: add active reference count regression test
ns: add asserts for active refcount underflow
ns: handle setns(pidfd, ...) cleanly
ns: return EFAULT on put_user() error
ns: make sure reference are dropped outside of rcu lock
ns: don't increment or decrement initial namespaces
ns: don't skip active reference count initialization

Link: https://patch.msgid.link/20251109-namespace-6-19-fixes-v1-0-ae8a4ad5a3b3@kernel.org
Signed-off-by: Christian Brauner <brauner@kernel.org>

+724 -74
+1 -1
fs/nsfs.c
··· 430 430 * ioctl on such a socket will resurrect the relevant namespace 431 431 * subtree. 432 432 */ 433 - __ns_ref_active_resurrect(ns); 433 + __ns_ref_active_get(ns); 434 434 return 0; 435 435 } 436 436
+13 -36
include/linux/ns_common.h
··· 141 141 IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); 142 142 } 143 143 144 + static __always_inline bool is_ns_init_id(const struct ns_common *ns) 145 + { 146 + VFS_WARN_ON_ONCE(ns->ns_id == 0); 147 + return ns->ns_id <= NS_LAST_INIT_ID; 148 + } 149 + 144 150 #define to_ns_common(__ns) \ 145 151 _Generic((__ns), \ 146 152 struct cgroup_namespace *: &(__ns)->ns, \ ··· 287 281 #define ns_ref_active_read(__ns) \ 288 282 ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) 289 283 290 - void __ns_ref_active_get_owner(struct ns_common *ns); 284 + void __ns_ref_active_put(struct ns_common *ns); 291 285 292 - static __always_inline void __ns_ref_active_get(struct ns_common *ns) 293 - { 294 - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); 295 - VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); 296 - } 297 - #define ns_ref_active_get(__ns) \ 298 - do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) 299 - 300 - static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) 301 - { 302 - if (atomic_inc_not_zero(&ns->__ns_ref_active)) { 303 - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 304 - return true; 305 - } 306 - return false; 307 - } 308 - 309 - #define ns_ref_active_get_owner(__ns) \ 310 - do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) 311 - 312 - void __ns_ref_active_put_owner(struct ns_common *ns); 313 - 314 - static __always_inline void __ns_ref_active_put(struct ns_common *ns) 315 - { 316 - if (atomic_dec_and_test(&ns->__ns_ref_active)) { 317 - VFS_WARN_ON_ONCE(is_initial_namespace(ns)); 318 - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 319 - __ns_ref_active_put_owner(ns); 320 - } 321 - } 322 286 #define ns_ref_active_put(__ns) \ 323 287 do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) 324 288 325 289 static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) 326 290 { 327 - VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); 328 - if (!__ns_ref_active_read(ns)) 291 + if (!__ns_ref_active_read(ns)) { 292 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 329 293 return NULL; 294 + } 330 295 if (!__ns_ref_get(ns)) 331 296 return NULL; 332 297 return ns; 333 298 } 334 299 335 - void __ns_ref_active_resurrect(struct ns_common *ns); 300 + void __ns_ref_active_get(struct ns_common *ns); 336 301 337 - #define ns_ref_active_resurrect(__ns) \ 338 - do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) 302 + #define ns_ref_active_get(__ns) \ 303 + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) 339 304 340 305 #endif
+35 -17
kernel/nscommon.c
··· 54 54 55 55 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) 56 56 { 57 - int ret; 57 + int ret = 0; 58 58 59 59 refcount_set(&ns->__ns_ref, 1); 60 60 ns->stashed = NULL; ··· 74 74 ns_debug(ns, ops); 75 75 #endif 76 76 77 - if (inum) { 77 + if (inum) 78 78 ns->inum = inum; 79 - return 0; 80 - } 81 - ret = proc_alloc_inum(&ns->inum); 79 + else 80 + ret = proc_alloc_inum(&ns->inum); 82 81 if (ret) 83 82 return ret; 84 83 /* ··· 112 113 if (owner == &init_user_ns) 113 114 return NULL; 114 115 return to_ns_common(owner); 115 - } 116 - 117 - void __ns_ref_active_get_owner(struct ns_common *ns) 118 - { 119 - ns = ns_owner(ns); 120 - if (ns) 121 - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); 122 116 } 123 117 124 118 /* ··· 164 172 * The iteration stops once we reach a namespace that still has active 165 173 * references. 166 174 */ 167 - void __ns_ref_active_put_owner(struct ns_common *ns) 175 + void __ns_ref_active_put(struct ns_common *ns) 168 176 { 177 + /* Initial namespaces are always active. */ 178 + if (is_ns_init_id(ns)) 179 + return; 180 + 181 + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 182 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 183 + return; 184 + } 185 + 186 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 187 + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 188 + 169 189 for (;;) { 170 190 ns = ns_owner(ns); 171 191 if (!ns) 172 192 return; 173 - if (!atomic_dec_and_test(&ns->__ns_ref_active)) 193 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 194 + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 195 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 174 196 return; 197 + } 175 198 } 176 199 } 177 200 ··· 282 275 * it also needs to take another reference on its owning user namespace 283 276 * and so on. 284 277 */ 285 - void __ns_ref_active_resurrect(struct ns_common *ns) 278 + void __ns_ref_active_get(struct ns_common *ns) 286 279 { 280 + int prev; 281 + 282 + /* Initial namespaces are always active. */ 283 + if (is_ns_init_id(ns)) 284 + return; 285 + 287 286 /* If we didn't resurrect the namespace we're done. */ 288 - if (atomic_fetch_add(1, &ns->__ns_ref_active)) 287 + prev = atomic_fetch_add(1, &ns->__ns_ref_active); 288 + VFS_WARN_ON_ONCE(prev < 0); 289 + if (likely(prev)) 289 290 return; 290 291 291 292 /* ··· 305 290 if (!ns) 306 291 return; 307 292 308 - if (atomic_fetch_add(1, &ns->__ns_ref_active)) 293 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 294 + prev = atomic_fetch_add(1, &ns->__ns_ref_active); 295 + VFS_WARN_ON_ONCE(prev < 0); 296 + if (likely(prev)) 309 297 return; 310 298 } 311 299 }
+25 -19
kernel/nstree.c
··· 173 173 write_sequnlock(&ns_tree_lock); 174 174 175 175 VFS_WARN_ON_ONCE(node); 176 - 177 - /* 178 - * Take an active reference on the owner namespace. This ensures 179 - * that the owner remains visible while any of its child namespaces 180 - * are active. For init namespaces this is a no-op as ns_owner() 181 - * returns NULL for namespaces owned by init_user_ns. 182 - */ 183 - __ns_ref_active_get_owner(ns); 184 176 } 185 177 186 178 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) ··· 497 505 return false; 498 506 } 499 507 500 - static void __ns_put(struct ns_common *ns) 508 + static inline void ns_put(struct ns_common *ns) 501 509 { 502 - if (ns->ops) 510 + if (ns && ns->ops) 503 511 ns->ops->put(ns); 504 512 } 505 513 506 - DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T)) 514 + DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) 507 515 508 516 static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, 509 517 struct ns_common *candidate) ··· 527 535 { 528 536 u64 __user *ns_ids = kls->uns_ids; 529 537 size_t nr_ns_ids = kls->nr_ns_ids; 530 - struct ns_common *ns = NULL, *first_ns = NULL; 538 + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; 531 539 const struct list_head *head; 532 540 ssize_t ret; 533 541 ··· 560 568 561 569 if (!first_ns) 562 570 first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); 571 + 563 572 for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; 564 573 ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { 565 - struct ns_common *valid __free(ns_put); 574 + struct ns_common *valid; 566 575 567 576 valid = legitimize_ns(kls, ns); 568 577 if (!valid) ··· 571 578 572 579 rcu_read_unlock(); 573 580 574 - if (put_user(valid->ns_id, ns_ids + ret)) 575 - return -EINVAL; 581 + ns_put(prev); 582 + prev = valid; 583 + 584 + if (put_user(valid->ns_id, ns_ids + ret)) { 585 + ns_put(prev); 586 + return -EFAULT; 587 + } 588 + 576 589 nr_ns_ids--; 577 590 ret++; 578 591 ··· 586 587 } 587 588 588 589 rcu_read_unlock(); 590 + ns_put(prev); 589 591 return ret; 590 592 } 591 593 ··· 668 668 { 669 669 u64 __user *ns_ids = kls->uns_ids; 670 670 size_t nr_ns_ids = kls->nr_ns_ids; 671 - struct ns_common *ns, *first_ns = NULL; 671 + struct ns_common *ns, *first_ns = NULL, *prev = NULL; 672 672 struct ns_tree *ns_tree = NULL; 673 673 const struct list_head *head; 674 674 u32 ns_type; ··· 705 705 706 706 for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; 707 707 ns = next_ns_common(ns, ns_tree)) { 708 - struct ns_common *valid __free(ns_put); 708 + struct ns_common *valid; 709 709 710 710 valid = legitimize_ns(kls, ns); 711 711 if (!valid) ··· 713 713 714 714 rcu_read_unlock(); 715 715 716 - if (put_user(valid->ns_id, ns_ids + ret)) 717 - return -EINVAL; 716 + ns_put(prev); 717 + prev = valid; 718 + 719 + if (put_user(valid->ns_id, ns_ids + ret)) { 720 + ns_put(prev); 721 + return -EFAULT; 722 + } 718 723 719 724 nr_ns_ids--; 720 725 ret++; ··· 728 723 } 729 724 730 725 rcu_read_unlock(); 726 + ns_put(prev); 731 727 return ret; 732 728 } 733 729
+2
tools/testing/selftests/namespaces/.gitignore
··· 4 4 ns_active_ref_test 5 5 listns_test 6 6 listns_permissions_test 7 + listns_efault_test 7 8 siocgskns_test 8 9 cred_change_test 9 10 stress_test 10 11 listns_pagination_bug 12 + regression_pidfd_setns_test
+5 -1
tools/testing/selftests/namespaces/Makefile
··· 8 8 ns_active_ref_test \ 9 9 listns_test \ 10 10 listns_permissions_test \ 11 + listns_efault_test \ 11 12 siocgskns_test \ 12 13 cred_change_test \ 13 14 stress_test \ 14 - listns_pagination_bug 15 + listns_pagination_bug \ 16 + regression_pidfd_setns_test 15 17 16 18 include ../lib.mk 17 19 18 20 $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c 19 21 $(OUTPUT)/listns_test: ../filesystems/utils.c 20 22 $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c 23 + $(OUTPUT)/listns_efault_test: ../filesystems/utils.c 21 24 $(OUTPUT)/siocgskns_test: ../filesystems/utils.c 22 25 $(OUTPUT)/cred_change_test: ../filesystems/utils.c 23 26 $(OUTPUT)/stress_test: ../filesystems/utils.c 24 27 $(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c 28 + $(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c 25 29
+530
tools/testing/selftests/namespaces/listns_efault_test.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <signal.h> 8 + #include <stdio.h> 9 + #include <stdlib.h> 10 + #include <string.h> 11 + #include <linux/nsfs.h> 12 + #include <sys/ioctl.h> 13 + #include <sys/mman.h> 14 + #include <sys/mount.h> 15 + #include <sys/socket.h> 16 + #include <sys/stat.h> 17 + #include <sys/syscall.h> 18 + #include <sys/types.h> 19 + #include <sys/wait.h> 20 + #include <unistd.h> 21 + #include "../kselftest_harness.h" 22 + #include "../filesystems/utils.h" 23 + #include "../pidfd/pidfd.h" 24 + #include "wrappers.h" 25 + 26 + /* 27 + * Test listns() error handling with invalid buffer addresses. 28 + * 29 + * When the buffer pointer is invalid (e.g., crossing page boundaries 30 + * into unmapped memory), listns() returns EINVAL. 31 + * 32 + * This test also creates mount namespaces that get destroyed during 33 + * iteration, testing that namespace cleanup happens outside the RCU 34 + * read lock. 35 + */ 36 + TEST(listns_partial_fault_with_ns_cleanup) 37 + { 38 + void *map; 39 + __u64 *ns_ids; 40 + ssize_t ret; 41 + long page_size; 42 + pid_t pid, iter_pid; 43 + int pidfds[5]; 44 + int sv[5][2]; 45 + int iter_pidfd; 46 + int i, status; 47 + char c; 48 + 49 + page_size = sysconf(_SC_PAGESIZE); 50 + ASSERT_GT(page_size, 0); 51 + 52 + /* 53 + * Map two pages: 54 + * - First page: readable and writable 55 + * - Second page: will be unmapped to trigger EFAULT 56 + */ 57 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 58 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 59 + ASSERT_NE(map, MAP_FAILED); 60 + 61 + /* Unmap the second page */ 62 + ret = munmap((char *)map + page_size, page_size); 63 + ASSERT_EQ(ret, 0); 64 + 65 + /* 66 + * Position the buffer pointer so there's room for exactly one u64 67 + * before the page boundary. The second u64 would fall into the 68 + * unmapped page. 69 + */ 70 + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; 71 + 72 + /* 73 + * Create a separate process to run listns() in a loop concurrently 74 + * with namespace creation and destruction. 75 + */ 76 + iter_pid = create_child(&iter_pidfd, 0); 77 + ASSERT_NE(iter_pid, -1); 78 + 79 + if (iter_pid == 0) { 80 + struct ns_id_req req = { 81 + .size = sizeof(req), 82 + .spare = 0, 83 + .ns_id = 0, 84 + .ns_type = 0, /* All types */ 85 + .spare2 = 0, 86 + .user_ns_id = 0, /* Global listing */ 87 + }; 88 + int iter_ret; 89 + 90 + /* 91 + * Loop calling listns() until killed. 92 + * The kernel should: 93 + * 1. Successfully write the first namespace ID (within valid page) 94 + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) 95 + * 3. Handle concurrent namespace destruction without deadlock 96 + */ 97 + while (1) { 98 + iter_ret = sys_listns(&req, ns_ids, 2, 0); 99 + 100 + if (iter_ret == -1 && errno == ENOSYS) 101 + _exit(PIDFD_SKIP); 102 + } 103 + } 104 + 105 + /* Small delay to let iterator start looping */ 106 + usleep(50000); 107 + 108 + /* 109 + * Create several child processes, each in its own mount namespace. 110 + * These will be destroyed while the iterator is running listns(). 111 + */ 112 + for (i = 0; i < 5; i++) { 113 + /* Create socketpair for synchronization */ 114 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 115 + 116 + pid = create_child(&pidfds[i], CLONE_NEWNS); 117 + ASSERT_NE(pid, -1); 118 + 119 + if (pid == 0) { 120 + close(sv[i][0]); /* Close parent end */ 121 + 122 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 123 + _exit(1); 124 + 125 + /* Child: create a couple of tmpfs mounts */ 126 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 127 + _exit(1); 128 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 129 + _exit(1); 130 + 131 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 132 + _exit(1); 133 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 134 + _exit(1); 135 + 136 + /* Signal parent that setup is complete */ 137 + if (write_nointr(sv[i][1], "R", 1) != 1) 138 + _exit(1); 139 + 140 + /* Wait for parent to signal us to exit */ 141 + if (read_nointr(sv[i][1], &c, 1) != 1) 142 + _exit(1); 143 + 144 + close(sv[i][1]); 145 + _exit(0); 146 + } 147 + 148 + close(sv[i][1]); /* Close child end */ 149 + } 150 + 151 + /* Wait for all children to finish setup */ 152 + for (i = 0; i < 5; i++) { 153 + ret = read_nointr(sv[i][0], &c, 1); 154 + ASSERT_EQ(ret, 1); 155 + ASSERT_EQ(c, 'R'); 156 + } 157 + 158 + /* 159 + * Signal children to exit. This will destroy their mount namespaces 160 + * while listns() is iterating the namespace tree. 161 + * This tests that cleanup happens outside the RCU read lock. 162 + */ 163 + for (i = 0; i < 5; i++) 164 + write_nointr(sv[i][0], "X", 1); 165 + 166 + /* Wait for all mount namespace children to exit and cleanup */ 167 + for (i = 0; i < 5; i++) { 168 + waitpid(-1, NULL, 0); 169 + close(sv[i][0]); 170 + close(pidfds[i]); 171 + } 172 + 173 + /* Kill iterator and wait for it */ 174 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 175 + ret = waitpid(iter_pid, &status, 0); 176 + ASSERT_EQ(ret, iter_pid); 177 + close(iter_pidfd); 178 + 179 + /* Should have been killed */ 180 + ASSERT_TRUE(WIFSIGNALED(status)); 181 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 182 + 183 + /* Clean up */ 184 + munmap(map, page_size); 185 + } 186 + 187 + /* 188 + * Test listns() error handling when the entire buffer is invalid. 189 + * This is a sanity check that basic invalid pointer detection works. 190 + */ 191 + TEST(listns_complete_fault) 192 + { 193 + struct ns_id_req req = { 194 + .size = sizeof(req), 195 + .spare = 0, 196 + .ns_id = 0, 197 + .ns_type = 0, 198 + .spare2 = 0, 199 + .user_ns_id = 0, 200 + }; 201 + __u64 *ns_ids; 202 + ssize_t ret; 203 + 204 + /* Use a clearly invalid pointer */ 205 + ns_ids = (__u64 *)0xdeadbeef; 206 + 207 + ret = sys_listns(&req, ns_ids, 10, 0); 208 + 209 + if (ret == -1 && errno == ENOSYS) 210 + SKIP(return, "listns() not supported"); 211 + 212 + /* Should fail with EFAULT */ 213 + ASSERT_EQ(ret, -1); 214 + ASSERT_EQ(errno, EFAULT); 215 + } 216 + 217 + /* 218 + * Test listns() error handling when the buffer is NULL. 219 + */ 220 + TEST(listns_null_buffer) 221 + { 222 + struct ns_id_req req = { 223 + .size = sizeof(req), 224 + .spare = 0, 225 + .ns_id = 0, 226 + .ns_type = 0, 227 + .spare2 = 0, 228 + .user_ns_id = 0, 229 + }; 230 + ssize_t ret; 231 + 232 + /* NULL buffer with non-zero count should fail */ 233 + ret = sys_listns(&req, NULL, 10, 0); 234 + 235 + if (ret == -1 && errno == ENOSYS) 236 + SKIP(return, "listns() not supported"); 237 + 238 + /* Should fail with EFAULT */ 239 + ASSERT_EQ(ret, -1); 240 + ASSERT_EQ(errno, EFAULT); 241 + } 242 + 243 + /* 244 + * Test listns() with a buffer that becomes invalid mid-iteration 245 + * (after several successful writes), combined with mount namespace 246 + * destruction to test RCU cleanup logic. 247 + */ 248 + TEST(listns_late_fault_with_ns_cleanup) 249 + { 250 + void *map; 251 + __u64 *ns_ids; 252 + ssize_t ret; 253 + long page_size; 254 + pid_t pid, iter_pid; 255 + int pidfds[10]; 256 + int sv[10][2]; 257 + int iter_pidfd; 258 + int i, status; 259 + char c; 260 + 261 + page_size = sysconf(_SC_PAGESIZE); 262 + ASSERT_GT(page_size, 0); 263 + 264 + /* Map two pages */ 265 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 266 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 267 + ASSERT_NE(map, MAP_FAILED); 268 + 269 + /* Unmap the second page */ 270 + ret = munmap((char *)map + page_size, page_size); 271 + ASSERT_EQ(ret, 0); 272 + 273 + /* 274 + * Position buffer so we can write several u64s successfully 275 + * before hitting the page boundary. 276 + */ 277 + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; 278 + 279 + /* 280 + * Create a separate process to run listns() concurrently. 281 + */ 282 + iter_pid = create_child(&iter_pidfd, 0); 283 + ASSERT_NE(iter_pid, -1); 284 + 285 + if (iter_pid == 0) { 286 + struct ns_id_req req = { 287 + .size = sizeof(req), 288 + .spare = 0, 289 + .ns_id = 0, 290 + .ns_type = 0, 291 + .spare2 = 0, 292 + .user_ns_id = 0, 293 + }; 294 + int iter_ret; 295 + 296 + /* 297 + * Loop calling listns() until killed. 298 + * Request 10 namespace IDs while namespaces are being destroyed. 299 + * This tests: 300 + * 1. EFAULT handling when buffer becomes invalid 301 + * 2. Namespace cleanup outside RCU read lock during iteration 302 + */ 303 + while (1) { 304 + iter_ret = sys_listns(&req, ns_ids, 10, 0); 305 + 306 + if (iter_ret == -1 && errno == ENOSYS) 307 + _exit(PIDFD_SKIP); 308 + } 309 + } 310 + 311 + /* Small delay to let iterator start looping */ 312 + usleep(50000); 313 + 314 + /* 315 + * Create more children with mount namespaces to increase the 316 + * likelihood that namespace cleanup happens during iteration. 317 + */ 318 + for (i = 0; i < 10; i++) { 319 + /* Create socketpair for synchronization */ 320 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 321 + 322 + pid = create_child(&pidfds[i], CLONE_NEWNS); 323 + ASSERT_NE(pid, -1); 324 + 325 + if (pid == 0) { 326 + close(sv[i][0]); /* Close parent end */ 327 + 328 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 329 + _exit(1); 330 + 331 + /* Child: create tmpfs mounts */ 332 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 333 + _exit(1); 334 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 335 + _exit(1); 336 + 337 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 338 + _exit(1); 339 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 340 + _exit(1); 341 + 342 + /* Signal parent that setup is complete */ 343 + if (write_nointr(sv[i][1], "R", 1) != 1) 344 + _exit(1); 345 + 346 + /* Wait for parent to signal us to exit */ 347 + if (read_nointr(sv[i][1], &c, 1) != 1) 348 + _exit(1); 349 + 350 + close(sv[i][1]); 351 + _exit(0); 352 + } 353 + 354 + close(sv[i][1]); /* Close child end */ 355 + } 356 + 357 + /* Wait for all children to finish setup */ 358 + for (i = 0; i < 10; i++) { 359 + ret = read_nointr(sv[i][0], &c, 1); 360 + ASSERT_EQ(ret, 1); 361 + ASSERT_EQ(c, 'R'); 362 + } 363 + 364 + /* Kill half the children */ 365 + for (i = 0; i < 5; i++) 366 + write_nointr(sv[i][0], "X", 1); 367 + 368 + /* Small delay to let some exit */ 369 + usleep(10000); 370 + 371 + /* Kill remaining children */ 372 + for (i = 5; i < 10; i++) 373 + write_nointr(sv[i][0], "X", 1); 374 + 375 + /* Wait for all children and cleanup */ 376 + for (i = 0; i < 10; i++) { 377 + waitpid(-1, NULL, 0); 378 + close(sv[i][0]); 379 + close(pidfds[i]); 380 + } 381 + 382 + /* Kill iterator and wait for it */ 383 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 384 + ret = waitpid(iter_pid, &status, 0); 385 + ASSERT_EQ(ret, iter_pid); 386 + close(iter_pidfd); 387 + 388 + /* Should have been killed */ 389 + ASSERT_TRUE(WIFSIGNALED(status)); 390 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 391 + 392 + /* Clean up */ 393 + munmap(map, page_size); 394 + } 395 + 396 + /* 397 + * Test specifically focused on mount namespace cleanup during EFAULT. 398 + * Filter for mount namespaces only. 399 + */ 400 + TEST(listns_mnt_ns_cleanup_on_fault) 401 + { 402 + void *map; 403 + __u64 *ns_ids; 404 + ssize_t ret; 405 + long page_size; 406 + pid_t pid, iter_pid; 407 + int pidfds[8]; 408 + int sv[8][2]; 409 + int iter_pidfd; 410 + int i, status; 411 + char c; 412 + 413 + page_size = sysconf(_SC_PAGESIZE); 414 + ASSERT_GT(page_size, 0); 415 + 416 + /* Set up partial fault buffer */ 417 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 418 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 419 + ASSERT_NE(map, MAP_FAILED); 420 + 421 + ret = munmap((char *)map + page_size, page_size); 422 + ASSERT_EQ(ret, 0); 423 + 424 + /* Position for 3 successful writes, then fault */ 425 + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; 426 + 427 + /* 428 + * Create a separate process to run listns() concurrently. 429 + */ 430 + iter_pid = create_child(&iter_pidfd, 0); 431 + ASSERT_NE(iter_pid, -1); 432 + 433 + if (iter_pid == 0) { 434 + struct ns_id_req req = { 435 + .size = sizeof(req), 436 + .spare = 0, 437 + .ns_id = 0, 438 + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ 439 + .spare2 = 0, 440 + .user_ns_id = 0, 441 + }; 442 + int iter_ret; 443 + 444 + /* 445 + * Loop calling listns() until killed. 446 + * Call listns() to race with namespace destruction. 447 + */ 448 + while (1) { 449 + iter_ret = sys_listns(&req, ns_ids, 10, 0); 450 + 451 + if (iter_ret == -1 && errno == ENOSYS) 452 + _exit(PIDFD_SKIP); 453 + } 454 + } 455 + 456 + /* Small delay to let iterator start looping */ 457 + usleep(50000); 458 + 459 + /* Create children with mount namespaces */ 460 + for (i = 0; i < 8; i++) { 461 + /* Create socketpair for synchronization */ 462 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 463 + 464 + pid = create_child(&pidfds[i], CLONE_NEWNS); 465 + ASSERT_NE(pid, -1); 466 + 467 + if (pid == 0) { 468 + close(sv[i][0]); /* Close parent end */ 469 + 470 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 471 + _exit(1); 472 + 473 + /* Do some mount operations to make cleanup more interesting */ 474 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 475 + _exit(1); 476 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 477 + _exit(1); 478 + 479 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 480 + _exit(1); 481 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 482 + _exit(1); 483 + 484 + /* Signal parent that setup is complete */ 485 + if (write_nointr(sv[i][1], "R", 1) != 1) 486 + _exit(1); 487 + 488 + /* Wait for parent to signal us to exit */ 489 + if (read_nointr(sv[i][1], &c, 1) != 1) 490 + _exit(1); 491 + 492 + close(sv[i][1]); 493 + _exit(0); 494 + } 495 + 496 + close(sv[i][1]); /* Close child end */ 497 + } 498 + 499 + /* Wait for all children to finish setup */ 500 + for (i = 0; i < 8; i++) { 501 + ret = read_nointr(sv[i][0], &c, 1); 502 + ASSERT_EQ(ret, 1); 503 + ASSERT_EQ(c, 'R'); 504 + } 505 + 506 + /* Kill children to trigger namespace destruction during iteration */ 507 + for (i = 0; i < 8; i++) 508 + write_nointr(sv[i][0], "X", 1); 509 + 510 + /* Wait for children and cleanup */ 511 + for (i = 0; i < 8; i++) { 512 + waitpid(-1, NULL, 0); 513 + close(sv[i][0]); 514 + close(pidfds[i]); 515 + } 516 + 517 + /* Kill iterator and wait for it */ 518 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 519 + ret = waitpid(iter_pid, &status, 0); 520 + ASSERT_EQ(ret, iter_pid); 521 + close(iter_pidfd); 522 + 523 + /* Should have been killed */ 524 + ASSERT_TRUE(WIFSIGNALED(status)); 525 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 526 + 527 + munmap(map, page_size); 528 + } 529 + 530 + TEST_HARNESS_MAIN
+113
tools/testing/selftests/namespaces/regression_pidfd_setns_test.c
··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <sched.h> 5 + #include <signal.h> 6 + #include <stdio.h> 7 + #include <stdlib.h> 8 + #include <string.h> 9 + #include <sys/socket.h> 10 + #include <unistd.h> 11 + #include "../pidfd/pidfd.h" 12 + #include "../kselftest_harness.h" 13 + 14 + /* 15 + * Regression tests for the setns(pidfd) active reference counting bug. 16 + * 17 + * These tests are based on the reproducers that triggered the race condition 18 + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). 19 + * 20 + * The bug: When using setns() with a pidfd, if the target task exits between 21 + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. 22 + * Then ns_ref_active_get() would increment from 0 without properly resurrecting 23 + * the owner chain, causing active reference count underflows. 24 + */ 25 + 26 + /* 27 + * Simple pidfd setns test using create_child()+unshare(). 28 + * 29 + * Without the fix, this would trigger active refcount warnings when the 30 + * parent exits after doing setns(pidfd) on a child that has already exited. 31 + */ 32 + TEST(simple_pidfd_setns) 33 + { 34 + pid_t child_pid; 35 + int pidfd = -1; 36 + int ret; 37 + int sv[2]; 38 + char c; 39 + 40 + /* Ignore SIGCHLD for autoreap */ 41 + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); 42 + 43 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 44 + 45 + /* Create a child process without namespaces initially */ 46 + child_pid = create_child(&pidfd, 0); 47 + ASSERT_GE(child_pid, 0); 48 + 49 + if (child_pid == 0) { 50 + close(sv[0]); 51 + 52 + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { 53 + close(sv[1]); 54 + _exit(1); 55 + } 56 + 57 + /* Signal parent that namespaces are ready */ 58 + if (write_nointr(sv[1], "1", 1) < 0) { 59 + close(sv[1]); 60 + _exit(1); 61 + } 62 + 63 + close(sv[1]); 64 + _exit(0); 65 + } 66 + ASSERT_GE(pidfd, 0); 67 + EXPECT_EQ(close(sv[1]), 0); 68 + 69 + ret = read_nointr(sv[0], &c, 1); 70 + ASSERT_EQ(ret, 1); 71 + EXPECT_EQ(close(sv[0]), 0); 72 + 73 + /* Set to child's namespaces via pidfd */ 74 + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); 75 + TH_LOG("setns() returned %d", ret); 76 + close(pidfd); 77 + } 78 + 79 + /* 80 + * Simple pidfd setns test using create_child(). 81 + * 82 + * This variation uses create_child() with namespace flags directly. 83 + * Namespaces are created immediately at clone time. 84 + */ 85 + TEST(simple_pidfd_setns_clone) 86 + { 87 + pid_t child_pid; 88 + int pidfd = -1; 89 + int ret; 90 + 91 + /* Ignore SIGCHLD for autoreap */ 92 + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); 93 + 94 + /* Create a child process with new namespaces using create_child() */ 95 + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); 96 + ASSERT_GE(child_pid, 0); 97 + 98 + if (child_pid == 0) { 99 + /* Child: sleep for a while so parent can setns to us */ 100 + sleep(2); 101 + _exit(0); 102 + } 103 + 104 + /* Parent: pidfd was already created by create_child() */ 105 + ASSERT_GE(pidfd, 0); 106 + 107 + /* Set to child's namespaces via pidfd */ 108 + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); 109 + close(pidfd); 110 + TH_LOG("setns() returned %d", ret); 111 + } 112 + 113 + TEST_HARNESS_MAIN