Merge patch series "ns: fixes for namespace iteration and active reference counting"

+1 -1

fs/nsfs.c

··· 430 430 * ioctl on such a socket will resurrect the relevant namespace 431 431 * subtree. 432 432 */ 433 - __ns_ref_active_resurrect(ns); 433 + __ns_ref_active_get(ns); 434 434 return 0; 435 435 } 436 436

+13 -36

include/linux/ns_common.h

··· 141 141 IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); 142 142 } 143 143 144 + static __always_inline bool is_ns_init_id(const struct ns_common *ns) 145 + { 146 + VFS_WARN_ON_ONCE(ns->ns_id == 0); 147 + return ns->ns_id <= NS_LAST_INIT_ID; 148 + } 149 + 144 150 #define to_ns_common(__ns) \ 145 151 _Generic((__ns), \ 146 152 struct cgroup_namespace *: &(__ns)->ns, \ ··· 287 281 #define ns_ref_active_read(__ns) \ 288 282 ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) 289 283 290 - void __ns_ref_active_get_owner(struct ns_common *ns); 284 + void __ns_ref_active_put(struct ns_common *ns); 291 285 292 - static __always_inline void __ns_ref_active_get(struct ns_common *ns) 293 - { 294 - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); 295 - VFS_WARN_ON_ONCE(is_initial_namespace(ns) && __ns_ref_active_read(ns) <= 0); 296 - } 297 - #define ns_ref_active_get(__ns) \ 298 - do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) 299 - 300 - static __always_inline bool __ns_ref_active_get_not_zero(struct ns_common *ns) 301 - { 302 - if (atomic_inc_not_zero(&ns->__ns_ref_active)) { 303 - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 304 - return true; 305 - } 306 - return false; 307 - } 308 - 309 - #define ns_ref_active_get_owner(__ns) \ 310 - do { if (__ns) __ns_ref_active_get_owner(to_ns_common(__ns)); } while (0) 311 - 312 - void __ns_ref_active_put_owner(struct ns_common *ns); 313 - 314 - static __always_inline void __ns_ref_active_put(struct ns_common *ns) 315 - { 316 - if (atomic_dec_and_test(&ns->__ns_ref_active)) { 317 - VFS_WARN_ON_ONCE(is_initial_namespace(ns)); 318 - VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 319 - __ns_ref_active_put_owner(ns); 320 - } 321 - } 322 286 #define ns_ref_active_put(__ns) \ 323 287 do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) 324 288 325 289 static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) 326 290 { 327 - VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) && !__ns_ref_read(ns)); 328 - if (!__ns_ref_active_read(ns)) 291 + if (!__ns_ref_active_read(ns)) { 292 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 329 293 return NULL; 294 + } 330 295 if (!__ns_ref_get(ns)) 331 296 return NULL; 332 297 return ns; 333 298 } 334 299 335 - void __ns_ref_active_resurrect(struct ns_common *ns); 300 + void __ns_ref_active_get(struct ns_common *ns); 336 301 337 - #define ns_ref_active_resurrect(__ns) \ 338 - do { if (__ns) __ns_ref_active_resurrect(to_ns_common(__ns)); } while (0) 302 + #define ns_ref_active_get(__ns) \ 303 + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) 339 304 340 305 #endif

+35 -17

kernel/nscommon.c

··· 54 54 55 55 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) 56 56 { 57 - int ret; 57 + int ret = 0; 58 58 59 59 refcount_set(&ns->__ns_ref, 1); 60 60 ns->stashed = NULL; ··· 74 74 ns_debug(ns, ops); 75 75 #endif 76 76 77 - if (inum) { 77 + if (inum) 78 78 ns->inum = inum; 79 - return 0; 80 - } 81 - ret = proc_alloc_inum(&ns->inum); 79 + else 80 + ret = proc_alloc_inum(&ns->inum); 82 81 if (ret) 83 82 return ret; 84 83 /* ··· 112 113 if (owner == &init_user_ns) 113 114 return NULL; 114 115 return to_ns_common(owner); 115 - } 116 - 117 - void __ns_ref_active_get_owner(struct ns_common *ns) 118 - { 119 - ns = ns_owner(ns); 120 - if (ns) 121 - WARN_ON_ONCE(atomic_add_negative(1, &ns->__ns_ref_active)); 122 116 } 123 117 124 118 /* ··· 164 172 * The iteration stops once we reach a namespace that still has active 165 173 * references. 166 174 */ 167 - void __ns_ref_active_put_owner(struct ns_common *ns) 175 + void __ns_ref_active_put(struct ns_common *ns) 168 176 { 177 + /* Initial namespaces are always active. */ 178 + if (is_ns_init_id(ns)) 179 + return; 180 + 181 + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 182 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 183 + return; 184 + } 185 + 186 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 187 + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 188 + 169 189 for (;;) { 170 190 ns = ns_owner(ns); 171 191 if (!ns) 172 192 return; 173 - if (!atomic_dec_and_test(&ns->__ns_ref_active)) 193 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 194 + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 195 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 174 196 return; 197 + } 175 198 } 176 199 } 177 200 ··· 282 275 * it also needs to take another reference on its owning user namespace 283 276 * and so on. 284 277 */ 285 - void __ns_ref_active_resurrect(struct ns_common *ns) 278 + void __ns_ref_active_get(struct ns_common *ns) 286 279 { 280 + int prev; 281 + 282 + /* Initial namespaces are always active. */ 283 + if (is_ns_init_id(ns)) 284 + return; 285 + 287 286 /* If we didn't resurrect the namespace we're done. */ 288 - if (atomic_fetch_add(1, &ns->__ns_ref_active)) 287 + prev = atomic_fetch_add(1, &ns->__ns_ref_active); 288 + VFS_WARN_ON_ONCE(prev < 0); 289 + if (likely(prev)) 289 290 return; 290 291 291 292 /* ··· 305 290 if (!ns) 306 291 return; 307 292 308 - if (atomic_fetch_add(1, &ns->__ns_ref_active)) 293 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 294 + prev = atomic_fetch_add(1, &ns->__ns_ref_active); 295 + VFS_WARN_ON_ONCE(prev < 0); 296 + if (likely(prev)) 309 297 return; 310 298 } 311 299 }

+25 -19

kernel/nstree.c

··· 173 173 write_sequnlock(&ns_tree_lock); 174 174 175 175 VFS_WARN_ON_ONCE(node); 176 - 177 - /* 178 - * Take an active reference on the owner namespace. This ensures 179 - * that the owner remains visible while any of its child namespaces 180 - * are active. For init namespaces this is a no-op as ns_owner() 181 - * returns NULL for namespaces owned by init_user_ns. 182 - */ 183 - __ns_ref_active_get_owner(ns); 184 176 } 185 177 186 178 void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) ··· 497 505 return false; 498 506 } 499 507 500 - static void __ns_put(struct ns_common *ns) 508 + static inline void ns_put(struct ns_common *ns) 501 509 { 502 - if (ns->ops) 510 + if (ns && ns->ops) 503 511 ns->ops->put(ns); 504 512 } 505 513 506 - DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) __ns_put(_T)) 514 + DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) 507 515 508 516 static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, 509 517 struct ns_common *candidate) ··· 527 535 { 528 536 u64 __user *ns_ids = kls->uns_ids; 529 537 size_t nr_ns_ids = kls->nr_ns_ids; 530 - struct ns_common *ns = NULL, *first_ns = NULL; 538 + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; 531 539 const struct list_head *head; 532 540 ssize_t ret; 533 541 ··· 560 568 561 569 if (!first_ns) 562 570 first_ns = list_entry_rcu(head->next, typeof(*ns), ns_owner_entry); 571 + 563 572 for (ns = first_ns; &ns->ns_owner_entry != head && nr_ns_ids; 564 573 ns = list_entry_rcu(ns->ns_owner_entry.next, typeof(*ns), ns_owner_entry)) { 565 - struct ns_common *valid __free(ns_put); 574 + struct ns_common *valid; 566 575 567 576 valid = legitimize_ns(kls, ns); 568 577 if (!valid) ··· 571 578 572 579 rcu_read_unlock(); 573 580 574 - if (put_user(valid->ns_id, ns_ids + ret)) 575 - return -EINVAL; 581 + ns_put(prev); 582 + prev = valid; 583 + 584 + if (put_user(valid->ns_id, ns_ids + ret)) { 585 + ns_put(prev); 586 + return -EFAULT; 587 + } 588 + 576 589 nr_ns_ids--; 577 590 ret++; 578 591 ··· 586 587 } 587 588 588 589 rcu_read_unlock(); 590 + ns_put(prev); 589 591 return ret; 590 592 } 591 593 ··· 668 668 { 669 669 u64 __user *ns_ids = kls->uns_ids; 670 670 size_t nr_ns_ids = kls->nr_ns_ids; 671 - struct ns_common *ns, *first_ns = NULL; 671 + struct ns_common *ns, *first_ns = NULL, *prev = NULL; 672 672 struct ns_tree *ns_tree = NULL; 673 673 const struct list_head *head; 674 674 u32 ns_type; ··· 705 705 706 706 for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; 707 707 ns = next_ns_common(ns, ns_tree)) { 708 - struct ns_common *valid __free(ns_put); 708 + struct ns_common *valid; 709 709 710 710 valid = legitimize_ns(kls, ns); 711 711 if (!valid) ··· 713 713 714 714 rcu_read_unlock(); 715 715 716 - if (put_user(valid->ns_id, ns_ids + ret)) 717 - return -EINVAL; 716 + ns_put(prev); 717 + prev = valid; 718 + 719 + if (put_user(valid->ns_id, ns_ids + ret)) { 720 + ns_put(prev); 721 + return -EFAULT; 722 + } 718 723 719 724 nr_ns_ids--; 720 725 ret++; ··· 728 723 } 729 724 730 725 rcu_read_unlock(); 726 + ns_put(prev); 731 727 return ret; 732 728 } 733 729

+2

tools/testing/selftests/namespaces/.gitignore

··· 4 4 ns_active_ref_test 5 5 listns_test 6 6 listns_permissions_test 7 + listns_efault_test 7 8 siocgskns_test 8 9 cred_change_test 9 10 stress_test 10 11 listns_pagination_bug 12 + regression_pidfd_setns_test

+5 -1

tools/testing/selftests/namespaces/Makefile

··· 8 8 ns_active_ref_test \ 9 9 listns_test \ 10 10 listns_permissions_test \ 11 + listns_efault_test \ 11 12 siocgskns_test \ 12 13 cred_change_test \ 13 14 stress_test \ 14 - listns_pagination_bug 15 + listns_pagination_bug \ 16 + regression_pidfd_setns_test 15 17 16 18 include ../lib.mk 17 19 18 20 $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c 19 21 $(OUTPUT)/listns_test: ../filesystems/utils.c 20 22 $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c 23 + $(OUTPUT)/listns_efault_test: ../filesystems/utils.c 21 24 $(OUTPUT)/siocgskns_test: ../filesystems/utils.c 22 25 $(OUTPUT)/cred_change_test: ../filesystems/utils.c 23 26 $(OUTPUT)/stress_test: ../filesystems/utils.c 24 27 $(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c 28 + $(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c 25 29

+530

tools/testing/selftests/namespaces/listns_efault_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <signal.h> 8 + #include <stdio.h> 9 + #include <stdlib.h> 10 + #include <string.h> 11 + #include <linux/nsfs.h> 12 + #include <sys/ioctl.h> 13 + #include <sys/mman.h> 14 + #include <sys/mount.h> 15 + #include <sys/socket.h> 16 + #include <sys/stat.h> 17 + #include <sys/syscall.h> 18 + #include <sys/types.h> 19 + #include <sys/wait.h> 20 + #include <unistd.h> 21 + #include "../kselftest_harness.h" 22 + #include "../filesystems/utils.h" 23 + #include "../pidfd/pidfd.h" 24 + #include "wrappers.h" 25 + 26 + /* 27 + * Test listns() error handling with invalid buffer addresses. 28 + * 29 + * When the buffer pointer is invalid (e.g., crossing page boundaries 30 + * into unmapped memory), listns() returns EINVAL. 31 + * 32 + * This test also creates mount namespaces that get destroyed during 33 + * iteration, testing that namespace cleanup happens outside the RCU 34 + * read lock. 35 + */ 36 + TEST(listns_partial_fault_with_ns_cleanup) 37 + { 38 + void *map; 39 + __u64 *ns_ids; 40 + ssize_t ret; 41 + long page_size; 42 + pid_t pid, iter_pid; 43 + int pidfds[5]; 44 + int sv[5][2]; 45 + int iter_pidfd; 46 + int i, status; 47 + char c; 48 + 49 + page_size = sysconf(_SC_PAGESIZE); 50 + ASSERT_GT(page_size, 0); 51 + 52 + /* 53 + * Map two pages: 54 + * - First page: readable and writable 55 + * - Second page: will be unmapped to trigger EFAULT 56 + */ 57 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 58 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 59 + ASSERT_NE(map, MAP_FAILED); 60 + 61 + /* Unmap the second page */ 62 + ret = munmap((char *)map + page_size, page_size); 63 + ASSERT_EQ(ret, 0); 64 + 65 + /* 66 + * Position the buffer pointer so there's room for exactly one u64 67 + * before the page boundary. The second u64 would fall into the 68 + * unmapped page. 69 + */ 70 + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; 71 + 72 + /* 73 + * Create a separate process to run listns() in a loop concurrently 74 + * with namespace creation and destruction. 75 + */ 76 + iter_pid = create_child(&iter_pidfd, 0); 77 + ASSERT_NE(iter_pid, -1); 78 + 79 + if (iter_pid == 0) { 80 + struct ns_id_req req = { 81 + .size = sizeof(req), 82 + .spare = 0, 83 + .ns_id = 0, 84 + .ns_type = 0, /* All types */ 85 + .spare2 = 0, 86 + .user_ns_id = 0, /* Global listing */ 87 + }; 88 + int iter_ret; 89 + 90 + /* 91 + * Loop calling listns() until killed. 92 + * The kernel should: 93 + * 1. Successfully write the first namespace ID (within valid page) 94 + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) 95 + * 3. Handle concurrent namespace destruction without deadlock 96 + */ 97 + while (1) { 98 + iter_ret = sys_listns(&req, ns_ids, 2, 0); 99 + 100 + if (iter_ret == -1 && errno == ENOSYS) 101 + _exit(PIDFD_SKIP); 102 + } 103 + } 104 + 105 + /* Small delay to let iterator start looping */ 106 + usleep(50000); 107 + 108 + /* 109 + * Create several child processes, each in its own mount namespace. 110 + * These will be destroyed while the iterator is running listns(). 111 + */ 112 + for (i = 0; i < 5; i++) { 113 + /* Create socketpair for synchronization */ 114 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 115 + 116 + pid = create_child(&pidfds[i], CLONE_NEWNS); 117 + ASSERT_NE(pid, -1); 118 + 119 + if (pid == 0) { 120 + close(sv[i][0]); /* Close parent end */ 121 + 122 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 123 + _exit(1); 124 + 125 + /* Child: create a couple of tmpfs mounts */ 126 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 127 + _exit(1); 128 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 129 + _exit(1); 130 + 131 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 132 + _exit(1); 133 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 134 + _exit(1); 135 + 136 + /* Signal parent that setup is complete */ 137 + if (write_nointr(sv[i][1], "R", 1) != 1) 138 + _exit(1); 139 + 140 + /* Wait for parent to signal us to exit */ 141 + if (read_nointr(sv[i][1], &c, 1) != 1) 142 + _exit(1); 143 + 144 + close(sv[i][1]); 145 + _exit(0); 146 + } 147 + 148 + close(sv[i][1]); /* Close child end */ 149 + } 150 + 151 + /* Wait for all children to finish setup */ 152 + for (i = 0; i < 5; i++) { 153 + ret = read_nointr(sv[i][0], &c, 1); 154 + ASSERT_EQ(ret, 1); 155 + ASSERT_EQ(c, 'R'); 156 + } 157 + 158 + /* 159 + * Signal children to exit. This will destroy their mount namespaces 160 + * while listns() is iterating the namespace tree. 161 + * This tests that cleanup happens outside the RCU read lock. 162 + */ 163 + for (i = 0; i < 5; i++) 164 + write_nointr(sv[i][0], "X", 1); 165 + 166 + /* Wait for all mount namespace children to exit and cleanup */ 167 + for (i = 0; i < 5; i++) { 168 + waitpid(-1, NULL, 0); 169 + close(sv[i][0]); 170 + close(pidfds[i]); 171 + } 172 + 173 + /* Kill iterator and wait for it */ 174 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 175 + ret = waitpid(iter_pid, &status, 0); 176 + ASSERT_EQ(ret, iter_pid); 177 + close(iter_pidfd); 178 + 179 + /* Should have been killed */ 180 + ASSERT_TRUE(WIFSIGNALED(status)); 181 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 182 + 183 + /* Clean up */ 184 + munmap(map, page_size); 185 + } 186 + 187 + /* 188 + * Test listns() error handling when the entire buffer is invalid. 189 + * This is a sanity check that basic invalid pointer detection works. 190 + */ 191 + TEST(listns_complete_fault) 192 + { 193 + struct ns_id_req req = { 194 + .size = sizeof(req), 195 + .spare = 0, 196 + .ns_id = 0, 197 + .ns_type = 0, 198 + .spare2 = 0, 199 + .user_ns_id = 0, 200 + }; 201 + __u64 *ns_ids; 202 + ssize_t ret; 203 + 204 + /* Use a clearly invalid pointer */ 205 + ns_ids = (__u64 *)0xdeadbeef; 206 + 207 + ret = sys_listns(&req, ns_ids, 10, 0); 208 + 209 + if (ret == -1 && errno == ENOSYS) 210 + SKIP(return, "listns() not supported"); 211 + 212 + /* Should fail with EFAULT */ 213 + ASSERT_EQ(ret, -1); 214 + ASSERT_EQ(errno, EFAULT); 215 + } 216 + 217 + /* 218 + * Test listns() error handling when the buffer is NULL. 219 + */ 220 + TEST(listns_null_buffer) 221 + { 222 + struct ns_id_req req = { 223 + .size = sizeof(req), 224 + .spare = 0, 225 + .ns_id = 0, 226 + .ns_type = 0, 227 + .spare2 = 0, 228 + .user_ns_id = 0, 229 + }; 230 + ssize_t ret; 231 + 232 + /* NULL buffer with non-zero count should fail */ 233 + ret = sys_listns(&req, NULL, 10, 0); 234 + 235 + if (ret == -1 && errno == ENOSYS) 236 + SKIP(return, "listns() not supported"); 237 + 238 + /* Should fail with EFAULT */ 239 + ASSERT_EQ(ret, -1); 240 + ASSERT_EQ(errno, EFAULT); 241 + } 242 + 243 + /* 244 + * Test listns() with a buffer that becomes invalid mid-iteration 245 + * (after several successful writes), combined with mount namespace 246 + * destruction to test RCU cleanup logic. 247 + */ 248 + TEST(listns_late_fault_with_ns_cleanup) 249 + { 250 + void *map; 251 + __u64 *ns_ids; 252 + ssize_t ret; 253 + long page_size; 254 + pid_t pid, iter_pid; 255 + int pidfds[10]; 256 + int sv[10][2]; 257 + int iter_pidfd; 258 + int i, status; 259 + char c; 260 + 261 + page_size = sysconf(_SC_PAGESIZE); 262 + ASSERT_GT(page_size, 0); 263 + 264 + /* Map two pages */ 265 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 266 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 267 + ASSERT_NE(map, MAP_FAILED); 268 + 269 + /* Unmap the second page */ 270 + ret = munmap((char *)map + page_size, page_size); 271 + ASSERT_EQ(ret, 0); 272 + 273 + /* 274 + * Position buffer so we can write several u64s successfully 275 + * before hitting the page boundary. 276 + */ 277 + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; 278 + 279 + /* 280 + * Create a separate process to run listns() concurrently. 281 + */ 282 + iter_pid = create_child(&iter_pidfd, 0); 283 + ASSERT_NE(iter_pid, -1); 284 + 285 + if (iter_pid == 0) { 286 + struct ns_id_req req = { 287 + .size = sizeof(req), 288 + .spare = 0, 289 + .ns_id = 0, 290 + .ns_type = 0, 291 + .spare2 = 0, 292 + .user_ns_id = 0, 293 + }; 294 + int iter_ret; 295 + 296 + /* 297 + * Loop calling listns() until killed. 298 + * Request 10 namespace IDs while namespaces are being destroyed. 299 + * This tests: 300 + * 1. EFAULT handling when buffer becomes invalid 301 + * 2. Namespace cleanup outside RCU read lock during iteration 302 + */ 303 + while (1) { 304 + iter_ret = sys_listns(&req, ns_ids, 10, 0); 305 + 306 + if (iter_ret == -1 && errno == ENOSYS) 307 + _exit(PIDFD_SKIP); 308 + } 309 + } 310 + 311 + /* Small delay to let iterator start looping */ 312 + usleep(50000); 313 + 314 + /* 315 + * Create more children with mount namespaces to increase the 316 + * likelihood that namespace cleanup happens during iteration. 317 + */ 318 + for (i = 0; i < 10; i++) { 319 + /* Create socketpair for synchronization */ 320 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 321 + 322 + pid = create_child(&pidfds[i], CLONE_NEWNS); 323 + ASSERT_NE(pid, -1); 324 + 325 + if (pid == 0) { 326 + close(sv[i][0]); /* Close parent end */ 327 + 328 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 329 + _exit(1); 330 + 331 + /* Child: create tmpfs mounts */ 332 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 333 + _exit(1); 334 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 335 + _exit(1); 336 + 337 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 338 + _exit(1); 339 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 340 + _exit(1); 341 + 342 + /* Signal parent that setup is complete */ 343 + if (write_nointr(sv[i][1], "R", 1) != 1) 344 + _exit(1); 345 + 346 + /* Wait for parent to signal us to exit */ 347 + if (read_nointr(sv[i][1], &c, 1) != 1) 348 + _exit(1); 349 + 350 + close(sv[i][1]); 351 + _exit(0); 352 + } 353 + 354 + close(sv[i][1]); /* Close child end */ 355 + } 356 + 357 + /* Wait for all children to finish setup */ 358 + for (i = 0; i < 10; i++) { 359 + ret = read_nointr(sv[i][0], &c, 1); 360 + ASSERT_EQ(ret, 1); 361 + ASSERT_EQ(c, 'R'); 362 + } 363 + 364 + /* Kill half the children */ 365 + for (i = 0; i < 5; i++) 366 + write_nointr(sv[i][0], "X", 1); 367 + 368 + /* Small delay to let some exit */ 369 + usleep(10000); 370 + 371 + /* Kill remaining children */ 372 + for (i = 5; i < 10; i++) 373 + write_nointr(sv[i][0], "X", 1); 374 + 375 + /* Wait for all children and cleanup */ 376 + for (i = 0; i < 10; i++) { 377 + waitpid(-1, NULL, 0); 378 + close(sv[i][0]); 379 + close(pidfds[i]); 380 + } 381 + 382 + /* Kill iterator and wait for it */ 383 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 384 + ret = waitpid(iter_pid, &status, 0); 385 + ASSERT_EQ(ret, iter_pid); 386 + close(iter_pidfd); 387 + 388 + /* Should have been killed */ 389 + ASSERT_TRUE(WIFSIGNALED(status)); 390 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 391 + 392 + /* Clean up */ 393 + munmap(map, page_size); 394 + } 395 + 396 + /* 397 + * Test specifically focused on mount namespace cleanup during EFAULT. 398 + * Filter for mount namespaces only. 399 + */ 400 + TEST(listns_mnt_ns_cleanup_on_fault) 401 + { 402 + void *map; 403 + __u64 *ns_ids; 404 + ssize_t ret; 405 + long page_size; 406 + pid_t pid, iter_pid; 407 + int pidfds[8]; 408 + int sv[8][2]; 409 + int iter_pidfd; 410 + int i, status; 411 + char c; 412 + 413 + page_size = sysconf(_SC_PAGESIZE); 414 + ASSERT_GT(page_size, 0); 415 + 416 + /* Set up partial fault buffer */ 417 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 418 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 419 + ASSERT_NE(map, MAP_FAILED); 420 + 421 + ret = munmap((char *)map + page_size, page_size); 422 + ASSERT_EQ(ret, 0); 423 + 424 + /* Position for 3 successful writes, then fault */ 425 + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; 426 + 427 + /* 428 + * Create a separate process to run listns() concurrently. 429 + */ 430 + iter_pid = create_child(&iter_pidfd, 0); 431 + ASSERT_NE(iter_pid, -1); 432 + 433 + if (iter_pid == 0) { 434 + struct ns_id_req req = { 435 + .size = sizeof(req), 436 + .spare = 0, 437 + .ns_id = 0, 438 + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ 439 + .spare2 = 0, 440 + .user_ns_id = 0, 441 + }; 442 + int iter_ret; 443 + 444 + /* 445 + * Loop calling listns() until killed. 446 + * Call listns() to race with namespace destruction. 447 + */ 448 + while (1) { 449 + iter_ret = sys_listns(&req, ns_ids, 10, 0); 450 + 451 + if (iter_ret == -1 && errno == ENOSYS) 452 + _exit(PIDFD_SKIP); 453 + } 454 + } 455 + 456 + /* Small delay to let iterator start looping */ 457 + usleep(50000); 458 + 459 + /* Create children with mount namespaces */ 460 + for (i = 0; i < 8; i++) { 461 + /* Create socketpair for synchronization */ 462 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 463 + 464 + pid = create_child(&pidfds[i], CLONE_NEWNS); 465 + ASSERT_NE(pid, -1); 466 + 467 + if (pid == 0) { 468 + close(sv[i][0]); /* Close parent end */ 469 + 470 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 471 + _exit(1); 472 + 473 + /* Do some mount operations to make cleanup more interesting */ 474 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 475 + _exit(1); 476 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 477 + _exit(1); 478 + 479 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 480 + _exit(1); 481 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 482 + _exit(1); 483 + 484 + /* Signal parent that setup is complete */ 485 + if (write_nointr(sv[i][1], "R", 1) != 1) 486 + _exit(1); 487 + 488 + /* Wait for parent to signal us to exit */ 489 + if (read_nointr(sv[i][1], &c, 1) != 1) 490 + _exit(1); 491 + 492 + close(sv[i][1]); 493 + _exit(0); 494 + } 495 + 496 + close(sv[i][1]); /* Close child end */ 497 + } 498 + 499 + /* Wait for all children to finish setup */ 500 + for (i = 0; i < 8; i++) { 501 + ret = read_nointr(sv[i][0], &c, 1); 502 + ASSERT_EQ(ret, 1); 503 + ASSERT_EQ(c, 'R'); 504 + } 505 + 506 + /* Kill children to trigger namespace destruction during iteration */ 507 + for (i = 0; i < 8; i++) 508 + write_nointr(sv[i][0], "X", 1); 509 + 510 + /* Wait for children and cleanup */ 511 + for (i = 0; i < 8; i++) { 512 + waitpid(-1, NULL, 0); 513 + close(sv[i][0]); 514 + close(pidfds[i]); 515 + } 516 + 517 + /* Kill iterator and wait for it */ 518 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 519 + ret = waitpid(iter_pid, &status, 0); 520 + ASSERT_EQ(ret, iter_pid); 521 + close(iter_pidfd); 522 + 523 + /* Should have been killed */ 524 + ASSERT_TRUE(WIFSIGNALED(status)); 525 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 526 + 527 + munmap(map, page_size); 528 + } 529 + 530 + TEST_HARNESS_MAIN

+113

tools/testing/selftests/namespaces/regression_pidfd_setns_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <sched.h> 5 + #include <signal.h> 6 + #include <stdio.h> 7 + #include <stdlib.h> 8 + #include <string.h> 9 + #include <sys/socket.h> 10 + #include <unistd.h> 11 + #include "../pidfd/pidfd.h" 12 + #include "../kselftest_harness.h" 13 + 14 + /* 15 + * Regression tests for the setns(pidfd) active reference counting bug. 16 + * 17 + * These tests are based on the reproducers that triggered the race condition 18 + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). 19 + * 20 + * The bug: When using setns() with a pidfd, if the target task exits between 21 + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. 22 + * Then ns_ref_active_get() would increment from 0 without properly resurrecting 23 + * the owner chain, causing active reference count underflows. 24 + */ 25 + 26 + /* 27 + * Simple pidfd setns test using create_child()+unshare(). 28 + * 29 + * Without the fix, this would trigger active refcount warnings when the 30 + * parent exits after doing setns(pidfd) on a child that has already exited. 31 + */ 32 + TEST(simple_pidfd_setns) 33 + { 34 + pid_t child_pid; 35 + int pidfd = -1; 36 + int ret; 37 + int sv[2]; 38 + char c; 39 + 40 + /* Ignore SIGCHLD for autoreap */ 41 + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); 42 + 43 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 44 + 45 + /* Create a child process without namespaces initially */ 46 + child_pid = create_child(&pidfd, 0); 47 + ASSERT_GE(child_pid, 0); 48 + 49 + if (child_pid == 0) { 50 + close(sv[0]); 51 + 52 + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { 53 + close(sv[1]); 54 + _exit(1); 55 + } 56 + 57 + /* Signal parent that namespaces are ready */ 58 + if (write_nointr(sv[1], "1", 1) < 0) { 59 + close(sv[1]); 60 + _exit(1); 61 + } 62 + 63 + close(sv[1]); 64 + _exit(0); 65 + } 66 + ASSERT_GE(pidfd, 0); 67 + EXPECT_EQ(close(sv[1]), 0); 68 + 69 + ret = read_nointr(sv[0], &c, 1); 70 + ASSERT_EQ(ret, 1); 71 + EXPECT_EQ(close(sv[0]), 0); 72 + 73 + /* Set to child's namespaces via pidfd */ 74 + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); 75 + TH_LOG("setns() returned %d", ret); 76 + close(pidfd); 77 + } 78 + 79 + /* 80 + * Simple pidfd setns test using create_child(). 81 + * 82 + * This variation uses create_child() with namespace flags directly. 83 + * Namespaces are created immediately at clone time. 84 + */ 85 + TEST(simple_pidfd_setns_clone) 86 + { 87 + pid_t child_pid; 88 + int pidfd = -1; 89 + int ret; 90 + 91 + /* Ignore SIGCHLD for autoreap */ 92 + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); 93 + 94 + /* Create a child process with new namespaces using create_child() */ 95 + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); 96 + ASSERT_GE(child_pid, 0); 97 + 98 + if (child_pid == 0) { 99 + /* Child: sleep for a while so parent can setns to us */ 100 + sleep(2); 101 + _exit(0); 102 + } 103 + 104 + /* Parent: pidfd was already created by create_child() */ 105 + ASSERT_GE(pidfd, 0); 106 + 107 + /* Set to child's namespaces via pidfd */ 108 + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); 109 + close(pidfd); 110 + TH_LOG("setns() returned %d", ret); 111 + } 112 + 113 + TEST_HARNESS_MAIN

Configure Feed

Configure Feed