Merge tag 'namespace-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

+1

arch/alpha/kernel/syscalls/syscall.tbl

··· 509 509 577 common open_tree_attr sys_open_tree_attr 510 510 578 common file_getattr sys_file_getattr 511 511 579 common file_setattr sys_file_setattr 512 + 580 common listns sys_listns

+1

arch/arm/tools/syscall.tbl

··· 484 484 467 common open_tree_attr sys_open_tree_attr 485 485 468 common file_getattr sys_file_getattr 486 486 469 common file_setattr sys_file_setattr 487 + 470 common listns sys_listns

+1

arch/arm64/tools/syscall_32.tbl

··· 481 481 467 common open_tree_attr sys_open_tree_attr 482 482 468 common file_getattr sys_file_getattr 483 483 469 common file_setattr sys_file_setattr 484 + 470 common listns sys_listns

+1

arch/m68k/kernel/syscalls/syscall.tbl

··· 469 469 467 common open_tree_attr sys_open_tree_attr 470 470 468 common file_getattr sys_file_getattr 471 471 469 common file_setattr sys_file_setattr 472 + 470 common listns sys_listns

+1

arch/microblaze/kernel/syscalls/syscall.tbl

··· 475 475 467 common open_tree_attr sys_open_tree_attr 476 476 468 common file_getattr sys_file_getattr 477 477 469 common file_setattr sys_file_setattr 478 + 470 common listns sys_listns

+1

arch/mips/kernel/syscalls/syscall_n32.tbl

··· 408 408 467 n32 open_tree_attr sys_open_tree_attr 409 409 468 n32 file_getattr sys_file_getattr 410 410 469 n32 file_setattr sys_file_setattr 411 + 470 n32 listns sys_listns

+1

arch/mips/kernel/syscalls/syscall_n64.tbl

··· 384 384 467 n64 open_tree_attr sys_open_tree_attr 385 385 468 n64 file_getattr sys_file_getattr 386 386 469 n64 file_setattr sys_file_setattr 387 + 470 n64 listns sys_listns

+1

arch/mips/kernel/syscalls/syscall_o32.tbl

··· 457 457 467 o32 open_tree_attr sys_open_tree_attr 458 458 468 o32 file_getattr sys_file_getattr 459 459 469 o32 file_setattr sys_file_setattr 460 + 470 o32 listns sys_listns

+1

arch/parisc/kernel/syscalls/syscall.tbl

··· 468 468 467 common open_tree_attr sys_open_tree_attr 469 469 468 common file_getattr sys_file_getattr 470 470 469 common file_setattr sys_file_setattr 471 + 470 common listns sys_listns

+1

arch/powerpc/kernel/syscalls/syscall.tbl

··· 560 560 467 common open_tree_attr sys_open_tree_attr 561 561 468 common file_getattr sys_file_getattr 562 562 469 common file_setattr sys_file_setattr 563 + 470 common listns sys_listns

+1

arch/s390/kernel/syscalls/syscall.tbl

··· 472 472 467 common open_tree_attr sys_open_tree_attr sys_open_tree_attr 473 473 468 common file_getattr sys_file_getattr sys_file_getattr 474 474 469 common file_setattr sys_file_setattr sys_file_setattr 475 + 470 common listns sys_listns sys_listns

+1

arch/sh/kernel/syscalls/syscall.tbl

··· 473 473 467 common open_tree_attr sys_open_tree_attr 474 474 468 common file_getattr sys_file_getattr 475 475 469 common file_setattr sys_file_setattr 476 + 470 common listns sys_listns

+1

arch/sparc/kernel/syscalls/syscall.tbl

··· 515 515 467 common open_tree_attr sys_open_tree_attr 516 516 468 common file_getattr sys_file_getattr 517 517 469 common file_setattr sys_file_setattr 518 + 470 common listns sys_listns

+1

arch/x86/entry/syscalls/syscall_32.tbl

··· 475 475 467 i386 open_tree_attr sys_open_tree_attr 476 476 468 i386 file_getattr sys_file_getattr 477 477 469 i386 file_setattr sys_file_setattr 478 + 470 i386 listns sys_listns

+1

arch/x86/entry/syscalls/syscall_64.tbl

··· 394 394 467 common open_tree_attr sys_open_tree_attr 395 395 468 common file_getattr sys_file_getattr 396 396 469 common file_setattr sys_file_setattr 397 + 470 common listns sys_listns 397 398 398 399 # 399 400 # Due to a historical design error, certain syscalls are numbered differently

+1

arch/xtensa/kernel/syscalls/syscall.tbl

··· 440 440 467 common open_tree_attr sys_open_tree_attr 441 441 468 common file_getattr sys_file_getattr 442 442 469 common file_setattr sys_file_setattr 443 + 470 common listns sys_listns

+1

fs/libfs.c

··· 680 680 s->s_export_op = ctx->eops; 681 681 s->s_xattr = ctx->xattr; 682 682 s->s_time_gran = 1; 683 + s->s_d_flags |= ctx->s_d_flags; 683 684 root = new_inode(s); 684 685 if (!root) 685 686 return -ENOMEM;

+2 -1

fs/mount.h

··· 27 27 unsigned int nr_mounts; /* # of mounts in the namespace */ 28 28 unsigned int pending_mounts; 29 29 refcount_t passive; /* number references not pinning @mounts */ 30 + bool is_anon; 30 31 } __randomize_layout; 31 32 32 33 struct mnt_pcp { ··· 176 175 177 176 static inline bool is_anon_ns(struct mnt_namespace *ns) 178 177 { 179 - return ns->ns.ns_id == 0; 178 + return ns->is_anon; 180 179 } 181 180 182 181 static inline bool anon_ns_root(const struct mount *m)

+4 -6

fs/namespace.c

··· 4090 4090 dec_mnt_namespaces(ucounts); 4091 4091 return ERR_PTR(ret); 4092 4092 } 4093 - if (!anon) 4094 - ns_tree_gen_id(&new_ns->ns); 4093 + ns_tree_gen_id(new_ns); 4094 + 4095 + new_ns->is_anon = anon; 4095 4096 refcount_set(&new_ns->passive, 1); 4096 4097 new_ns->mounts = RB_ROOT; 4097 4098 init_waitqueue_head(&new_ns->poll); ··· 5983 5982 } 5984 5983 5985 5984 struct mnt_namespace init_mnt_ns = { 5986 - .ns.inum = ns_init_inum(&init_mnt_ns), 5987 - .ns.ops = &mntns_operations, 5985 + .ns = NS_COMMON_INIT(init_mnt_ns), 5988 5986 .user_ns = &init_user_ns, 5989 - .ns.__ns_ref = REFCOUNT_INIT(1), 5990 - .ns.ns_type = ns_common_type(&init_mnt_ns), 5991 5987 .passive = REFCOUNT_INIT(1), 5992 5988 .mounts = RB_ROOT, 5993 5989 .poll = __WAIT_QUEUE_HEAD_INITIALIZER(init_mnt_ns.poll),

+98 -3

fs/nsfs.c

··· 58 58 static void nsfs_evict(struct inode *inode) 59 59 { 60 60 struct ns_common *ns = inode->i_private; 61 + 62 + __ns_ref_active_put(ns); 61 63 clear_inode(inode); 62 64 ns->ops->put(ns); 63 65 } ··· 410 408 .statfs = simple_statfs, 411 409 .evict_inode = nsfs_evict, 412 410 .show_path = nsfs_show_path, 411 + .drop_inode = inode_just_drop, 413 412 }; 414 413 415 414 static int nsfs_init_inode(struct inode *inode, void *data) ··· 421 418 inode->i_mode |= S_IRUGO; 422 419 inode->i_fop = &ns_file_operations; 423 420 inode->i_ino = ns->inum; 421 + 422 + /* 423 + * Bring the namespace subtree back to life if we have to. This 424 + * can happen when e.g., all processes using a network namespace 425 + * and all namespace files or namespace file bind-mounts have 426 + * died but there are still sockets pinning it. The SIOCGSKNS 427 + * ioctl on such a socket will resurrect the relevant namespace 428 + * subtree. 429 + */ 430 + __ns_ref_active_get(ns); 424 431 return 0; 425 432 } 426 433 ··· 471 458 return FILEID_NSFS; 472 459 } 473 460 461 + bool is_current_namespace(struct ns_common *ns) 462 + { 463 + switch (ns->ns_type) { 464 + #ifdef CONFIG_CGROUPS 465 + case CLONE_NEWCGROUP: 466 + return current_in_namespace(to_cg_ns(ns)); 467 + #endif 468 + #ifdef CONFIG_IPC_NS 469 + case CLONE_NEWIPC: 470 + return current_in_namespace(to_ipc_ns(ns)); 471 + #endif 472 + case CLONE_NEWNS: 473 + return current_in_namespace(to_mnt_ns(ns)); 474 + #ifdef CONFIG_NET_NS 475 + case CLONE_NEWNET: 476 + return current_in_namespace(to_net_ns(ns)); 477 + #endif 478 + #ifdef CONFIG_PID_NS 479 + case CLONE_NEWPID: 480 + return current_in_namespace(to_pid_ns(ns)); 481 + #endif 482 + #ifdef CONFIG_TIME_NS 483 + case CLONE_NEWTIME: 484 + return current_in_namespace(to_time_ns(ns)); 485 + #endif 486 + #ifdef CONFIG_USER_NS 487 + case CLONE_NEWUSER: 488 + return current_in_namespace(to_user_ns(ns)); 489 + #endif 490 + #ifdef CONFIG_UTS_NS 491 + case CLONE_NEWUTS: 492 + return current_in_namespace(to_uts_ns(ns)); 493 + #endif 494 + default: 495 + VFS_WARN_ON_ONCE(true); 496 + return false; 497 + } 498 + } 499 + 474 500 static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, 475 501 int fh_len, int fh_type) 476 502 { ··· 535 483 return NULL; 536 484 } 537 485 486 + if (!fid->ns_id) 487 + return NULL; 488 + /* Either both are set or both are unset. */ 489 + if (!fid->ns_inum != !fid->ns_type) 490 + return NULL; 491 + 538 492 scoped_guard(rcu) { 539 493 ns = ns_tree_lookup_rcu(fid->ns_id, fid->ns_type); 540 494 if (!ns) 541 495 return NULL; 542 496 543 497 VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); 544 - VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); 545 498 546 - if (ns->inum != fid->ns_inum) 499 + if (fid->ns_inum && (fid->ns_inum != ns->inum)) 500 + return NULL; 501 + if (fid->ns_type && (fid->ns_type != ns->ns_type)) 547 502 return NULL; 548 503 549 - if (!__ns_ref_get(ns)) 504 + /* 505 + * This is racy because we're not actually taking an 506 + * active reference. IOW, it could happen that the 507 + * namespace becomes inactive after this check. 508 + * We don't care because nsfs_init_inode() will just 509 + * resurrect the relevant namespace tree for us. If it 510 + * has been active here we just allow it's resurrection. 511 + * We could try to take an active reference here and 512 + * then drop it again. But really, why bother. 513 + */ 514 + if (!ns_get_unless_inactive(ns)) 550 515 return NULL; 551 516 } 552 517 ··· 659 590 struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC); 660 591 if (!ctx) 661 592 return -ENOMEM; 593 + fc->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; 594 + ctx->s_d_flags |= DCACHE_DONTCACHE; 662 595 ctx->ops = &nsfs_ops; 663 596 ctx->eops = &nsfs_export_operations; 664 597 ctx->dops = &ns_dentry_operations; ··· 682 611 nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; 683 612 nsfs_root_path.mnt = nsfs_mnt; 684 613 nsfs_root_path.dentry = nsfs_mnt->mnt_root; 614 + } 615 + 616 + void nsproxy_ns_active_get(struct nsproxy *ns) 617 + { 618 + ns_ref_active_get(ns->mnt_ns); 619 + ns_ref_active_get(ns->uts_ns); 620 + ns_ref_active_get(ns->ipc_ns); 621 + ns_ref_active_get(ns->pid_ns_for_children); 622 + ns_ref_active_get(ns->cgroup_ns); 623 + ns_ref_active_get(ns->net_ns); 624 + ns_ref_active_get(ns->time_ns); 625 + ns_ref_active_get(ns->time_ns_for_children); 626 + } 627 + 628 + void nsproxy_ns_active_put(struct nsproxy *ns) 629 + { 630 + ns_ref_active_put(ns->mnt_ns); 631 + ns_ref_active_put(ns->uts_ns); 632 + ns_ref_active_put(ns->ipc_ns); 633 + ns_ref_active_put(ns->pid_ns_for_children); 634 + ns_ref_active_put(ns->cgroup_ns); 635 + ns_ref_active_put(ns->net_ns); 636 + ns_ref_active_put(ns->time_ns); 637 + ns_ref_active_put(ns->time_ns_for_children); 685 638 }

+37 -39

fs/pidfs.c

··· 454 454 struct task_struct *task __free(put_task) = NULL; 455 455 struct nsproxy *nsp __free(put_nsproxy) = NULL; 456 456 struct ns_common *ns_common = NULL; 457 - struct pid_namespace *pid_ns; 458 457 459 458 if (!pidfs_ioctl_valid(cmd)) 460 459 return -ENOIOCTLCMD; ··· 495 496 switch (cmd) { 496 497 /* Namespaces that hang of nsproxy. */ 497 498 case PIDFD_GET_CGROUP_NAMESPACE: 498 - if (IS_ENABLED(CONFIG_CGROUPS)) { 499 - get_cgroup_ns(nsp->cgroup_ns); 500 - ns_common = to_ns_common(nsp->cgroup_ns); 501 - } 499 + if (!ns_ref_get(nsp->cgroup_ns)) 500 + break; 501 + ns_common = to_ns_common(nsp->cgroup_ns); 502 502 break; 503 503 case PIDFD_GET_IPC_NAMESPACE: 504 - if (IS_ENABLED(CONFIG_IPC_NS)) { 505 - get_ipc_ns(nsp->ipc_ns); 506 - ns_common = to_ns_common(nsp->ipc_ns); 507 - } 504 + if (!ns_ref_get(nsp->ipc_ns)) 505 + break; 506 + ns_common = to_ns_common(nsp->ipc_ns); 508 507 break; 509 508 case PIDFD_GET_MNT_NAMESPACE: 510 - get_mnt_ns(nsp->mnt_ns); 509 + if (!ns_ref_get(nsp->mnt_ns)) 510 + break; 511 511 ns_common = to_ns_common(nsp->mnt_ns); 512 512 break; 513 513 case PIDFD_GET_NET_NAMESPACE: 514 - if (IS_ENABLED(CONFIG_NET_NS)) { 515 - ns_common = to_ns_common(nsp->net_ns); 516 - get_net_ns(ns_common); 517 - } 514 + if (!ns_ref_get(nsp->net_ns)) 515 + break; 516 + ns_common = to_ns_common(nsp->net_ns); 518 517 break; 519 518 case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: 520 - if (IS_ENABLED(CONFIG_PID_NS)) { 521 - get_pid_ns(nsp->pid_ns_for_children); 522 - ns_common = to_ns_common(nsp->pid_ns_for_children); 523 - } 519 + if (!ns_ref_get(nsp->pid_ns_for_children)) 520 + break; 521 + ns_common = to_ns_common(nsp->pid_ns_for_children); 524 522 break; 525 523 case PIDFD_GET_TIME_NAMESPACE: 526 - if (IS_ENABLED(CONFIG_TIME_NS)) { 527 - get_time_ns(nsp->time_ns); 528 - ns_common = to_ns_common(nsp->time_ns); 529 - } 524 + if (!ns_ref_get(nsp->time_ns)) 525 + break; 526 + ns_common = to_ns_common(nsp->time_ns); 530 527 break; 531 528 case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: 532 - if (IS_ENABLED(CONFIG_TIME_NS)) { 533 - get_time_ns(nsp->time_ns_for_children); 534 - ns_common = to_ns_common(nsp->time_ns_for_children); 535 - } 529 + if (!ns_ref_get(nsp->time_ns_for_children)) 530 + break; 531 + ns_common = to_ns_common(nsp->time_ns_for_children); 536 532 break; 537 533 case PIDFD_GET_UTS_NAMESPACE: 538 - if (IS_ENABLED(CONFIG_UTS_NS)) { 539 - get_uts_ns(nsp->uts_ns); 540 - ns_common = to_ns_common(nsp->uts_ns); 541 - } 534 + if (!ns_ref_get(nsp->uts_ns)) 535 + break; 536 + ns_common = to_ns_common(nsp->uts_ns); 542 537 break; 543 538 /* Namespaces that don't hang of nsproxy. */ 544 539 case PIDFD_GET_USER_NAMESPACE: 545 - if (IS_ENABLED(CONFIG_USER_NS)) { 546 - rcu_read_lock(); 547 - ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); 548 - rcu_read_unlock(); 540 + scoped_guard(rcu) { 541 + struct user_namespace *user_ns; 542 + 543 + user_ns = task_cred_xxx(task, user_ns); 544 + if (!ns_ref_get(user_ns)) 545 + break; 546 + ns_common = to_ns_common(user_ns); 549 547 } 550 548 break; 551 549 case PIDFD_GET_PID_NAMESPACE: 552 - if (IS_ENABLED(CONFIG_PID_NS)) { 553 - rcu_read_lock(); 550 + scoped_guard(rcu) { 551 + struct pid_namespace *pid_ns; 552 + 554 553 pid_ns = task_active_pid_ns(task); 555 - if (pid_ns) 556 - ns_common = to_ns_common(get_pid_ns(pid_ns)); 557 - rcu_read_unlock(); 554 + if (!ns_ref_get(pid_ns)) 555 + break; 556 + ns_common = to_ns_common(pid_ns); 558 557 } 559 558 break; 560 559 default: ··· 1019 1022 1020 1023 fc->s_iflags |= SB_I_NOEXEC; 1021 1024 fc->s_iflags |= SB_I_NODEV; 1025 + ctx->s_d_flags |= DCACHE_DONTCACHE; 1022 1026 ctx->ops = &pidfs_sops; 1023 1027 ctx->eops = &pidfs_export_operations; 1024 1028 ctx->dops = &pidfs_dentry_operations;

+196

include/linux/ns/ns_common_types.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_NS_COMMON_TYPES_H 3 + #define _LINUX_NS_COMMON_TYPES_H 4 + 5 + #include <linux/atomic.h> 6 + #include <linux/ns/nstree_types.h> 7 + #include <linux/rbtree.h> 8 + #include <linux/refcount.h> 9 + #include <linux/types.h> 10 + 11 + struct cgroup_namespace; 12 + struct dentry; 13 + struct ipc_namespace; 14 + struct mnt_namespace; 15 + struct net; 16 + struct pid_namespace; 17 + struct proc_ns_operations; 18 + struct time_namespace; 19 + struct user_namespace; 20 + struct uts_namespace; 21 + 22 + extern struct cgroup_namespace init_cgroup_ns; 23 + extern struct ipc_namespace init_ipc_ns; 24 + extern struct mnt_namespace init_mnt_ns; 25 + extern struct net init_net; 26 + extern struct pid_namespace init_pid_ns; 27 + extern struct time_namespace init_time_ns; 28 + extern struct user_namespace init_user_ns; 29 + extern struct uts_namespace init_uts_ns; 30 + 31 + extern const struct proc_ns_operations cgroupns_operations; 32 + extern const struct proc_ns_operations ipcns_operations; 33 + extern const struct proc_ns_operations mntns_operations; 34 + extern const struct proc_ns_operations netns_operations; 35 + extern const struct proc_ns_operations pidns_operations; 36 + extern const struct proc_ns_operations pidns_for_children_operations; 37 + extern const struct proc_ns_operations timens_operations; 38 + extern const struct proc_ns_operations timens_for_children_operations; 39 + extern const struct proc_ns_operations userns_operations; 40 + extern const struct proc_ns_operations utsns_operations; 41 + 42 + /* 43 + * Namespace lifetimes are managed via a two-tier reference counting model: 44 + * 45 + * (1) __ns_ref (refcount_t): Main reference count tracking memory 46 + * lifetime. Controls when the namespace structure itself is freed. 47 + * It also pins the namespace on the namespace trees whereas (2) 48 + * only regulates their visibility to userspace. 49 + * 50 + * (2) __ns_ref_active (atomic_t): Reference count tracking active users. 51 + * Controls visibility of the namespace in the namespace trees. 52 + * Any live task that uses the namespace (via nsproxy or cred) holds 53 + * an active reference. Any open file descriptor or bind-mount of 54 + * the namespace holds an active reference. Once all tasks have 55 + * called exited their namespaces and all file descriptors and 56 + * bind-mounts have been released the active reference count drops 57 + * to zero and the namespace becomes inactive. IOW, the namespace 58 + * cannot be listed or opened via file handles anymore. 59 + * 60 + * Note that it is valid to transition from active to inactive and 61 + * back from inactive to active e.g., when resurrecting an inactive 62 + * namespace tree via the SIOCGSKNS ioctl(). 63 + * 64 + * Relationship and lifecycle states: 65 + * 66 + * - Active (__ns_ref_active > 0): 67 + * Namespace is actively used and visible to userspace. The namespace 68 + * can be reopened via /proc/<pid>/ns/<ns_type>, via namespace file 69 + * handles, or discovered via listns(). 70 + * 71 + * - Inactive (__ns_ref_active == 0, __ns_ref > 0): 72 + * No tasks are actively using the namespace and it isn't pinned by 73 + * any bind-mounts or open file descriptors anymore. But the namespace 74 + * is still kept alive by internal references. For example, the user 75 + * namespace could be pinned by an open file through file->f_cred 76 + * references when one of the now defunct tasks had opened a file and 77 + * handed the file descriptor off to another process via a UNIX 78 + * sockets. Such references keep the namespace structure alive through 79 + * __ns_ref but will not hold an active reference. 80 + * 81 + * - Destroyed (__ns_ref == 0): 82 + * No references remain. The namespace is removed from the tree and freed. 83 + * 84 + * State transitions: 85 + * 86 + * Active -> Inactive: 87 + * When the last task using the namespace exits it drops its active 88 + * references to all namespaces. However, user and pid namespaces 89 + * remain accessible until the task has been reaped. 90 + * 91 + * Inactive -> Active: 92 + * An inactive namespace tree might be resurrected due to e.g., the 93 + * SIOCGSKNS ioctl() on a socket. 94 + * 95 + * Inactive -> Destroyed: 96 + * When __ns_ref drops to zero the namespace is removed from the 97 + * namespaces trees and the memory is freed (after RCU grace period). 98 + * 99 + * Initial namespaces: 100 + * Boot-time namespaces (init_net, init_pid_ns, etc.) start with 101 + * __ns_ref_active = 1 and remain active forever. 102 + * 103 + * @ns_type: type of namespace (e.g., CLONE_NEWNET) 104 + * @stashed: cached dentry to be used by the vfs 105 + * @ops: namespace operations 106 + * @inum: namespace inode number (quickly recycled for non-initial namespaces) 107 + * @__ns_ref: main reference count (do not use directly) 108 + * @ns_tree: namespace tree nodes and active reference count 109 + */ 110 + struct ns_common { 111 + u32 ns_type; 112 + struct dentry *stashed; 113 + const struct proc_ns_operations *ops; 114 + unsigned int inum; 115 + refcount_t __ns_ref; /* do not use directly */ 116 + union { 117 + struct ns_tree; 118 + struct rcu_head ns_rcu; 119 + }; 120 + }; 121 + 122 + #define to_ns_common(__ns) \ 123 + _Generic((__ns), \ 124 + struct cgroup_namespace *: &(__ns)->ns, \ 125 + const struct cgroup_namespace *: &(__ns)->ns, \ 126 + struct ipc_namespace *: &(__ns)->ns, \ 127 + const struct ipc_namespace *: &(__ns)->ns, \ 128 + struct mnt_namespace *: &(__ns)->ns, \ 129 + const struct mnt_namespace *: &(__ns)->ns, \ 130 + struct net *: &(__ns)->ns, \ 131 + const struct net *: &(__ns)->ns, \ 132 + struct pid_namespace *: &(__ns)->ns, \ 133 + const struct pid_namespace *: &(__ns)->ns, \ 134 + struct time_namespace *: &(__ns)->ns, \ 135 + const struct time_namespace *: &(__ns)->ns, \ 136 + struct user_namespace *: &(__ns)->ns, \ 137 + const struct user_namespace *: &(__ns)->ns, \ 138 + struct uts_namespace *: &(__ns)->ns, \ 139 + const struct uts_namespace *: &(__ns)->ns) 140 + 141 + #define ns_init_inum(__ns) \ 142 + _Generic((__ns), \ 143 + struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ 144 + struct ipc_namespace *: IPC_NS_INIT_INO, \ 145 + struct mnt_namespace *: MNT_NS_INIT_INO, \ 146 + struct net *: NET_NS_INIT_INO, \ 147 + struct pid_namespace *: PID_NS_INIT_INO, \ 148 + struct time_namespace *: TIME_NS_INIT_INO, \ 149 + struct user_namespace *: USER_NS_INIT_INO, \ 150 + struct uts_namespace *: UTS_NS_INIT_INO) 151 + 152 + #define ns_init_ns(__ns) \ 153 + _Generic((__ns), \ 154 + struct cgroup_namespace *: &init_cgroup_ns, \ 155 + struct ipc_namespace *: &init_ipc_ns, \ 156 + struct mnt_namespace *: &init_mnt_ns, \ 157 + struct net *: &init_net, \ 158 + struct pid_namespace *: &init_pid_ns, \ 159 + struct time_namespace *: &init_time_ns, \ 160 + struct user_namespace *: &init_user_ns, \ 161 + struct uts_namespace *: &init_uts_ns) 162 + 163 + #define ns_init_id(__ns) \ 164 + _Generic((__ns), \ 165 + struct cgroup_namespace *: CGROUP_NS_INIT_ID, \ 166 + struct ipc_namespace *: IPC_NS_INIT_ID, \ 167 + struct mnt_namespace *: MNT_NS_INIT_ID, \ 168 + struct net *: NET_NS_INIT_ID, \ 169 + struct pid_namespace *: PID_NS_INIT_ID, \ 170 + struct time_namespace *: TIME_NS_INIT_ID, \ 171 + struct user_namespace *: USER_NS_INIT_ID, \ 172 + struct uts_namespace *: UTS_NS_INIT_ID) 173 + 174 + #define to_ns_operations(__ns) \ 175 + _Generic((__ns), \ 176 + struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ 177 + struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ 178 + struct mnt_namespace *: &mntns_operations, \ 179 + struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ 180 + struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ 181 + struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ 182 + struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ 183 + struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) 184 + 185 + #define ns_common_type(__ns) \ 186 + _Generic((__ns), \ 187 + struct cgroup_namespace *: CLONE_NEWCGROUP, \ 188 + struct ipc_namespace *: CLONE_NEWIPC, \ 189 + struct mnt_namespace *: CLONE_NEWNS, \ 190 + struct net *: CLONE_NEWNET, \ 191 + struct pid_namespace *: CLONE_NEWPID, \ 192 + struct time_namespace *: CLONE_NEWTIME, \ 193 + struct user_namespace *: CLONE_NEWUSER, \ 194 + struct uts_namespace *: CLONE_NEWUTS) 195 + 196 + #endif /* _LINUX_NS_COMMON_TYPES_H */

+55

include/linux/ns/nstree_types.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 3 + #ifndef _LINUX_NSTREE_TYPES_H 4 + #define _LINUX_NSTREE_TYPES_H 5 + 6 + #include <linux/rbtree.h> 7 + #include <linux/list.h> 8 + 9 + /** 10 + * struct ns_tree_root - Root of a namespace tree 11 + * @ns_rb: Red-black tree root for efficient lookups 12 + * @ns_list_head: List head for sequential iteration 13 + * 14 + * Each namespace tree maintains both an rbtree (for O(log n) lookups) 15 + * and a list (for efficient sequential iteration). The list is kept in 16 + * the same sorted order as the rbtree. 17 + */ 18 + struct ns_tree_root { 19 + struct rb_root ns_rb; 20 + struct list_head ns_list_head; 21 + }; 22 + 23 + /** 24 + * struct ns_tree_node - Node in a namespace tree 25 + * @ns_node: Red-black tree node 26 + * @ns_list_entry: List entry for sequential iteration 27 + * 28 + * Represents a namespace's position in a tree. Each namespace has 29 + * multiple tree nodes for different trees (unified, per-type, owner). 30 + */ 31 + struct ns_tree_node { 32 + struct rb_node ns_node; 33 + struct list_head ns_list_entry; 34 + }; 35 + 36 + /** 37 + * struct ns_tree - Namespace tree nodes and active reference count 38 + * @ns_id: Unique namespace identifier 39 + * @__ns_ref_active: Active reference count (do not use directly) 40 + * @ns_unified_node: Node in the global namespace tree 41 + * @ns_tree_node: Node in the per-type namespace tree 42 + * @ns_owner_node: Node in the owner namespace's tree of owned namespaces 43 + * @ns_owner_root: Root of the tree of namespaces owned by this namespace 44 + * (only used when this namespace is an owner) 45 + */ 46 + struct ns_tree { 47 + u64 ns_id; 48 + atomic_t __ns_ref_active; 49 + struct ns_tree_node ns_unified_node; 50 + struct ns_tree_node ns_tree_node; 51 + struct ns_tree_node ns_owner_node; 52 + struct ns_tree_root ns_owner_root; 53 + }; 54 + 55 + #endif /* _LINUX_NSTREE_TYPES_H */

+113 -116

include/linux/ns_common.h

··· 2 2 #ifndef _LINUX_NS_COMMON_H 3 3 #define _LINUX_NS_COMMON_H 4 4 5 + #include <linux/ns/ns_common_types.h> 5 6 #include <linux/refcount.h> 6 - #include <linux/rbtree.h> 7 + #include <linux/vfsdebug.h> 7 8 #include <uapi/linux/sched.h> 9 + #include <uapi/linux/nsfs.h> 8 10 9 - struct proc_ns_operations; 10 - 11 - struct cgroup_namespace; 12 - struct ipc_namespace; 13 - struct mnt_namespace; 14 - struct net; 15 - struct pid_namespace; 16 - struct time_namespace; 17 - struct user_namespace; 18 - struct uts_namespace; 19 - 20 - extern struct cgroup_namespace init_cgroup_ns; 21 - extern struct ipc_namespace init_ipc_ns; 22 - extern struct mnt_namespace init_mnt_ns; 23 - extern struct net init_net; 24 - extern struct pid_namespace init_pid_ns; 25 - extern struct time_namespace init_time_ns; 26 - extern struct user_namespace init_user_ns; 27 - extern struct uts_namespace init_uts_ns; 28 - 29 - extern const struct proc_ns_operations netns_operations; 30 - extern const struct proc_ns_operations utsns_operations; 31 - extern const struct proc_ns_operations ipcns_operations; 32 - extern const struct proc_ns_operations pidns_operations; 33 - extern const struct proc_ns_operations pidns_for_children_operations; 34 - extern const struct proc_ns_operations userns_operations; 35 - extern const struct proc_ns_operations mntns_operations; 36 - extern const struct proc_ns_operations cgroupns_operations; 37 - extern const struct proc_ns_operations timens_operations; 38 - extern const struct proc_ns_operations timens_for_children_operations; 39 - 40 - struct ns_common { 41 - u32 ns_type; 42 - struct dentry *stashed; 43 - const struct proc_ns_operations *ops; 44 - unsigned int inum; 45 - refcount_t __ns_ref; /* do not use directly */ 46 - union { 47 - struct { 48 - u64 ns_id; 49 - struct rb_node ns_tree_node; 50 - struct list_head ns_list_node; 51 - }; 52 - struct rcu_head ns_rcu; 53 - }; 54 - }; 55 - 11 + bool is_current_namespace(struct ns_common *ns); 56 12 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum); 57 13 void __ns_common_free(struct ns_common *ns); 14 + struct ns_common *__must_check ns_owner(struct ns_common *ns); 58 15 59 - #define to_ns_common(__ns) \ 60 - _Generic((__ns), \ 61 - struct cgroup_namespace *: &(__ns)->ns, \ 62 - const struct cgroup_namespace *: &(__ns)->ns, \ 63 - struct ipc_namespace *: &(__ns)->ns, \ 64 - const struct ipc_namespace *: &(__ns)->ns, \ 65 - struct mnt_namespace *: &(__ns)->ns, \ 66 - const struct mnt_namespace *: &(__ns)->ns, \ 67 - struct net *: &(__ns)->ns, \ 68 - const struct net *: &(__ns)->ns, \ 69 - struct pid_namespace *: &(__ns)->ns, \ 70 - const struct pid_namespace *: &(__ns)->ns, \ 71 - struct time_namespace *: &(__ns)->ns, \ 72 - const struct time_namespace *: &(__ns)->ns, \ 73 - struct user_namespace *: &(__ns)->ns, \ 74 - const struct user_namespace *: &(__ns)->ns, \ 75 - struct uts_namespace *: &(__ns)->ns, \ 76 - const struct uts_namespace *: &(__ns)->ns) 16 + static __always_inline bool is_ns_init_inum(const struct ns_common *ns) 17 + { 18 + VFS_WARN_ON_ONCE(ns->inum == 0); 19 + return unlikely(in_range(ns->inum, MNT_NS_INIT_INO, 20 + IPC_NS_INIT_INO - MNT_NS_INIT_INO + 1)); 21 + } 77 22 78 - #define ns_init_inum(__ns) \ 79 - _Generic((__ns), \ 80 - struct cgroup_namespace *: CGROUP_NS_INIT_INO, \ 81 - struct ipc_namespace *: IPC_NS_INIT_INO, \ 82 - struct mnt_namespace *: MNT_NS_INIT_INO, \ 83 - struct net *: NET_NS_INIT_INO, \ 84 - struct pid_namespace *: PID_NS_INIT_INO, \ 85 - struct time_namespace *: TIME_NS_INIT_INO, \ 86 - struct user_namespace *: USER_NS_INIT_INO, \ 87 - struct uts_namespace *: UTS_NS_INIT_INO) 23 + static __always_inline bool is_ns_init_id(const struct ns_common *ns) 24 + { 25 + VFS_WARN_ON_ONCE(ns->ns_id == 0); 26 + return ns->ns_id <= NS_LAST_INIT_ID; 27 + } 88 28 89 - #define ns_init_ns(__ns) \ 90 - _Generic((__ns), \ 91 - struct cgroup_namespace *: &init_cgroup_ns, \ 92 - struct ipc_namespace *: &init_ipc_ns, \ 93 - struct mnt_namespace *: &init_mnt_ns, \ 94 - struct net *: &init_net, \ 95 - struct pid_namespace *: &init_pid_ns, \ 96 - struct time_namespace *: &init_time_ns, \ 97 - struct user_namespace *: &init_user_ns, \ 98 - struct uts_namespace *: &init_uts_ns) 99 - 100 - #define to_ns_operations(__ns) \ 101 - _Generic((__ns), \ 102 - struct cgroup_namespace *: (IS_ENABLED(CONFIG_CGROUPS) ? &cgroupns_operations : NULL), \ 103 - struct ipc_namespace *: (IS_ENABLED(CONFIG_IPC_NS) ? &ipcns_operations : NULL), \ 104 - struct mnt_namespace *: &mntns_operations, \ 105 - struct net *: (IS_ENABLED(CONFIG_NET_NS) ? &netns_operations : NULL), \ 106 - struct pid_namespace *: (IS_ENABLED(CONFIG_PID_NS) ? &pidns_operations : NULL), \ 107 - struct time_namespace *: (IS_ENABLED(CONFIG_TIME_NS) ? &timens_operations : NULL), \ 108 - struct user_namespace *: (IS_ENABLED(CONFIG_USER_NS) ? &userns_operations : NULL), \ 109 - struct uts_namespace *: (IS_ENABLED(CONFIG_UTS_NS) ? &utsns_operations : NULL)) 110 - 111 - #define ns_common_type(__ns) \ 112 - _Generic((__ns), \ 113 - struct cgroup_namespace *: CLONE_NEWCGROUP, \ 114 - struct ipc_namespace *: CLONE_NEWIPC, \ 115 - struct mnt_namespace *: CLONE_NEWNS, \ 116 - struct net *: CLONE_NEWNET, \ 117 - struct pid_namespace *: CLONE_NEWPID, \ 118 - struct time_namespace *: CLONE_NEWTIME, \ 119 - struct user_namespace *: CLONE_NEWUSER, \ 120 - struct uts_namespace *: CLONE_NEWUTS) 29 + #define NS_COMMON_INIT(nsname) \ 30 + { \ 31 + .ns_type = ns_common_type(&nsname), \ 32 + .ns_id = ns_init_id(&nsname), \ 33 + .inum = ns_init_inum(&nsname), \ 34 + .ops = to_ns_operations(&nsname), \ 35 + .stashed = NULL, \ 36 + .__ns_ref = REFCOUNT_INIT(1), \ 37 + .__ns_ref_active = ATOMIC_INIT(1), \ 38 + .ns_unified_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_unified_node.ns_list_entry), \ 39 + .ns_tree_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_tree_node.ns_list_entry), \ 40 + .ns_owner_node.ns_list_entry = LIST_HEAD_INIT(nsname.ns.ns_owner_node.ns_list_entry), \ 41 + .ns_owner_root.ns_list_head = LIST_HEAD_INIT(nsname.ns.ns_owner_root.ns_list_head), \ 42 + } 121 43 122 44 #define ns_common_init(__ns) \ 123 45 __ns_common_init(to_ns_common(__ns), \ ··· 55 133 56 134 #define ns_common_free(__ns) __ns_common_free(to_ns_common((__ns))) 57 135 136 + static __always_inline __must_check int __ns_ref_active_read(const struct ns_common *ns) 137 + { 138 + return atomic_read(&ns->__ns_ref_active); 139 + } 140 + 141 + static __always_inline __must_check int __ns_ref_read(const struct ns_common *ns) 142 + { 143 + return refcount_read(&ns->__ns_ref); 144 + } 145 + 58 146 static __always_inline __must_check bool __ns_ref_put(struct ns_common *ns) 59 147 { 60 - return refcount_dec_and_test(&ns->__ns_ref); 148 + if (is_ns_init_id(ns)) { 149 + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); 150 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); 151 + return false; 152 + } 153 + if (refcount_dec_and_test(&ns->__ns_ref)) { 154 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); 155 + return true; 156 + } 157 + return false; 61 158 } 62 159 63 160 static __always_inline __must_check bool __ns_ref_get(struct ns_common *ns) 64 161 { 65 - return refcount_inc_not_zero(&ns->__ns_ref); 162 + if (is_ns_init_id(ns)) { 163 + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); 164 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); 165 + return true; 166 + } 167 + if (refcount_inc_not_zero(&ns->__ns_ref)) 168 + return true; 169 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns)); 170 + return false; 66 171 } 67 172 68 - #define ns_ref_read(__ns) refcount_read(&to_ns_common((__ns))->__ns_ref) 69 - #define ns_ref_inc(__ns) refcount_inc(&to_ns_common((__ns))->__ns_ref) 70 - #define ns_ref_get(__ns) __ns_ref_get(to_ns_common((__ns))) 71 - #define ns_ref_put(__ns) __ns_ref_put(to_ns_common((__ns))) 72 - #define ns_ref_put_and_lock(__ns, __lock) \ 73 - refcount_dec_and_lock(&to_ns_common((__ns))->__ns_ref, (__lock)) 173 + static __always_inline void __ns_ref_inc(struct ns_common *ns) 174 + { 175 + if (is_ns_init_id(ns)) { 176 + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); 177 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); 178 + return; 179 + } 180 + refcount_inc(&ns->__ns_ref); 181 + } 182 + 183 + static __always_inline __must_check bool __ns_ref_dec_and_lock(struct ns_common *ns, 184 + spinlock_t *ns_lock) 185 + { 186 + if (is_ns_init_id(ns)) { 187 + VFS_WARN_ON_ONCE(__ns_ref_read(ns) != 1); 188 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) != 1); 189 + return false; 190 + } 191 + return refcount_dec_and_lock(&ns->__ns_ref, ns_lock); 192 + } 193 + 194 + #define ns_ref_read(__ns) __ns_ref_read(to_ns_common((__ns))) 195 + #define ns_ref_inc(__ns) \ 196 + do { if (__ns) __ns_ref_inc(to_ns_common((__ns))); } while (0) 197 + #define ns_ref_get(__ns) \ 198 + ((__ns) ? __ns_ref_get(to_ns_common((__ns))) : false) 199 + #define ns_ref_put(__ns) \ 200 + ((__ns) ? __ns_ref_put(to_ns_common((__ns))) : false) 201 + #define ns_ref_put_and_lock(__ns, __ns_lock) \ 202 + ((__ns) ? __ns_ref_dec_and_lock(to_ns_common((__ns)), __ns_lock) : false) 203 + 204 + #define ns_ref_active_read(__ns) \ 205 + ((__ns) ? __ns_ref_active_read(to_ns_common(__ns)) : 0) 206 + 207 + void __ns_ref_active_put(struct ns_common *ns); 208 + 209 + #define ns_ref_active_put(__ns) \ 210 + do { if (__ns) __ns_ref_active_put(to_ns_common(__ns)); } while (0) 211 + 212 + static __always_inline struct ns_common *__must_check ns_get_unless_inactive(struct ns_common *ns) 213 + { 214 + if (!__ns_ref_active_read(ns)) { 215 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 216 + return NULL; 217 + } 218 + if (!__ns_ref_get(ns)) 219 + return NULL; 220 + return ns; 221 + } 222 + 223 + void __ns_ref_active_get(struct ns_common *ns); 224 + 225 + #define ns_ref_active_get(__ns) \ 226 + do { if (__ns) __ns_ref_active_get(to_ns_common(__ns)); } while (0) 74 227 75 228 #endif

+3

include/linux/nsfs.h

··· 37 37 38 38 #define current_in_namespace(__ns) (__current_namespace_from_type(__ns) == __ns) 39 39 40 + void nsproxy_ns_active_get(struct nsproxy *ns); 41 + void nsproxy_ns_active_put(struct nsproxy *ns); 42 + 40 43 #endif /* _LINUX_NSFS_H */

+6 -3

include/linux/nsproxy.h

··· 93 93 */ 94 94 95 95 int copy_namespaces(u64 flags, struct task_struct *tsk); 96 - void exit_task_namespaces(struct task_struct *tsk); 96 + void switch_cred_namespaces(const struct cred *old, const struct cred *new); 97 + void exit_nsproxy_namespaces(struct task_struct *tsk); 98 + void get_cred_namespaces(struct task_struct *tsk); 99 + void exit_cred_namespaces(struct task_struct *tsk); 97 100 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); 98 101 int exec_task_namespaces(void); 99 - void free_nsproxy(struct nsproxy *ns); 102 + void deactivate_nsproxy(struct nsproxy *ns); 100 103 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, 101 104 struct cred *, struct fs_struct *); 102 105 int __init nsproxy_cache_init(void); ··· 107 104 static inline void put_nsproxy(struct nsproxy *ns) 108 105 { 109 106 if (refcount_dec_and_test(&ns->count)) 110 - free_nsproxy(ns); 107 + deactivate_nsproxy(ns); 111 108 } 112 109 113 110 static inline void get_nsproxy(struct nsproxy *ns)

+35 -17

include/linux/nstree.h

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 2 3 #ifndef _LINUX_NSTREE_H 3 4 #define _LINUX_NSTREE_H 4 5 5 - #include <linux/ns_common.h> 6 + #include <linux/ns/nstree_types.h> 6 7 #include <linux/nsproxy.h> 7 8 #include <linux/rbtree.h> 8 9 #include <linux/seqlock.h> 9 10 #include <linux/rculist.h> 10 11 #include <linux/cookie.h> 12 + #include <uapi/linux/nsfs.h> 11 13 12 - extern struct ns_tree cgroup_ns_tree; 13 - extern struct ns_tree ipc_ns_tree; 14 - extern struct ns_tree mnt_ns_tree; 15 - extern struct ns_tree net_ns_tree; 16 - extern struct ns_tree pid_ns_tree; 17 - extern struct ns_tree time_ns_tree; 18 - extern struct ns_tree user_ns_tree; 19 - extern struct ns_tree uts_ns_tree; 14 + struct ns_common; 15 + 16 + extern struct ns_tree_root cgroup_ns_tree; 17 + extern struct ns_tree_root ipc_ns_tree; 18 + extern struct ns_tree_root mnt_ns_tree; 19 + extern struct ns_tree_root net_ns_tree; 20 + extern struct ns_tree_root pid_ns_tree; 21 + extern struct ns_tree_root time_ns_tree; 22 + extern struct ns_tree_root user_ns_tree; 23 + extern struct ns_tree_root uts_ns_tree; 24 + 25 + void ns_tree_node_init(struct ns_tree_node *node); 26 + void ns_tree_root_init(struct ns_tree_root *root); 27 + bool ns_tree_node_empty(const struct ns_tree_node *node); 28 + struct rb_node *ns_tree_node_add(struct ns_tree_node *node, 29 + struct ns_tree_root *root, 30 + int (*cmp)(struct rb_node *, const struct rb_node *)); 31 + void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root); 20 32 21 33 #define to_ns_tree(__ns) \ 22 34 _Generic((__ns), \ ··· 41 29 struct user_namespace *: &(user_ns_tree), \ 42 30 struct uts_namespace *: &(uts_ns_tree)) 43 31 44 - u64 ns_tree_gen_id(struct ns_common *ns); 45 - void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree); 46 - void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree); 32 + #define ns_tree_gen_id(__ns) \ 33 + __ns_tree_gen_id(to_ns_common(__ns), \ 34 + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) 35 + 36 + u64 __ns_tree_gen_id(struct ns_common *ns, u64 id); 37 + void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree); 38 + void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree); 47 39 struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type); 48 40 struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, 49 - struct ns_tree *ns_tree, 41 + struct ns_tree_root *ns_tree, 50 42 bool previous); 51 43 52 - static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree *ns_tree) 44 + static inline void __ns_tree_add(struct ns_common *ns, struct ns_tree_root *ns_tree, u64 id) 53 45 { 54 - ns_tree_gen_id(ns); 46 + __ns_tree_gen_id(ns, id); 55 47 __ns_tree_add_raw(ns, ns_tree); 56 48 } 57 49 ··· 75 59 * This function assigns a new id to the namespace and adds it to the 76 60 * appropriate namespace tree and list. 77 61 */ 78 - #define ns_tree_add(__ns) __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns)) 62 + #define ns_tree_add(__ns) \ 63 + __ns_tree_add(to_ns_common(__ns), to_ns_tree(__ns), \ 64 + (((__ns) == ns_init_ns(__ns)) ? ns_init_id(__ns) : 0)) 79 65 80 66 /** 81 67 * ns_tree_remove - Remove a namespace from a namespace tree ··· 91 73 #define ns_tree_adjoined_rcu(__ns, __previous) \ 92 74 __ns_tree_adjoined_rcu(to_ns_common(__ns), to_ns_tree(__ns), __previous) 93 75 94 - #define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node)) 76 + #define ns_tree_active(__ns) (!RB_EMPTY_NODE(&to_ns_common(__ns)->ns_tree_node.ns_node)) 95 77 96 78 #endif /* _LINUX_NSTREE_H */

+1 -2

include/linux/pid_namespace.h

··· 61 61 62 62 static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) 63 63 { 64 - if (ns != &init_pid_ns) 65 - ns_ref_inc(ns); 64 + ns_ref_inc(ns); 66 65 return ns; 67 66 } 68 67

+1

include/linux/pseudo_fs.h

··· 9 9 const struct xattr_handler * const *xattr; 10 10 const struct dentry_operations *dops; 11 11 unsigned long magic; 12 + unsigned int s_d_flags; 12 13 }; 13 14 14 15 struct pseudo_fs_context *init_pseudo(struct fs_context *fc,

+4

include/linux/syscalls.h

··· 77 77 struct cachestat; 78 78 struct statmount; 79 79 struct mnt_id_req; 80 + struct ns_id_req; 80 81 struct xattr_args; 81 82 struct file_attr; 82 83 ··· 438 437 asmlinkage long sys_listmount(const struct mnt_id_req __user *req, 439 438 u64 __user *mnt_ids, size_t nr_mnt_ids, 440 439 unsigned int flags); 440 + asmlinkage long sys_listns(const struct ns_id_req __user *req, 441 + u64 __user *ns_ids, size_t nr_ns_ids, 442 + unsigned int flags); 441 443 asmlinkage long sys_truncate(const char __user *path, long length); 442 444 asmlinkage long sys_ftruncate(unsigned int fd, off_t length); 443 445 #if BITS_PER_LONG == 32

+2 -2

include/linux/user_namespace.h

··· 166 166 ns->rlimit_max[type] = max <= LONG_MAX ? max : LONG_MAX; 167 167 } 168 168 169 - #ifdef CONFIG_USER_NS 170 - 171 169 static inline struct user_namespace *to_user_ns(struct ns_common *ns) 172 170 { 173 171 return container_of(ns, struct user_namespace, ns); 174 172 } 173 + 174 + #ifdef CONFIG_USER_NS 175 175 176 176 static inline struct user_namespace *get_user_ns(struct user_namespace *ns) 177 177 {

+3 -1

include/uapi/asm-generic/unistd.h

··· 857 857 __SYSCALL(__NR_file_getattr, sys_file_getattr) 858 858 #define __NR_file_setattr 469 859 859 __SYSCALL(__NR_file_setattr, sys_file_setattr) 860 + #define __NR_listns 470 861 + __SYSCALL(__NR_listns, sys_listns) 860 862 861 863 #undef __NR_syscalls 862 - #define __NR_syscalls 470 864 + #define __NR_syscalls 471 863 865 864 866 /* 865 867 * 32 bit systems traditionally used different

+58

include/uapi/linux/nsfs.h

··· 67 67 #define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ 68 68 #define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ 69 69 70 + enum init_ns_id { 71 + IPC_NS_INIT_ID = 1ULL, 72 + UTS_NS_INIT_ID = 2ULL, 73 + USER_NS_INIT_ID = 3ULL, 74 + PID_NS_INIT_ID = 4ULL, 75 + CGROUP_NS_INIT_ID = 5ULL, 76 + TIME_NS_INIT_ID = 6ULL, 77 + NET_NS_INIT_ID = 7ULL, 78 + MNT_NS_INIT_ID = 8ULL, 79 + #ifdef __KERNEL__ 80 + NS_LAST_INIT_ID = MNT_NS_INIT_ID, 81 + #endif 82 + }; 83 + 84 + enum ns_type { 85 + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ 86 + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ 87 + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ 88 + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ 89 + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ 90 + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ 91 + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ 92 + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ 93 + }; 94 + 95 + /** 96 + * struct ns_id_req - namespace ID request structure 97 + * @size: size of this structure 98 + * @spare: reserved for future use 99 + * @filter: filter mask 100 + * @ns_id: last namespace id 101 + * @user_ns_id: owning user namespace ID 102 + * 103 + * Structure for passing namespace ID and miscellaneous parameters to 104 + * statns(2) and listns(2). 105 + * 106 + * For statns(2) @param represents the request mask. 107 + * For listns(2) @param represents the last listed mount id (or zero). 108 + */ 109 + struct ns_id_req { 110 + __u32 size; 111 + __u32 spare; 112 + __u64 ns_id; 113 + struct /* listns */ { 114 + __u32 ns_type; 115 + __u32 spare2; 116 + __u64 user_ns_id; 117 + }; 118 + }; 119 + 120 + /* 121 + * Special @user_ns_id value that can be passed to listns() 122 + */ 123 + #define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ 124 + 125 + /* List of all ns_id_req versions. */ 126 + #define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ 127 + 70 128 #endif /* __LINUX_NSFS_H */

+1 -6

init/version-timestamp.c

··· 8 8 #include <linux/utsname.h> 9 9 10 10 struct uts_namespace init_uts_ns = { 11 - .ns.ns_type = ns_common_type(&init_uts_ns), 12 - .ns.__ns_ref = REFCOUNT_INIT(2), 11 + .ns = NS_COMMON_INIT(init_uts_ns), 13 12 .name = { 14 13 .sysname = UTS_SYSNAME, 15 14 .nodename = UTS_NODENAME, ··· 18 19 .domainname = UTS_DOMAINNAME, 19 20 }, 20 21 .user_ns = &init_user_ns, 21 - .ns.inum = ns_init_inum(&init_uts_ns), 22 - #ifdef CONFIG_UTS_NS 23 - .ns.ops = &utsns_operations, 24 - #endif 25 22 }; 26 23 27 24 /* FIXED STRINGS! Don't touch! */

+1 -6

ipc/msgutil.c

··· 27 27 * and not CONFIG_IPC_NS. 28 28 */ 29 29 struct ipc_namespace init_ipc_ns = { 30 - .ns.__ns_ref = REFCOUNT_INIT(1), 30 + .ns = NS_COMMON_INIT(init_ipc_ns), 31 31 .user_ns = &init_user_ns, 32 - .ns.inum = ns_init_inum(&init_ipc_ns), 33 - #ifdef CONFIG_IPC_NS 34 - .ns.ops = &ipcns_operations, 35 - #endif 36 - .ns.ns_type = ns_common_type(&init_ipc_ns), 37 32 }; 38 33 39 34 struct msg_msgseg {

+2 -1

ipc/namespace.c

··· 66 66 if (err) 67 67 goto fail_free; 68 68 69 + ns_tree_gen_id(ns); 69 70 ns->user_ns = get_user_ns(user_ns); 70 71 ns->ucounts = ucounts; 71 72 ··· 87 86 88 87 sem_init_ns(ns); 89 88 shm_init_ns(ns); 90 - ns_tree_add(ns); 89 + ns_tree_add_raw(ns); 91 90 92 91 return ns; 93 92

+4 -7

kernel/cgroup/cgroup.c

··· 250 250 251 251 /* cgroup namespace for init task */ 252 252 struct cgroup_namespace init_cgroup_ns = { 253 - .ns.__ns_ref = REFCOUNT_INIT(2), 253 + .ns = NS_COMMON_INIT(init_cgroup_ns), 254 254 .user_ns = &init_user_ns, 255 - .ns.ops = &cgroupns_operations, 256 - .ns.inum = ns_init_inum(&init_cgroup_ns), 257 255 .root_cset = &init_css_set, 258 - .ns.ns_type = ns_common_type(&init_cgroup_ns), 259 256 }; 260 257 261 258 static struct file_system_type cgroup2_fs_type; ··· 1519 1522 } else { 1520 1523 /* 1521 1524 * NOTE: This function may be called from bpf_cgroup_from_id() 1522 - * on a task which has already passed exit_task_namespaces() and 1523 - * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all 1524 - * cgroups visible for lookups. 1525 + * on a task which has already passed exit_nsproxy_namespaces() 1526 + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will 1527 + * make all cgroups visible for lookups. 1525 1528 */ 1526 1529 return &cgrp_dfl_root.cgrp; 1527 1530 }

+1 -1

kernel/cgroup/namespace.c

··· 30 30 ret = ns_common_init(new_ns); 31 31 if (ret) 32 32 return ERR_PTR(ret); 33 - ns_tree_add(new_ns); 34 33 return no_free_ptr(new_ns); 35 34 } 36 35 ··· 85 86 new_ns->ucounts = ucounts; 86 87 new_ns->root_cset = cset; 87 88 89 + ns_tree_add(new_ns); 88 90 return new_ns; 89 91 } 90 92

+6

kernel/cred.c

··· 306 306 kdebug("share_creds(%p{%ld})", 307 307 p->cred, atomic_long_read(&p->cred->usage)); 308 308 inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 309 + get_cred_namespaces(p); 309 310 return 0; 310 311 } 311 312 ··· 344 343 345 344 p->cred = p->real_cred = get_cred(new); 346 345 inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 346 + get_cred_namespaces(p); 347 + 347 348 return 0; 348 349 349 350 error_put: ··· 438 435 */ 439 436 if (new->user != old->user || new->user_ns != old->user_ns) 440 437 inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); 438 + 441 439 rcu_assign_pointer(task->real_cred, new); 442 440 rcu_assign_pointer(task->cred, new); 443 441 if (new->user != old->user || new->user_ns != old->user_ns) 444 442 dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); 443 + if (new->user_ns != old->user_ns) 444 + switch_cred_namespaces(old, new); 445 445 446 446 /* send notifications */ 447 447 if (!uid_eq(new->uid, old->uid) ||

+2 -1

kernel/exit.c

··· 291 291 write_unlock_irq(&tasklist_lock); 292 292 /* @thread_pid can't go away until free_pids() below */ 293 293 proc_flush_pid(thread_pid); 294 + exit_cred_namespaces(p); 294 295 add_device_randomness(&p->se.sum_exec_runtime, 295 296 sizeof(p->se.sum_exec_runtime)); 296 297 free_pids(post.pids); ··· 963 962 exit_fs(tsk); 964 963 if (group_dead) 965 964 disassociate_ctty(1); 966 - exit_task_namespaces(tsk); 965 + exit_nsproxy_namespaces(tsk); 967 966 exit_task_work(tsk); 968 967 exit_thread(tsk); 969 968

+2 -1

kernel/fork.c

··· 2453 2453 if (p->io_context) 2454 2454 exit_io_context(p); 2455 2455 bad_fork_cleanup_namespaces: 2456 - exit_task_namespaces(p); 2456 + exit_nsproxy_namespaces(p); 2457 2457 bad_fork_cleanup_mm: 2458 2458 if (p->mm) { 2459 2459 mm_clear_owner(p->mm, p); ··· 2487 2487 delayacct_tsk_free(p); 2488 2488 bad_fork_cleanup_count: 2489 2489 dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); 2490 + exit_cred_namespaces(p); 2490 2491 exit_creds(p); 2491 2492 bad_fork_free: 2492 2493 WRITE_ONCE(p->__state, TASK_DEAD);

+240 -6

kernel/nscommon.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 2 3 3 4 #include <linux/ns_common.h> 5 + #include <linux/nstree.h> 4 6 #include <linux/proc_ns.h> 7 + #include <linux/user_namespace.h> 5 8 #include <linux/vfsdebug.h> 6 9 7 10 #ifdef CONFIG_DEBUG_VFS ··· 55 52 56 53 int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) 57 54 { 55 + int ret = 0; 56 + 58 57 refcount_set(&ns->__ns_ref, 1); 59 58 ns->stashed = NULL; 60 59 ns->ops = ops; 61 60 ns->ns_id = 0; 62 61 ns->ns_type = ns_type; 63 - RB_CLEAR_NODE(&ns->ns_tree_node); 64 - INIT_LIST_HEAD(&ns->ns_list_node); 62 + ns_tree_node_init(&ns->ns_tree_node); 63 + ns_tree_node_init(&ns->ns_unified_node); 64 + ns_tree_node_init(&ns->ns_owner_node); 65 + ns_tree_root_init(&ns->ns_owner_root); 65 66 66 67 #ifdef CONFIG_DEBUG_VFS 67 68 ns_debug(ns, ops); 68 69 #endif 69 70 70 - if (inum) { 71 + if (inum) 71 72 ns->inum = inum; 72 - return 0; 73 - } 74 - return proc_alloc_inum(&ns->inum); 73 + else 74 + ret = proc_alloc_inum(&ns->inum); 75 + if (ret) 76 + return ret; 77 + /* 78 + * Tree ref starts at 0. It's incremented when namespace enters 79 + * active use (installed in nsproxy) and decremented when all 80 + * active uses are gone. Initial namespaces are always active. 81 + */ 82 + if (is_ns_init_inum(ns)) 83 + atomic_set(&ns->__ns_ref_active, 1); 84 + else 85 + atomic_set(&ns->__ns_ref_active, 0); 86 + return 0; 75 87 } 76 88 77 89 void __ns_common_free(struct ns_common *ns) 78 90 { 79 91 proc_free_inum(ns->inum); 92 + } 93 + 94 + struct ns_common *__must_check ns_owner(struct ns_common *ns) 95 + { 96 + struct user_namespace *owner; 97 + 98 + if (unlikely(!ns->ops)) 99 + return NULL; 100 + VFS_WARN_ON_ONCE(!ns->ops->owner); 101 + owner = ns->ops->owner(ns); 102 + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); 103 + if (!owner) 104 + return NULL; 105 + /* Skip init_user_ns as it's always active */ 106 + if (owner == &init_user_ns) 107 + return NULL; 108 + return to_ns_common(owner); 109 + } 110 + 111 + /* 112 + * The active reference count works by having each namespace that gets 113 + * created take a single active reference on its owning user namespace. 114 + * That single reference is only released once the child namespace's 115 + * active count itself goes down. 116 + * 117 + * A regular namespace tree might look as follow: 118 + * Legend: 119 + * + : adding active reference 120 + * - : dropping active reference 121 + * x : always active (initial namespace) 122 + * 123 + * 124 + * net_ns pid_ns 125 + * \ / 126 + * + + 127 + * user_ns1 (2) 128 + * | 129 + * ipc_ns | uts_ns 130 + * \ | / 131 + * + + + 132 + * user_ns2 (3) 133 + * | 134 + * cgroup_ns | mnt_ns 135 + * \ | / 136 + * x x x 137 + * init_user_ns (1) 138 + * 139 + * If both net_ns and pid_ns put their last active reference on 140 + * themselves it will cascade to user_ns1 dropping its own active 141 + * reference and dropping one active reference on user_ns2: 142 + * 143 + * net_ns pid_ns 144 + * \ / 145 + * - - 146 + * user_ns1 (0) 147 + * | 148 + * ipc_ns | uts_ns 149 + * \ | / 150 + * + - + 151 + * user_ns2 (2) 152 + * | 153 + * cgroup_ns | mnt_ns 154 + * \ | / 155 + * x x x 156 + * init_user_ns (1) 157 + * 158 + * The iteration stops once we reach a namespace that still has active 159 + * references. 160 + */ 161 + void __ns_ref_active_put(struct ns_common *ns) 162 + { 163 + /* Initial namespaces are always active. */ 164 + if (is_ns_init_id(ns)) 165 + return; 166 + 167 + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 168 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 169 + return; 170 + } 171 + 172 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 173 + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); 174 + 175 + for (;;) { 176 + ns = ns_owner(ns); 177 + if (!ns) 178 + return; 179 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 180 + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { 181 + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); 182 + return; 183 + } 184 + } 185 + } 186 + 187 + /* 188 + * The active reference count works by having each namespace that gets 189 + * created take a single active reference on its owning user namespace. 190 + * That single reference is only released once the child namespace's 191 + * active count itself goes down. This makes it possible to efficiently 192 + * resurrect a namespace tree: 193 + * 194 + * A regular namespace tree might look as follow: 195 + * Legend: 196 + * + : adding active reference 197 + * - : dropping active reference 198 + * x : always active (initial namespace) 199 + * 200 + * 201 + * net_ns pid_ns 202 + * \ / 203 + * + + 204 + * user_ns1 (2) 205 + * | 206 + * ipc_ns | uts_ns 207 + * \ | / 208 + * + + + 209 + * user_ns2 (3) 210 + * | 211 + * cgroup_ns | mnt_ns 212 + * \ | / 213 + * x x x 214 + * init_user_ns (1) 215 + * 216 + * If both net_ns and pid_ns put their last active reference on 217 + * themselves it will cascade to user_ns1 dropping its own active 218 + * reference and dropping one active reference on user_ns2: 219 + * 220 + * net_ns pid_ns 221 + * \ / 222 + * - - 223 + * user_ns1 (0) 224 + * | 225 + * ipc_ns | uts_ns 226 + * \ | / 227 + * + - + 228 + * user_ns2 (2) 229 + * | 230 + * cgroup_ns | mnt_ns 231 + * \ | / 232 + * x x x 233 + * init_user_ns (1) 234 + * 235 + * Assume the whole tree is dead but all namespaces are still active: 236 + * 237 + * net_ns pid_ns 238 + * \ / 239 + * - - 240 + * user_ns1 (0) 241 + * | 242 + * ipc_ns | uts_ns 243 + * \ | / 244 + * - - - 245 + * user_ns2 (0) 246 + * | 247 + * cgroup_ns | mnt_ns 248 + * \ | / 249 + * x x x 250 + * init_user_ns (1) 251 + * 252 + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): 253 + * 254 + * net_ns pid_ns 255 + * \ / 256 + * + - 257 + * user_ns1 (0) 258 + * | 259 + * ipc_ns | uts_ns 260 + * \ | / 261 + * - + - 262 + * user_ns2 (0) 263 + * | 264 + * cgroup_ns | mnt_ns 265 + * \ | / 266 + * x x x 267 + * init_user_ns (1) 268 + * 269 + * If net_ns had a zero reference count and we bumped it we also need to 270 + * take another reference on its owning user namespace. Similarly, if 271 + * pid_ns had a zero reference count it also needs to take another 272 + * reference on its owning user namespace. So both net_ns and pid_ns 273 + * will each have their own reference on the owning user namespace. 274 + * 275 + * If the owning user namespace user_ns1 had a zero reference count then 276 + * it also needs to take another reference on its owning user namespace 277 + * and so on. 278 + */ 279 + void __ns_ref_active_get(struct ns_common *ns) 280 + { 281 + int prev; 282 + 283 + /* Initial namespaces are always active. */ 284 + if (is_ns_init_id(ns)) 285 + return; 286 + 287 + /* If we didn't resurrect the namespace we're done. */ 288 + prev = atomic_fetch_add(1, &ns->__ns_ref_active); 289 + VFS_WARN_ON_ONCE(prev < 0); 290 + if (likely(prev)) 291 + return; 292 + 293 + /* 294 + * We did resurrect it. Walk the ownership hierarchy upwards 295 + * until we found an owning user namespace that is active. 296 + */ 297 + for (;;) { 298 + ns = ns_owner(ns); 299 + if (!ns) 300 + return; 301 + 302 + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); 303 + prev = atomic_fetch_add(1, &ns->__ns_ref_active); 304 + VFS_WARN_ON_ONCE(prev < 0); 305 + if (likely(prev)) 306 + return; 307 + } 80 308 }

+42 -15

kernel/nsproxy.c

··· 26 26 #include <linux/syscalls.h> 27 27 #include <linux/cgroup.h> 28 28 #include <linux/perf_event.h> 29 + #include <linux/nstree.h> 29 30 30 31 static struct kmem_cache *nsproxy_cachep; 31 32 ··· 58 57 if (nsproxy) 59 58 refcount_set(&nsproxy->count, 1); 60 59 return nsproxy; 60 + } 61 + 62 + static inline void nsproxy_free(struct nsproxy *ns) 63 + { 64 + put_mnt_ns(ns->mnt_ns); 65 + put_uts_ns(ns->uts_ns); 66 + put_ipc_ns(ns->ipc_ns); 67 + put_pid_ns(ns->pid_ns_for_children); 68 + put_time_ns(ns->time_ns); 69 + put_time_ns(ns->time_ns_for_children); 70 + put_cgroup_ns(ns->cgroup_ns); 71 + put_net(ns->net_ns); 72 + kmem_cache_free(nsproxy_cachep, ns); 73 + } 74 + 75 + void deactivate_nsproxy(struct nsproxy *ns) 76 + { 77 + nsproxy_ns_active_put(ns); 78 + nsproxy_free(ns); 61 79 } 62 80 63 81 /* ··· 199 179 if ((flags & CLONE_VM) == 0) 200 180 timens_on_fork(new_ns, tsk); 201 181 182 + nsproxy_ns_active_get(new_ns); 202 183 tsk->nsproxy = new_ns; 203 184 return 0; 204 - } 205 - 206 - void free_nsproxy(struct nsproxy *ns) 207 - { 208 - put_mnt_ns(ns->mnt_ns); 209 - put_uts_ns(ns->uts_ns); 210 - put_ipc_ns(ns->ipc_ns); 211 - put_pid_ns(ns->pid_ns_for_children); 212 - put_time_ns(ns->time_ns); 213 - put_time_ns(ns->time_ns_for_children); 214 - put_cgroup_ns(ns->cgroup_ns); 215 - put_net(ns->net_ns); 216 - kmem_cache_free(nsproxy_cachep, ns); 217 185 } 218 186 219 187 /* ··· 240 232 241 233 might_sleep(); 242 234 235 + if (new) 236 + nsproxy_ns_active_get(new); 237 + 243 238 task_lock(p); 244 239 ns = p->nsproxy; 245 240 p->nsproxy = new; ··· 252 241 put_nsproxy(ns); 253 242 } 254 243 255 - void exit_task_namespaces(struct task_struct *p) 244 + void exit_nsproxy_namespaces(struct task_struct *p) 256 245 { 257 246 switch_task_namespaces(p, NULL); 247 + } 248 + 249 + void switch_cred_namespaces(const struct cred *old, const struct cred *new) 250 + { 251 + ns_ref_active_get(new->user_ns); 252 + ns_ref_active_put(old->user_ns); 253 + } 254 + 255 + void get_cred_namespaces(struct task_struct *tsk) 256 + { 257 + ns_ref_active_get(tsk->real_cred->user_ns); 258 + } 259 + 260 + void exit_cred_namespaces(struct task_struct *tsk) 261 + { 262 + ns_ref_active_put(tsk->real_cred->user_ns); 258 263 } 259 264 260 265 int exec_task_namespaces(void) ··· 342 315 if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) 343 316 free_fs_struct(nsset->fs); 344 317 if (nsset->nsproxy) 345 - free_nsproxy(nsset->nsproxy); 318 + nsproxy_free(nsset->nsproxy); 346 319 } 347 320 348 321 static int prepare_nsset(unsigned flags, struct nsset *nsset)

+675 -109

kernel/nstree.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 + /* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ 2 3 3 4 #include <linux/nstree.h> 4 5 #include <linux/proc_ns.h> 6 + #include <linux/rculist.h> 5 7 #include <linux/vfsdebug.h> 8 + #include <linux/syscalls.h> 9 + #include <linux/user_namespace.h> 6 10 7 - /** 8 - * struct ns_tree - Namespace tree 9 - * @ns_tree: Rbtree of namespaces of a particular type 10 - * @ns_list: Sequentially walkable list of all namespaces of this type 11 - * @ns_tree_lock: Seqlock to protect the tree and list 12 - * @type: type of namespaces in this tree 13 - */ 14 - struct ns_tree { 15 - struct rb_root ns_tree; 16 - struct list_head ns_list; 17 - seqlock_t ns_tree_lock; 18 - int type; 11 + static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); 12 + 13 + DEFINE_LOCK_GUARD_0(ns_tree_writer, 14 + write_seqlock(&ns_tree_lock), 15 + write_sequnlock(&ns_tree_lock)) 16 + 17 + DEFINE_LOCK_GUARD_0(ns_tree_locked_reader, 18 + read_seqlock_excl(&ns_tree_lock), 19 + read_sequnlock_excl(&ns_tree_lock)) 20 + 21 + static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */ 22 + .ns_rb = RB_ROOT, 23 + .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head), 19 24 }; 20 25 21 - struct ns_tree mnt_ns_tree = { 22 - .ns_tree = RB_ROOT, 23 - .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), 24 - .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), 25 - .type = CLONE_NEWNS, 26 + struct ns_tree_root mnt_ns_tree = { 27 + .ns_rb = RB_ROOT, 28 + .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head), 26 29 }; 27 30 28 - struct ns_tree net_ns_tree = { 29 - .ns_tree = RB_ROOT, 30 - .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), 31 - .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), 32 - .type = CLONE_NEWNET, 31 + struct ns_tree_root net_ns_tree = { 32 + .ns_rb = RB_ROOT, 33 + .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head), 33 34 }; 34 35 EXPORT_SYMBOL_GPL(net_ns_tree); 35 36 36 - struct ns_tree uts_ns_tree = { 37 - .ns_tree = RB_ROOT, 38 - .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), 39 - .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), 40 - .type = CLONE_NEWUTS, 37 + struct ns_tree_root uts_ns_tree = { 38 + .ns_rb = RB_ROOT, 39 + .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head), 41 40 }; 42 41 43 - struct ns_tree user_ns_tree = { 44 - .ns_tree = RB_ROOT, 45 - .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), 46 - .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), 47 - .type = CLONE_NEWUSER, 42 + struct ns_tree_root user_ns_tree = { 43 + .ns_rb = RB_ROOT, 44 + .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head), 48 45 }; 49 46 50 - struct ns_tree ipc_ns_tree = { 51 - .ns_tree = RB_ROOT, 52 - .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), 53 - .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), 54 - .type = CLONE_NEWIPC, 47 + struct ns_tree_root ipc_ns_tree = { 48 + .ns_rb = RB_ROOT, 49 + .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head), 55 50 }; 56 51 57 - struct ns_tree pid_ns_tree = { 58 - .ns_tree = RB_ROOT, 59 - .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), 60 - .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), 61 - .type = CLONE_NEWPID, 52 + struct ns_tree_root pid_ns_tree = { 53 + .ns_rb = RB_ROOT, 54 + .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head), 62 55 }; 63 56 64 - struct ns_tree cgroup_ns_tree = { 65 - .ns_tree = RB_ROOT, 66 - .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), 67 - .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), 68 - .type = CLONE_NEWCGROUP, 57 + struct ns_tree_root cgroup_ns_tree = { 58 + .ns_rb = RB_ROOT, 59 + .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head), 69 60 }; 70 61 71 - struct ns_tree time_ns_tree = { 72 - .ns_tree = RB_ROOT, 73 - .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), 74 - .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), 75 - .type = CLONE_NEWTIME, 62 + struct ns_tree_root time_ns_tree = { 63 + .ns_rb = RB_ROOT, 64 + .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head), 76 65 }; 77 66 78 - DEFINE_COOKIE(namespace_cookie); 67 + /** 68 + * ns_tree_node_init - Initialize a namespace tree node 69 + * @node: The node to initialize 70 + * 71 + * Initializes both the rbtree node and list entry. 72 + */ 73 + void ns_tree_node_init(struct ns_tree_node *node) 74 + { 75 + RB_CLEAR_NODE(&node->ns_node); 76 + INIT_LIST_HEAD(&node->ns_list_entry); 77 + } 78 + 79 + /** 80 + * ns_tree_root_init - Initialize a namespace tree root 81 + * @root: The root to initialize 82 + * 83 + * Initializes both the rbtree root and list head. 84 + */ 85 + void ns_tree_root_init(struct ns_tree_root *root) 86 + { 87 + root->ns_rb = RB_ROOT; 88 + INIT_LIST_HEAD(&root->ns_list_head); 89 + } 90 + 91 + /** 92 + * ns_tree_node_empty - Check if a namespace tree node is empty 93 + * @node: The node to check 94 + * 95 + * Returns true if the node is not in any tree. 96 + */ 97 + bool ns_tree_node_empty(const struct ns_tree_node *node) 98 + { 99 + return RB_EMPTY_NODE(&node->ns_node); 100 + } 101 + 102 + /** 103 + * ns_tree_node_add - Add a node to a namespace tree 104 + * @node: The node to add 105 + * @root: The tree root to add to 106 + * @cmp: Comparison function for rbtree insertion 107 + * 108 + * Adds the node to both the rbtree and the list, maintaining sorted order. 109 + * The list is maintained in the same order as the rbtree to enable efficient 110 + * iteration. 111 + * 112 + * Returns: NULL if insertion succeeded, existing node if duplicate found 113 + */ 114 + struct rb_node *ns_tree_node_add(struct ns_tree_node *node, 115 + struct ns_tree_root *root, 116 + int (*cmp)(struct rb_node *, const struct rb_node *)) 117 + { 118 + struct rb_node *ret, *prev; 119 + 120 + /* Add to rbtree */ 121 + ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp); 122 + 123 + /* Add to list in sorted order */ 124 + prev = rb_prev(&node->ns_node); 125 + if (!prev) { 126 + /* No previous node, add at head */ 127 + list_add_rcu(&node->ns_list_entry, &root->ns_list_head); 128 + } else { 129 + /* Add after previous node */ 130 + struct ns_tree_node *prev_node; 131 + prev_node = rb_entry(prev, struct ns_tree_node, ns_node); 132 + list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry); 133 + } 134 + 135 + return ret; 136 + } 137 + 138 + /** 139 + * ns_tree_node_del - Remove a node from a namespace tree 140 + * @node: The node to remove 141 + * @root: The tree root to remove from 142 + * 143 + * Removes the node from both the rbtree and the list atomically. 144 + */ 145 + void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root) 146 + { 147 + rb_erase(&node->ns_node, &root->ns_rb); 148 + RB_CLEAR_NODE(&node->ns_node); 149 + list_bidir_del_rcu(&node->ns_list_entry); 150 + } 79 151 80 152 static inline struct ns_common *node_to_ns(const struct rb_node *node) 81 153 { 82 154 if (!node) 83 155 return NULL; 84 - return rb_entry(node, struct ns_common, ns_tree_node); 156 + return rb_entry(node, struct ns_common, ns_tree_node.ns_node); 85 157 } 86 158 87 - static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) 159 + static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) 88 160 { 89 - struct ns_common *ns_a = node_to_ns(a); 90 - struct ns_common *ns_b = node_to_ns(b); 91 - u64 ns_id_a = ns_a->ns_id; 92 - u64 ns_id_b = ns_b->ns_id; 161 + if (!node) 162 + return NULL; 163 + return rb_entry(node, struct ns_common, ns_unified_node.ns_node); 164 + } 93 165 94 - if (ns_id_a < ns_id_b) 166 + static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) 167 + { 168 + if (!node) 169 + return NULL; 170 + return rb_entry(node, struct ns_common, ns_owner_node.ns_node); 171 + } 172 + 173 + static int ns_id_cmp(u64 id_a, u64 id_b) 174 + { 175 + if (id_a < id_b) 95 176 return -1; 96 - if (ns_id_a > ns_id_b) 177 + if (id_a > id_b) 97 178 return 1; 98 179 return 0; 99 180 } 100 181 101 - void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) 182 + static int ns_cmp(struct rb_node *a, const struct rb_node *b) 102 183 { 103 - struct rb_node *node, *prev; 184 + return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id); 185 + } 186 + 187 + static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) 188 + { 189 + return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id); 190 + } 191 + 192 + static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) 193 + { 194 + return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id); 195 + } 196 + 197 + void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree) 198 + { 199 + struct rb_node *node; 200 + const struct proc_ns_operations *ops = ns->ops; 104 201 105 202 VFS_WARN_ON_ONCE(!ns->ns_id); 106 203 107 - write_seqlock(&ns_tree->ns_tree_lock); 204 + guard(ns_tree_writer)(); 108 205 109 - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); 206 + /* Add to per-type tree and list */ 207 + node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp); 110 208 111 - node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); 112 - /* 113 - * If there's no previous entry simply add it after the 114 - * head and if there is add it after the previous entry. 115 - */ 116 - prev = rb_prev(&ns->ns_tree_node); 117 - if (!prev) 118 - list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); 119 - else 120 - list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); 209 + /* Add to unified tree and list */ 210 + ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified); 121 211 122 - write_sequnlock(&ns_tree->ns_tree_lock); 212 + /* Add to owner's tree if applicable */ 213 + if (ops) { 214 + struct user_namespace *user_ns; 215 + 216 + VFS_WARN_ON_ONCE(!ops->owner); 217 + user_ns = ops->owner(ns); 218 + if (user_ns) { 219 + struct ns_common *owner = &user_ns->ns; 220 + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); 221 + 222 + /* Insert into owner's tree and list */ 223 + ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner); 224 + } else { 225 + /* Only the initial user namespace doesn't have an owner. */ 226 + VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); 227 + } 228 + } 123 229 124 230 VFS_WARN_ON_ONCE(node); 125 231 } 126 232 127 - void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) 233 + void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree) 128 234 { 129 - VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); 130 - VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); 131 - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); 235 + const struct proc_ns_operations *ops = ns->ops; 236 + struct user_namespace *user_ns; 132 237 133 - write_seqlock(&ns_tree->ns_tree_lock); 134 - rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); 135 - list_bidir_del_rcu(&ns->ns_list_node); 136 - RB_CLEAR_NODE(&ns->ns_tree_node); 137 - write_sequnlock(&ns_tree->ns_tree_lock); 238 + VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node)); 239 + VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry)); 240 + 241 + write_seqlock(&ns_tree_lock); 242 + 243 + /* Remove from per-type tree and list */ 244 + ns_tree_node_del(&ns->ns_tree_node, ns_tree); 245 + 246 + /* Remove from unified tree and list */ 247 + ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root); 248 + 249 + /* Remove from owner's tree if applicable */ 250 + if (ops) { 251 + user_ns = ops->owner(ns); 252 + if (user_ns) { 253 + struct ns_common *owner = &user_ns->ns; 254 + ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root); 255 + } 256 + } 257 + 258 + write_sequnlock(&ns_tree_lock); 138 259 } 139 260 EXPORT_SYMBOL_GPL(__ns_tree_remove); 140 261 ··· 271 150 return 0; 272 151 } 273 152 153 + static int ns_find_unified(const void *key, const struct rb_node *node) 154 + { 155 + const u64 ns_id = *(u64 *)key; 156 + const struct ns_common *ns = node_to_ns_unified(node); 274 157 275 - static struct ns_tree *ns_tree_from_type(int ns_type) 158 + if (ns_id < ns->ns_id) 159 + return -1; 160 + if (ns_id > ns->ns_id) 161 + return 1; 162 + return 0; 163 + } 164 + 165 + static struct ns_tree_root *ns_tree_from_type(int ns_type) 276 166 { 277 167 switch (ns_type) { 278 168 case CLONE_NEWCGROUP: ··· 307 175 return NULL; 308 176 } 309 177 310 - struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) 178 + static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) 311 179 { 312 - struct ns_tree *ns_tree; 313 180 struct rb_node *node; 314 181 unsigned int seq; 315 182 316 - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); 183 + do { 184 + seq = read_seqbegin(&ns_tree_lock); 185 + node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified); 186 + if (node) 187 + break; 188 + } while (read_seqretry(&ns_tree_lock, seq)); 189 + 190 + return node_to_ns_unified(node); 191 + } 192 + 193 + static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) 194 + { 195 + struct ns_tree_root *ns_tree; 196 + struct rb_node *node; 197 + unsigned int seq; 317 198 318 199 ns_tree = ns_tree_from_type(ns_type); 319 200 if (!ns_tree) 320 201 return NULL; 321 202 322 203 do { 323 - seq = read_seqbegin(&ns_tree->ns_tree_lock); 324 - node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); 204 + seq = read_seqbegin(&ns_tree_lock); 205 + node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find); 325 206 if (node) 326 207 break; 327 - } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); 328 - 329 - if (!node) 330 - return NULL; 331 - 332 - VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); 208 + } while (read_seqretry(&ns_tree_lock, seq)); 333 209 334 210 return node_to_ns(node); 335 211 } 336 212 213 + struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) 214 + { 215 + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); 216 + 217 + if (ns_type) 218 + return __ns_tree_lookup_rcu(ns_id, ns_type); 219 + 220 + return __ns_unified_tree_lookup_rcu(ns_id); 221 + } 222 + 337 223 /** 338 - * ns_tree_adjoined_rcu - find the next/previous namespace in the same 224 + * __ns_tree_adjoined_rcu - find the next/previous namespace in the same 339 225 * tree 340 226 * @ns: namespace to start from 227 + * @ns_tree: namespace tree to search in 341 228 * @previous: if true find the previous namespace, otherwise the next 342 229 * 343 230 * Find the next or previous namespace in the same tree as @ns. If 344 231 * there is no next/previous namespace, -ENOENT is returned. 345 232 */ 346 233 struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, 347 - struct ns_tree *ns_tree, bool previous) 234 + struct ns_tree_root *ns_tree, bool previous) 348 235 { 349 236 struct list_head *list; 350 237 351 238 RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); 352 239 353 240 if (previous) 354 - list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); 241 + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry)); 355 242 else 356 - list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); 357 - if (list_is_head(list, &ns_tree->ns_list)) 243 + list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry)); 244 + if (list_is_head(list, &ns_tree->ns_list_head)) 358 245 return ERR_PTR(-ENOENT); 359 246 360 - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); 361 - 362 - return list_entry_rcu(list, struct ns_common, ns_list_node); 247 + return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry); 363 248 } 364 249 365 250 /** 366 - * ns_tree_gen_id - generate a new namespace id 251 + * __ns_tree_gen_id - generate a new namespace id 367 252 * @ns: namespace to generate id for 253 + * @id: if non-zero, this is the initial namespace and this is a fixed id 368 254 * 369 255 * Generates a new namespace id and assigns it to the namespace. All 370 256 * namespaces types share the same id space and thus can be compared 371 257 * directly. IOW, when two ids of two namespace are equal, they are 372 258 * identical. 373 259 */ 374 - u64 ns_tree_gen_id(struct ns_common *ns) 260 + u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) 375 261 { 376 - guard(preempt)(); 377 - ns->ns_id = gen_cookie_next(&namespace_cookie); 262 + static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1); 263 + 264 + if (id) 265 + ns->ns_id = id; 266 + else 267 + ns->ns_id = atomic64_inc_return(&namespace_cookie); 378 268 return ns->ns_id; 269 + } 270 + 271 + struct klistns { 272 + u64 __user *uns_ids; 273 + u32 nr_ns_ids; 274 + u64 last_ns_id; 275 + u64 user_ns_id; 276 + u32 ns_type; 277 + struct user_namespace *user_ns; 278 + bool userns_capable; 279 + struct ns_common *first_ns; 280 + }; 281 + 282 + static void __free_klistns_free(const struct klistns *kls) 283 + { 284 + if (kls->user_ns_id != LISTNS_CURRENT_USER) 285 + put_user_ns(kls->user_ns); 286 + if (kls->first_ns && kls->first_ns->ops) 287 + kls->first_ns->ops->put(kls->first_ns); 288 + } 289 + 290 + #define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) 291 + 292 + static int copy_ns_id_req(const struct ns_id_req __user *req, 293 + struct ns_id_req *kreq) 294 + { 295 + int ret; 296 + size_t usize; 297 + 298 + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); 299 + 300 + ret = get_user(usize, &req->size); 301 + if (ret) 302 + return -EFAULT; 303 + if (unlikely(usize > PAGE_SIZE)) 304 + return -E2BIG; 305 + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) 306 + return -EINVAL; 307 + memset(kreq, 0, sizeof(*kreq)); 308 + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); 309 + if (ret) 310 + return ret; 311 + if (kreq->spare != 0) 312 + return -EINVAL; 313 + if (kreq->ns_type & ~NS_ALL) 314 + return -EOPNOTSUPP; 315 + return 0; 316 + } 317 + 318 + static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, 319 + u64 __user *ns_ids, size_t nr_ns_ids) 320 + { 321 + kls->last_ns_id = kreq->ns_id; 322 + kls->user_ns_id = kreq->user_ns_id; 323 + kls->nr_ns_ids = nr_ns_ids; 324 + kls->ns_type = kreq->ns_type; 325 + kls->uns_ids = ns_ids; 326 + return 0; 327 + } 328 + 329 + /* 330 + * Lookup a namespace owned by owner with id >= ns_id. 331 + * Returns the namespace with the smallest id that is >= ns_id. 332 + */ 333 + static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) 334 + { 335 + struct ns_common *ret = NULL; 336 + struct rb_node *node; 337 + 338 + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); 339 + 340 + guard(ns_tree_locked_reader)(); 341 + 342 + node = owner->ns_owner_root.ns_rb.rb_node; 343 + while (node) { 344 + struct ns_common *ns; 345 + 346 + ns = node_to_ns_owner(node); 347 + if (ns_id <= ns->ns_id) { 348 + ret = ns; 349 + if (ns_id == ns->ns_id) 350 + break; 351 + node = node->rb_left; 352 + } else { 353 + node = node->rb_right; 354 + } 355 + } 356 + 357 + if (ret) 358 + ret = ns_get_unless_inactive(ret); 359 + return ret; 360 + } 361 + 362 + static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) 363 + { 364 + struct ns_common *ns; 365 + 366 + guard(rcu)(); 367 + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); 368 + if (!ns) 369 + return NULL; 370 + 371 + if (!ns_get_unless_inactive(ns)) 372 + return NULL; 373 + 374 + return ns; 375 + } 376 + 377 + static inline bool __must_check ns_requested(const struct klistns *kls, 378 + const struct ns_common *ns) 379 + { 380 + return !kls->ns_type || (kls->ns_type & ns->ns_type); 381 + } 382 + 383 + static inline bool __must_check may_list_ns(const struct klistns *kls, 384 + struct ns_common *ns) 385 + { 386 + if (kls->user_ns) { 387 + if (kls->userns_capable) 388 + return true; 389 + } else { 390 + struct ns_common *owner; 391 + struct user_namespace *user_ns; 392 + 393 + owner = ns_owner(ns); 394 + if (owner) 395 + user_ns = to_user_ns(owner); 396 + else 397 + user_ns = &init_user_ns; 398 + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) 399 + return true; 400 + } 401 + 402 + if (is_current_namespace(ns)) 403 + return true; 404 + 405 + if (ns->ns_type != CLONE_NEWUSER) 406 + return false; 407 + 408 + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) 409 + return true; 410 + 411 + return false; 412 + } 413 + 414 + static inline void ns_put(struct ns_common *ns) 415 + { 416 + if (ns && ns->ops) 417 + ns->ops->put(ns); 418 + } 419 + 420 + DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) 421 + 422 + static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, 423 + struct ns_common *candidate) 424 + { 425 + struct ns_common *ns __free(ns_put) = NULL; 426 + 427 + if (!ns_requested(kls, candidate)) 428 + return NULL; 429 + 430 + ns = ns_get_unless_inactive(candidate); 431 + if (!ns) 432 + return NULL; 433 + 434 + if (!may_list_ns(kls, ns)) 435 + return NULL; 436 + 437 + return no_free_ptr(ns); 438 + } 439 + 440 + static ssize_t do_listns_userns(struct klistns *kls) 441 + { 442 + u64 __user *ns_ids = kls->uns_ids; 443 + size_t nr_ns_ids = kls->nr_ns_ids; 444 + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; 445 + const struct list_head *head; 446 + ssize_t ret; 447 + 448 + VFS_WARN_ON_ONCE(!kls->user_ns_id); 449 + 450 + if (kls->user_ns_id == LISTNS_CURRENT_USER) 451 + ns = to_ns_common(current_user_ns()); 452 + else if (kls->user_ns_id) 453 + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); 454 + if (!ns) 455 + return -EINVAL; 456 + kls->user_ns = to_user_ns(ns); 457 + 458 + /* 459 + * Use the rbtree to find the first namespace we care about and 460 + * then use it's list entry to iterate from there. 461 + */ 462 + if (kls->last_ns_id) { 463 + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); 464 + if (!kls->first_ns) 465 + return -ENOENT; 466 + first_ns = kls->first_ns; 467 + } 468 + 469 + ret = 0; 470 + head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; 471 + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); 472 + 473 + rcu_read_lock(); 474 + 475 + if (!first_ns) 476 + first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry); 477 + 478 + ns = first_ns; 479 + list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) { 480 + struct ns_common *valid; 481 + 482 + if (!nr_ns_ids) 483 + break; 484 + 485 + valid = legitimize_ns(kls, ns); 486 + if (!valid) 487 + continue; 488 + 489 + rcu_read_unlock(); 490 + 491 + ns_put(prev); 492 + prev = valid; 493 + 494 + if (put_user(valid->ns_id, ns_ids + ret)) { 495 + ns_put(prev); 496 + return -EFAULT; 497 + } 498 + 499 + nr_ns_ids--; 500 + ret++; 501 + 502 + rcu_read_lock(); 503 + } 504 + 505 + rcu_read_unlock(); 506 + ns_put(prev); 507 + return ret; 508 + } 509 + 510 + /* 511 + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. 512 + * Returns the namespace with the smallest id that is >= ns_id. 513 + */ 514 + static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) 515 + { 516 + struct ns_common *ret = NULL; 517 + struct ns_tree_root *ns_tree = NULL; 518 + struct rb_node *node; 519 + 520 + if (ns_type) { 521 + ns_tree = ns_tree_from_type(ns_type); 522 + if (!ns_tree) 523 + return NULL; 524 + } 525 + 526 + guard(ns_tree_locked_reader)(); 527 + 528 + if (ns_tree) 529 + node = ns_tree->ns_rb.rb_node; 530 + else 531 + node = ns_unified_root.ns_rb.rb_node; 532 + 533 + while (node) { 534 + struct ns_common *ns; 535 + 536 + if (ns_type) 537 + ns = node_to_ns(node); 538 + else 539 + ns = node_to_ns_unified(node); 540 + 541 + if (ns_id <= ns->ns_id) { 542 + if (ns_type) 543 + ret = node_to_ns(node); 544 + else 545 + ret = node_to_ns_unified(node); 546 + if (ns_id == ns->ns_id) 547 + break; 548 + node = node->rb_left; 549 + } else { 550 + node = node->rb_right; 551 + } 552 + } 553 + 554 + if (ret) 555 + ret = ns_get_unless_inactive(ret); 556 + return ret; 557 + } 558 + 559 + static inline struct ns_common *first_ns_common(const struct list_head *head, 560 + struct ns_tree_root *ns_tree) 561 + { 562 + if (ns_tree) 563 + return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry); 564 + return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry); 565 + } 566 + 567 + static inline struct ns_common *next_ns_common(struct ns_common *ns, 568 + struct ns_tree_root *ns_tree) 569 + { 570 + if (ns_tree) 571 + return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry); 572 + return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry); 573 + } 574 + 575 + static inline bool ns_common_is_head(struct ns_common *ns, 576 + const struct list_head *head, 577 + struct ns_tree_root *ns_tree) 578 + { 579 + if (ns_tree) 580 + return &ns->ns_tree_node.ns_list_entry == head; 581 + return &ns->ns_unified_node.ns_list_entry == head; 582 + } 583 + 584 + static ssize_t do_listns(struct klistns *kls) 585 + { 586 + u64 __user *ns_ids = kls->uns_ids; 587 + size_t nr_ns_ids = kls->nr_ns_ids; 588 + struct ns_common *ns, *first_ns = NULL, *prev = NULL; 589 + struct ns_tree_root *ns_tree = NULL; 590 + const struct list_head *head; 591 + u32 ns_type; 592 + ssize_t ret; 593 + 594 + if (hweight32(kls->ns_type) == 1) 595 + ns_type = kls->ns_type; 596 + else 597 + ns_type = 0; 598 + 599 + if (ns_type) { 600 + ns_tree = ns_tree_from_type(ns_type); 601 + if (!ns_tree) 602 + return -EINVAL; 603 + } 604 + 605 + if (kls->last_ns_id) { 606 + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); 607 + if (!kls->first_ns) 608 + return -ENOENT; 609 + first_ns = kls->first_ns; 610 + } 611 + 612 + ret = 0; 613 + if (ns_tree) 614 + head = &ns_tree->ns_list_head; 615 + else 616 + head = &ns_unified_root.ns_list_head; 617 + 618 + rcu_read_lock(); 619 + 620 + if (!first_ns) 621 + first_ns = first_ns_common(head, ns_tree); 622 + 623 + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; 624 + ns = next_ns_common(ns, ns_tree)) { 625 + struct ns_common *valid; 626 + 627 + valid = legitimize_ns(kls, ns); 628 + if (!valid) 629 + continue; 630 + 631 + rcu_read_unlock(); 632 + 633 + ns_put(prev); 634 + prev = valid; 635 + 636 + if (put_user(valid->ns_id, ns_ids + ret)) { 637 + ns_put(prev); 638 + return -EFAULT; 639 + } 640 + 641 + nr_ns_ids--; 642 + ret++; 643 + 644 + rcu_read_lock(); 645 + } 646 + 647 + rcu_read_unlock(); 648 + ns_put(prev); 649 + return ret; 650 + } 651 + 652 + SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, 653 + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) 654 + { 655 + struct klistns klns __free(klistns_free) = {}; 656 + const size_t maxcount = 1000000; 657 + struct ns_id_req kreq; 658 + ssize_t ret; 659 + 660 + if (flags) 661 + return -EINVAL; 662 + 663 + if (unlikely(nr_ns_ids > maxcount)) 664 + return -EOVERFLOW; 665 + 666 + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) 667 + return -EFAULT; 668 + 669 + ret = copy_ns_id_req(req, &kreq); 670 + if (ret) 671 + return ret; 672 + 673 + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); 674 + if (ret) 675 + return ret; 676 + 677 + if (kreq.user_ns_id) 678 + return do_listns_userns(&klns); 679 + 680 + return do_listns(&klns); 379 681 }

+6 -6

kernel/pid.c

··· 71 71 * the scheme scales to up to 4 million PIDs, runtime. 72 72 */ 73 73 struct pid_namespace init_pid_ns = { 74 - .ns.__ns_ref = REFCOUNT_INIT(2), 74 + .ns = NS_COMMON_INIT(init_pid_ns), 75 75 .idr = IDR_INIT(init_pid_ns.idr), 76 76 .pid_allocated = PIDNS_ADDING, 77 77 .level = 0, 78 78 .child_reaper = &init_task, 79 79 .user_ns = &init_user_ns, 80 - .ns.inum = ns_init_inum(&init_pid_ns), 81 - #ifdef CONFIG_PID_NS 82 - .ns.ops = &pidns_operations, 83 - #endif 84 80 .pid_max = PID_MAX_DEFAULT, 85 81 #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) 86 82 .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, 87 83 #endif 88 - .ns.ns_type = ns_common_type(&init_pid_ns), 89 84 }; 90 85 EXPORT_SYMBOL_GPL(init_pid_ns); 91 86 ··· 112 117 void free_pid(struct pid *pid) 113 118 { 114 119 int i; 120 + struct pid_namespace *active_ns; 115 121 116 122 lockdep_assert_not_held(&tasklist_lock); 123 + 124 + active_ns = pid->numbers[pid->level].ns; 125 + ns_ref_active_put(active_ns); 117 126 118 127 spin_lock(&pidmap_lock); 119 128 for (i = 0; i <= pid->level; i++) { ··· 282 283 } 283 284 spin_unlock(&pidmap_lock); 284 285 idr_preload_end(); 286 + ns_ref_active_get(ns); 285 287 286 288 return pid; 287 289

+1 -1

kernel/pid_namespace.c

··· 184 184 185 185 void put_pid_ns(struct pid_namespace *ns) 186 186 { 187 - if (ns && ns != &init_pid_ns && ns_ref_put(ns)) 187 + if (ns && ns_ref_put(ns)) 188 188 schedule_work(&ns->work); 189 189 } 190 190 EXPORT_SYMBOL_GPL(put_pid_ns);

+1 -4

kernel/time/namespace.c

··· 478 478 }; 479 479 480 480 struct time_namespace init_time_ns = { 481 - .ns.ns_type = ns_common_type(&init_time_ns), 482 - .ns.__ns_ref = REFCOUNT_INIT(3), 481 + .ns = NS_COMMON_INIT(init_time_ns), 483 482 .user_ns = &init_user_ns, 484 - .ns.inum = ns_init_inum(&init_time_ns), 485 - .ns.ops = &timens_operations, 486 483 .frozen_offsets = true, 487 484 }; 488 485

+1 -6

kernel/user.c

··· 35 35 * and 1 for... ? 36 36 */ 37 37 struct user_namespace init_user_ns = { 38 + .ns = NS_COMMON_INIT(init_user_ns), 38 39 .uid_map = { 39 40 { 40 41 .extent[0] = { ··· 66 65 .nr_extents = 1, 67 66 }, 68 67 }, 69 - .ns.ns_type = ns_common_type(&init_user_ns), 70 - .ns.__ns_ref = REFCOUNT_INIT(3), 71 68 .owner = GLOBAL_ROOT_UID, 72 69 .group = GLOBAL_ROOT_GID, 73 - .ns.inum = ns_init_inum(&init_user_ns), 74 - #ifdef CONFIG_USER_NS 75 - .ns.ops = &userns_operations, 76 - #endif 77 70 .flags = USERNS_INIT_FLAGS, 78 71 #ifdef CONFIG_KEYS 79 72 .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list),

+1 -1

net/core/net_namespace.c

··· 439 439 LIST_HEAD(net_exit_list); 440 440 int error = 0; 441 441 442 - net->net_cookie = ns_tree_gen_id(&net->ns); 442 + net->net_cookie = ns_tree_gen_id(net); 443 443 444 444 list_for_each_entry(ops, &pernet_list, list) { 445 445 error = ops_init(ops, net);

+1

scripts/syscall.tbl

··· 410 410 467 common open_tree_attr sys_open_tree_attr 411 411 468 common file_getattr sys_file_getattr 412 412 469 common file_setattr sys_file_setattr 413 + 470 common listns sys_listns

+70

tools/include/uapi/linux/nsfs.h

··· 53 53 TIME_NS_INIT_INO = 0xEFFFFFFAU, 54 54 NET_NS_INIT_INO = 0xEFFFFFF9U, 55 55 MNT_NS_INIT_INO = 0xEFFFFFF8U, 56 + #ifdef __KERNEL__ 57 + MNT_NS_ANON_INO = 0xEFFFFFF7U, 58 + #endif 56 59 }; 60 + 61 + struct nsfs_file_handle { 62 + __u64 ns_id; 63 + __u32 ns_type; 64 + __u32 ns_inum; 65 + }; 66 + 67 + #define NSFS_FILE_HANDLE_SIZE_VER0 16 /* sizeof first published struct */ 68 + #define NSFS_FILE_HANDLE_SIZE_LATEST sizeof(struct nsfs_file_handle) /* sizeof latest published struct */ 69 + 70 + enum init_ns_id { 71 + IPC_NS_INIT_ID = 1ULL, 72 + UTS_NS_INIT_ID = 2ULL, 73 + USER_NS_INIT_ID = 3ULL, 74 + PID_NS_INIT_ID = 4ULL, 75 + CGROUP_NS_INIT_ID = 5ULL, 76 + TIME_NS_INIT_ID = 6ULL, 77 + NET_NS_INIT_ID = 7ULL, 78 + MNT_NS_INIT_ID = 8ULL, 79 + #ifdef __KERNEL__ 80 + NS_LAST_INIT_ID = MNT_NS_INIT_ID, 81 + #endif 82 + }; 83 + 84 + enum ns_type { 85 + TIME_NS = (1ULL << 7), /* CLONE_NEWTIME */ 86 + MNT_NS = (1ULL << 17), /* CLONE_NEWNS */ 87 + CGROUP_NS = (1ULL << 25), /* CLONE_NEWCGROUP */ 88 + UTS_NS = (1ULL << 26), /* CLONE_NEWUTS */ 89 + IPC_NS = (1ULL << 27), /* CLONE_NEWIPC */ 90 + USER_NS = (1ULL << 28), /* CLONE_NEWUSER */ 91 + PID_NS = (1ULL << 29), /* CLONE_NEWPID */ 92 + NET_NS = (1ULL << 30), /* CLONE_NEWNET */ 93 + }; 94 + 95 + /** 96 + * struct ns_id_req - namespace ID request structure 97 + * @size: size of this structure 98 + * @spare: reserved for future use 99 + * @filter: filter mask 100 + * @ns_id: last namespace id 101 + * @user_ns_id: owning user namespace ID 102 + * 103 + * Structure for passing namespace ID and miscellaneous parameters to 104 + * statns(2) and listns(2). 105 + * 106 + * For statns(2) @param represents the request mask. 107 + * For listns(2) @param represents the last listed mount id (or zero). 108 + */ 109 + struct ns_id_req { 110 + __u32 size; 111 + __u32 spare; 112 + __u64 ns_id; 113 + struct /* listns */ { 114 + __u32 ns_type; 115 + __u32 spare2; 116 + __u64 user_ns_id; 117 + }; 118 + }; 119 + 120 + /* 121 + * Special @user_ns_id value that can be passed to listns() 122 + */ 123 + #define LISTNS_CURRENT_USER 0xffffffffffffffff /* Caller's userns */ 124 + 125 + /* List of all ns_id_req versions. */ 126 + #define NS_ID_REQ_SIZE_VER0 32 /* sizeof first published struct */ 57 127 58 128 #endif /* __LINUX_NSFS_H */

+1 -1

tools/testing/selftests/filesystems/utils.c

··· 487 487 uid_t uid = getuid(); 488 488 gid_t gid = getgid(); 489 489 490 - ret = unshare(CLONE_NEWNS|CLONE_NEWUSER|CLONE_NEWPID); 490 + ret = unshare(CLONE_NEWNS|CLONE_NEWUSER); 491 491 if (ret) { 492 492 ksft_exit_fail_msg("unsharing mountns and userns: %s\n", 493 493 strerror(errno));

+9

tools/testing/selftests/namespaces/.gitignore

··· 1 1 nsid_test 2 2 file_handle_test 3 3 init_ino_test 4 + ns_active_ref_test 5 + listns_test 6 + listns_permissions_test 7 + listns_efault_test 8 + siocgskns_test 9 + cred_change_test 10 + stress_test 11 + listns_pagination_bug 12 + regression_pidfd_setns_test

+23 -1

tools/testing/selftests/namespaces/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 CFLAGS += -Wall -O0 -g $(KHDR_INCLUDES) $(TOOLS_INCLUDES) 3 + LDLIBS += -lcap 3 4 4 - TEST_GEN_PROGS := nsid_test file_handle_test init_ino_test 5 + TEST_GEN_PROGS := nsid_test \ 6 + file_handle_test \ 7 + init_ino_test \ 8 + ns_active_ref_test \ 9 + listns_test \ 10 + listns_permissions_test \ 11 + listns_efault_test \ 12 + siocgskns_test \ 13 + cred_change_test \ 14 + stress_test \ 15 + listns_pagination_bug \ 16 + regression_pidfd_setns_test 5 17 6 18 include ../lib.mk 19 + 20 + $(OUTPUT)/ns_active_ref_test: ../filesystems/utils.c 21 + $(OUTPUT)/listns_test: ../filesystems/utils.c 22 + $(OUTPUT)/listns_permissions_test: ../filesystems/utils.c 23 + $(OUTPUT)/listns_efault_test: ../filesystems/utils.c 24 + $(OUTPUT)/siocgskns_test: ../filesystems/utils.c 25 + $(OUTPUT)/cred_change_test: ../filesystems/utils.c 26 + $(OUTPUT)/stress_test: ../filesystems/utils.c 27 + $(OUTPUT)/listns_pagination_bug: ../filesystems/utils.c 28 + $(OUTPUT)/regression_pidfd_setns_test: ../filesystems/utils.c 7 29

+814

tools/testing/selftests/namespaces/cred_change_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <string.h> 10 + #include <sys/capability.h> 11 + #include <sys/ioctl.h> 12 + #include <sys/stat.h> 13 + #include <sys/syscall.h> 14 + #include <sys/types.h> 15 + #include <sys/wait.h> 16 + #include <unistd.h> 17 + #include <linux/nsfs.h> 18 + #include "../kselftest_harness.h" 19 + #include "../filesystems/utils.h" 20 + #include "wrappers.h" 21 + 22 + /* 23 + * Test credential changes and their impact on namespace active references. 24 + */ 25 + 26 + /* 27 + * Test setuid() in a user namespace properly swaps active references. 28 + * Create a user namespace with multiple UIDs mapped, then setuid() between them. 29 + * Verify that the user namespace remains active throughout. 30 + */ 31 + TEST(setuid_preserves_active_refs) 32 + { 33 + pid_t pid; 34 + int status; 35 + __u64 userns_id; 36 + struct ns_id_req req = { 37 + .size = sizeof(req), 38 + .spare = 0, 39 + .ns_id = 0, 40 + .ns_type = CLONE_NEWUSER, 41 + .spare2 = 0, 42 + .user_ns_id = 0, 43 + }; 44 + __u64 ns_ids[256]; 45 + ssize_t ret; 46 + int i; 47 + bool found = false; 48 + int pipefd[2]; 49 + 50 + ASSERT_EQ(pipe(pipefd), 0); 51 + 52 + pid = fork(); 53 + ASSERT_GE(pid, 0); 54 + 55 + if (pid == 0) { 56 + /* Child process */ 57 + int fd, userns_fd; 58 + __u64 child_userns_id; 59 + uid_t orig_uid = getuid(); 60 + int setuid_count; 61 + 62 + close(pipefd[0]); 63 + 64 + /* Create new user namespace with multiple UIDs mapped (0-9) */ 65 + userns_fd = get_userns_fd(0, orig_uid, 10); 66 + if (userns_fd < 0) { 67 + close(pipefd[1]); 68 + exit(1); 69 + } 70 + 71 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 72 + close(userns_fd); 73 + close(pipefd[1]); 74 + exit(1); 75 + } 76 + close(userns_fd); 77 + 78 + /* Get user namespace ID */ 79 + fd = open("/proc/self/ns/user", O_RDONLY); 80 + if (fd < 0) { 81 + close(pipefd[1]); 82 + exit(1); 83 + } 84 + 85 + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { 86 + close(fd); 87 + close(pipefd[1]); 88 + exit(1); 89 + } 90 + close(fd); 91 + 92 + /* Send namespace ID to parent */ 93 + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); 94 + 95 + /* 96 + * Perform multiple setuid() calls. 97 + * Each setuid() triggers commit_creds() which should properly 98 + * swap active references via switch_cred_namespaces(). 99 + */ 100 + for (setuid_count = 0; setuid_count < 50; setuid_count++) { 101 + uid_t target_uid = (setuid_count % 10); 102 + if (setuid(target_uid) < 0) { 103 + if (errno != EPERM) { 104 + close(pipefd[1]); 105 + exit(1); 106 + } 107 + } 108 + } 109 + 110 + close(pipefd[1]); 111 + exit(0); 112 + } 113 + 114 + /* Parent process */ 115 + close(pipefd[1]); 116 + 117 + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { 118 + close(pipefd[0]); 119 + kill(pid, SIGKILL); 120 + waitpid(pid, NULL, 0); 121 + SKIP(return, "Failed to get namespace ID from child"); 122 + } 123 + close(pipefd[0]); 124 + 125 + TH_LOG("Child user namespace ID: %llu", (unsigned long long)userns_id); 126 + 127 + /* Verify namespace is active while child is running */ 128 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 129 + if (ret < 0) { 130 + kill(pid, SIGKILL); 131 + waitpid(pid, NULL, 0); 132 + if (errno == ENOSYS) 133 + SKIP(return, "listns() not supported"); 134 + ASSERT_GE(ret, 0); 135 + } 136 + 137 + for (i = 0; i < ret; i++) { 138 + if (ns_ids[i] == userns_id) { 139 + found = true; 140 + break; 141 + } 142 + } 143 + ASSERT_TRUE(found); 144 + 145 + waitpid(pid, &status, 0); 146 + ASSERT_TRUE(WIFEXITED(status)); 147 + ASSERT_EQ(WEXITSTATUS(status), 0); 148 + 149 + /* Verify namespace becomes inactive after child exits */ 150 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 151 + ASSERT_GE(ret, 0); 152 + 153 + found = false; 154 + for (i = 0; i < ret; i++) { 155 + if (ns_ids[i] == userns_id) { 156 + found = true; 157 + break; 158 + } 159 + } 160 + 161 + ASSERT_FALSE(found); 162 + TH_LOG("setuid() correctly preserved active references (no leak)"); 163 + } 164 + 165 + /* 166 + * Test setgid() in a user namespace properly handles active references. 167 + */ 168 + TEST(setgid_preserves_active_refs) 169 + { 170 + pid_t pid; 171 + int status; 172 + __u64 userns_id; 173 + struct ns_id_req req = { 174 + .size = sizeof(req), 175 + .spare = 0, 176 + .ns_id = 0, 177 + .ns_type = CLONE_NEWUSER, 178 + .spare2 = 0, 179 + .user_ns_id = 0, 180 + }; 181 + __u64 ns_ids[256]; 182 + ssize_t ret; 183 + int i; 184 + bool found = false; 185 + int pipefd[2]; 186 + 187 + ASSERT_EQ(pipe(pipefd), 0); 188 + 189 + pid = fork(); 190 + ASSERT_GE(pid, 0); 191 + 192 + if (pid == 0) { 193 + /* Child process */ 194 + int fd, userns_fd; 195 + __u64 child_userns_id; 196 + uid_t orig_uid = getuid(); 197 + int setgid_count; 198 + 199 + close(pipefd[0]); 200 + 201 + /* Create new user namespace with multiple GIDs mapped */ 202 + userns_fd = get_userns_fd(0, orig_uid, 10); 203 + if (userns_fd < 0) { 204 + close(pipefd[1]); 205 + exit(1); 206 + } 207 + 208 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 209 + close(userns_fd); 210 + close(pipefd[1]); 211 + exit(1); 212 + } 213 + close(userns_fd); 214 + 215 + /* Get user namespace ID */ 216 + fd = open("/proc/self/ns/user", O_RDONLY); 217 + if (fd < 0) { 218 + close(pipefd[1]); 219 + exit(1); 220 + } 221 + 222 + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { 223 + close(fd); 224 + close(pipefd[1]); 225 + exit(1); 226 + } 227 + close(fd); 228 + 229 + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); 230 + 231 + /* Perform multiple setgid() calls */ 232 + for (setgid_count = 0; setgid_count < 50; setgid_count++) { 233 + gid_t target_gid = (setgid_count % 10); 234 + if (setgid(target_gid) < 0) { 235 + if (errno != EPERM) { 236 + close(pipefd[1]); 237 + exit(1); 238 + } 239 + } 240 + } 241 + 242 + close(pipefd[1]); 243 + exit(0); 244 + } 245 + 246 + /* Parent process */ 247 + close(pipefd[1]); 248 + 249 + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { 250 + close(pipefd[0]); 251 + kill(pid, SIGKILL); 252 + waitpid(pid, NULL, 0); 253 + SKIP(return, "Failed to get namespace ID from child"); 254 + } 255 + close(pipefd[0]); 256 + 257 + waitpid(pid, &status, 0); 258 + ASSERT_TRUE(WIFEXITED(status)); 259 + ASSERT_EQ(WEXITSTATUS(status), 0); 260 + 261 + /* Verify namespace becomes inactive */ 262 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 263 + if (ret < 0) { 264 + if (errno == ENOSYS) 265 + SKIP(return, "listns() not supported"); 266 + ASSERT_GE(ret, 0); 267 + } 268 + 269 + for (i = 0; i < ret; i++) { 270 + if (ns_ids[i] == userns_id) { 271 + found = true; 272 + break; 273 + } 274 + } 275 + 276 + ASSERT_FALSE(found); 277 + TH_LOG("setgid() correctly preserved active references (no leak)"); 278 + } 279 + 280 + /* 281 + * Test setresuid() which changes real, effective, and saved UIDs. 282 + * This should properly swap active references via commit_creds(). 283 + */ 284 + TEST(setresuid_preserves_active_refs) 285 + { 286 + pid_t pid; 287 + int status; 288 + __u64 userns_id; 289 + struct ns_id_req req = { 290 + .size = sizeof(req), 291 + .spare = 0, 292 + .ns_id = 0, 293 + .ns_type = CLONE_NEWUSER, 294 + .spare2 = 0, 295 + .user_ns_id = 0, 296 + }; 297 + __u64 ns_ids[256]; 298 + ssize_t ret; 299 + int i; 300 + bool found = false; 301 + int pipefd[2]; 302 + 303 + ASSERT_EQ(pipe(pipefd), 0); 304 + 305 + pid = fork(); 306 + ASSERT_GE(pid, 0); 307 + 308 + if (pid == 0) { 309 + /* Child process */ 310 + int fd, userns_fd; 311 + __u64 child_userns_id; 312 + uid_t orig_uid = getuid(); 313 + int setres_count; 314 + 315 + close(pipefd[0]); 316 + 317 + /* Create new user namespace */ 318 + userns_fd = get_userns_fd(0, orig_uid, 10); 319 + if (userns_fd < 0) { 320 + close(pipefd[1]); 321 + exit(1); 322 + } 323 + 324 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 325 + close(userns_fd); 326 + close(pipefd[1]); 327 + exit(1); 328 + } 329 + close(userns_fd); 330 + 331 + /* Get user namespace ID */ 332 + fd = open("/proc/self/ns/user", O_RDONLY); 333 + if (fd < 0) { 334 + close(pipefd[1]); 335 + exit(1); 336 + } 337 + 338 + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { 339 + close(fd); 340 + close(pipefd[1]); 341 + exit(1); 342 + } 343 + close(fd); 344 + 345 + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); 346 + 347 + /* Perform multiple setresuid() calls */ 348 + for (setres_count = 0; setres_count < 30; setres_count++) { 349 + uid_t uid1 = (setres_count % 5); 350 + uid_t uid2 = ((setres_count + 1) % 5); 351 + uid_t uid3 = ((setres_count + 2) % 5); 352 + 353 + if (setresuid(uid1, uid2, uid3) < 0) { 354 + if (errno != EPERM) { 355 + close(pipefd[1]); 356 + exit(1); 357 + } 358 + } 359 + } 360 + 361 + close(pipefd[1]); 362 + exit(0); 363 + } 364 + 365 + /* Parent process */ 366 + close(pipefd[1]); 367 + 368 + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { 369 + close(pipefd[0]); 370 + kill(pid, SIGKILL); 371 + waitpid(pid, NULL, 0); 372 + SKIP(return, "Failed to get namespace ID from child"); 373 + } 374 + close(pipefd[0]); 375 + 376 + waitpid(pid, &status, 0); 377 + ASSERT_TRUE(WIFEXITED(status)); 378 + ASSERT_EQ(WEXITSTATUS(status), 0); 379 + 380 + /* Verify namespace becomes inactive */ 381 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 382 + if (ret < 0) { 383 + if (errno == ENOSYS) 384 + SKIP(return, "listns() not supported"); 385 + ASSERT_GE(ret, 0); 386 + } 387 + 388 + for (i = 0; i < ret; i++) { 389 + if (ns_ids[i] == userns_id) { 390 + found = true; 391 + break; 392 + } 393 + } 394 + 395 + ASSERT_FALSE(found); 396 + TH_LOG("setresuid() correctly preserved active references (no leak)"); 397 + } 398 + 399 + /* 400 + * Test credential changes across multiple user namespaces. 401 + * Create nested user namespaces and verify active reference tracking. 402 + */ 403 + TEST(cred_change_nested_userns) 404 + { 405 + pid_t pid; 406 + int status; 407 + __u64 parent_userns_id, child_userns_id; 408 + struct ns_id_req req = { 409 + .size = sizeof(req), 410 + .spare = 0, 411 + .ns_id = 0, 412 + .ns_type = CLONE_NEWUSER, 413 + .spare2 = 0, 414 + .user_ns_id = 0, 415 + }; 416 + __u64 ns_ids[256]; 417 + ssize_t ret; 418 + int i; 419 + bool found_parent = false, found_child = false; 420 + int pipefd[2]; 421 + 422 + ASSERT_EQ(pipe(pipefd), 0); 423 + 424 + pid = fork(); 425 + ASSERT_GE(pid, 0); 426 + 427 + if (pid == 0) { 428 + /* Child process */ 429 + int fd, userns_fd; 430 + __u64 parent_id, child_id; 431 + uid_t orig_uid = getuid(); 432 + 433 + close(pipefd[0]); 434 + 435 + /* Create first user namespace */ 436 + userns_fd = get_userns_fd(0, orig_uid, 1); 437 + if (userns_fd < 0) { 438 + close(pipefd[1]); 439 + exit(1); 440 + } 441 + 442 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 443 + close(userns_fd); 444 + close(pipefd[1]); 445 + exit(1); 446 + } 447 + close(userns_fd); 448 + 449 + /* Get first namespace ID */ 450 + fd = open("/proc/self/ns/user", O_RDONLY); 451 + if (fd < 0) { 452 + close(pipefd[1]); 453 + exit(1); 454 + } 455 + 456 + if (ioctl(fd, NS_GET_ID, &parent_id) < 0) { 457 + close(fd); 458 + close(pipefd[1]); 459 + exit(1); 460 + } 461 + close(fd); 462 + 463 + /* Create nested user namespace */ 464 + userns_fd = get_userns_fd(0, 0, 1); 465 + if (userns_fd < 0) { 466 + close(pipefd[1]); 467 + exit(1); 468 + } 469 + 470 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 471 + close(userns_fd); 472 + close(pipefd[1]); 473 + exit(1); 474 + } 475 + close(userns_fd); 476 + 477 + /* Get nested namespace ID */ 478 + fd = open("/proc/self/ns/user", O_RDONLY); 479 + if (fd < 0) { 480 + close(pipefd[1]); 481 + exit(1); 482 + } 483 + 484 + if (ioctl(fd, NS_GET_ID, &child_id) < 0) { 485 + close(fd); 486 + close(pipefd[1]); 487 + exit(1); 488 + } 489 + close(fd); 490 + 491 + /* Send both IDs to parent */ 492 + write(pipefd[1], &parent_id, sizeof(parent_id)); 493 + write(pipefd[1], &child_id, sizeof(child_id)); 494 + 495 + /* Perform some credential changes in nested namespace */ 496 + setuid(0); 497 + setgid(0); 498 + 499 + close(pipefd[1]); 500 + exit(0); 501 + } 502 + 503 + /* Parent process */ 504 + close(pipefd[1]); 505 + 506 + /* Read both namespace IDs */ 507 + if (read(pipefd[0], &parent_userns_id, sizeof(parent_userns_id)) != sizeof(parent_userns_id)) { 508 + close(pipefd[0]); 509 + kill(pid, SIGKILL); 510 + waitpid(pid, NULL, 0); 511 + SKIP(return, "Failed to get parent namespace ID"); 512 + } 513 + 514 + if (read(pipefd[0], &child_userns_id, sizeof(child_userns_id)) != sizeof(child_userns_id)) { 515 + close(pipefd[0]); 516 + kill(pid, SIGKILL); 517 + waitpid(pid, NULL, 0); 518 + SKIP(return, "Failed to get child namespace ID"); 519 + } 520 + close(pipefd[0]); 521 + 522 + TH_LOG("Parent userns: %llu, Child userns: %llu", 523 + (unsigned long long)parent_userns_id, 524 + (unsigned long long)child_userns_id); 525 + 526 + /* Verify both namespaces are active */ 527 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 528 + if (ret < 0) { 529 + kill(pid, SIGKILL); 530 + waitpid(pid, NULL, 0); 531 + if (errno == ENOSYS) 532 + SKIP(return, "listns() not supported"); 533 + ASSERT_GE(ret, 0); 534 + } 535 + 536 + for (i = 0; i < ret; i++) { 537 + if (ns_ids[i] == parent_userns_id) 538 + found_parent = true; 539 + if (ns_ids[i] == child_userns_id) 540 + found_child = true; 541 + } 542 + 543 + ASSERT_TRUE(found_parent); 544 + ASSERT_TRUE(found_child); 545 + 546 + /* Wait for child */ 547 + waitpid(pid, &status, 0); 548 + ASSERT_TRUE(WIFEXITED(status)); 549 + ASSERT_EQ(WEXITSTATUS(status), 0); 550 + 551 + /* Verify both namespaces become inactive */ 552 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 553 + ASSERT_GE(ret, 0); 554 + 555 + found_parent = false; 556 + found_child = false; 557 + for (i = 0; i < ret; i++) { 558 + if (ns_ids[i] == parent_userns_id) 559 + found_parent = true; 560 + if (ns_ids[i] == child_userns_id) 561 + found_child = true; 562 + } 563 + 564 + ASSERT_FALSE(found_parent); 565 + ASSERT_FALSE(found_child); 566 + TH_LOG("Nested user namespace credential changes preserved active refs (no leak)"); 567 + } 568 + 569 + /* 570 + * Test rapid credential changes don't cause refcount imbalances. 571 + * This stress-tests the switch_cred_namespaces() logic. 572 + */ 573 + TEST(rapid_cred_changes_no_leak) 574 + { 575 + pid_t pid; 576 + int status; 577 + __u64 userns_id; 578 + struct ns_id_req req = { 579 + .size = sizeof(req), 580 + .spare = 0, 581 + .ns_id = 0, 582 + .ns_type = CLONE_NEWUSER, 583 + .spare2 = 0, 584 + .user_ns_id = 0, 585 + }; 586 + __u64 ns_ids[256]; 587 + ssize_t ret; 588 + int i; 589 + bool found = false; 590 + int pipefd[2]; 591 + 592 + ASSERT_EQ(pipe(pipefd), 0); 593 + 594 + pid = fork(); 595 + ASSERT_GE(pid, 0); 596 + 597 + if (pid == 0) { 598 + /* Child process */ 599 + int fd, userns_fd; 600 + __u64 child_userns_id; 601 + uid_t orig_uid = getuid(); 602 + int change_count; 603 + 604 + close(pipefd[0]); 605 + 606 + /* Create new user namespace with wider range of UIDs/GIDs */ 607 + userns_fd = get_userns_fd(0, orig_uid, 100); 608 + if (userns_fd < 0) { 609 + close(pipefd[1]); 610 + exit(1); 611 + } 612 + 613 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 614 + close(userns_fd); 615 + close(pipefd[1]); 616 + exit(1); 617 + } 618 + close(userns_fd); 619 + 620 + /* Get user namespace ID */ 621 + fd = open("/proc/self/ns/user", O_RDONLY); 622 + if (fd < 0) { 623 + close(pipefd[1]); 624 + exit(1); 625 + } 626 + 627 + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { 628 + close(fd); 629 + close(pipefd[1]); 630 + exit(1); 631 + } 632 + close(fd); 633 + 634 + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); 635 + 636 + /* 637 + * Perform many rapid credential changes. 638 + * Mix setuid, setgid, setreuid, setregid, setresuid, setresgid. 639 + */ 640 + for (change_count = 0; change_count < 200; change_count++) { 641 + switch (change_count % 6) { 642 + case 0: 643 + setuid(change_count % 50); 644 + break; 645 + case 1: 646 + setgid(change_count % 50); 647 + break; 648 + case 2: 649 + setreuid(change_count % 50, (change_count + 1) % 50); 650 + break; 651 + case 3: 652 + setregid(change_count % 50, (change_count + 1) % 50); 653 + break; 654 + case 4: 655 + setresuid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); 656 + break; 657 + case 5: 658 + setresgid(change_count % 50, (change_count + 1) % 50, (change_count + 2) % 50); 659 + break; 660 + } 661 + } 662 + 663 + close(pipefd[1]); 664 + exit(0); 665 + } 666 + 667 + /* Parent process */ 668 + close(pipefd[1]); 669 + 670 + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { 671 + close(pipefd[0]); 672 + kill(pid, SIGKILL); 673 + waitpid(pid, NULL, 0); 674 + SKIP(return, "Failed to get namespace ID from child"); 675 + } 676 + close(pipefd[0]); 677 + 678 + TH_LOG("Testing with user namespace ID: %llu", (unsigned long long)userns_id); 679 + 680 + waitpid(pid, &status, 0); 681 + ASSERT_TRUE(WIFEXITED(status)); 682 + ASSERT_EQ(WEXITSTATUS(status), 0); 683 + 684 + /* Verify namespace becomes inactive (no leaked active refs) */ 685 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 686 + if (ret < 0) { 687 + if (errno == ENOSYS) 688 + SKIP(return, "listns() not supported"); 689 + ASSERT_GE(ret, 0); 690 + } 691 + 692 + for (i = 0; i < ret; i++) { 693 + if (ns_ids[i] == userns_id) { 694 + found = true; 695 + break; 696 + } 697 + } 698 + 699 + ASSERT_FALSE(found); 700 + TH_LOG("200 rapid credential changes completed with no active ref leak"); 701 + } 702 + 703 + /* 704 + * Test setfsuid/setfsgid which change filesystem UID/GID. 705 + * These also trigger credential changes but may have different code paths. 706 + */ 707 + TEST(setfsuid_preserves_active_refs) 708 + { 709 + pid_t pid; 710 + int status; 711 + __u64 userns_id; 712 + struct ns_id_req req = { 713 + .size = sizeof(req), 714 + .spare = 0, 715 + .ns_id = 0, 716 + .ns_type = CLONE_NEWUSER, 717 + .spare2 = 0, 718 + .user_ns_id = 0, 719 + }; 720 + __u64 ns_ids[256]; 721 + ssize_t ret; 722 + int i; 723 + bool found = false; 724 + int pipefd[2]; 725 + 726 + ASSERT_EQ(pipe(pipefd), 0); 727 + 728 + pid = fork(); 729 + ASSERT_GE(pid, 0); 730 + 731 + if (pid == 0) { 732 + /* Child process */ 733 + int fd, userns_fd; 734 + __u64 child_userns_id; 735 + uid_t orig_uid = getuid(); 736 + int change_count; 737 + 738 + close(pipefd[0]); 739 + 740 + /* Create new user namespace */ 741 + userns_fd = get_userns_fd(0, orig_uid, 10); 742 + if (userns_fd < 0) { 743 + close(pipefd[1]); 744 + exit(1); 745 + } 746 + 747 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 748 + close(userns_fd); 749 + close(pipefd[1]); 750 + exit(1); 751 + } 752 + close(userns_fd); 753 + 754 + /* Get user namespace ID */ 755 + fd = open("/proc/self/ns/user", O_RDONLY); 756 + if (fd < 0) { 757 + close(pipefd[1]); 758 + exit(1); 759 + } 760 + 761 + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { 762 + close(fd); 763 + close(pipefd[1]); 764 + exit(1); 765 + } 766 + close(fd); 767 + 768 + write(pipefd[1], &child_userns_id, sizeof(child_userns_id)); 769 + 770 + /* Perform multiple setfsuid/setfsgid calls */ 771 + for (change_count = 0; change_count < 50; change_count++) { 772 + setfsuid(change_count % 10); 773 + setfsgid(change_count % 10); 774 + } 775 + 776 + close(pipefd[1]); 777 + exit(0); 778 + } 779 + 780 + /* Parent process */ 781 + close(pipefd[1]); 782 + 783 + if (read(pipefd[0], &userns_id, sizeof(userns_id)) != sizeof(userns_id)) { 784 + close(pipefd[0]); 785 + kill(pid, SIGKILL); 786 + waitpid(pid, NULL, 0); 787 + SKIP(return, "Failed to get namespace ID from child"); 788 + } 789 + close(pipefd[0]); 790 + 791 + waitpid(pid, &status, 0); 792 + ASSERT_TRUE(WIFEXITED(status)); 793 + ASSERT_EQ(WEXITSTATUS(status), 0); 794 + 795 + /* Verify namespace becomes inactive */ 796 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 797 + if (ret < 0) { 798 + if (errno == ENOSYS) 799 + SKIP(return, "listns() not supported"); 800 + ASSERT_GE(ret, 0); 801 + } 802 + 803 + for (i = 0; i < ret; i++) { 804 + if (ns_ids[i] == userns_id) { 805 + found = true; 806 + break; 807 + } 808 + } 809 + 810 + ASSERT_FALSE(found); 811 + TH_LOG("setfsuid/setfsgid correctly preserved active references (no leak)"); 812 + } 813 + 814 + TEST_HARNESS_MAIN

+530

tools/testing/selftests/namespaces/listns_efault_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <signal.h> 8 + #include <stdio.h> 9 + #include <stdlib.h> 10 + #include <string.h> 11 + #include <linux/nsfs.h> 12 + #include <sys/ioctl.h> 13 + #include <sys/mman.h> 14 + #include <sys/mount.h> 15 + #include <sys/socket.h> 16 + #include <sys/stat.h> 17 + #include <sys/syscall.h> 18 + #include <sys/types.h> 19 + #include <sys/wait.h> 20 + #include <unistd.h> 21 + #include "../kselftest_harness.h" 22 + #include "../filesystems/utils.h" 23 + #include "../pidfd/pidfd.h" 24 + #include "wrappers.h" 25 + 26 + /* 27 + * Test listns() error handling with invalid buffer addresses. 28 + * 29 + * When the buffer pointer is invalid (e.g., crossing page boundaries 30 + * into unmapped memory), listns() returns EINVAL. 31 + * 32 + * This test also creates mount namespaces that get destroyed during 33 + * iteration, testing that namespace cleanup happens outside the RCU 34 + * read lock. 35 + */ 36 + TEST(listns_partial_fault_with_ns_cleanup) 37 + { 38 + void *map; 39 + __u64 *ns_ids; 40 + ssize_t ret; 41 + long page_size; 42 + pid_t pid, iter_pid; 43 + int pidfds[5]; 44 + int sv[5][2]; 45 + int iter_pidfd; 46 + int i, status; 47 + char c; 48 + 49 + page_size = sysconf(_SC_PAGESIZE); 50 + ASSERT_GT(page_size, 0); 51 + 52 + /* 53 + * Map two pages: 54 + * - First page: readable and writable 55 + * - Second page: will be unmapped to trigger EFAULT 56 + */ 57 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 58 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 59 + ASSERT_NE(map, MAP_FAILED); 60 + 61 + /* Unmap the second page */ 62 + ret = munmap((char *)map + page_size, page_size); 63 + ASSERT_EQ(ret, 0); 64 + 65 + /* 66 + * Position the buffer pointer so there's room for exactly one u64 67 + * before the page boundary. The second u64 would fall into the 68 + * unmapped page. 69 + */ 70 + ns_ids = ((__u64 *)((char *)map + page_size)) - 1; 71 + 72 + /* 73 + * Create a separate process to run listns() in a loop concurrently 74 + * with namespace creation and destruction. 75 + */ 76 + iter_pid = create_child(&iter_pidfd, 0); 77 + ASSERT_NE(iter_pid, -1); 78 + 79 + if (iter_pid == 0) { 80 + struct ns_id_req req = { 81 + .size = sizeof(req), 82 + .spare = 0, 83 + .ns_id = 0, 84 + .ns_type = 0, /* All types */ 85 + .spare2 = 0, 86 + .user_ns_id = 0, /* Global listing */ 87 + }; 88 + int iter_ret; 89 + 90 + /* 91 + * Loop calling listns() until killed. 92 + * The kernel should: 93 + * 1. Successfully write the first namespace ID (within valid page) 94 + * 2. Fail with EFAULT when trying to write the second ID (unmapped page) 95 + * 3. Handle concurrent namespace destruction without deadlock 96 + */ 97 + while (1) { 98 + iter_ret = sys_listns(&req, ns_ids, 2, 0); 99 + 100 + if (iter_ret == -1 && errno == ENOSYS) 101 + _exit(PIDFD_SKIP); 102 + } 103 + } 104 + 105 + /* Small delay to let iterator start looping */ 106 + usleep(50000); 107 + 108 + /* 109 + * Create several child processes, each in its own mount namespace. 110 + * These will be destroyed while the iterator is running listns(). 111 + */ 112 + for (i = 0; i < 5; i++) { 113 + /* Create socketpair for synchronization */ 114 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 115 + 116 + pid = create_child(&pidfds[i], CLONE_NEWNS); 117 + ASSERT_NE(pid, -1); 118 + 119 + if (pid == 0) { 120 + close(sv[i][0]); /* Close parent end */ 121 + 122 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 123 + _exit(1); 124 + 125 + /* Child: create a couple of tmpfs mounts */ 126 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 127 + _exit(1); 128 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 129 + _exit(1); 130 + 131 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 132 + _exit(1); 133 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 134 + _exit(1); 135 + 136 + /* Signal parent that setup is complete */ 137 + if (write_nointr(sv[i][1], "R", 1) != 1) 138 + _exit(1); 139 + 140 + /* Wait for parent to signal us to exit */ 141 + if (read_nointr(sv[i][1], &c, 1) != 1) 142 + _exit(1); 143 + 144 + close(sv[i][1]); 145 + _exit(0); 146 + } 147 + 148 + close(sv[i][1]); /* Close child end */ 149 + } 150 + 151 + /* Wait for all children to finish setup */ 152 + for (i = 0; i < 5; i++) { 153 + ret = read_nointr(sv[i][0], &c, 1); 154 + ASSERT_EQ(ret, 1); 155 + ASSERT_EQ(c, 'R'); 156 + } 157 + 158 + /* 159 + * Signal children to exit. This will destroy their mount namespaces 160 + * while listns() is iterating the namespace tree. 161 + * This tests that cleanup happens outside the RCU read lock. 162 + */ 163 + for (i = 0; i < 5; i++) 164 + write_nointr(sv[i][0], "X", 1); 165 + 166 + /* Wait for all mount namespace children to exit and cleanup */ 167 + for (i = 0; i < 5; i++) { 168 + waitpid(-1, NULL, 0); 169 + close(sv[i][0]); 170 + close(pidfds[i]); 171 + } 172 + 173 + /* Kill iterator and wait for it */ 174 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 175 + ret = waitpid(iter_pid, &status, 0); 176 + ASSERT_EQ(ret, iter_pid); 177 + close(iter_pidfd); 178 + 179 + /* Should have been killed */ 180 + ASSERT_TRUE(WIFSIGNALED(status)); 181 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 182 + 183 + /* Clean up */ 184 + munmap(map, page_size); 185 + } 186 + 187 + /* 188 + * Test listns() error handling when the entire buffer is invalid. 189 + * This is a sanity check that basic invalid pointer detection works. 190 + */ 191 + TEST(listns_complete_fault) 192 + { 193 + struct ns_id_req req = { 194 + .size = sizeof(req), 195 + .spare = 0, 196 + .ns_id = 0, 197 + .ns_type = 0, 198 + .spare2 = 0, 199 + .user_ns_id = 0, 200 + }; 201 + __u64 *ns_ids; 202 + ssize_t ret; 203 + 204 + /* Use a clearly invalid pointer */ 205 + ns_ids = (__u64 *)0xdeadbeef; 206 + 207 + ret = sys_listns(&req, ns_ids, 10, 0); 208 + 209 + if (ret == -1 && errno == ENOSYS) 210 + SKIP(return, "listns() not supported"); 211 + 212 + /* Should fail with EFAULT */ 213 + ASSERT_EQ(ret, -1); 214 + ASSERT_EQ(errno, EFAULT); 215 + } 216 + 217 + /* 218 + * Test listns() error handling when the buffer is NULL. 219 + */ 220 + TEST(listns_null_buffer) 221 + { 222 + struct ns_id_req req = { 223 + .size = sizeof(req), 224 + .spare = 0, 225 + .ns_id = 0, 226 + .ns_type = 0, 227 + .spare2 = 0, 228 + .user_ns_id = 0, 229 + }; 230 + ssize_t ret; 231 + 232 + /* NULL buffer with non-zero count should fail */ 233 + ret = sys_listns(&req, NULL, 10, 0); 234 + 235 + if (ret == -1 && errno == ENOSYS) 236 + SKIP(return, "listns() not supported"); 237 + 238 + /* Should fail with EFAULT */ 239 + ASSERT_EQ(ret, -1); 240 + ASSERT_EQ(errno, EFAULT); 241 + } 242 + 243 + /* 244 + * Test listns() with a buffer that becomes invalid mid-iteration 245 + * (after several successful writes), combined with mount namespace 246 + * destruction to test RCU cleanup logic. 247 + */ 248 + TEST(listns_late_fault_with_ns_cleanup) 249 + { 250 + void *map; 251 + __u64 *ns_ids; 252 + ssize_t ret; 253 + long page_size; 254 + pid_t pid, iter_pid; 255 + int pidfds[10]; 256 + int sv[10][2]; 257 + int iter_pidfd; 258 + int i, status; 259 + char c; 260 + 261 + page_size = sysconf(_SC_PAGESIZE); 262 + ASSERT_GT(page_size, 0); 263 + 264 + /* Map two pages */ 265 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 266 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 267 + ASSERT_NE(map, MAP_FAILED); 268 + 269 + /* Unmap the second page */ 270 + ret = munmap((char *)map + page_size, page_size); 271 + ASSERT_EQ(ret, 0); 272 + 273 + /* 274 + * Position buffer so we can write several u64s successfully 275 + * before hitting the page boundary. 276 + */ 277 + ns_ids = ((__u64 *)((char *)map + page_size)) - 5; 278 + 279 + /* 280 + * Create a separate process to run listns() concurrently. 281 + */ 282 + iter_pid = create_child(&iter_pidfd, 0); 283 + ASSERT_NE(iter_pid, -1); 284 + 285 + if (iter_pid == 0) { 286 + struct ns_id_req req = { 287 + .size = sizeof(req), 288 + .spare = 0, 289 + .ns_id = 0, 290 + .ns_type = 0, 291 + .spare2 = 0, 292 + .user_ns_id = 0, 293 + }; 294 + int iter_ret; 295 + 296 + /* 297 + * Loop calling listns() until killed. 298 + * Request 10 namespace IDs while namespaces are being destroyed. 299 + * This tests: 300 + * 1. EFAULT handling when buffer becomes invalid 301 + * 2. Namespace cleanup outside RCU read lock during iteration 302 + */ 303 + while (1) { 304 + iter_ret = sys_listns(&req, ns_ids, 10, 0); 305 + 306 + if (iter_ret == -1 && errno == ENOSYS) 307 + _exit(PIDFD_SKIP); 308 + } 309 + } 310 + 311 + /* Small delay to let iterator start looping */ 312 + usleep(50000); 313 + 314 + /* 315 + * Create more children with mount namespaces to increase the 316 + * likelihood that namespace cleanup happens during iteration. 317 + */ 318 + for (i = 0; i < 10; i++) { 319 + /* Create socketpair for synchronization */ 320 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 321 + 322 + pid = create_child(&pidfds[i], CLONE_NEWNS); 323 + ASSERT_NE(pid, -1); 324 + 325 + if (pid == 0) { 326 + close(sv[i][0]); /* Close parent end */ 327 + 328 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 329 + _exit(1); 330 + 331 + /* Child: create tmpfs mounts */ 332 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 333 + _exit(1); 334 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 335 + _exit(1); 336 + 337 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 338 + _exit(1); 339 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 340 + _exit(1); 341 + 342 + /* Signal parent that setup is complete */ 343 + if (write_nointr(sv[i][1], "R", 1) != 1) 344 + _exit(1); 345 + 346 + /* Wait for parent to signal us to exit */ 347 + if (read_nointr(sv[i][1], &c, 1) != 1) 348 + _exit(1); 349 + 350 + close(sv[i][1]); 351 + _exit(0); 352 + } 353 + 354 + close(sv[i][1]); /* Close child end */ 355 + } 356 + 357 + /* Wait for all children to finish setup */ 358 + for (i = 0; i < 10; i++) { 359 + ret = read_nointr(sv[i][0], &c, 1); 360 + ASSERT_EQ(ret, 1); 361 + ASSERT_EQ(c, 'R'); 362 + } 363 + 364 + /* Kill half the children */ 365 + for (i = 0; i < 5; i++) 366 + write_nointr(sv[i][0], "X", 1); 367 + 368 + /* Small delay to let some exit */ 369 + usleep(10000); 370 + 371 + /* Kill remaining children */ 372 + for (i = 5; i < 10; i++) 373 + write_nointr(sv[i][0], "X", 1); 374 + 375 + /* Wait for all children and cleanup */ 376 + for (i = 0; i < 10; i++) { 377 + waitpid(-1, NULL, 0); 378 + close(sv[i][0]); 379 + close(pidfds[i]); 380 + } 381 + 382 + /* Kill iterator and wait for it */ 383 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 384 + ret = waitpid(iter_pid, &status, 0); 385 + ASSERT_EQ(ret, iter_pid); 386 + close(iter_pidfd); 387 + 388 + /* Should have been killed */ 389 + ASSERT_TRUE(WIFSIGNALED(status)); 390 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 391 + 392 + /* Clean up */ 393 + munmap(map, page_size); 394 + } 395 + 396 + /* 397 + * Test specifically focused on mount namespace cleanup during EFAULT. 398 + * Filter for mount namespaces only. 399 + */ 400 + TEST(listns_mnt_ns_cleanup_on_fault) 401 + { 402 + void *map; 403 + __u64 *ns_ids; 404 + ssize_t ret; 405 + long page_size; 406 + pid_t pid, iter_pid; 407 + int pidfds[8]; 408 + int sv[8][2]; 409 + int iter_pidfd; 410 + int i, status; 411 + char c; 412 + 413 + page_size = sysconf(_SC_PAGESIZE); 414 + ASSERT_GT(page_size, 0); 415 + 416 + /* Set up partial fault buffer */ 417 + map = mmap(NULL, page_size * 2, PROT_READ | PROT_WRITE, 418 + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); 419 + ASSERT_NE(map, MAP_FAILED); 420 + 421 + ret = munmap((char *)map + page_size, page_size); 422 + ASSERT_EQ(ret, 0); 423 + 424 + /* Position for 3 successful writes, then fault */ 425 + ns_ids = ((__u64 *)((char *)map + page_size)) - 3; 426 + 427 + /* 428 + * Create a separate process to run listns() concurrently. 429 + */ 430 + iter_pid = create_child(&iter_pidfd, 0); 431 + ASSERT_NE(iter_pid, -1); 432 + 433 + if (iter_pid == 0) { 434 + struct ns_id_req req = { 435 + .size = sizeof(req), 436 + .spare = 0, 437 + .ns_id = 0, 438 + .ns_type = CLONE_NEWNS, /* Only mount namespaces */ 439 + .spare2 = 0, 440 + .user_ns_id = 0, 441 + }; 442 + int iter_ret; 443 + 444 + /* 445 + * Loop calling listns() until killed. 446 + * Call listns() to race with namespace destruction. 447 + */ 448 + while (1) { 449 + iter_ret = sys_listns(&req, ns_ids, 10, 0); 450 + 451 + if (iter_ret == -1 && errno == ENOSYS) 452 + _exit(PIDFD_SKIP); 453 + } 454 + } 455 + 456 + /* Small delay to let iterator start looping */ 457 + usleep(50000); 458 + 459 + /* Create children with mount namespaces */ 460 + for (i = 0; i < 8; i++) { 461 + /* Create socketpair for synchronization */ 462 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv[i]), 0); 463 + 464 + pid = create_child(&pidfds[i], CLONE_NEWNS); 465 + ASSERT_NE(pid, -1); 466 + 467 + if (pid == 0) { 468 + close(sv[i][0]); /* Close parent end */ 469 + 470 + if (mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0)) 471 + _exit(1); 472 + 473 + /* Do some mount operations to make cleanup more interesting */ 474 + if (mkdir("/tmp/test_mnt1", 0755) == -1 && errno != EEXIST) 475 + _exit(1); 476 + if (mkdir("/tmp/test_mnt2", 0755) == -1 && errno != EEXIST) 477 + _exit(1); 478 + 479 + if (mount("tmpfs", "/tmp/test_mnt1", "tmpfs", 0, NULL) == -1) 480 + _exit(1); 481 + if (mount("tmpfs", "/tmp/test_mnt2", "tmpfs", 0, NULL) == -1) 482 + _exit(1); 483 + 484 + /* Signal parent that setup is complete */ 485 + if (write_nointr(sv[i][1], "R", 1) != 1) 486 + _exit(1); 487 + 488 + /* Wait for parent to signal us to exit */ 489 + if (read_nointr(sv[i][1], &c, 1) != 1) 490 + _exit(1); 491 + 492 + close(sv[i][1]); 493 + _exit(0); 494 + } 495 + 496 + close(sv[i][1]); /* Close child end */ 497 + } 498 + 499 + /* Wait for all children to finish setup */ 500 + for (i = 0; i < 8; i++) { 501 + ret = read_nointr(sv[i][0], &c, 1); 502 + ASSERT_EQ(ret, 1); 503 + ASSERT_EQ(c, 'R'); 504 + } 505 + 506 + /* Kill children to trigger namespace destruction during iteration */ 507 + for (i = 0; i < 8; i++) 508 + write_nointr(sv[i][0], "X", 1); 509 + 510 + /* Wait for children and cleanup */ 511 + for (i = 0; i < 8; i++) { 512 + waitpid(-1, NULL, 0); 513 + close(sv[i][0]); 514 + close(pidfds[i]); 515 + } 516 + 517 + /* Kill iterator and wait for it */ 518 + sys_pidfd_send_signal(iter_pidfd, SIGKILL, NULL, 0); 519 + ret = waitpid(iter_pid, &status, 0); 520 + ASSERT_EQ(ret, iter_pid); 521 + close(iter_pidfd); 522 + 523 + /* Should have been killed */ 524 + ASSERT_TRUE(WIFSIGNALED(status)); 525 + ASSERT_EQ(WTERMSIG(status), SIGKILL); 526 + 527 + munmap(map, page_size); 528 + } 529 + 530 + TEST_HARNESS_MAIN

+138

tools/testing/selftests/namespaces/listns_pagination_bug.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <sched.h> 5 + #include <stdio.h> 6 + #include <stdlib.h> 7 + #include <sys/socket.h> 8 + #include <sys/wait.h> 9 + #include <unistd.h> 10 + #include "../kselftest_harness.h" 11 + #include "../filesystems/utils.h" 12 + #include "wrappers.h" 13 + 14 + /* 15 + * Minimal test case to reproduce KASAN out-of-bounds in listns pagination. 16 + * 17 + * The bug occurs when: 18 + * 1. Filtering by a specific namespace type (e.g., CLONE_NEWUSER) 19 + * 2. Using pagination (req.ns_id != 0) 20 + * 3. The lookup_ns_id_at() call in do_listns() passes ns_type=0 instead of 21 + * the filtered type, causing it to search the unified tree and potentially 22 + * return a namespace of the wrong type. 23 + */ 24 + TEST(pagination_with_type_filter) 25 + { 26 + struct ns_id_req req = { 27 + .size = sizeof(req), 28 + .spare = 0, 29 + .ns_id = 0, 30 + .ns_type = CLONE_NEWUSER, /* Filter by user namespace */ 31 + .spare2 = 0, 32 + .user_ns_id = 0, 33 + }; 34 + pid_t pids[10]; 35 + int num_children = 10; 36 + int i; 37 + int sv[2]; 38 + __u64 first_batch[3]; 39 + ssize_t ret; 40 + 41 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 42 + 43 + /* Create children with user namespaces */ 44 + for (i = 0; i < num_children; i++) { 45 + pids[i] = fork(); 46 + ASSERT_GE(pids[i], 0); 47 + 48 + if (pids[i] == 0) { 49 + char c; 50 + close(sv[0]); 51 + 52 + if (setup_userns() < 0) { 53 + close(sv[1]); 54 + exit(1); 55 + } 56 + 57 + /* Signal parent we're ready */ 58 + if (write(sv[1], &c, 1) != 1) { 59 + close(sv[1]); 60 + exit(1); 61 + } 62 + 63 + /* Wait for parent signal to exit */ 64 + if (read(sv[1], &c, 1) != 1) { 65 + close(sv[1]); 66 + exit(1); 67 + } 68 + 69 + close(sv[1]); 70 + exit(0); 71 + } 72 + } 73 + 74 + close(sv[1]); 75 + 76 + /* Wait for all children to signal ready */ 77 + for (i = 0; i < num_children; i++) { 78 + char c; 79 + if (read(sv[0], &c, 1) != 1) { 80 + close(sv[0]); 81 + for (int j = 0; j < num_children; j++) 82 + kill(pids[j], SIGKILL); 83 + for (int j = 0; j < num_children; j++) 84 + waitpid(pids[j], NULL, 0); 85 + ASSERT_TRUE(false); 86 + } 87 + } 88 + 89 + /* First batch - this should work */ 90 + ret = sys_listns(&req, first_batch, 3, 0); 91 + if (ret < 0) { 92 + if (errno == ENOSYS) { 93 + close(sv[0]); 94 + for (i = 0; i < num_children; i++) 95 + kill(pids[i], SIGKILL); 96 + for (i = 0; i < num_children; i++) 97 + waitpid(pids[i], NULL, 0); 98 + SKIP(return, "listns() not supported"); 99 + } 100 + ASSERT_GE(ret, 0); 101 + } 102 + 103 + TH_LOG("First batch returned %zd entries", ret); 104 + 105 + if (ret == 3) { 106 + __u64 second_batch[3]; 107 + 108 + /* Second batch - pagination triggers the bug */ 109 + req.ns_id = first_batch[2]; /* Continue from last ID */ 110 + ret = sys_listns(&req, second_batch, 3, 0); 111 + 112 + TH_LOG("Second batch returned %zd entries", ret); 113 + ASSERT_GE(ret, 0); 114 + } 115 + 116 + /* Signal all children to exit */ 117 + for (i = 0; i < num_children; i++) { 118 + char c = 'X'; 119 + if (write(sv[0], &c, 1) != 1) { 120 + close(sv[0]); 121 + for (int j = i; j < num_children; j++) 122 + kill(pids[j], SIGKILL); 123 + for (int j = 0; j < num_children; j++) 124 + waitpid(pids[j], NULL, 0); 125 + ASSERT_TRUE(false); 126 + } 127 + } 128 + 129 + close(sv[0]); 130 + 131 + /* Cleanup */ 132 + for (i = 0; i < num_children; i++) { 133 + int status; 134 + waitpid(pids[i], &status, 0); 135 + } 136 + } 137 + 138 + TEST_HARNESS_MAIN

+759

tools/testing/selftests/namespaces/listns_permissions_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <string.h> 10 + #include <linux/nsfs.h> 11 + #include <sys/capability.h> 12 + #include <sys/ioctl.h> 13 + #include <sys/prctl.h> 14 + #include <sys/stat.h> 15 + #include <sys/syscall.h> 16 + #include <sys/types.h> 17 + #include <sys/wait.h> 18 + #include <unistd.h> 19 + #include "../kselftest_harness.h" 20 + #include "../filesystems/utils.h" 21 + #include "wrappers.h" 22 + 23 + /* 24 + * Test that unprivileged users can only see namespaces they're currently in. 25 + * Create a namespace, drop privileges, verify we can only see our own namespaces. 26 + */ 27 + TEST(listns_unprivileged_current_only) 28 + { 29 + struct ns_id_req req = { 30 + .size = sizeof(req), 31 + .spare = 0, 32 + .ns_id = 0, 33 + .ns_type = CLONE_NEWNET, 34 + .spare2 = 0, 35 + .user_ns_id = 0, 36 + }; 37 + __u64 ns_ids[100]; 38 + ssize_t ret; 39 + int pipefd[2]; 40 + pid_t pid; 41 + int status; 42 + bool found_ours; 43 + int unexpected_count; 44 + 45 + ASSERT_EQ(pipe(pipefd), 0); 46 + 47 + pid = fork(); 48 + ASSERT_GE(pid, 0); 49 + 50 + if (pid == 0) { 51 + int fd; 52 + __u64 our_netns_id; 53 + bool found_ours; 54 + int unexpected_count; 55 + 56 + close(pipefd[0]); 57 + 58 + /* Create user namespace to be unprivileged */ 59 + if (setup_userns() < 0) { 60 + close(pipefd[1]); 61 + exit(1); 62 + } 63 + 64 + /* Create a network namespace */ 65 + if (unshare(CLONE_NEWNET) < 0) { 66 + close(pipefd[1]); 67 + exit(1); 68 + } 69 + 70 + /* Get our network namespace ID */ 71 + fd = open("/proc/self/ns/net", O_RDONLY); 72 + if (fd < 0) { 73 + close(pipefd[1]); 74 + exit(1); 75 + } 76 + 77 + if (ioctl(fd, NS_GET_ID, &our_netns_id) < 0) { 78 + close(fd); 79 + close(pipefd[1]); 80 + exit(1); 81 + } 82 + close(fd); 83 + 84 + /* Now we're unprivileged - list all network namespaces */ 85 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 86 + if (ret < 0) { 87 + close(pipefd[1]); 88 + exit(1); 89 + } 90 + 91 + /* We should only see our own network namespace */ 92 + found_ours = false; 93 + unexpected_count = 0; 94 + 95 + for (ssize_t i = 0; i < ret; i++) { 96 + if (ns_ids[i] == our_netns_id) { 97 + found_ours = true; 98 + } else { 99 + /* This is either init_net (which we can see) or unexpected */ 100 + unexpected_count++; 101 + } 102 + } 103 + 104 + /* Send results to parent */ 105 + write(pipefd[1], &found_ours, sizeof(found_ours)); 106 + write(pipefd[1], &unexpected_count, sizeof(unexpected_count)); 107 + close(pipefd[1]); 108 + exit(0); 109 + } 110 + 111 + /* Parent */ 112 + close(pipefd[1]); 113 + 114 + found_ours = false; 115 + unexpected_count = 0; 116 + read(pipefd[0], &found_ours, sizeof(found_ours)); 117 + read(pipefd[0], &unexpected_count, sizeof(unexpected_count)); 118 + close(pipefd[0]); 119 + 120 + waitpid(pid, &status, 0); 121 + ASSERT_TRUE(WIFEXITED(status)); 122 + ASSERT_EQ(WEXITSTATUS(status), 0); 123 + 124 + /* Child should have seen its own namespace */ 125 + ASSERT_TRUE(found_ours); 126 + 127 + TH_LOG("Unprivileged child saw its own namespace, plus %d others (likely init_net)", 128 + unexpected_count); 129 + } 130 + 131 + /* 132 + * Test that users with CAP_SYS_ADMIN in a user namespace can see 133 + * all namespaces owned by that user namespace. 134 + */ 135 + TEST(listns_cap_sys_admin_in_userns) 136 + { 137 + struct ns_id_req req = { 138 + .size = sizeof(req), 139 + .spare = 0, 140 + .ns_id = 0, 141 + .ns_type = 0, /* All types */ 142 + .spare2 = 0, 143 + .user_ns_id = 0, /* Will be set to our created user namespace */ 144 + }; 145 + __u64 ns_ids[100]; 146 + int pipefd[2]; 147 + pid_t pid; 148 + int status; 149 + bool success; 150 + ssize_t count; 151 + 152 + ASSERT_EQ(pipe(pipefd), 0); 153 + 154 + pid = fork(); 155 + ASSERT_GE(pid, 0); 156 + 157 + if (pid == 0) { 158 + int fd; 159 + __u64 userns_id; 160 + ssize_t ret; 161 + int min_expected; 162 + bool success; 163 + 164 + close(pipefd[0]); 165 + 166 + /* Create user namespace - we'll have CAP_SYS_ADMIN in it */ 167 + if (setup_userns() < 0) { 168 + close(pipefd[1]); 169 + exit(1); 170 + } 171 + 172 + /* Get the user namespace ID */ 173 + fd = open("/proc/self/ns/user", O_RDONLY); 174 + if (fd < 0) { 175 + close(pipefd[1]); 176 + exit(1); 177 + } 178 + 179 + if (ioctl(fd, NS_GET_ID, &userns_id) < 0) { 180 + close(fd); 181 + close(pipefd[1]); 182 + exit(1); 183 + } 184 + close(fd); 185 + 186 + /* Create several namespaces owned by this user namespace */ 187 + unshare(CLONE_NEWNET); 188 + unshare(CLONE_NEWUTS); 189 + unshare(CLONE_NEWIPC); 190 + 191 + /* List namespaces owned by our user namespace */ 192 + req.user_ns_id = userns_id; 193 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 194 + if (ret < 0) { 195 + close(pipefd[1]); 196 + exit(1); 197 + } 198 + 199 + /* 200 + * We have CAP_SYS_ADMIN in this user namespace, 201 + * so we should see all namespaces owned by it. 202 + * That includes: net, uts, ipc, and the user namespace itself. 203 + */ 204 + min_expected = 4; 205 + success = (ret >= min_expected); 206 + 207 + write(pipefd[1], &success, sizeof(success)); 208 + write(pipefd[1], &ret, sizeof(ret)); 209 + close(pipefd[1]); 210 + exit(0); 211 + } 212 + 213 + /* Parent */ 214 + close(pipefd[1]); 215 + 216 + success = false; 217 + count = 0; 218 + read(pipefd[0], &success, sizeof(success)); 219 + read(pipefd[0], &count, sizeof(count)); 220 + close(pipefd[0]); 221 + 222 + waitpid(pid, &status, 0); 223 + ASSERT_TRUE(WIFEXITED(status)); 224 + ASSERT_EQ(WEXITSTATUS(status), 0); 225 + 226 + ASSERT_TRUE(success); 227 + TH_LOG("User with CAP_SYS_ADMIN saw %zd namespaces owned by their user namespace", 228 + count); 229 + } 230 + 231 + /* 232 + * Test that users cannot see namespaces from unrelated user namespaces. 233 + * Create two sibling user namespaces, verify they can't see each other's 234 + * owned namespaces. 235 + */ 236 + TEST(listns_cannot_see_sibling_userns_namespaces) 237 + { 238 + int pipefd[2]; 239 + pid_t pid1, pid2; 240 + int status; 241 + __u64 netns_a_id; 242 + int pipefd2[2]; 243 + bool found_sibling_netns; 244 + 245 + ASSERT_EQ(pipe(pipefd), 0); 246 + 247 + /* Fork first child - creates user namespace A */ 248 + pid1 = fork(); 249 + ASSERT_GE(pid1, 0); 250 + 251 + if (pid1 == 0) { 252 + int fd; 253 + __u64 netns_a_id; 254 + char buf; 255 + 256 + close(pipefd[0]); 257 + 258 + /* Create user namespace A */ 259 + if (setup_userns() < 0) { 260 + close(pipefd[1]); 261 + exit(1); 262 + } 263 + 264 + /* Create network namespace owned by user namespace A */ 265 + if (unshare(CLONE_NEWNET) < 0) { 266 + close(pipefd[1]); 267 + exit(1); 268 + } 269 + 270 + /* Get network namespace ID */ 271 + fd = open("/proc/self/ns/net", O_RDONLY); 272 + if (fd < 0) { 273 + close(pipefd[1]); 274 + exit(1); 275 + } 276 + 277 + if (ioctl(fd, NS_GET_ID, &netns_a_id) < 0) { 278 + close(fd); 279 + close(pipefd[1]); 280 + exit(1); 281 + } 282 + close(fd); 283 + 284 + /* Send namespace ID to parent */ 285 + write(pipefd[1], &netns_a_id, sizeof(netns_a_id)); 286 + 287 + /* Keep alive for sibling to check */ 288 + read(pipefd[1], &buf, 1); 289 + close(pipefd[1]); 290 + exit(0); 291 + } 292 + 293 + /* Parent reads namespace A ID */ 294 + close(pipefd[1]); 295 + netns_a_id = 0; 296 + read(pipefd[0], &netns_a_id, sizeof(netns_a_id)); 297 + 298 + TH_LOG("User namespace A created network namespace with ID %llu", 299 + (unsigned long long)netns_a_id); 300 + 301 + /* Fork second child - creates user namespace B */ 302 + ASSERT_EQ(pipe(pipefd2), 0); 303 + 304 + pid2 = fork(); 305 + ASSERT_GE(pid2, 0); 306 + 307 + if (pid2 == 0) { 308 + struct ns_id_req req = { 309 + .size = sizeof(req), 310 + .spare = 0, 311 + .ns_id = 0, 312 + .ns_type = CLONE_NEWNET, 313 + .spare2 = 0, 314 + .user_ns_id = 0, 315 + }; 316 + __u64 ns_ids[100]; 317 + ssize_t ret; 318 + bool found_sibling_netns; 319 + 320 + close(pipefd[0]); 321 + close(pipefd2[0]); 322 + 323 + /* Create user namespace B (sibling to A) */ 324 + if (setup_userns() < 0) { 325 + close(pipefd2[1]); 326 + exit(1); 327 + } 328 + 329 + /* Try to list all network namespaces */ 330 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 331 + 332 + found_sibling_netns = false; 333 + if (ret > 0) { 334 + for (ssize_t i = 0; i < ret; i++) { 335 + if (ns_ids[i] == netns_a_id) { 336 + found_sibling_netns = true; 337 + break; 338 + } 339 + } 340 + } 341 + 342 + /* We should NOT see the sibling's network namespace */ 343 + write(pipefd2[1], &found_sibling_netns, sizeof(found_sibling_netns)); 344 + close(pipefd2[1]); 345 + exit(0); 346 + } 347 + 348 + /* Parent reads result from second child */ 349 + close(pipefd2[1]); 350 + found_sibling_netns = false; 351 + read(pipefd2[0], &found_sibling_netns, sizeof(found_sibling_netns)); 352 + close(pipefd2[0]); 353 + 354 + /* Signal first child to exit */ 355 + close(pipefd[0]); 356 + 357 + /* Wait for both children */ 358 + waitpid(pid2, &status, 0); 359 + ASSERT_TRUE(WIFEXITED(status)); 360 + 361 + waitpid(pid1, &status, 0); 362 + ASSERT_TRUE(WIFEXITED(status)); 363 + 364 + /* Second child should NOT have seen first child's namespace */ 365 + ASSERT_FALSE(found_sibling_netns); 366 + TH_LOG("User namespace B correctly could not see sibling namespace A's network namespace"); 367 + } 368 + 369 + /* 370 + * Test permission checking with LISTNS_CURRENT_USER. 371 + * Verify that listing with LISTNS_CURRENT_USER respects permissions. 372 + */ 373 + TEST(listns_current_user_permissions) 374 + { 375 + int pipefd[2]; 376 + pid_t pid; 377 + int status; 378 + bool success; 379 + ssize_t count; 380 + 381 + ASSERT_EQ(pipe(pipefd), 0); 382 + 383 + pid = fork(); 384 + ASSERT_GE(pid, 0); 385 + 386 + if (pid == 0) { 387 + struct ns_id_req req = { 388 + .size = sizeof(req), 389 + .spare = 0, 390 + .ns_id = 0, 391 + .ns_type = 0, 392 + .spare2 = 0, 393 + .user_ns_id = LISTNS_CURRENT_USER, 394 + }; 395 + __u64 ns_ids[100]; 396 + ssize_t ret; 397 + bool success; 398 + 399 + close(pipefd[0]); 400 + 401 + /* Create user namespace */ 402 + if (setup_userns() < 0) { 403 + close(pipefd[1]); 404 + exit(1); 405 + } 406 + 407 + /* Create some namespaces owned by this user namespace */ 408 + if (unshare(CLONE_NEWNET) < 0) { 409 + close(pipefd[1]); 410 + exit(1); 411 + } 412 + 413 + if (unshare(CLONE_NEWUTS) < 0) { 414 + close(pipefd[1]); 415 + exit(1); 416 + } 417 + 418 + /* List with LISTNS_CURRENT_USER - should see our owned namespaces */ 419 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 420 + 421 + success = (ret >= 3); /* At least user, net, uts */ 422 + write(pipefd[1], &success, sizeof(success)); 423 + write(pipefd[1], &ret, sizeof(ret)); 424 + close(pipefd[1]); 425 + exit(0); 426 + } 427 + 428 + /* Parent */ 429 + close(pipefd[1]); 430 + 431 + success = false; 432 + count = 0; 433 + read(pipefd[0], &success, sizeof(success)); 434 + read(pipefd[0], &count, sizeof(count)); 435 + close(pipefd[0]); 436 + 437 + waitpid(pid, &status, 0); 438 + ASSERT_TRUE(WIFEXITED(status)); 439 + ASSERT_EQ(WEXITSTATUS(status), 0); 440 + 441 + ASSERT_TRUE(success); 442 + TH_LOG("LISTNS_CURRENT_USER returned %zd namespaces", count); 443 + } 444 + 445 + /* 446 + * Test that CAP_SYS_ADMIN in parent user namespace allows seeing 447 + * child user namespace's owned namespaces. 448 + */ 449 + TEST(listns_parent_userns_cap_sys_admin) 450 + { 451 + int pipefd[2]; 452 + pid_t pid; 453 + int status; 454 + bool found_child_userns; 455 + ssize_t count; 456 + 457 + ASSERT_EQ(pipe(pipefd), 0); 458 + 459 + pid = fork(); 460 + ASSERT_GE(pid, 0); 461 + 462 + if (pid == 0) { 463 + int fd; 464 + __u64 parent_userns_id; 465 + __u64 child_userns_id; 466 + struct ns_id_req req; 467 + __u64 ns_ids[100]; 468 + ssize_t ret; 469 + bool found_child_userns; 470 + 471 + close(pipefd[0]); 472 + 473 + /* Create parent user namespace - we have CAP_SYS_ADMIN in it */ 474 + if (setup_userns() < 0) { 475 + close(pipefd[1]); 476 + exit(1); 477 + } 478 + 479 + /* Get parent user namespace ID */ 480 + fd = open("/proc/self/ns/user", O_RDONLY); 481 + if (fd < 0) { 482 + close(pipefd[1]); 483 + exit(1); 484 + } 485 + 486 + if (ioctl(fd, NS_GET_ID, &parent_userns_id) < 0) { 487 + close(fd); 488 + close(pipefd[1]); 489 + exit(1); 490 + } 491 + close(fd); 492 + 493 + /* Create child user namespace */ 494 + if (setup_userns() < 0) { 495 + close(pipefd[1]); 496 + exit(1); 497 + } 498 + 499 + /* Get child user namespace ID */ 500 + fd = open("/proc/self/ns/user", O_RDONLY); 501 + if (fd < 0) { 502 + close(pipefd[1]); 503 + exit(1); 504 + } 505 + 506 + if (ioctl(fd, NS_GET_ID, &child_userns_id) < 0) { 507 + close(fd); 508 + close(pipefd[1]); 509 + exit(1); 510 + } 511 + close(fd); 512 + 513 + /* Create namespaces owned by child user namespace */ 514 + if (unshare(CLONE_NEWNET) < 0) { 515 + close(pipefd[1]); 516 + exit(1); 517 + } 518 + 519 + /* List namespaces owned by parent user namespace */ 520 + req.size = sizeof(req); 521 + req.spare = 0; 522 + req.ns_id = 0; 523 + req.ns_type = 0; 524 + req.spare2 = 0; 525 + req.user_ns_id = parent_userns_id; 526 + 527 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 528 + 529 + /* Should see child user namespace in the list */ 530 + found_child_userns = false; 531 + if (ret > 0) { 532 + for (ssize_t i = 0; i < ret; i++) { 533 + if (ns_ids[i] == child_userns_id) { 534 + found_child_userns = true; 535 + break; 536 + } 537 + } 538 + } 539 + 540 + write(pipefd[1], &found_child_userns, sizeof(found_child_userns)); 541 + write(pipefd[1], &ret, sizeof(ret)); 542 + close(pipefd[1]); 543 + exit(0); 544 + } 545 + 546 + /* Parent */ 547 + close(pipefd[1]); 548 + 549 + found_child_userns = false; 550 + count = 0; 551 + read(pipefd[0], &found_child_userns, sizeof(found_child_userns)); 552 + read(pipefd[0], &count, sizeof(count)); 553 + close(pipefd[0]); 554 + 555 + waitpid(pid, &status, 0); 556 + ASSERT_TRUE(WIFEXITED(status)); 557 + ASSERT_EQ(WEXITSTATUS(status), 0); 558 + 559 + ASSERT_TRUE(found_child_userns); 560 + TH_LOG("Process with CAP_SYS_ADMIN in parent user namespace saw child user namespace (total: %zd)", 561 + count); 562 + } 563 + 564 + /* 565 + * Test that we can see user namespaces we have CAP_SYS_ADMIN inside of. 566 + * This is different from seeing namespaces owned by a user namespace. 567 + */ 568 + TEST(listns_cap_sys_admin_inside_userns) 569 + { 570 + int pipefd[2]; 571 + pid_t pid; 572 + int status; 573 + bool found_ours; 574 + 575 + ASSERT_EQ(pipe(pipefd), 0); 576 + 577 + pid = fork(); 578 + ASSERT_GE(pid, 0); 579 + 580 + if (pid == 0) { 581 + int fd; 582 + __u64 our_userns_id; 583 + struct ns_id_req req; 584 + __u64 ns_ids[100]; 585 + ssize_t ret; 586 + bool found_ours; 587 + 588 + close(pipefd[0]); 589 + 590 + /* Create user namespace - we have CAP_SYS_ADMIN inside it */ 591 + if (setup_userns() < 0) { 592 + close(pipefd[1]); 593 + exit(1); 594 + } 595 + 596 + /* Get our user namespace ID */ 597 + fd = open("/proc/self/ns/user", O_RDONLY); 598 + if (fd < 0) { 599 + close(pipefd[1]); 600 + exit(1); 601 + } 602 + 603 + if (ioctl(fd, NS_GET_ID, &our_userns_id) < 0) { 604 + close(fd); 605 + close(pipefd[1]); 606 + exit(1); 607 + } 608 + close(fd); 609 + 610 + /* List all user namespaces globally */ 611 + req.size = sizeof(req); 612 + req.spare = 0; 613 + req.ns_id = 0; 614 + req.ns_type = CLONE_NEWUSER; 615 + req.spare2 = 0; 616 + req.user_ns_id = 0; 617 + 618 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 619 + 620 + /* We should be able to see our own user namespace */ 621 + found_ours = false; 622 + if (ret > 0) { 623 + for (ssize_t i = 0; i < ret; i++) { 624 + if (ns_ids[i] == our_userns_id) { 625 + found_ours = true; 626 + break; 627 + } 628 + } 629 + } 630 + 631 + write(pipefd[1], &found_ours, sizeof(found_ours)); 632 + close(pipefd[1]); 633 + exit(0); 634 + } 635 + 636 + /* Parent */ 637 + close(pipefd[1]); 638 + 639 + found_ours = false; 640 + read(pipefd[0], &found_ours, sizeof(found_ours)); 641 + close(pipefd[0]); 642 + 643 + waitpid(pid, &status, 0); 644 + ASSERT_TRUE(WIFEXITED(status)); 645 + ASSERT_EQ(WEXITSTATUS(status), 0); 646 + 647 + ASSERT_TRUE(found_ours); 648 + TH_LOG("Process can see user namespace it has CAP_SYS_ADMIN inside of"); 649 + } 650 + 651 + /* 652 + * Test that dropping CAP_SYS_ADMIN restricts what we can see. 653 + */ 654 + TEST(listns_drop_cap_sys_admin) 655 + { 656 + cap_t caps; 657 + cap_value_t cap_list[1] = { CAP_SYS_ADMIN }; 658 + 659 + /* This test needs to start with CAP_SYS_ADMIN */ 660 + caps = cap_get_proc(); 661 + if (!caps) { 662 + SKIP(return, "Cannot get capabilities"); 663 + } 664 + 665 + cap_flag_value_t cap_val; 666 + if (cap_get_flag(caps, CAP_SYS_ADMIN, CAP_EFFECTIVE, &cap_val) < 0) { 667 + cap_free(caps); 668 + SKIP(return, "Cannot check CAP_SYS_ADMIN"); 669 + } 670 + 671 + if (cap_val != CAP_SET) { 672 + cap_free(caps); 673 + SKIP(return, "Test needs CAP_SYS_ADMIN to start"); 674 + } 675 + cap_free(caps); 676 + 677 + int pipefd[2]; 678 + pid_t pid; 679 + int status; 680 + bool correct; 681 + ssize_t count_before, count_after; 682 + 683 + ASSERT_EQ(pipe(pipefd), 0); 684 + 685 + pid = fork(); 686 + ASSERT_GE(pid, 0); 687 + 688 + if (pid == 0) { 689 + struct ns_id_req req = { 690 + .size = sizeof(req), 691 + .spare = 0, 692 + .ns_id = 0, 693 + .ns_type = CLONE_NEWNET, 694 + .spare2 = 0, 695 + .user_ns_id = LISTNS_CURRENT_USER, 696 + }; 697 + __u64 ns_ids_before[100]; 698 + ssize_t count_before; 699 + __u64 ns_ids_after[100]; 700 + ssize_t count_after; 701 + bool correct; 702 + 703 + close(pipefd[0]); 704 + 705 + /* Create user namespace */ 706 + if (setup_userns() < 0) { 707 + close(pipefd[1]); 708 + exit(1); 709 + } 710 + 711 + /* Count namespaces with CAP_SYS_ADMIN */ 712 + count_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 713 + 714 + /* Drop CAP_SYS_ADMIN */ 715 + caps = cap_get_proc(); 716 + if (caps) { 717 + cap_set_flag(caps, CAP_EFFECTIVE, 1, cap_list, CAP_CLEAR); 718 + cap_set_flag(caps, CAP_PERMITTED, 1, cap_list, CAP_CLEAR); 719 + cap_set_proc(caps); 720 + cap_free(caps); 721 + } 722 + 723 + /* Ensure we can't regain the capability */ 724 + prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); 725 + 726 + /* Count namespaces without CAP_SYS_ADMIN */ 727 + count_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 728 + 729 + /* Without CAP_SYS_ADMIN, we should see same or fewer namespaces */ 730 + correct = (count_after <= count_before); 731 + 732 + write(pipefd[1], &correct, sizeof(correct)); 733 + write(pipefd[1], &count_before, sizeof(count_before)); 734 + write(pipefd[1], &count_after, sizeof(count_after)); 735 + close(pipefd[1]); 736 + exit(0); 737 + } 738 + 739 + /* Parent */ 740 + close(pipefd[1]); 741 + 742 + correct = false; 743 + count_before = 0; 744 + count_after = 0; 745 + read(pipefd[0], &correct, sizeof(correct)); 746 + read(pipefd[0], &count_before, sizeof(count_before)); 747 + read(pipefd[0], &count_after, sizeof(count_after)); 748 + close(pipefd[0]); 749 + 750 + waitpid(pid, &status, 0); 751 + ASSERT_TRUE(WIFEXITED(status)); 752 + ASSERT_EQ(WEXITSTATUS(status), 0); 753 + 754 + ASSERT_TRUE(correct); 755 + TH_LOG("With CAP_SYS_ADMIN: %zd namespaces, without: %zd namespaces", 756 + count_before, count_after); 757 + } 758 + 759 + TEST_HARNESS_MAIN

+679

tools/testing/selftests/namespaces/listns_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <string.h> 10 + #include <linux/nsfs.h> 11 + #include <sys/ioctl.h> 12 + #include <sys/socket.h> 13 + #include <sys/stat.h> 14 + #include <sys/syscall.h> 15 + #include <sys/types.h> 16 + #include <sys/wait.h> 17 + #include <unistd.h> 18 + #include "../kselftest_harness.h" 19 + #include "../filesystems/utils.h" 20 + #include "wrappers.h" 21 + 22 + /* 23 + * Test basic listns() functionality with the unified namespace tree. 24 + * List all active namespaces globally. 25 + */ 26 + TEST(listns_basic_unified) 27 + { 28 + struct ns_id_req req = { 29 + .size = sizeof(req), 30 + .spare = 0, 31 + .ns_id = 0, 32 + .ns_type = 0, /* All types */ 33 + .spare2 = 0, 34 + .user_ns_id = 0, /* Global listing */ 35 + }; 36 + __u64 ns_ids[100]; 37 + ssize_t ret; 38 + 39 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 40 + if (ret < 0) { 41 + if (errno == ENOSYS) 42 + SKIP(return, "listns() not supported"); 43 + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); 44 + ASSERT_TRUE(false); 45 + } 46 + 47 + /* Should find at least the initial namespaces */ 48 + ASSERT_GT(ret, 0); 49 + TH_LOG("Found %zd active namespaces", ret); 50 + 51 + /* Verify all returned IDs are non-zero */ 52 + for (ssize_t i = 0; i < ret; i++) { 53 + ASSERT_NE(ns_ids[i], 0); 54 + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); 55 + } 56 + } 57 + 58 + /* 59 + * Test listns() with type filtering. 60 + * List only network namespaces. 61 + */ 62 + TEST(listns_filter_by_type) 63 + { 64 + struct ns_id_req req = { 65 + .size = sizeof(req), 66 + .spare = 0, 67 + .ns_id = 0, 68 + .ns_type = CLONE_NEWNET, /* Only network namespaces */ 69 + .spare2 = 0, 70 + .user_ns_id = 0, 71 + }; 72 + __u64 ns_ids[100]; 73 + ssize_t ret; 74 + 75 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 76 + if (ret < 0) { 77 + if (errno == ENOSYS) 78 + SKIP(return, "listns() not supported"); 79 + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); 80 + ASSERT_TRUE(false); 81 + } 82 + ASSERT_GE(ret, 0); 83 + 84 + /* Should find at least init_net */ 85 + ASSERT_GT(ret, 0); 86 + TH_LOG("Found %zd active network namespaces", ret); 87 + 88 + /* Verify we can open each namespace and it's actually a network namespace */ 89 + for (ssize_t i = 0; i < ret && i < 5; i++) { 90 + struct nsfs_file_handle nsfh = { 91 + .ns_id = ns_ids[i], 92 + .ns_type = CLONE_NEWNET, 93 + .ns_inum = 0, 94 + }; 95 + struct file_handle *fh; 96 + int fd; 97 + 98 + fh = (struct file_handle *)malloc(sizeof(*fh) + sizeof(nsfh)); 99 + ASSERT_NE(fh, NULL); 100 + fh->handle_bytes = sizeof(nsfh); 101 + fh->handle_type = 0; 102 + memcpy(fh->f_handle, &nsfh, sizeof(nsfh)); 103 + 104 + fd = open_by_handle_at(-10003, fh, O_RDONLY); 105 + free(fh); 106 + 107 + if (fd >= 0) { 108 + int ns_type; 109 + /* Verify it's a network namespace via ioctl */ 110 + ns_type = ioctl(fd, NS_GET_NSTYPE); 111 + if (ns_type >= 0) { 112 + ASSERT_EQ(ns_type, CLONE_NEWNET); 113 + } 114 + close(fd); 115 + } 116 + } 117 + } 118 + 119 + /* 120 + * Test listns() pagination. 121 + * List namespaces in batches. 122 + */ 123 + TEST(listns_pagination) 124 + { 125 + struct ns_id_req req = { 126 + .size = sizeof(req), 127 + .spare = 0, 128 + .ns_id = 0, 129 + .ns_type = 0, 130 + .spare2 = 0, 131 + .user_ns_id = 0, 132 + }; 133 + __u64 batch1[2], batch2[2]; 134 + ssize_t ret1, ret2; 135 + 136 + /* Get first batch */ 137 + ret1 = sys_listns(&req, batch1, ARRAY_SIZE(batch1), 0); 138 + if (ret1 < 0) { 139 + if (errno == ENOSYS) 140 + SKIP(return, "listns() not supported"); 141 + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); 142 + ASSERT_TRUE(false); 143 + } 144 + ASSERT_GE(ret1, 0); 145 + 146 + if (ret1 == 0) 147 + SKIP(return, "No namespaces found"); 148 + 149 + TH_LOG("First batch: %zd namespaces", ret1); 150 + 151 + /* Get second batch using last ID from first batch */ 152 + if (ret1 == ARRAY_SIZE(batch1)) { 153 + req.ns_id = batch1[ret1 - 1]; 154 + ret2 = sys_listns(&req, batch2, ARRAY_SIZE(batch2), 0); 155 + ASSERT_GE(ret2, 0); 156 + 157 + TH_LOG("Second batch: %zd namespaces (after ns_id=%llu)", 158 + ret2, (unsigned long long)req.ns_id); 159 + 160 + /* If we got more results, verify IDs are monotonically increasing */ 161 + if (ret2 > 0) { 162 + ASSERT_GT(batch2[0], batch1[ret1 - 1]); 163 + TH_LOG("Pagination working: %llu > %llu", 164 + (unsigned long long)batch2[0], 165 + (unsigned long long)batch1[ret1 - 1]); 166 + } 167 + } else { 168 + TH_LOG("All namespaces fit in first batch"); 169 + } 170 + } 171 + 172 + /* 173 + * Test listns() with LISTNS_CURRENT_USER. 174 + * List namespaces owned by current user namespace. 175 + */ 176 + TEST(listns_current_user) 177 + { 178 + struct ns_id_req req = { 179 + .size = sizeof(req), 180 + .spare = 0, 181 + .ns_id = 0, 182 + .ns_type = 0, 183 + .spare2 = 0, 184 + .user_ns_id = LISTNS_CURRENT_USER, 185 + }; 186 + __u64 ns_ids[100]; 187 + ssize_t ret; 188 + 189 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 190 + if (ret < 0) { 191 + if (errno == ENOSYS) 192 + SKIP(return, "listns() not supported"); 193 + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); 194 + ASSERT_TRUE(false); 195 + } 196 + ASSERT_GE(ret, 0); 197 + 198 + /* Should find at least the initial namespaces if we're in init_user_ns */ 199 + TH_LOG("Found %zd namespaces owned by current user namespace", ret); 200 + 201 + for (ssize_t i = 0; i < ret; i++) 202 + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); 203 + } 204 + 205 + /* 206 + * Test that listns() only returns active namespaces. 207 + * Create a namespace, let it become inactive, verify it's not listed. 208 + */ 209 + TEST(listns_only_active) 210 + { 211 + struct ns_id_req req = { 212 + .size = sizeof(req), 213 + .spare = 0, 214 + .ns_id = 0, 215 + .ns_type = CLONE_NEWNET, 216 + .spare2 = 0, 217 + .user_ns_id = 0, 218 + }; 219 + __u64 ns_ids_before[100], ns_ids_after[100]; 220 + ssize_t ret_before, ret_after; 221 + int pipefd[2]; 222 + pid_t pid; 223 + __u64 new_ns_id = 0; 224 + int status; 225 + 226 + /* Get initial list */ 227 + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 228 + if (ret_before < 0) { 229 + if (errno == ENOSYS) 230 + SKIP(return, "listns() not supported"); 231 + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); 232 + ASSERT_TRUE(false); 233 + } 234 + ASSERT_GE(ret_before, 0); 235 + 236 + TH_LOG("Before: %zd active network namespaces", ret_before); 237 + 238 + /* Create a new namespace in a child process and get its ID */ 239 + ASSERT_EQ(pipe(pipefd), 0); 240 + 241 + pid = fork(); 242 + ASSERT_GE(pid, 0); 243 + 244 + if (pid == 0) { 245 + int fd; 246 + __u64 ns_id; 247 + 248 + close(pipefd[0]); 249 + 250 + /* Create new network namespace */ 251 + if (unshare(CLONE_NEWNET) < 0) { 252 + close(pipefd[1]); 253 + exit(1); 254 + } 255 + 256 + /* Get its ID */ 257 + fd = open("/proc/self/ns/net", O_RDONLY); 258 + if (fd < 0) { 259 + close(pipefd[1]); 260 + exit(1); 261 + } 262 + 263 + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { 264 + close(fd); 265 + close(pipefd[1]); 266 + exit(1); 267 + } 268 + close(fd); 269 + 270 + /* Send ID to parent */ 271 + write(pipefd[1], &ns_id, sizeof(ns_id)); 272 + close(pipefd[1]); 273 + 274 + /* Keep namespace active briefly */ 275 + usleep(100000); 276 + exit(0); 277 + } 278 + 279 + /* Parent reads the new namespace ID */ 280 + { 281 + int bytes; 282 + 283 + close(pipefd[1]); 284 + bytes = read(pipefd[0], &new_ns_id, sizeof(new_ns_id)); 285 + close(pipefd[0]); 286 + 287 + if (bytes == sizeof(new_ns_id)) { 288 + __u64 ns_ids_during[100]; 289 + int ret_during; 290 + 291 + TH_LOG("Child created namespace with ID %llu", (unsigned long long)new_ns_id); 292 + 293 + /* List namespaces while child is still alive - should see new one */ 294 + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); 295 + ASSERT_GE(ret_during, 0); 296 + TH_LOG("During: %d active network namespaces", ret_during); 297 + 298 + /* Should have more namespaces than before */ 299 + ASSERT_GE(ret_during, ret_before); 300 + } 301 + } 302 + 303 + /* Wait for child to exit */ 304 + waitpid(pid, &status, 0); 305 + 306 + /* Give time for namespace to become inactive */ 307 + usleep(100000); 308 + 309 + /* List namespaces after child exits - should not see new one */ 310 + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 311 + ASSERT_GE(ret_after, 0); 312 + TH_LOG("After: %zd active network namespaces", ret_after); 313 + 314 + /* Verify the new namespace ID is not in the after list */ 315 + if (new_ns_id != 0) { 316 + bool found = false; 317 + 318 + for (ssize_t i = 0; i < ret_after; i++) { 319 + if (ns_ids_after[i] == new_ns_id) { 320 + found = true; 321 + break; 322 + } 323 + } 324 + ASSERT_FALSE(found); 325 + } 326 + } 327 + 328 + /* 329 + * Test listns() with specific user namespace ID. 330 + * Create a user namespace and list namespaces it owns. 331 + */ 332 + TEST(listns_specific_userns) 333 + { 334 + struct ns_id_req req = { 335 + .size = sizeof(req), 336 + .spare = 0, 337 + .ns_id = 0, 338 + .ns_type = 0, 339 + .spare2 = 0, 340 + .user_ns_id = 0, /* Will be filled with created userns ID */ 341 + }; 342 + __u64 ns_ids[100]; 343 + int sv[2]; 344 + pid_t pid; 345 + int status; 346 + __u64 user_ns_id = 0; 347 + int bytes; 348 + ssize_t ret; 349 + 350 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 351 + 352 + pid = fork(); 353 + ASSERT_GE(pid, 0); 354 + 355 + if (pid == 0) { 356 + int fd; 357 + __u64 ns_id; 358 + char buf; 359 + 360 + close(sv[0]); 361 + 362 + /* Create new user namespace */ 363 + if (setup_userns() < 0) { 364 + close(sv[1]); 365 + exit(1); 366 + } 367 + 368 + /* Get user namespace ID */ 369 + fd = open("/proc/self/ns/user", O_RDONLY); 370 + if (fd < 0) { 371 + close(sv[1]); 372 + exit(1); 373 + } 374 + 375 + if (ioctl(fd, NS_GET_ID, &ns_id) < 0) { 376 + close(fd); 377 + close(sv[1]); 378 + exit(1); 379 + } 380 + close(fd); 381 + 382 + /* Send ID to parent */ 383 + if (write(sv[1], &ns_id, sizeof(ns_id)) != sizeof(ns_id)) { 384 + close(sv[1]); 385 + exit(1); 386 + } 387 + 388 + /* Create some namespaces owned by this user namespace */ 389 + unshare(CLONE_NEWNET); 390 + unshare(CLONE_NEWUTS); 391 + 392 + /* Wait for parent signal */ 393 + if (read(sv[1], &buf, 1) != 1) { 394 + close(sv[1]); 395 + exit(1); 396 + } 397 + close(sv[1]); 398 + exit(0); 399 + } 400 + 401 + /* Parent */ 402 + close(sv[1]); 403 + bytes = read(sv[0], &user_ns_id, sizeof(user_ns_id)); 404 + 405 + if (bytes != sizeof(user_ns_id)) { 406 + close(sv[0]); 407 + kill(pid, SIGKILL); 408 + waitpid(pid, NULL, 0); 409 + SKIP(return, "Failed to get user namespace ID from child"); 410 + } 411 + 412 + TH_LOG("Child created user namespace with ID %llu", (unsigned long long)user_ns_id); 413 + 414 + /* List namespaces owned by this user namespace */ 415 + req.user_ns_id = user_ns_id; 416 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 417 + 418 + if (ret < 0) { 419 + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); 420 + close(sv[0]); 421 + kill(pid, SIGKILL); 422 + waitpid(pid, NULL, 0); 423 + if (errno == ENOSYS) { 424 + SKIP(return, "listns() not supported"); 425 + } 426 + ASSERT_GE(ret, 0); 427 + } 428 + 429 + TH_LOG("Found %zd namespaces owned by user namespace %llu", ret, 430 + (unsigned long long)user_ns_id); 431 + 432 + /* Should find at least the network and UTS namespaces we created */ 433 + if (ret > 0) { 434 + for (ssize_t i = 0; i < ret && i < 10; i++) 435 + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); 436 + } 437 + 438 + /* Signal child to exit */ 439 + if (write(sv[0], "X", 1) != 1) { 440 + close(sv[0]); 441 + kill(pid, SIGKILL); 442 + waitpid(pid, NULL, 0); 443 + ASSERT_TRUE(false); 444 + } 445 + close(sv[0]); 446 + waitpid(pid, &status, 0); 447 + } 448 + 449 + /* 450 + * Test listns() with multiple namespace types filter. 451 + */ 452 + TEST(listns_multiple_types) 453 + { 454 + struct ns_id_req req = { 455 + .size = sizeof(req), 456 + .spare = 0, 457 + .ns_id = 0, 458 + .ns_type = CLONE_NEWNET | CLONE_NEWUTS, /* Network and UTS */ 459 + .spare2 = 0, 460 + .user_ns_id = 0, 461 + }; 462 + __u64 ns_ids[100]; 463 + ssize_t ret; 464 + 465 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 466 + if (ret < 0) { 467 + if (errno == ENOSYS) 468 + SKIP(return, "listns() not supported"); 469 + TH_LOG("listns failed: %s (errno=%d)", strerror(errno), errno); 470 + ASSERT_TRUE(false); 471 + } 472 + ASSERT_GE(ret, 0); 473 + 474 + TH_LOG("Found %zd active network/UTS namespaces", ret); 475 + 476 + for (ssize_t i = 0; i < ret; i++) 477 + TH_LOG(" [%zd] ns_id: %llu", i, (unsigned long long)ns_ids[i]); 478 + } 479 + 480 + /* 481 + * Test that hierarchical active reference propagation keeps parent 482 + * user namespaces visible in listns(). 483 + */ 484 + TEST(listns_hierarchical_visibility) 485 + { 486 + struct ns_id_req req = { 487 + .size = sizeof(req), 488 + .spare = 0, 489 + .ns_id = 0, 490 + .ns_type = CLONE_NEWUSER, 491 + .spare2 = 0, 492 + .user_ns_id = 0, 493 + }; 494 + __u64 parent_ns_id = 0, child_ns_id = 0; 495 + int sv[2]; 496 + pid_t pid; 497 + int status; 498 + int bytes; 499 + __u64 ns_ids[100]; 500 + ssize_t ret; 501 + bool found_parent, found_child; 502 + 503 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 504 + 505 + pid = fork(); 506 + ASSERT_GE(pid, 0); 507 + 508 + if (pid == 0) { 509 + int fd; 510 + char buf; 511 + 512 + close(sv[0]); 513 + 514 + /* Create parent user namespace */ 515 + if (setup_userns() < 0) { 516 + close(sv[1]); 517 + exit(1); 518 + } 519 + 520 + fd = open("/proc/self/ns/user", O_RDONLY); 521 + if (fd < 0) { 522 + close(sv[1]); 523 + exit(1); 524 + } 525 + 526 + if (ioctl(fd, NS_GET_ID, &parent_ns_id) < 0) { 527 + close(fd); 528 + close(sv[1]); 529 + exit(1); 530 + } 531 + close(fd); 532 + 533 + /* Create child user namespace */ 534 + if (setup_userns() < 0) { 535 + close(sv[1]); 536 + exit(1); 537 + } 538 + 539 + fd = open("/proc/self/ns/user", O_RDONLY); 540 + if (fd < 0) { 541 + close(sv[1]); 542 + exit(1); 543 + } 544 + 545 + if (ioctl(fd, NS_GET_ID, &child_ns_id) < 0) { 546 + close(fd); 547 + close(sv[1]); 548 + exit(1); 549 + } 550 + close(fd); 551 + 552 + /* Send both IDs to parent */ 553 + if (write(sv[1], &parent_ns_id, sizeof(parent_ns_id)) != sizeof(parent_ns_id)) { 554 + close(sv[1]); 555 + exit(1); 556 + } 557 + if (write(sv[1], &child_ns_id, sizeof(child_ns_id)) != sizeof(child_ns_id)) { 558 + close(sv[1]); 559 + exit(1); 560 + } 561 + 562 + /* Wait for parent signal */ 563 + if (read(sv[1], &buf, 1) != 1) { 564 + close(sv[1]); 565 + exit(1); 566 + } 567 + close(sv[1]); 568 + exit(0); 569 + } 570 + 571 + /* Parent */ 572 + close(sv[1]); 573 + 574 + /* Read both namespace IDs */ 575 + bytes = read(sv[0], &parent_ns_id, sizeof(parent_ns_id)); 576 + bytes += read(sv[0], &child_ns_id, sizeof(child_ns_id)); 577 + 578 + if (bytes != (int)(2 * sizeof(__u64))) { 579 + close(sv[0]); 580 + kill(pid, SIGKILL); 581 + waitpid(pid, NULL, 0); 582 + SKIP(return, "Failed to get namespace IDs from child"); 583 + } 584 + 585 + TH_LOG("Parent user namespace ID: %llu", (unsigned long long)parent_ns_id); 586 + TH_LOG("Child user namespace ID: %llu", (unsigned long long)child_ns_id); 587 + 588 + /* List all user namespaces */ 589 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 590 + 591 + if (ret < 0 && errno == ENOSYS) { 592 + close(sv[0]); 593 + kill(pid, SIGKILL); 594 + waitpid(pid, NULL, 0); 595 + SKIP(return, "listns() not supported"); 596 + } 597 + 598 + ASSERT_GE(ret, 0); 599 + TH_LOG("Found %zd active user namespaces", ret); 600 + 601 + /* Both parent and child should be visible (active due to child process) */ 602 + found_parent = false; 603 + found_child = false; 604 + for (ssize_t i = 0; i < ret; i++) { 605 + if (ns_ids[i] == parent_ns_id) 606 + found_parent = true; 607 + if (ns_ids[i] == child_ns_id) 608 + found_child = true; 609 + } 610 + 611 + TH_LOG("Parent namespace %s, child namespace %s", 612 + found_parent ? "found" : "NOT FOUND", 613 + found_child ? "found" : "NOT FOUND"); 614 + 615 + ASSERT_TRUE(found_child); 616 + /* With hierarchical propagation, parent should also be active */ 617 + ASSERT_TRUE(found_parent); 618 + 619 + /* Signal child to exit */ 620 + if (write(sv[0], "X", 1) != 1) { 621 + close(sv[0]); 622 + kill(pid, SIGKILL); 623 + waitpid(pid, NULL, 0); 624 + ASSERT_TRUE(false); 625 + } 626 + close(sv[0]); 627 + waitpid(pid, &status, 0); 628 + } 629 + 630 + /* 631 + * Test error cases for listns(). 632 + */ 633 + TEST(listns_error_cases) 634 + { 635 + struct ns_id_req req = { 636 + .size = sizeof(req), 637 + .spare = 0, 638 + .ns_id = 0, 639 + .ns_type = 0, 640 + .spare2 = 0, 641 + .user_ns_id = 0, 642 + }; 643 + __u64 ns_ids[10]; 644 + int ret; 645 + 646 + /* Test with invalid flags */ 647 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0xFFFF); 648 + if (errno == ENOSYS) { 649 + /* listns() not supported, skip this check */ 650 + } else { 651 + ASSERT_LT(ret, 0); 652 + ASSERT_EQ(errno, EINVAL); 653 + } 654 + 655 + /* Test with NULL ns_ids array */ 656 + ret = sys_listns(&req, NULL, 10, 0); 657 + ASSERT_LT(ret, 0); 658 + 659 + /* Test with invalid spare field */ 660 + req.spare = 1; 661 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 662 + if (errno == ENOSYS) { 663 + /* listns() not supported, skip this check */ 664 + } else { 665 + ASSERT_LT(ret, 0); 666 + ASSERT_EQ(errno, EINVAL); 667 + } 668 + req.spare = 0; 669 + 670 + /* Test with huge nr_ns_ids */ 671 + ret = sys_listns(&req, ns_ids, 2000000, 0); 672 + if (errno == ENOSYS) { 673 + /* listns() not supported, skip this check */ 674 + } else { 675 + ASSERT_LT(ret, 0); 676 + } 677 + } 678 + 679 + TEST_HARNESS_MAIN

+2672

tools/testing/selftests/namespaces/ns_active_ref_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <string.h> 10 + #include <linux/nsfs.h> 11 + #include <sys/mount.h> 12 + #include <sys/socket.h> 13 + #include <sys/stat.h> 14 + #include <sys/types.h> 15 + #include <sys/wait.h> 16 + #include <sys/syscall.h> 17 + #include <unistd.h> 18 + #include <pthread.h> 19 + #include "../kselftest_harness.h" 20 + #include "../filesystems/utils.h" 21 + #include "wrappers.h" 22 + 23 + #ifndef FD_NSFS_ROOT 24 + #define FD_NSFS_ROOT -10003 /* Root of the nsfs filesystem */ 25 + #endif 26 + 27 + #ifndef FILEID_NSFS 28 + #define FILEID_NSFS 0xf1 29 + #endif 30 + 31 + /* 32 + * Test that initial namespaces can be reopened via file handle. 33 + * Initial namespaces should have active ref count of 1 from boot. 34 + */ 35 + TEST(init_ns_always_active) 36 + { 37 + struct file_handle *handle; 38 + int mount_id; 39 + int ret; 40 + int fd1, fd2; 41 + struct stat st1, st2; 42 + 43 + handle = malloc(sizeof(*handle) + MAX_HANDLE_SZ); 44 + ASSERT_NE(handle, NULL); 45 + 46 + /* Open initial network namespace */ 47 + fd1 = open("/proc/1/ns/net", O_RDONLY); 48 + ASSERT_GE(fd1, 0); 49 + 50 + /* Get file handle for initial namespace */ 51 + handle->handle_bytes = MAX_HANDLE_SZ; 52 + ret = name_to_handle_at(fd1, "", handle, &mount_id, AT_EMPTY_PATH); 53 + if (ret < 0 && errno == EOPNOTSUPP) { 54 + SKIP(free(handle); close(fd1); 55 + return, "nsfs doesn't support file handles"); 56 + } 57 + ASSERT_EQ(ret, 0); 58 + 59 + /* Close the namespace fd */ 60 + close(fd1); 61 + 62 + /* Try to reopen via file handle - should succeed since init ns is always active */ 63 + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 64 + if (fd2 < 0 && (errno == EINVAL || errno == EOPNOTSUPP)) { 65 + SKIP(free(handle); 66 + return, "open_by_handle_at with FD_NSFS_ROOT not supported"); 67 + } 68 + ASSERT_GE(fd2, 0); 69 + 70 + /* Verify we opened the same namespace */ 71 + fd1 = open("/proc/1/ns/net", O_RDONLY); 72 + ASSERT_GE(fd1, 0); 73 + ASSERT_EQ(fstat(fd1, &st1), 0); 74 + ASSERT_EQ(fstat(fd2, &st2), 0); 75 + ASSERT_EQ(st1.st_ino, st2.st_ino); 76 + 77 + close(fd1); 78 + close(fd2); 79 + free(handle); 80 + } 81 + 82 + /* 83 + * Test namespace lifecycle: create a namespace in a child process, 84 + * get a file handle while it's active, then try to reopen after 85 + * the process exits (namespace becomes inactive). 86 + */ 87 + TEST(ns_inactive_after_exit) 88 + { 89 + struct file_handle *handle; 90 + int mount_id; 91 + int ret; 92 + int fd; 93 + int pipefd[2]; 94 + pid_t pid; 95 + int status; 96 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 97 + 98 + /* Create pipe for passing file handle from child */ 99 + ASSERT_EQ(pipe(pipefd), 0); 100 + 101 + pid = fork(); 102 + ASSERT_GE(pid, 0); 103 + 104 + if (pid == 0) { 105 + /* Child process */ 106 + close(pipefd[0]); 107 + 108 + /* Create new network namespace */ 109 + ret = unshare(CLONE_NEWNET); 110 + if (ret < 0) { 111 + close(pipefd[1]); 112 + exit(1); 113 + } 114 + 115 + /* Open our new namespace */ 116 + fd = open("/proc/self/ns/net", O_RDONLY); 117 + if (fd < 0) { 118 + close(pipefd[1]); 119 + exit(1); 120 + } 121 + 122 + /* Get file handle for the namespace */ 123 + handle = (struct file_handle *)buf; 124 + handle->handle_bytes = MAX_HANDLE_SZ; 125 + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); 126 + close(fd); 127 + 128 + if (ret < 0) { 129 + close(pipefd[1]); 130 + exit(1); 131 + } 132 + 133 + /* Send handle to parent */ 134 + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); 135 + close(pipefd[1]); 136 + 137 + /* Exit - namespace should become inactive */ 138 + exit(0); 139 + } 140 + 141 + /* Parent process */ 142 + close(pipefd[1]); 143 + 144 + /* Read file handle from child */ 145 + ret = read(pipefd[0], buf, sizeof(buf)); 146 + close(pipefd[0]); 147 + 148 + /* Wait for child to exit */ 149 + waitpid(pid, &status, 0); 150 + ASSERT_TRUE(WIFEXITED(status)); 151 + ASSERT_EQ(WEXITSTATUS(status), 0); 152 + 153 + ASSERT_GT(ret, 0); 154 + handle = (struct file_handle *)buf; 155 + 156 + /* Try to reopen namespace - should fail with ENOENT since it's inactive */ 157 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 158 + ASSERT_LT(fd, 0); 159 + /* Should fail with ENOENT (namespace inactive) or ESTALE */ 160 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 161 + } 162 + 163 + /* 164 + * Test that a namespace remains active while a process is using it, 165 + * even after the creating process exits. 166 + */ 167 + TEST(ns_active_with_multiple_processes) 168 + { 169 + struct file_handle *handle; 170 + int mount_id; 171 + int ret; 172 + int fd; 173 + int pipefd[2]; 174 + int syncpipe[2]; 175 + pid_t pid1, pid2; 176 + int status; 177 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 178 + char sync_byte; 179 + 180 + /* Create pipes for communication */ 181 + ASSERT_EQ(pipe(pipefd), 0); 182 + ASSERT_EQ(pipe(syncpipe), 0); 183 + 184 + pid1 = fork(); 185 + ASSERT_GE(pid1, 0); 186 + 187 + if (pid1 == 0) { 188 + /* First child - creates namespace */ 189 + close(pipefd[0]); 190 + close(syncpipe[1]); 191 + 192 + /* Create new network namespace */ 193 + ret = unshare(CLONE_NEWNET); 194 + if (ret < 0) { 195 + close(pipefd[1]); 196 + close(syncpipe[0]); 197 + exit(1); 198 + } 199 + 200 + /* Open and get handle */ 201 + fd = open("/proc/self/ns/net", O_RDONLY); 202 + if (fd < 0) { 203 + close(pipefd[1]); 204 + close(syncpipe[0]); 205 + exit(1); 206 + } 207 + 208 + handle = (struct file_handle *)buf; 209 + handle->handle_bytes = MAX_HANDLE_SZ; 210 + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); 211 + close(fd); 212 + 213 + if (ret < 0) { 214 + close(pipefd[1]); 215 + close(syncpipe[0]); 216 + exit(1); 217 + } 218 + 219 + /* Send handle to parent */ 220 + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); 221 + close(pipefd[1]); 222 + 223 + /* Wait for signal before exiting */ 224 + read(syncpipe[0], &sync_byte, 1); 225 + close(syncpipe[0]); 226 + exit(0); 227 + } 228 + 229 + /* Parent reads handle */ 230 + close(pipefd[1]); 231 + ret = read(pipefd[0], buf, sizeof(buf)); 232 + close(pipefd[0]); 233 + ASSERT_GT(ret, 0); 234 + 235 + handle = (struct file_handle *)buf; 236 + 237 + /* Create second child that will keep namespace active */ 238 + pid2 = fork(); 239 + ASSERT_GE(pid2, 0); 240 + 241 + if (pid2 == 0) { 242 + /* Second child - reopens the namespace */ 243 + close(syncpipe[0]); 244 + close(syncpipe[1]); 245 + 246 + /* Open the namespace via handle */ 247 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 248 + if (fd < 0) { 249 + exit(1); 250 + } 251 + 252 + /* Join the namespace */ 253 + ret = setns(fd, CLONE_NEWNET); 254 + close(fd); 255 + if (ret < 0) { 256 + exit(1); 257 + } 258 + 259 + /* Sleep to keep namespace active */ 260 + sleep(1); 261 + exit(0); 262 + } 263 + 264 + /* Let second child enter the namespace */ 265 + usleep(100000); /* 100ms */ 266 + 267 + /* Signal first child to exit */ 268 + close(syncpipe[0]); 269 + sync_byte = 'X'; 270 + write(syncpipe[1], &sync_byte, 1); 271 + close(syncpipe[1]); 272 + 273 + /* Wait for first child */ 274 + waitpid(pid1, &status, 0); 275 + ASSERT_TRUE(WIFEXITED(status)); 276 + 277 + /* Namespace should still be active because second child is using it */ 278 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 279 + ASSERT_GE(fd, 0); 280 + close(fd); 281 + 282 + /* Wait for second child */ 283 + waitpid(pid2, &status, 0); 284 + ASSERT_TRUE(WIFEXITED(status)); 285 + } 286 + 287 + /* 288 + * Test user namespace active ref tracking via credential lifecycle 289 + */ 290 + TEST(userns_active_ref_lifecycle) 291 + { 292 + struct file_handle *handle; 293 + int mount_id; 294 + int ret; 295 + int fd; 296 + int pipefd[2]; 297 + pid_t pid; 298 + int status; 299 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 300 + 301 + ASSERT_EQ(pipe(pipefd), 0); 302 + 303 + pid = fork(); 304 + ASSERT_GE(pid, 0); 305 + 306 + if (pid == 0) { 307 + /* Child process */ 308 + close(pipefd[0]); 309 + 310 + /* Create new user namespace */ 311 + ret = unshare(CLONE_NEWUSER); 312 + if (ret < 0) { 313 + close(pipefd[1]); 314 + exit(1); 315 + } 316 + 317 + /* Set up uid/gid mappings */ 318 + int uid_map_fd = open("/proc/self/uid_map", O_WRONLY); 319 + int gid_map_fd = open("/proc/self/gid_map", O_WRONLY); 320 + int setgroups_fd = open("/proc/self/setgroups", O_WRONLY); 321 + 322 + if (uid_map_fd >= 0 && gid_map_fd >= 0 && setgroups_fd >= 0) { 323 + write(setgroups_fd, "deny", 4); 324 + close(setgroups_fd); 325 + 326 + char mapping[64]; 327 + snprintf(mapping, sizeof(mapping), "0 %d 1", getuid()); 328 + write(uid_map_fd, mapping, strlen(mapping)); 329 + close(uid_map_fd); 330 + 331 + snprintf(mapping, sizeof(mapping), "0 %d 1", getgid()); 332 + write(gid_map_fd, mapping, strlen(mapping)); 333 + close(gid_map_fd); 334 + } 335 + 336 + /* Get file handle */ 337 + fd = open("/proc/self/ns/user", O_RDONLY); 338 + if (fd < 0) { 339 + close(pipefd[1]); 340 + exit(1); 341 + } 342 + 343 + handle = (struct file_handle *)buf; 344 + handle->handle_bytes = MAX_HANDLE_SZ; 345 + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); 346 + close(fd); 347 + 348 + if (ret < 0) { 349 + close(pipefd[1]); 350 + exit(1); 351 + } 352 + 353 + /* Send handle to parent */ 354 + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); 355 + close(pipefd[1]); 356 + exit(0); 357 + } 358 + 359 + /* Parent */ 360 + close(pipefd[1]); 361 + ret = read(pipefd[0], buf, sizeof(buf)); 362 + close(pipefd[0]); 363 + 364 + waitpid(pid, &status, 0); 365 + ASSERT_TRUE(WIFEXITED(status)); 366 + ASSERT_EQ(WEXITSTATUS(status), 0); 367 + 368 + ASSERT_GT(ret, 0); 369 + handle = (struct file_handle *)buf; 370 + 371 + /* Namespace should be inactive after all tasks exit */ 372 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 373 + ASSERT_LT(fd, 0); 374 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 375 + } 376 + 377 + /* 378 + * Test PID namespace active ref tracking 379 + */ 380 + TEST(pidns_active_ref_lifecycle) 381 + { 382 + struct file_handle *handle; 383 + int mount_id; 384 + int ret; 385 + int fd; 386 + int pipefd[2]; 387 + pid_t pid; 388 + int status; 389 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 390 + 391 + ASSERT_EQ(pipe(pipefd), 0); 392 + 393 + pid = fork(); 394 + ASSERT_GE(pid, 0); 395 + 396 + if (pid == 0) { 397 + /* Child process */ 398 + close(pipefd[0]); 399 + 400 + /* Create new PID namespace */ 401 + ret = unshare(CLONE_NEWPID); 402 + if (ret < 0) { 403 + close(pipefd[1]); 404 + exit(1); 405 + } 406 + 407 + /* Fork to actually enter the PID namespace */ 408 + pid_t child = fork(); 409 + if (child < 0) { 410 + close(pipefd[1]); 411 + exit(1); 412 + } 413 + 414 + if (child == 0) { 415 + /* Grandchild - in new PID namespace */ 416 + fd = open("/proc/self/ns/pid", O_RDONLY); 417 + if (fd < 0) { 418 + exit(1); 419 + } 420 + 421 + handle = (struct file_handle *)buf; 422 + handle->handle_bytes = MAX_HANDLE_SZ; 423 + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); 424 + close(fd); 425 + 426 + if (ret < 0) { 427 + exit(1); 428 + } 429 + 430 + /* Send handle to grandparent */ 431 + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); 432 + close(pipefd[1]); 433 + exit(0); 434 + } 435 + 436 + /* Wait for grandchild */ 437 + waitpid(child, NULL, 0); 438 + exit(0); 439 + } 440 + 441 + /* Parent */ 442 + close(pipefd[1]); 443 + ret = read(pipefd[0], buf, sizeof(buf)); 444 + close(pipefd[0]); 445 + 446 + waitpid(pid, &status, 0); 447 + ASSERT_TRUE(WIFEXITED(status)); 448 + ASSERT_EQ(WEXITSTATUS(status), 0); 449 + 450 + ASSERT_GT(ret, 0); 451 + handle = (struct file_handle *)buf; 452 + 453 + /* Namespace should be inactive after all processes exit */ 454 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 455 + ASSERT_LT(fd, 0); 456 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 457 + } 458 + 459 + /* 460 + * Test that an open file descriptor keeps a namespace active. 461 + * Even after the creating process exits, the namespace should remain 462 + * active as long as an fd is held open. 463 + */ 464 + TEST(ns_fd_keeps_active) 465 + { 466 + struct file_handle *handle; 467 + int mount_id; 468 + int ret; 469 + int nsfd; 470 + int pipe_child_ready[2]; 471 + int pipe_parent_ready[2]; 472 + pid_t pid; 473 + int status; 474 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 475 + char sync_byte; 476 + char proc_path[64]; 477 + 478 + ASSERT_EQ(pipe(pipe_child_ready), 0); 479 + ASSERT_EQ(pipe(pipe_parent_ready), 0); 480 + 481 + pid = fork(); 482 + ASSERT_GE(pid, 0); 483 + 484 + if (pid == 0) { 485 + /* Child process */ 486 + close(pipe_child_ready[0]); 487 + close(pipe_parent_ready[1]); 488 + 489 + TH_LOG("Child: creating new network namespace"); 490 + 491 + /* Create new network namespace */ 492 + ret = unshare(CLONE_NEWNET); 493 + if (ret < 0) { 494 + TH_LOG("Child: unshare(CLONE_NEWNET) failed: %s", strerror(errno)); 495 + close(pipe_child_ready[1]); 496 + close(pipe_parent_ready[0]); 497 + exit(1); 498 + } 499 + 500 + TH_LOG("Child: network namespace created successfully"); 501 + 502 + /* Get file handle for the namespace */ 503 + nsfd = open("/proc/self/ns/net", O_RDONLY); 504 + if (nsfd < 0) { 505 + TH_LOG("Child: failed to open /proc/self/ns/net: %s", strerror(errno)); 506 + close(pipe_child_ready[1]); 507 + close(pipe_parent_ready[0]); 508 + exit(1); 509 + } 510 + 511 + TH_LOG("Child: opened namespace fd %d", nsfd); 512 + 513 + handle = (struct file_handle *)buf; 514 + handle->handle_bytes = MAX_HANDLE_SZ; 515 + ret = name_to_handle_at(nsfd, "", handle, &mount_id, AT_EMPTY_PATH); 516 + close(nsfd); 517 + 518 + if (ret < 0) { 519 + TH_LOG("Child: name_to_handle_at failed: %s", strerror(errno)); 520 + close(pipe_child_ready[1]); 521 + close(pipe_parent_ready[0]); 522 + exit(1); 523 + } 524 + 525 + TH_LOG("Child: got file handle (bytes=%u)", handle->handle_bytes); 526 + 527 + /* Send file handle to parent */ 528 + ret = write(pipe_child_ready[1], buf, sizeof(*handle) + handle->handle_bytes); 529 + TH_LOG("Child: sent %d bytes of file handle to parent", ret); 530 + close(pipe_child_ready[1]); 531 + 532 + /* Wait for parent to open the fd */ 533 + TH_LOG("Child: waiting for parent to open fd"); 534 + ret = read(pipe_parent_ready[0], &sync_byte, 1); 535 + close(pipe_parent_ready[0]); 536 + 537 + TH_LOG("Child: parent signaled (read %d bytes), exiting now", ret); 538 + /* Exit - namespace should stay active because parent holds fd */ 539 + exit(0); 540 + } 541 + 542 + /* Parent process */ 543 + close(pipe_child_ready[1]); 544 + close(pipe_parent_ready[0]); 545 + 546 + TH_LOG("Parent: reading file handle from child"); 547 + 548 + /* Read file handle from child */ 549 + ret = read(pipe_child_ready[0], buf, sizeof(buf)); 550 + close(pipe_child_ready[0]); 551 + ASSERT_GT(ret, 0); 552 + handle = (struct file_handle *)buf; 553 + 554 + TH_LOG("Parent: received %d bytes, handle size=%u", ret, handle->handle_bytes); 555 + 556 + /* Open the child's namespace while it's still alive */ 557 + snprintf(proc_path, sizeof(proc_path), "/proc/%d/ns/net", pid); 558 + TH_LOG("Parent: opening child's namespace at %s", proc_path); 559 + nsfd = open(proc_path, O_RDONLY); 560 + if (nsfd < 0) { 561 + TH_LOG("Parent: failed to open %s: %s", proc_path, strerror(errno)); 562 + close(pipe_parent_ready[1]); 563 + kill(pid, SIGKILL); 564 + waitpid(pid, NULL, 0); 565 + SKIP(return, "Failed to open child's namespace"); 566 + } 567 + 568 + TH_LOG("Parent: opened child's namespace, got fd %d", nsfd); 569 + 570 + /* Signal child that we have the fd */ 571 + sync_byte = 'G'; 572 + write(pipe_parent_ready[1], &sync_byte, 1); 573 + close(pipe_parent_ready[1]); 574 + TH_LOG("Parent: signaled child that we have the fd"); 575 + 576 + /* Wait for child to exit */ 577 + waitpid(pid, &status, 0); 578 + ASSERT_TRUE(WIFEXITED(status)); 579 + ASSERT_EQ(WEXITSTATUS(status), 0); 580 + 581 + TH_LOG("Child exited, parent holds fd %d to namespace", nsfd); 582 + 583 + /* 584 + * Namespace should still be ACTIVE because we hold an fd. 585 + * We should be able to reopen it via file handle. 586 + */ 587 + TH_LOG("Attempting to reopen namespace via file handle (should succeed - fd held)"); 588 + int fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 589 + ASSERT_GE(fd2, 0); 590 + 591 + TH_LOG("Successfully reopened namespace via file handle, got fd %d", fd2); 592 + 593 + /* Verify it's the same namespace */ 594 + struct stat st1, st2; 595 + ASSERT_EQ(fstat(nsfd, &st1), 0); 596 + ASSERT_EQ(fstat(fd2, &st2), 0); 597 + TH_LOG("Namespace inodes: nsfd=%lu, fd2=%lu", st1.st_ino, st2.st_ino); 598 + ASSERT_EQ(st1.st_ino, st2.st_ino); 599 + close(fd2); 600 + 601 + /* Now close the fd - namespace should become inactive */ 602 + TH_LOG("Closing fd %d - namespace should become inactive", nsfd); 603 + close(nsfd); 604 + 605 + /* Now reopening should fail - namespace is inactive */ 606 + TH_LOG("Attempting to reopen namespace via file handle (should fail - inactive)"); 607 + fd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 608 + ASSERT_LT(fd2, 0); 609 + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ 610 + TH_LOG("Reopen failed as expected: %s (errno=%d)", strerror(errno), errno); 611 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 612 + } 613 + 614 + /* 615 + * Test hierarchical active reference propagation. 616 + * When a child namespace is active, its owning user namespace should also 617 + * be active automatically due to hierarchical active reference propagation. 618 + * This ensures parents are always reachable when children are active. 619 + */ 620 + TEST(ns_parent_always_reachable) 621 + { 622 + struct file_handle *parent_handle, *child_handle; 623 + int ret; 624 + int child_nsfd; 625 + int pipefd[2]; 626 + pid_t pid; 627 + int status; 628 + __u64 parent_id, child_id; 629 + char parent_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; 630 + char child_buf[sizeof(*child_handle) + MAX_HANDLE_SZ]; 631 + 632 + ASSERT_EQ(pipe(pipefd), 0); 633 + 634 + pid = fork(); 635 + ASSERT_GE(pid, 0); 636 + 637 + if (pid == 0) { 638 + /* Child process */ 639 + close(pipefd[0]); 640 + 641 + TH_LOG("Child: creating parent user namespace and setting up mappings"); 642 + 643 + /* Create parent user namespace with mappings */ 644 + ret = setup_userns(); 645 + if (ret < 0) { 646 + TH_LOG("Child: setup_userns() for parent failed: %s", strerror(errno)); 647 + close(pipefd[1]); 648 + exit(1); 649 + } 650 + 651 + TH_LOG("Child: parent user namespace created, now uid=%d gid=%d", getuid(), getgid()); 652 + 653 + /* Get namespace ID for parent user namespace */ 654 + int parent_fd = open("/proc/self/ns/user", O_RDONLY); 655 + if (parent_fd < 0) { 656 + TH_LOG("Child: failed to open parent /proc/self/ns/user: %s", strerror(errno)); 657 + close(pipefd[1]); 658 + exit(1); 659 + } 660 + 661 + TH_LOG("Child: opened parent userns fd %d", parent_fd); 662 + 663 + if (ioctl(parent_fd, NS_GET_ID, &parent_id) < 0) { 664 + TH_LOG("Child: NS_GET_ID for parent failed: %s", strerror(errno)); 665 + close(parent_fd); 666 + close(pipefd[1]); 667 + exit(1); 668 + } 669 + close(parent_fd); 670 + 671 + TH_LOG("Child: got parent namespace ID %llu", (unsigned long long)parent_id); 672 + 673 + /* Create child user namespace within parent */ 674 + TH_LOG("Child: creating nested child user namespace"); 675 + ret = setup_userns(); 676 + if (ret < 0) { 677 + TH_LOG("Child: setup_userns() for child failed: %s", strerror(errno)); 678 + close(pipefd[1]); 679 + exit(1); 680 + } 681 + 682 + TH_LOG("Child: nested child user namespace created, uid=%d gid=%d", getuid(), getgid()); 683 + 684 + /* Get namespace ID for child user namespace */ 685 + int child_fd = open("/proc/self/ns/user", O_RDONLY); 686 + if (child_fd < 0) { 687 + TH_LOG("Child: failed to open child /proc/self/ns/user: %s", strerror(errno)); 688 + close(pipefd[1]); 689 + exit(1); 690 + } 691 + 692 + TH_LOG("Child: opened child userns fd %d", child_fd); 693 + 694 + if (ioctl(child_fd, NS_GET_ID, &child_id) < 0) { 695 + TH_LOG("Child: NS_GET_ID for child failed: %s", strerror(errno)); 696 + close(child_fd); 697 + close(pipefd[1]); 698 + exit(1); 699 + } 700 + close(child_fd); 701 + 702 + TH_LOG("Child: got child namespace ID %llu", (unsigned long long)child_id); 703 + 704 + /* Send both namespace IDs to parent */ 705 + TH_LOG("Child: sending both namespace IDs to parent"); 706 + write(pipefd[1], &parent_id, sizeof(parent_id)); 707 + write(pipefd[1], &child_id, sizeof(child_id)); 708 + close(pipefd[1]); 709 + 710 + TH_LOG("Child: exiting - parent userns should become inactive"); 711 + /* Exit - parent user namespace should become inactive */ 712 + exit(0); 713 + } 714 + 715 + /* Parent process */ 716 + close(pipefd[1]); 717 + 718 + TH_LOG("Parent: reading both namespace IDs from child"); 719 + 720 + /* Read both namespace IDs - fixed size, no parsing needed */ 721 + ret = read(pipefd[0], &parent_id, sizeof(parent_id)); 722 + if (ret != sizeof(parent_id)) { 723 + close(pipefd[0]); 724 + waitpid(pid, NULL, 0); 725 + SKIP(return, "Failed to read parent namespace ID from child"); 726 + } 727 + 728 + ret = read(pipefd[0], &child_id, sizeof(child_id)); 729 + close(pipefd[0]); 730 + if (ret != sizeof(child_id)) { 731 + waitpid(pid, NULL, 0); 732 + SKIP(return, "Failed to read child namespace ID from child"); 733 + } 734 + 735 + TH_LOG("Parent: received parent_id=%llu, child_id=%llu", 736 + (unsigned long long)parent_id, (unsigned long long)child_id); 737 + 738 + /* Construct file handles from namespace IDs */ 739 + parent_handle = (struct file_handle *)parent_buf; 740 + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); 741 + parent_handle->handle_type = FILEID_NSFS; 742 + struct nsfs_file_handle *parent_fh = (struct nsfs_file_handle *)parent_handle->f_handle; 743 + parent_fh->ns_id = parent_id; 744 + parent_fh->ns_type = 0; 745 + parent_fh->ns_inum = 0; 746 + 747 + child_handle = (struct file_handle *)child_buf; 748 + child_handle->handle_bytes = sizeof(struct nsfs_file_handle); 749 + child_handle->handle_type = FILEID_NSFS; 750 + struct nsfs_file_handle *child_fh = (struct nsfs_file_handle *)child_handle->f_handle; 751 + child_fh->ns_id = child_id; 752 + child_fh->ns_type = 0; 753 + child_fh->ns_inum = 0; 754 + 755 + TH_LOG("Parent: opening child namespace BEFORE child exits"); 756 + 757 + /* Open child namespace while child is still alive to keep it active */ 758 + child_nsfd = open_by_handle_at(FD_NSFS_ROOT, child_handle, O_RDONLY); 759 + if (child_nsfd < 0) { 760 + TH_LOG("Failed to open child namespace: %s (errno=%d)", strerror(errno), errno); 761 + waitpid(pid, NULL, 0); 762 + SKIP(return, "Failed to open child namespace"); 763 + } 764 + 765 + TH_LOG("Opened child namespace fd %d", child_nsfd); 766 + 767 + /* Now wait for child to exit */ 768 + TH_LOG("Parent: waiting for child to exit"); 769 + waitpid(pid, &status, 0); 770 + ASSERT_TRUE(WIFEXITED(status)); 771 + ASSERT_EQ(WEXITSTATUS(status), 0); 772 + 773 + TH_LOG("Child process exited, parent holds fd to child namespace"); 774 + 775 + /* 776 + * With hierarchical active reference propagation: 777 + * Since the child namespace is active (parent process holds fd), 778 + * the parent user namespace should ALSO be active automatically. 779 + * This is because when we took an active reference on the child, 780 + * it propagated up to the owning user namespace. 781 + */ 782 + TH_LOG("Attempting to reopen parent namespace (should SUCCEED - hierarchical propagation)"); 783 + int parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); 784 + ASSERT_GE(parent_fd, 0); 785 + 786 + TH_LOG("SUCCESS: Parent namespace is active (fd=%d) due to active child", parent_fd); 787 + 788 + /* Verify we can also get parent via NS_GET_USERNS */ 789 + TH_LOG("Verifying NS_GET_USERNS also works"); 790 + int parent_fd2 = ioctl(child_nsfd, NS_GET_USERNS); 791 + if (parent_fd2 < 0) { 792 + close(parent_fd); 793 + close(child_nsfd); 794 + TH_LOG("NS_GET_USERNS failed: %s (errno=%d)", strerror(errno), errno); 795 + SKIP(return, "NS_GET_USERNS not supported or failed"); 796 + } 797 + 798 + TH_LOG("NS_GET_USERNS succeeded, got parent fd %d", parent_fd2); 799 + 800 + /* Verify both methods give us the same namespace */ 801 + struct stat st1, st2; 802 + ASSERT_EQ(fstat(parent_fd, &st1), 0); 803 + ASSERT_EQ(fstat(parent_fd2, &st2), 0); 804 + TH_LOG("Parent namespace inodes: parent_fd=%lu, parent_fd2=%lu", st1.st_ino, st2.st_ino); 805 + ASSERT_EQ(st1.st_ino, st2.st_ino); 806 + 807 + /* 808 + * Close child fd - parent should remain active because we still 809 + * hold direct references to it (parent_fd and parent_fd2). 810 + */ 811 + TH_LOG("Closing child fd - parent should remain active (direct refs held)"); 812 + close(child_nsfd); 813 + 814 + /* Parent should still be openable */ 815 + TH_LOG("Verifying parent still active via file handle"); 816 + int parent_fd3 = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); 817 + ASSERT_GE(parent_fd3, 0); 818 + close(parent_fd3); 819 + 820 + TH_LOG("Closing all fds to parent namespace"); 821 + close(parent_fd); 822 + close(parent_fd2); 823 + 824 + /* Both should now be inactive */ 825 + TH_LOG("Attempting to reopen parent (should fail - inactive, no refs)"); 826 + parent_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); 827 + ASSERT_LT(parent_fd, 0); 828 + TH_LOG("Parent inactive as expected: %s (errno=%d)", strerror(errno), errno); 829 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 830 + } 831 + 832 + /* 833 + * Test that bind mounts keep namespaces in the tree even when inactive 834 + */ 835 + TEST(ns_bind_mount_keeps_in_tree) 836 + { 837 + struct file_handle *handle; 838 + int mount_id; 839 + int ret; 840 + int fd; 841 + int pipefd[2]; 842 + pid_t pid; 843 + int status; 844 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 845 + char tmpfile[] = "/tmp/ns-test-XXXXXX"; 846 + int tmpfd; 847 + 848 + /* Create temporary file for bind mount */ 849 + tmpfd = mkstemp(tmpfile); 850 + if (tmpfd < 0) { 851 + SKIP(return, "Cannot create temporary file"); 852 + } 853 + close(tmpfd); 854 + 855 + ASSERT_EQ(pipe(pipefd), 0); 856 + 857 + pid = fork(); 858 + ASSERT_GE(pid, 0); 859 + 860 + if (pid == 0) { 861 + /* Child process */ 862 + close(pipefd[0]); 863 + 864 + /* Unshare mount namespace and make mounts private to avoid propagation */ 865 + ret = unshare(CLONE_NEWNS); 866 + if (ret < 0) { 867 + close(pipefd[1]); 868 + unlink(tmpfile); 869 + exit(1); 870 + } 871 + ret = mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL); 872 + if (ret < 0) { 873 + close(pipefd[1]); 874 + unlink(tmpfile); 875 + exit(1); 876 + } 877 + 878 + /* Create new network namespace */ 879 + ret = unshare(CLONE_NEWNET); 880 + if (ret < 0) { 881 + close(pipefd[1]); 882 + unlink(tmpfile); 883 + exit(1); 884 + } 885 + 886 + /* Bind mount the namespace */ 887 + ret = mount("/proc/self/ns/net", tmpfile, NULL, MS_BIND, NULL); 888 + if (ret < 0) { 889 + close(pipefd[1]); 890 + unlink(tmpfile); 891 + exit(1); 892 + } 893 + 894 + /* Get file handle */ 895 + fd = open("/proc/self/ns/net", O_RDONLY); 896 + if (fd < 0) { 897 + umount(tmpfile); 898 + close(pipefd[1]); 899 + unlink(tmpfile); 900 + exit(1); 901 + } 902 + 903 + handle = (struct file_handle *)buf; 904 + handle->handle_bytes = MAX_HANDLE_SZ; 905 + ret = name_to_handle_at(fd, "", handle, &mount_id, AT_EMPTY_PATH); 906 + close(fd); 907 + 908 + if (ret < 0) { 909 + umount(tmpfile); 910 + close(pipefd[1]); 911 + unlink(tmpfile); 912 + exit(1); 913 + } 914 + 915 + /* Send handle to parent */ 916 + write(pipefd[1], buf, sizeof(*handle) + handle->handle_bytes); 917 + close(pipefd[1]); 918 + exit(0); 919 + } 920 + 921 + /* Parent */ 922 + close(pipefd[1]); 923 + ret = read(pipefd[0], buf, sizeof(buf)); 924 + close(pipefd[0]); 925 + 926 + waitpid(pid, &status, 0); 927 + ASSERT_TRUE(WIFEXITED(status)); 928 + ASSERT_EQ(WEXITSTATUS(status), 0); 929 + 930 + ASSERT_GT(ret, 0); 931 + handle = (struct file_handle *)buf; 932 + 933 + /* 934 + * Namespace should be inactive but still in tree due to bind mount. 935 + * Reopening should fail with ENOENT (inactive) not ESTALE (not in tree). 936 + */ 937 + fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 938 + ASSERT_LT(fd, 0); 939 + /* Should be ENOENT (inactive) since bind mount keeps it in tree */ 940 + if (errno != ENOENT && errno != ESTALE) { 941 + TH_LOG("Unexpected error: %d", errno); 942 + } 943 + 944 + /* Cleanup */ 945 + umount(tmpfile); 946 + unlink(tmpfile); 947 + } 948 + 949 + /* 950 + * Test multi-level hierarchy (3+ levels deep). 951 + * Grandparent → Parent → Child 952 + * When child is active, both parent AND grandparent should be active. 953 + */ 954 + TEST(ns_multilevel_hierarchy) 955 + { 956 + struct file_handle *gp_handle, *p_handle, *c_handle; 957 + int ret, pipefd[2]; 958 + pid_t pid; 959 + int status; 960 + __u64 gp_id, p_id, c_id; 961 + char gp_buf[sizeof(*gp_handle) + MAX_HANDLE_SZ]; 962 + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; 963 + char c_buf[sizeof(*c_handle) + MAX_HANDLE_SZ]; 964 + 965 + ASSERT_EQ(pipe(pipefd), 0); 966 + pid = fork(); 967 + ASSERT_GE(pid, 0); 968 + 969 + if (pid == 0) { 970 + close(pipefd[0]); 971 + 972 + /* Create grandparent user namespace */ 973 + if (setup_userns() < 0) { 974 + close(pipefd[1]); 975 + exit(1); 976 + } 977 + 978 + int gp_fd = open("/proc/self/ns/user", O_RDONLY); 979 + if (gp_fd < 0) { 980 + close(pipefd[1]); 981 + exit(1); 982 + } 983 + if (ioctl(gp_fd, NS_GET_ID, &gp_id) < 0) { 984 + close(gp_fd); 985 + close(pipefd[1]); 986 + exit(1); 987 + } 988 + close(gp_fd); 989 + 990 + /* Create parent user namespace */ 991 + if (setup_userns() < 0) { 992 + close(pipefd[1]); 993 + exit(1); 994 + } 995 + 996 + int p_fd = open("/proc/self/ns/user", O_RDONLY); 997 + if (p_fd < 0) { 998 + close(pipefd[1]); 999 + exit(1); 1000 + } 1001 + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { 1002 + close(p_fd); 1003 + close(pipefd[1]); 1004 + exit(1); 1005 + } 1006 + close(p_fd); 1007 + 1008 + /* Create child user namespace */ 1009 + if (setup_userns() < 0) { 1010 + close(pipefd[1]); 1011 + exit(1); 1012 + } 1013 + 1014 + int c_fd = open("/proc/self/ns/user", O_RDONLY); 1015 + if (c_fd < 0) { 1016 + close(pipefd[1]); 1017 + exit(1); 1018 + } 1019 + if (ioctl(c_fd, NS_GET_ID, &c_id) < 0) { 1020 + close(c_fd); 1021 + close(pipefd[1]); 1022 + exit(1); 1023 + } 1024 + close(c_fd); 1025 + 1026 + /* Send all three namespace IDs */ 1027 + write(pipefd[1], &gp_id, sizeof(gp_id)); 1028 + write(pipefd[1], &p_id, sizeof(p_id)); 1029 + write(pipefd[1], &c_id, sizeof(c_id)); 1030 + close(pipefd[1]); 1031 + exit(0); 1032 + } 1033 + 1034 + close(pipefd[1]); 1035 + 1036 + /* Read all three namespace IDs - fixed size, no parsing needed */ 1037 + ret = read(pipefd[0], &gp_id, sizeof(gp_id)); 1038 + if (ret != sizeof(gp_id)) { 1039 + close(pipefd[0]); 1040 + waitpid(pid, NULL, 0); 1041 + SKIP(return, "Failed to read grandparent namespace ID from child"); 1042 + } 1043 + 1044 + ret = read(pipefd[0], &p_id, sizeof(p_id)); 1045 + if (ret != sizeof(p_id)) { 1046 + close(pipefd[0]); 1047 + waitpid(pid, NULL, 0); 1048 + SKIP(return, "Failed to read parent namespace ID from child"); 1049 + } 1050 + 1051 + ret = read(pipefd[0], &c_id, sizeof(c_id)); 1052 + close(pipefd[0]); 1053 + if (ret != sizeof(c_id)) { 1054 + waitpid(pid, NULL, 0); 1055 + SKIP(return, "Failed to read child namespace ID from child"); 1056 + } 1057 + 1058 + /* Construct file handles from namespace IDs */ 1059 + gp_handle = (struct file_handle *)gp_buf; 1060 + gp_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1061 + gp_handle->handle_type = FILEID_NSFS; 1062 + struct nsfs_file_handle *gp_fh = (struct nsfs_file_handle *)gp_handle->f_handle; 1063 + gp_fh->ns_id = gp_id; 1064 + gp_fh->ns_type = 0; 1065 + gp_fh->ns_inum = 0; 1066 + 1067 + p_handle = (struct file_handle *)p_buf; 1068 + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1069 + p_handle->handle_type = FILEID_NSFS; 1070 + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; 1071 + p_fh->ns_id = p_id; 1072 + p_fh->ns_type = 0; 1073 + p_fh->ns_inum = 0; 1074 + 1075 + c_handle = (struct file_handle *)c_buf; 1076 + c_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1077 + c_handle->handle_type = FILEID_NSFS; 1078 + struct nsfs_file_handle *c_fh = (struct nsfs_file_handle *)c_handle->f_handle; 1079 + c_fh->ns_id = c_id; 1080 + c_fh->ns_type = 0; 1081 + c_fh->ns_inum = 0; 1082 + 1083 + /* Open child before process exits */ 1084 + int c_fd = open_by_handle_at(FD_NSFS_ROOT, c_handle, O_RDONLY); 1085 + if (c_fd < 0) { 1086 + waitpid(pid, NULL, 0); 1087 + SKIP(return, "Failed to open child namespace"); 1088 + } 1089 + 1090 + waitpid(pid, &status, 0); 1091 + ASSERT_TRUE(WIFEXITED(status)); 1092 + ASSERT_EQ(WEXITSTATUS(status), 0); 1093 + 1094 + /* 1095 + * With 3-level hierarchy and child active: 1096 + * - Child is active (we hold fd) 1097 + * - Parent should be active (propagated from child) 1098 + * - Grandparent should be active (propagated from parent) 1099 + */ 1100 + TH_LOG("Testing parent active when child is active"); 1101 + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); 1102 + ASSERT_GE(p_fd, 0); 1103 + 1104 + TH_LOG("Testing grandparent active when child is active"); 1105 + int gp_fd = open_by_handle_at(FD_NSFS_ROOT, gp_handle, O_RDONLY); 1106 + ASSERT_GE(gp_fd, 0); 1107 + 1108 + close(c_fd); 1109 + close(p_fd); 1110 + close(gp_fd); 1111 + } 1112 + 1113 + /* 1114 + * Test multiple children sharing same parent. 1115 + * Parent should stay active as long as ANY child is active. 1116 + */ 1117 + TEST(ns_multiple_children_same_parent) 1118 + { 1119 + struct file_handle *p_handle, *c1_handle, *c2_handle; 1120 + int ret, pipefd[2]; 1121 + pid_t pid; 1122 + int status; 1123 + __u64 p_id, c1_id, c2_id; 1124 + char p_buf[sizeof(*p_handle) + MAX_HANDLE_SZ]; 1125 + char c1_buf[sizeof(*c1_handle) + MAX_HANDLE_SZ]; 1126 + char c2_buf[sizeof(*c2_handle) + MAX_HANDLE_SZ]; 1127 + 1128 + ASSERT_EQ(pipe(pipefd), 0); 1129 + pid = fork(); 1130 + ASSERT_GE(pid, 0); 1131 + 1132 + if (pid == 0) { 1133 + close(pipefd[0]); 1134 + 1135 + /* Create parent user namespace */ 1136 + if (setup_userns() < 0) { 1137 + close(pipefd[1]); 1138 + exit(1); 1139 + } 1140 + 1141 + int p_fd = open("/proc/self/ns/user", O_RDONLY); 1142 + if (p_fd < 0) { 1143 + close(pipefd[1]); 1144 + exit(1); 1145 + } 1146 + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { 1147 + close(p_fd); 1148 + close(pipefd[1]); 1149 + exit(1); 1150 + } 1151 + close(p_fd); 1152 + 1153 + /* Create first child user namespace */ 1154 + if (setup_userns() < 0) { 1155 + close(pipefd[1]); 1156 + exit(1); 1157 + } 1158 + 1159 + int c1_fd = open("/proc/self/ns/user", O_RDONLY); 1160 + if (c1_fd < 0) { 1161 + close(pipefd[1]); 1162 + exit(1); 1163 + } 1164 + if (ioctl(c1_fd, NS_GET_ID, &c1_id) < 0) { 1165 + close(c1_fd); 1166 + close(pipefd[1]); 1167 + exit(1); 1168 + } 1169 + close(c1_fd); 1170 + 1171 + /* Return to parent user namespace and create second child */ 1172 + /* We can't actually do this easily, so let's create a sibling namespace 1173 + * by creating a network namespace instead */ 1174 + if (unshare(CLONE_NEWNET) < 0) { 1175 + close(pipefd[1]); 1176 + exit(1); 1177 + } 1178 + 1179 + int c2_fd = open("/proc/self/ns/net", O_RDONLY); 1180 + if (c2_fd < 0) { 1181 + close(pipefd[1]); 1182 + exit(1); 1183 + } 1184 + if (ioctl(c2_fd, NS_GET_ID, &c2_id) < 0) { 1185 + close(c2_fd); 1186 + close(pipefd[1]); 1187 + exit(1); 1188 + } 1189 + close(c2_fd); 1190 + 1191 + /* Send all namespace IDs */ 1192 + write(pipefd[1], &p_id, sizeof(p_id)); 1193 + write(pipefd[1], &c1_id, sizeof(c1_id)); 1194 + write(pipefd[1], &c2_id, sizeof(c2_id)); 1195 + close(pipefd[1]); 1196 + exit(0); 1197 + } 1198 + 1199 + close(pipefd[1]); 1200 + 1201 + /* Read all three namespace IDs - fixed size, no parsing needed */ 1202 + ret = read(pipefd[0], &p_id, sizeof(p_id)); 1203 + if (ret != sizeof(p_id)) { 1204 + close(pipefd[0]); 1205 + waitpid(pid, NULL, 0); 1206 + SKIP(return, "Failed to read parent namespace ID"); 1207 + } 1208 + 1209 + ret = read(pipefd[0], &c1_id, sizeof(c1_id)); 1210 + if (ret != sizeof(c1_id)) { 1211 + close(pipefd[0]); 1212 + waitpid(pid, NULL, 0); 1213 + SKIP(return, "Failed to read first child namespace ID"); 1214 + } 1215 + 1216 + ret = read(pipefd[0], &c2_id, sizeof(c2_id)); 1217 + close(pipefd[0]); 1218 + if (ret != sizeof(c2_id)) { 1219 + waitpid(pid, NULL, 0); 1220 + SKIP(return, "Failed to read second child namespace ID"); 1221 + } 1222 + 1223 + /* Construct file handles from namespace IDs */ 1224 + p_handle = (struct file_handle *)p_buf; 1225 + p_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1226 + p_handle->handle_type = FILEID_NSFS; 1227 + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)p_handle->f_handle; 1228 + p_fh->ns_id = p_id; 1229 + p_fh->ns_type = 0; 1230 + p_fh->ns_inum = 0; 1231 + 1232 + c1_handle = (struct file_handle *)c1_buf; 1233 + c1_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1234 + c1_handle->handle_type = FILEID_NSFS; 1235 + struct nsfs_file_handle *c1_fh = (struct nsfs_file_handle *)c1_handle->f_handle; 1236 + c1_fh->ns_id = c1_id; 1237 + c1_fh->ns_type = 0; 1238 + c1_fh->ns_inum = 0; 1239 + 1240 + c2_handle = (struct file_handle *)c2_buf; 1241 + c2_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1242 + c2_handle->handle_type = FILEID_NSFS; 1243 + struct nsfs_file_handle *c2_fh = (struct nsfs_file_handle *)c2_handle->f_handle; 1244 + c2_fh->ns_id = c2_id; 1245 + c2_fh->ns_type = 0; 1246 + c2_fh->ns_inum = 0; 1247 + 1248 + /* Open both children before process exits */ 1249 + int c1_fd = open_by_handle_at(FD_NSFS_ROOT, c1_handle, O_RDONLY); 1250 + int c2_fd = open_by_handle_at(FD_NSFS_ROOT, c2_handle, O_RDONLY); 1251 + 1252 + if (c1_fd < 0 || c2_fd < 0) { 1253 + if (c1_fd >= 0) close(c1_fd); 1254 + if (c2_fd >= 0) close(c2_fd); 1255 + waitpid(pid, NULL, 0); 1256 + SKIP(return, "Failed to open child namespaces"); 1257 + } 1258 + 1259 + waitpid(pid, &status, 0); 1260 + ASSERT_TRUE(WIFEXITED(status)); 1261 + ASSERT_EQ(WEXITSTATUS(status), 0); 1262 + 1263 + /* Parent should be active (both children active) */ 1264 + TH_LOG("Both children active - parent should be active"); 1265 + int p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); 1266 + ASSERT_GE(p_fd, 0); 1267 + close(p_fd); 1268 + 1269 + /* Close first child - parent should STILL be active */ 1270 + TH_LOG("Closing first child - parent should still be active"); 1271 + close(c1_fd); 1272 + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); 1273 + ASSERT_GE(p_fd, 0); 1274 + close(p_fd); 1275 + 1276 + /* Close second child - NOW parent should become inactive */ 1277 + TH_LOG("Closing second child - parent should become inactive"); 1278 + close(c2_fd); 1279 + p_fd = open_by_handle_at(FD_NSFS_ROOT, p_handle, O_RDONLY); 1280 + ASSERT_LT(p_fd, 0); 1281 + } 1282 + 1283 + /* 1284 + * Test that different namespace types with same owner all contribute 1285 + * active references to the owning user namespace. 1286 + */ 1287 + TEST(ns_different_types_same_owner) 1288 + { 1289 + struct file_handle *u_handle, *n_handle, *ut_handle; 1290 + int ret, pipefd[2]; 1291 + pid_t pid; 1292 + int status; 1293 + __u64 u_id, n_id, ut_id; 1294 + char u_buf[sizeof(*u_handle) + MAX_HANDLE_SZ]; 1295 + char n_buf[sizeof(*n_handle) + MAX_HANDLE_SZ]; 1296 + char ut_buf[sizeof(*ut_handle) + MAX_HANDLE_SZ]; 1297 + 1298 + ASSERT_EQ(pipe(pipefd), 0); 1299 + pid = fork(); 1300 + ASSERT_GE(pid, 0); 1301 + 1302 + if (pid == 0) { 1303 + close(pipefd[0]); 1304 + 1305 + /* Create user namespace */ 1306 + if (setup_userns() < 0) { 1307 + close(pipefd[1]); 1308 + exit(1); 1309 + } 1310 + 1311 + int u_fd = open("/proc/self/ns/user", O_RDONLY); 1312 + if (u_fd < 0) { 1313 + close(pipefd[1]); 1314 + exit(1); 1315 + } 1316 + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { 1317 + close(u_fd); 1318 + close(pipefd[1]); 1319 + exit(1); 1320 + } 1321 + close(u_fd); 1322 + 1323 + /* Create network namespace (owned by user namespace) */ 1324 + if (unshare(CLONE_NEWNET) < 0) { 1325 + close(pipefd[1]); 1326 + exit(1); 1327 + } 1328 + 1329 + int n_fd = open("/proc/self/ns/net", O_RDONLY); 1330 + if (n_fd < 0) { 1331 + close(pipefd[1]); 1332 + exit(1); 1333 + } 1334 + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { 1335 + close(n_fd); 1336 + close(pipefd[1]); 1337 + exit(1); 1338 + } 1339 + close(n_fd); 1340 + 1341 + /* Create UTS namespace (also owned by user namespace) */ 1342 + if (unshare(CLONE_NEWUTS) < 0) { 1343 + close(pipefd[1]); 1344 + exit(1); 1345 + } 1346 + 1347 + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); 1348 + if (ut_fd < 0) { 1349 + close(pipefd[1]); 1350 + exit(1); 1351 + } 1352 + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { 1353 + close(ut_fd); 1354 + close(pipefd[1]); 1355 + exit(1); 1356 + } 1357 + close(ut_fd); 1358 + 1359 + /* Send all namespace IDs */ 1360 + write(pipefd[1], &u_id, sizeof(u_id)); 1361 + write(pipefd[1], &n_id, sizeof(n_id)); 1362 + write(pipefd[1], &ut_id, sizeof(ut_id)); 1363 + close(pipefd[1]); 1364 + exit(0); 1365 + } 1366 + 1367 + close(pipefd[1]); 1368 + 1369 + /* Read all three namespace IDs - fixed size, no parsing needed */ 1370 + ret = read(pipefd[0], &u_id, sizeof(u_id)); 1371 + if (ret != sizeof(u_id)) { 1372 + close(pipefd[0]); 1373 + waitpid(pid, NULL, 0); 1374 + SKIP(return, "Failed to read user namespace ID"); 1375 + } 1376 + 1377 + ret = read(pipefd[0], &n_id, sizeof(n_id)); 1378 + if (ret != sizeof(n_id)) { 1379 + close(pipefd[0]); 1380 + waitpid(pid, NULL, 0); 1381 + SKIP(return, "Failed to read network namespace ID"); 1382 + } 1383 + 1384 + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); 1385 + close(pipefd[0]); 1386 + if (ret != sizeof(ut_id)) { 1387 + waitpid(pid, NULL, 0); 1388 + SKIP(return, "Failed to read UTS namespace ID"); 1389 + } 1390 + 1391 + /* Construct file handles from namespace IDs */ 1392 + u_handle = (struct file_handle *)u_buf; 1393 + u_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1394 + u_handle->handle_type = FILEID_NSFS; 1395 + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)u_handle->f_handle; 1396 + u_fh->ns_id = u_id; 1397 + u_fh->ns_type = 0; 1398 + u_fh->ns_inum = 0; 1399 + 1400 + n_handle = (struct file_handle *)n_buf; 1401 + n_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1402 + n_handle->handle_type = FILEID_NSFS; 1403 + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)n_handle->f_handle; 1404 + n_fh->ns_id = n_id; 1405 + n_fh->ns_type = 0; 1406 + n_fh->ns_inum = 0; 1407 + 1408 + ut_handle = (struct file_handle *)ut_buf; 1409 + ut_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1410 + ut_handle->handle_type = FILEID_NSFS; 1411 + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)ut_handle->f_handle; 1412 + ut_fh->ns_id = ut_id; 1413 + ut_fh->ns_type = 0; 1414 + ut_fh->ns_inum = 0; 1415 + 1416 + /* Open both non-user namespaces before process exits */ 1417 + int n_fd = open_by_handle_at(FD_NSFS_ROOT, n_handle, O_RDONLY); 1418 + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, ut_handle, O_RDONLY); 1419 + 1420 + if (n_fd < 0 || ut_fd < 0) { 1421 + if (n_fd >= 0) close(n_fd); 1422 + if (ut_fd >= 0) close(ut_fd); 1423 + waitpid(pid, NULL, 0); 1424 + SKIP(return, "Failed to open namespaces"); 1425 + } 1426 + 1427 + waitpid(pid, &status, 0); 1428 + ASSERT_TRUE(WIFEXITED(status)); 1429 + ASSERT_EQ(WEXITSTATUS(status), 0); 1430 + 1431 + /* 1432 + * Both network and UTS namespaces are active. 1433 + * User namespace should be active (gets 2 active refs). 1434 + */ 1435 + TH_LOG("Both net and uts active - user namespace should be active"); 1436 + int u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); 1437 + ASSERT_GE(u_fd, 0); 1438 + close(u_fd); 1439 + 1440 + /* Close network namespace - user namespace should STILL be active */ 1441 + TH_LOG("Closing network ns - user ns should still be active (uts still active)"); 1442 + close(n_fd); 1443 + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); 1444 + ASSERT_GE(u_fd, 0); 1445 + close(u_fd); 1446 + 1447 + /* Close UTS namespace - user namespace should become inactive */ 1448 + TH_LOG("Closing uts ns - user ns should become inactive"); 1449 + close(ut_fd); 1450 + u_fd = open_by_handle_at(FD_NSFS_ROOT, u_handle, O_RDONLY); 1451 + ASSERT_LT(u_fd, 0); 1452 + } 1453 + 1454 + /* 1455 + * Test hierarchical propagation with deep namespace hierarchy. 1456 + * Create: init_user_ns -> user_A -> user_B -> net_ns 1457 + * When net_ns is active, both user_A and user_B should be active. 1458 + * This verifies the conditional recursion in __ns_ref_active_put() works. 1459 + */ 1460 + TEST(ns_deep_hierarchy_propagation) 1461 + { 1462 + struct file_handle *ua_handle, *ub_handle, *net_handle; 1463 + int ret, pipefd[2]; 1464 + pid_t pid; 1465 + int status; 1466 + __u64 ua_id, ub_id, net_id; 1467 + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; 1468 + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; 1469 + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; 1470 + 1471 + ASSERT_EQ(pipe(pipefd), 0); 1472 + pid = fork(); 1473 + ASSERT_GE(pid, 0); 1474 + 1475 + if (pid == 0) { 1476 + close(pipefd[0]); 1477 + 1478 + /* Create user_A -> user_B -> net hierarchy */ 1479 + if (setup_userns() < 0) { 1480 + close(pipefd[1]); 1481 + exit(1); 1482 + } 1483 + 1484 + int ua_fd = open("/proc/self/ns/user", O_RDONLY); 1485 + if (ua_fd < 0) { 1486 + close(pipefd[1]); 1487 + exit(1); 1488 + } 1489 + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { 1490 + close(ua_fd); 1491 + close(pipefd[1]); 1492 + exit(1); 1493 + } 1494 + close(ua_fd); 1495 + 1496 + if (setup_userns() < 0) { 1497 + close(pipefd[1]); 1498 + exit(1); 1499 + } 1500 + 1501 + int ub_fd = open("/proc/self/ns/user", O_RDONLY); 1502 + if (ub_fd < 0) { 1503 + close(pipefd[1]); 1504 + exit(1); 1505 + } 1506 + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { 1507 + close(ub_fd); 1508 + close(pipefd[1]); 1509 + exit(1); 1510 + } 1511 + close(ub_fd); 1512 + 1513 + if (unshare(CLONE_NEWNET) < 0) { 1514 + close(pipefd[1]); 1515 + exit(1); 1516 + } 1517 + 1518 + int net_fd = open("/proc/self/ns/net", O_RDONLY); 1519 + if (net_fd < 0) { 1520 + close(pipefd[1]); 1521 + exit(1); 1522 + } 1523 + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { 1524 + close(net_fd); 1525 + close(pipefd[1]); 1526 + exit(1); 1527 + } 1528 + close(net_fd); 1529 + 1530 + /* Send all three namespace IDs */ 1531 + write(pipefd[1], &ua_id, sizeof(ua_id)); 1532 + write(pipefd[1], &ub_id, sizeof(ub_id)); 1533 + write(pipefd[1], &net_id, sizeof(net_id)); 1534 + close(pipefd[1]); 1535 + exit(0); 1536 + } 1537 + 1538 + close(pipefd[1]); 1539 + 1540 + /* Read all three namespace IDs - fixed size, no parsing needed */ 1541 + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); 1542 + if (ret != sizeof(ua_id)) { 1543 + close(pipefd[0]); 1544 + waitpid(pid, NULL, 0); 1545 + SKIP(return, "Failed to read user_A namespace ID"); 1546 + } 1547 + 1548 + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); 1549 + if (ret != sizeof(ub_id)) { 1550 + close(pipefd[0]); 1551 + waitpid(pid, NULL, 0); 1552 + SKIP(return, "Failed to read user_B namespace ID"); 1553 + } 1554 + 1555 + ret = read(pipefd[0], &net_id, sizeof(net_id)); 1556 + close(pipefd[0]); 1557 + if (ret != sizeof(net_id)) { 1558 + waitpid(pid, NULL, 0); 1559 + SKIP(return, "Failed to read network namespace ID"); 1560 + } 1561 + 1562 + /* Construct file handles from namespace IDs */ 1563 + ua_handle = (struct file_handle *)ua_buf; 1564 + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1565 + ua_handle->handle_type = FILEID_NSFS; 1566 + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; 1567 + ua_fh->ns_id = ua_id; 1568 + ua_fh->ns_type = 0; 1569 + ua_fh->ns_inum = 0; 1570 + 1571 + ub_handle = (struct file_handle *)ub_buf; 1572 + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1573 + ub_handle->handle_type = FILEID_NSFS; 1574 + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; 1575 + ub_fh->ns_id = ub_id; 1576 + ub_fh->ns_type = 0; 1577 + ub_fh->ns_inum = 0; 1578 + 1579 + net_handle = (struct file_handle *)net_buf; 1580 + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1581 + net_handle->handle_type = FILEID_NSFS; 1582 + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; 1583 + net_fh->ns_id = net_id; 1584 + net_fh->ns_type = 0; 1585 + net_fh->ns_inum = 0; 1586 + 1587 + /* Open net_ns before child exits to keep it active */ 1588 + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); 1589 + if (net_fd < 0) { 1590 + waitpid(pid, NULL, 0); 1591 + SKIP(return, "Failed to open network namespace"); 1592 + } 1593 + 1594 + waitpid(pid, &status, 0); 1595 + ASSERT_TRUE(WIFEXITED(status)); 1596 + ASSERT_EQ(WEXITSTATUS(status), 0); 1597 + 1598 + /* With net_ns active, both user_A and user_B should be active */ 1599 + TH_LOG("Testing user_B active (net_ns active causes propagation)"); 1600 + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); 1601 + ASSERT_GE(ub_fd, 0); 1602 + 1603 + TH_LOG("Testing user_A active (propagated through user_B)"); 1604 + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); 1605 + ASSERT_GE(ua_fd, 0); 1606 + 1607 + /* Close net_ns - user_B should stay active (we hold direct ref) */ 1608 + TH_LOG("Closing net_ns, user_B should remain active (direct ref held)"); 1609 + close(net_fd); 1610 + int ub_fd2 = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); 1611 + ASSERT_GE(ub_fd2, 0); 1612 + close(ub_fd2); 1613 + 1614 + /* Close user_B - user_A should stay active (we hold direct ref) */ 1615 + TH_LOG("Closing user_B, user_A should remain active (direct ref held)"); 1616 + close(ub_fd); 1617 + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); 1618 + ASSERT_GE(ua_fd2, 0); 1619 + close(ua_fd2); 1620 + 1621 + /* Close user_A - everything should become inactive */ 1622 + TH_LOG("Closing user_A, all should become inactive"); 1623 + close(ua_fd); 1624 + 1625 + /* All should now be inactive */ 1626 + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); 1627 + ASSERT_LT(ua_fd, 0); 1628 + } 1629 + 1630 + /* 1631 + * Test that parent stays active as long as ANY child is active. 1632 + * Create parent user namespace with two child net namespaces. 1633 + * Parent should remain active until BOTH children are inactive. 1634 + */ 1635 + TEST(ns_parent_multiple_children_refcount) 1636 + { 1637 + struct file_handle *parent_handle, *net1_handle, *net2_handle; 1638 + int ret, pipefd[2], syncpipe[2]; 1639 + pid_t pid; 1640 + int status; 1641 + __u64 p_id, n1_id, n2_id; 1642 + char p_buf[sizeof(*parent_handle) + MAX_HANDLE_SZ]; 1643 + char n1_buf[sizeof(*net1_handle) + MAX_HANDLE_SZ]; 1644 + char n2_buf[sizeof(*net2_handle) + MAX_HANDLE_SZ]; 1645 + char sync_byte; 1646 + 1647 + ASSERT_EQ(pipe(pipefd), 0); 1648 + ASSERT_EQ(pipe(syncpipe), 0); 1649 + pid = fork(); 1650 + ASSERT_GE(pid, 0); 1651 + 1652 + if (pid == 0) { 1653 + close(pipefd[0]); 1654 + close(syncpipe[1]); 1655 + 1656 + /* Create parent user namespace */ 1657 + if (setup_userns() < 0) { 1658 + close(pipefd[1]); 1659 + exit(1); 1660 + } 1661 + 1662 + int p_fd = open("/proc/self/ns/user", O_RDONLY); 1663 + if (p_fd < 0) { 1664 + close(pipefd[1]); 1665 + exit(1); 1666 + } 1667 + if (ioctl(p_fd, NS_GET_ID, &p_id) < 0) { 1668 + close(p_fd); 1669 + close(pipefd[1]); 1670 + exit(1); 1671 + } 1672 + close(p_fd); 1673 + 1674 + /* Create first network namespace */ 1675 + if (unshare(CLONE_NEWNET) < 0) { 1676 + close(pipefd[1]); 1677 + close(syncpipe[0]); 1678 + exit(1); 1679 + } 1680 + 1681 + int n1_fd = open("/proc/self/ns/net", O_RDONLY); 1682 + if (n1_fd < 0) { 1683 + close(pipefd[1]); 1684 + close(syncpipe[0]); 1685 + exit(1); 1686 + } 1687 + if (ioctl(n1_fd, NS_GET_ID, &n1_id) < 0) { 1688 + close(n1_fd); 1689 + close(pipefd[1]); 1690 + close(syncpipe[0]); 1691 + exit(1); 1692 + } 1693 + /* Keep n1_fd open so first namespace stays active */ 1694 + 1695 + /* Create second network namespace */ 1696 + if (unshare(CLONE_NEWNET) < 0) { 1697 + close(n1_fd); 1698 + close(pipefd[1]); 1699 + close(syncpipe[0]); 1700 + exit(1); 1701 + } 1702 + 1703 + int n2_fd = open("/proc/self/ns/net", O_RDONLY); 1704 + if (n2_fd < 0) { 1705 + close(n1_fd); 1706 + close(pipefd[1]); 1707 + close(syncpipe[0]); 1708 + exit(1); 1709 + } 1710 + if (ioctl(n2_fd, NS_GET_ID, &n2_id) < 0) { 1711 + close(n1_fd); 1712 + close(n2_fd); 1713 + close(pipefd[1]); 1714 + close(syncpipe[0]); 1715 + exit(1); 1716 + } 1717 + /* Keep both n1_fd and n2_fd open */ 1718 + 1719 + /* Send all namespace IDs */ 1720 + write(pipefd[1], &p_id, sizeof(p_id)); 1721 + write(pipefd[1], &n1_id, sizeof(n1_id)); 1722 + write(pipefd[1], &n2_id, sizeof(n2_id)); 1723 + close(pipefd[1]); 1724 + 1725 + /* Wait for parent to signal before exiting */ 1726 + read(syncpipe[0], &sync_byte, 1); 1727 + close(syncpipe[0]); 1728 + exit(0); 1729 + } 1730 + 1731 + close(pipefd[1]); 1732 + close(syncpipe[0]); 1733 + 1734 + /* Read all three namespace IDs - fixed size, no parsing needed */ 1735 + ret = read(pipefd[0], &p_id, sizeof(p_id)); 1736 + if (ret != sizeof(p_id)) { 1737 + close(pipefd[0]); 1738 + waitpid(pid, NULL, 0); 1739 + SKIP(return, "Failed to read parent namespace ID"); 1740 + } 1741 + 1742 + ret = read(pipefd[0], &n1_id, sizeof(n1_id)); 1743 + if (ret != sizeof(n1_id)) { 1744 + close(pipefd[0]); 1745 + waitpid(pid, NULL, 0); 1746 + SKIP(return, "Failed to read first network namespace ID"); 1747 + } 1748 + 1749 + ret = read(pipefd[0], &n2_id, sizeof(n2_id)); 1750 + close(pipefd[0]); 1751 + if (ret != sizeof(n2_id)) { 1752 + waitpid(pid, NULL, 0); 1753 + SKIP(return, "Failed to read second network namespace ID"); 1754 + } 1755 + 1756 + /* Construct file handles from namespace IDs */ 1757 + parent_handle = (struct file_handle *)p_buf; 1758 + parent_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1759 + parent_handle->handle_type = FILEID_NSFS; 1760 + struct nsfs_file_handle *p_fh = (struct nsfs_file_handle *)parent_handle->f_handle; 1761 + p_fh->ns_id = p_id; 1762 + p_fh->ns_type = 0; 1763 + p_fh->ns_inum = 0; 1764 + 1765 + net1_handle = (struct file_handle *)n1_buf; 1766 + net1_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1767 + net1_handle->handle_type = FILEID_NSFS; 1768 + struct nsfs_file_handle *n1_fh = (struct nsfs_file_handle *)net1_handle->f_handle; 1769 + n1_fh->ns_id = n1_id; 1770 + n1_fh->ns_type = 0; 1771 + n1_fh->ns_inum = 0; 1772 + 1773 + net2_handle = (struct file_handle *)n2_buf; 1774 + net2_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1775 + net2_handle->handle_type = FILEID_NSFS; 1776 + struct nsfs_file_handle *n2_fh = (struct nsfs_file_handle *)net2_handle->f_handle; 1777 + n2_fh->ns_id = n2_id; 1778 + n2_fh->ns_type = 0; 1779 + n2_fh->ns_inum = 0; 1780 + 1781 + /* Open both net namespaces while child is still alive */ 1782 + int n1_fd = open_by_handle_at(FD_NSFS_ROOT, net1_handle, O_RDONLY); 1783 + int n2_fd = open_by_handle_at(FD_NSFS_ROOT, net2_handle, O_RDONLY); 1784 + if (n1_fd < 0 || n2_fd < 0) { 1785 + if (n1_fd >= 0) close(n1_fd); 1786 + if (n2_fd >= 0) close(n2_fd); 1787 + sync_byte = 'G'; 1788 + write(syncpipe[1], &sync_byte, 1); 1789 + close(syncpipe[1]); 1790 + waitpid(pid, NULL, 0); 1791 + SKIP(return, "Failed to open net namespaces"); 1792 + } 1793 + 1794 + /* Signal child that we have opened the namespaces */ 1795 + sync_byte = 'G'; 1796 + write(syncpipe[1], &sync_byte, 1); 1797 + close(syncpipe[1]); 1798 + 1799 + /* Wait for child to exit */ 1800 + waitpid(pid, &status, 0); 1801 + ASSERT_TRUE(WIFEXITED(status)); 1802 + ASSERT_EQ(WEXITSTATUS(status), 0); 1803 + 1804 + /* Parent should be active (has 2 active children) */ 1805 + TH_LOG("Both net namespaces active - parent should be active"); 1806 + int p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); 1807 + ASSERT_GE(p_fd, 0); 1808 + close(p_fd); 1809 + 1810 + /* Close first net namespace - parent should STILL be active */ 1811 + TH_LOG("Closing first net ns - parent should still be active"); 1812 + close(n1_fd); 1813 + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); 1814 + ASSERT_GE(p_fd, 0); 1815 + close(p_fd); 1816 + 1817 + /* Close second net namespace - parent should become inactive */ 1818 + TH_LOG("Closing second net ns - parent should become inactive"); 1819 + close(n2_fd); 1820 + p_fd = open_by_handle_at(FD_NSFS_ROOT, parent_handle, O_RDONLY); 1821 + ASSERT_LT(p_fd, 0); 1822 + } 1823 + 1824 + /* 1825 + * Test that user namespace as a child also propagates correctly. 1826 + * Create user_A -> user_B, verify when user_B is active that user_A 1827 + * is also active. This is different from non-user namespace children. 1828 + */ 1829 + TEST(ns_userns_child_propagation) 1830 + { 1831 + struct file_handle *ua_handle, *ub_handle; 1832 + int ret, pipefd[2]; 1833 + pid_t pid; 1834 + int status; 1835 + __u64 ua_id, ub_id; 1836 + char ua_buf[sizeof(*ua_handle) + MAX_HANDLE_SZ]; 1837 + char ub_buf[sizeof(*ub_handle) + MAX_HANDLE_SZ]; 1838 + 1839 + ASSERT_EQ(pipe(pipefd), 0); 1840 + pid = fork(); 1841 + ASSERT_GE(pid, 0); 1842 + 1843 + if (pid == 0) { 1844 + close(pipefd[0]); 1845 + 1846 + /* Create user_A */ 1847 + if (setup_userns() < 0) { 1848 + close(pipefd[1]); 1849 + exit(1); 1850 + } 1851 + 1852 + int ua_fd = open("/proc/self/ns/user", O_RDONLY); 1853 + if (ua_fd < 0) { 1854 + close(pipefd[1]); 1855 + exit(1); 1856 + } 1857 + if (ioctl(ua_fd, NS_GET_ID, &ua_id) < 0) { 1858 + close(ua_fd); 1859 + close(pipefd[1]); 1860 + exit(1); 1861 + } 1862 + close(ua_fd); 1863 + 1864 + /* Create user_B (child of user_A) */ 1865 + if (setup_userns() < 0) { 1866 + close(pipefd[1]); 1867 + exit(1); 1868 + } 1869 + 1870 + int ub_fd = open("/proc/self/ns/user", O_RDONLY); 1871 + if (ub_fd < 0) { 1872 + close(pipefd[1]); 1873 + exit(1); 1874 + } 1875 + if (ioctl(ub_fd, NS_GET_ID, &ub_id) < 0) { 1876 + close(ub_fd); 1877 + close(pipefd[1]); 1878 + exit(1); 1879 + } 1880 + close(ub_fd); 1881 + 1882 + /* Send both namespace IDs */ 1883 + write(pipefd[1], &ua_id, sizeof(ua_id)); 1884 + write(pipefd[1], &ub_id, sizeof(ub_id)); 1885 + close(pipefd[1]); 1886 + exit(0); 1887 + } 1888 + 1889 + close(pipefd[1]); 1890 + 1891 + /* Read both namespace IDs - fixed size, no parsing needed */ 1892 + ret = read(pipefd[0], &ua_id, sizeof(ua_id)); 1893 + if (ret != sizeof(ua_id)) { 1894 + close(pipefd[0]); 1895 + waitpid(pid, NULL, 0); 1896 + SKIP(return, "Failed to read user_A namespace ID"); 1897 + } 1898 + 1899 + ret = read(pipefd[0], &ub_id, sizeof(ub_id)); 1900 + close(pipefd[0]); 1901 + if (ret != sizeof(ub_id)) { 1902 + waitpid(pid, NULL, 0); 1903 + SKIP(return, "Failed to read user_B namespace ID"); 1904 + } 1905 + 1906 + /* Construct file handles from namespace IDs */ 1907 + ua_handle = (struct file_handle *)ua_buf; 1908 + ua_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1909 + ua_handle->handle_type = FILEID_NSFS; 1910 + struct nsfs_file_handle *ua_fh = (struct nsfs_file_handle *)ua_handle->f_handle; 1911 + ua_fh->ns_id = ua_id; 1912 + ua_fh->ns_type = 0; 1913 + ua_fh->ns_inum = 0; 1914 + 1915 + ub_handle = (struct file_handle *)ub_buf; 1916 + ub_handle->handle_bytes = sizeof(struct nsfs_file_handle); 1917 + ub_handle->handle_type = FILEID_NSFS; 1918 + struct nsfs_file_handle *ub_fh = (struct nsfs_file_handle *)ub_handle->f_handle; 1919 + ub_fh->ns_id = ub_id; 1920 + ub_fh->ns_type = 0; 1921 + ub_fh->ns_inum = 0; 1922 + 1923 + /* Open user_B before child exits */ 1924 + int ub_fd = open_by_handle_at(FD_NSFS_ROOT, ub_handle, O_RDONLY); 1925 + if (ub_fd < 0) { 1926 + waitpid(pid, NULL, 0); 1927 + SKIP(return, "Failed to open user_B"); 1928 + } 1929 + 1930 + waitpid(pid, &status, 0); 1931 + ASSERT_TRUE(WIFEXITED(status)); 1932 + ASSERT_EQ(WEXITSTATUS(status), 0); 1933 + 1934 + /* With user_B active, user_A should also be active */ 1935 + TH_LOG("Testing user_A active when child user_B is active"); 1936 + int ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); 1937 + ASSERT_GE(ua_fd, 0); 1938 + 1939 + /* Close user_B */ 1940 + TH_LOG("Closing user_B"); 1941 + close(ub_fd); 1942 + 1943 + /* user_A should remain active (we hold direct ref) */ 1944 + int ua_fd2 = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); 1945 + ASSERT_GE(ua_fd2, 0); 1946 + close(ua_fd2); 1947 + 1948 + /* Close user_A - should become inactive */ 1949 + TH_LOG("Closing user_A - should become inactive"); 1950 + close(ua_fd); 1951 + 1952 + ua_fd = open_by_handle_at(FD_NSFS_ROOT, ua_handle, O_RDONLY); 1953 + ASSERT_LT(ua_fd, 0); 1954 + } 1955 + 1956 + /* 1957 + * Test different namespace types (net, uts, ipc) all contributing 1958 + * active references to the same owning user namespace. 1959 + */ 1960 + TEST(ns_mixed_types_same_owner) 1961 + { 1962 + struct file_handle *user_handle, *net_handle, *uts_handle; 1963 + int ret, pipefd[2]; 1964 + pid_t pid; 1965 + int status; 1966 + __u64 u_id, n_id, ut_id; 1967 + char u_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; 1968 + char n_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; 1969 + char ut_buf[sizeof(*uts_handle) + MAX_HANDLE_SZ]; 1970 + 1971 + ASSERT_EQ(pipe(pipefd), 0); 1972 + pid = fork(); 1973 + ASSERT_GE(pid, 0); 1974 + 1975 + if (pid == 0) { 1976 + close(pipefd[0]); 1977 + 1978 + if (setup_userns() < 0) { 1979 + close(pipefd[1]); 1980 + exit(1); 1981 + } 1982 + 1983 + int u_fd = open("/proc/self/ns/user", O_RDONLY); 1984 + if (u_fd < 0) { 1985 + close(pipefd[1]); 1986 + exit(1); 1987 + } 1988 + if (ioctl(u_fd, NS_GET_ID, &u_id) < 0) { 1989 + close(u_fd); 1990 + close(pipefd[1]); 1991 + exit(1); 1992 + } 1993 + close(u_fd); 1994 + 1995 + if (unshare(CLONE_NEWNET) < 0) { 1996 + close(pipefd[1]); 1997 + exit(1); 1998 + } 1999 + 2000 + int n_fd = open("/proc/self/ns/net", O_RDONLY); 2001 + if (n_fd < 0) { 2002 + close(pipefd[1]); 2003 + exit(1); 2004 + } 2005 + if (ioctl(n_fd, NS_GET_ID, &n_id) < 0) { 2006 + close(n_fd); 2007 + close(pipefd[1]); 2008 + exit(1); 2009 + } 2010 + close(n_fd); 2011 + 2012 + if (unshare(CLONE_NEWUTS) < 0) { 2013 + close(pipefd[1]); 2014 + exit(1); 2015 + } 2016 + 2017 + int ut_fd = open("/proc/self/ns/uts", O_RDONLY); 2018 + if (ut_fd < 0) { 2019 + close(pipefd[1]); 2020 + exit(1); 2021 + } 2022 + if (ioctl(ut_fd, NS_GET_ID, &ut_id) < 0) { 2023 + close(ut_fd); 2024 + close(pipefd[1]); 2025 + exit(1); 2026 + } 2027 + close(ut_fd); 2028 + 2029 + /* Send all namespace IDs */ 2030 + write(pipefd[1], &u_id, sizeof(u_id)); 2031 + write(pipefd[1], &n_id, sizeof(n_id)); 2032 + write(pipefd[1], &ut_id, sizeof(ut_id)); 2033 + close(pipefd[1]); 2034 + exit(0); 2035 + } 2036 + 2037 + close(pipefd[1]); 2038 + 2039 + /* Read all three namespace IDs - fixed size, no parsing needed */ 2040 + ret = read(pipefd[0], &u_id, sizeof(u_id)); 2041 + if (ret != sizeof(u_id)) { 2042 + close(pipefd[0]); 2043 + waitpid(pid, NULL, 0); 2044 + SKIP(return, "Failed to read user namespace ID"); 2045 + } 2046 + 2047 + ret = read(pipefd[0], &n_id, sizeof(n_id)); 2048 + if (ret != sizeof(n_id)) { 2049 + close(pipefd[0]); 2050 + waitpid(pid, NULL, 0); 2051 + SKIP(return, "Failed to read network namespace ID"); 2052 + } 2053 + 2054 + ret = read(pipefd[0], &ut_id, sizeof(ut_id)); 2055 + close(pipefd[0]); 2056 + if (ret != sizeof(ut_id)) { 2057 + waitpid(pid, NULL, 0); 2058 + SKIP(return, "Failed to read UTS namespace ID"); 2059 + } 2060 + 2061 + /* Construct file handles from namespace IDs */ 2062 + user_handle = (struct file_handle *)u_buf; 2063 + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); 2064 + user_handle->handle_type = FILEID_NSFS; 2065 + struct nsfs_file_handle *u_fh = (struct nsfs_file_handle *)user_handle->f_handle; 2066 + u_fh->ns_id = u_id; 2067 + u_fh->ns_type = 0; 2068 + u_fh->ns_inum = 0; 2069 + 2070 + net_handle = (struct file_handle *)n_buf; 2071 + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); 2072 + net_handle->handle_type = FILEID_NSFS; 2073 + struct nsfs_file_handle *n_fh = (struct nsfs_file_handle *)net_handle->f_handle; 2074 + n_fh->ns_id = n_id; 2075 + n_fh->ns_type = 0; 2076 + n_fh->ns_inum = 0; 2077 + 2078 + uts_handle = (struct file_handle *)ut_buf; 2079 + uts_handle->handle_bytes = sizeof(struct nsfs_file_handle); 2080 + uts_handle->handle_type = FILEID_NSFS; 2081 + struct nsfs_file_handle *ut_fh = (struct nsfs_file_handle *)uts_handle->f_handle; 2082 + ut_fh->ns_id = ut_id; 2083 + ut_fh->ns_type = 0; 2084 + ut_fh->ns_inum = 0; 2085 + 2086 + /* Open both non-user namespaces */ 2087 + int n_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); 2088 + int ut_fd = open_by_handle_at(FD_NSFS_ROOT, uts_handle, O_RDONLY); 2089 + if (n_fd < 0 || ut_fd < 0) { 2090 + if (n_fd >= 0) close(n_fd); 2091 + if (ut_fd >= 0) close(ut_fd); 2092 + waitpid(pid, NULL, 0); 2093 + SKIP(return, "Failed to open namespaces"); 2094 + } 2095 + 2096 + waitpid(pid, &status, 0); 2097 + ASSERT_TRUE(WIFEXITED(status)); 2098 + ASSERT_EQ(WEXITSTATUS(status), 0); 2099 + 2100 + /* User namespace should be active (2 active children) */ 2101 + TH_LOG("Both net and uts active - user ns should be active"); 2102 + int u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); 2103 + ASSERT_GE(u_fd, 0); 2104 + close(u_fd); 2105 + 2106 + /* Close net - user ns should STILL be active (uts still active) */ 2107 + TH_LOG("Closing net - user ns should still be active"); 2108 + close(n_fd); 2109 + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); 2110 + ASSERT_GE(u_fd, 0); 2111 + close(u_fd); 2112 + 2113 + /* Close uts - user ns should become inactive */ 2114 + TH_LOG("Closing uts - user ns should become inactive"); 2115 + close(ut_fd); 2116 + u_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); 2117 + ASSERT_LT(u_fd, 0); 2118 + } 2119 + 2120 + /* Thread test helpers and structures */ 2121 + struct thread_ns_info { 2122 + __u64 ns_id; 2123 + int pipefd; 2124 + int syncfd_read; 2125 + int syncfd_write; 2126 + int exit_code; 2127 + }; 2128 + 2129 + static void *thread_create_namespace(void *arg) 2130 + { 2131 + struct thread_ns_info *info = (struct thread_ns_info *)arg; 2132 + int ret; 2133 + 2134 + /* Create new network namespace */ 2135 + ret = unshare(CLONE_NEWNET); 2136 + if (ret < 0) { 2137 + info->exit_code = 1; 2138 + return NULL; 2139 + } 2140 + 2141 + /* Get namespace ID */ 2142 + int fd = open("/proc/thread-self/ns/net", O_RDONLY); 2143 + if (fd < 0) { 2144 + info->exit_code = 2; 2145 + return NULL; 2146 + } 2147 + 2148 + ret = ioctl(fd, NS_GET_ID, &info->ns_id); 2149 + close(fd); 2150 + if (ret < 0) { 2151 + info->exit_code = 3; 2152 + return NULL; 2153 + } 2154 + 2155 + /* Send namespace ID to main thread */ 2156 + if (write(info->pipefd, &info->ns_id, sizeof(info->ns_id)) != sizeof(info->ns_id)) { 2157 + info->exit_code = 4; 2158 + return NULL; 2159 + } 2160 + 2161 + /* Wait for signal to exit */ 2162 + char sync_byte; 2163 + if (read(info->syncfd_read, &sync_byte, 1) != 1) { 2164 + info->exit_code = 5; 2165 + return NULL; 2166 + } 2167 + 2168 + info->exit_code = 0; 2169 + return NULL; 2170 + } 2171 + 2172 + /* 2173 + * Test that namespace becomes inactive after thread exits. 2174 + * This verifies active reference counting works with threads, not just processes. 2175 + */ 2176 + TEST(thread_ns_inactive_after_exit) 2177 + { 2178 + pthread_t thread; 2179 + struct thread_ns_info info; 2180 + struct file_handle *handle; 2181 + int pipefd[2]; 2182 + int syncpipe[2]; 2183 + int ret; 2184 + char sync_byte; 2185 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 2186 + 2187 + ASSERT_EQ(pipe(pipefd), 0); 2188 + ASSERT_EQ(pipe(syncpipe), 0); 2189 + 2190 + info.pipefd = pipefd[1]; 2191 + info.syncfd_read = syncpipe[0]; 2192 + info.syncfd_write = -1; 2193 + info.exit_code = -1; 2194 + 2195 + /* Create thread that will create a namespace */ 2196 + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); 2197 + ASSERT_EQ(ret, 0); 2198 + 2199 + /* Read namespace ID from thread */ 2200 + __u64 ns_id; 2201 + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); 2202 + if (ret != sizeof(ns_id)) { 2203 + sync_byte = 'X'; 2204 + write(syncpipe[1], &sync_byte, 1); 2205 + pthread_join(thread, NULL); 2206 + close(pipefd[0]); 2207 + close(pipefd[1]); 2208 + close(syncpipe[0]); 2209 + close(syncpipe[1]); 2210 + SKIP(return, "Failed to read namespace ID from thread"); 2211 + } 2212 + 2213 + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); 2214 + 2215 + /* Construct file handle */ 2216 + handle = (struct file_handle *)buf; 2217 + handle->handle_bytes = sizeof(struct nsfs_file_handle); 2218 + handle->handle_type = FILEID_NSFS; 2219 + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; 2220 + fh->ns_id = ns_id; 2221 + fh->ns_type = 0; 2222 + fh->ns_inum = 0; 2223 + 2224 + /* Namespace should be active while thread is alive */ 2225 + TH_LOG("Attempting to open namespace while thread is alive (should succeed)"); 2226 + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 2227 + ASSERT_GE(nsfd, 0); 2228 + close(nsfd); 2229 + 2230 + /* Signal thread to exit */ 2231 + TH_LOG("Signaling thread to exit"); 2232 + sync_byte = 'X'; 2233 + ASSERT_EQ(write(syncpipe[1], &sync_byte, 1), 1); 2234 + close(syncpipe[1]); 2235 + 2236 + /* Wait for thread to exit */ 2237 + ASSERT_EQ(pthread_join(thread, NULL), 0); 2238 + close(pipefd[0]); 2239 + close(pipefd[1]); 2240 + close(syncpipe[0]); 2241 + 2242 + if (info.exit_code != 0) 2243 + SKIP(return, "Thread failed to create namespace"); 2244 + 2245 + TH_LOG("Thread exited, namespace should be inactive"); 2246 + 2247 + /* Namespace should now be inactive */ 2248 + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 2249 + ASSERT_LT(nsfd, 0); 2250 + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ 2251 + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); 2252 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 2253 + } 2254 + 2255 + /* 2256 + * Test that a namespace remains active while a thread holds an fd to it. 2257 + * Even after the thread exits, the namespace should remain active as long as 2258 + * another thread holds a file descriptor to it. 2259 + */ 2260 + TEST(thread_ns_fd_keeps_active) 2261 + { 2262 + pthread_t thread; 2263 + struct thread_ns_info info; 2264 + struct file_handle *handle; 2265 + int pipefd[2]; 2266 + int syncpipe[2]; 2267 + int ret; 2268 + char sync_byte; 2269 + char buf[sizeof(*handle) + MAX_HANDLE_SZ]; 2270 + 2271 + ASSERT_EQ(pipe(pipefd), 0); 2272 + ASSERT_EQ(pipe(syncpipe), 0); 2273 + 2274 + info.pipefd = pipefd[1]; 2275 + info.syncfd_read = syncpipe[0]; 2276 + info.syncfd_write = -1; 2277 + info.exit_code = -1; 2278 + 2279 + /* Create thread that will create a namespace */ 2280 + ret = pthread_create(&thread, NULL, thread_create_namespace, &info); 2281 + ASSERT_EQ(ret, 0); 2282 + 2283 + /* Read namespace ID from thread */ 2284 + __u64 ns_id; 2285 + ret = read(pipefd[0], &ns_id, sizeof(ns_id)); 2286 + if (ret != sizeof(ns_id)) { 2287 + sync_byte = 'X'; 2288 + write(syncpipe[1], &sync_byte, 1); 2289 + pthread_join(thread, NULL); 2290 + close(pipefd[0]); 2291 + close(pipefd[1]); 2292 + close(syncpipe[0]); 2293 + close(syncpipe[1]); 2294 + SKIP(return, "Failed to read namespace ID from thread"); 2295 + } 2296 + 2297 + TH_LOG("Thread created namespace with ID %llu", (unsigned long long)ns_id); 2298 + 2299 + /* Construct file handle */ 2300 + handle = (struct file_handle *)buf; 2301 + handle->handle_bytes = sizeof(struct nsfs_file_handle); 2302 + handle->handle_type = FILEID_NSFS; 2303 + struct nsfs_file_handle *fh = (struct nsfs_file_handle *)handle->f_handle; 2304 + fh->ns_id = ns_id; 2305 + fh->ns_type = 0; 2306 + fh->ns_inum = 0; 2307 + 2308 + /* Open namespace while thread is alive */ 2309 + TH_LOG("Opening namespace while thread is alive"); 2310 + int nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 2311 + ASSERT_GE(nsfd, 0); 2312 + 2313 + /* Signal thread to exit */ 2314 + TH_LOG("Signaling thread to exit"); 2315 + sync_byte = 'X'; 2316 + write(syncpipe[1], &sync_byte, 1); 2317 + close(syncpipe[1]); 2318 + 2319 + /* Wait for thread to exit */ 2320 + pthread_join(thread, NULL); 2321 + close(pipefd[0]); 2322 + close(pipefd[1]); 2323 + close(syncpipe[0]); 2324 + 2325 + if (info.exit_code != 0) { 2326 + close(nsfd); 2327 + SKIP(return, "Thread failed to create namespace"); 2328 + } 2329 + 2330 + TH_LOG("Thread exited, but main thread holds fd - namespace should remain active"); 2331 + 2332 + /* Namespace should still be active because we hold an fd */ 2333 + int nsfd2 = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 2334 + ASSERT_GE(nsfd2, 0); 2335 + 2336 + /* Verify it's the same namespace */ 2337 + struct stat st1, st2; 2338 + ASSERT_EQ(fstat(nsfd, &st1), 0); 2339 + ASSERT_EQ(fstat(nsfd2, &st2), 0); 2340 + ASSERT_EQ(st1.st_ino, st2.st_ino); 2341 + close(nsfd2); 2342 + 2343 + TH_LOG("Closing fd - namespace should become inactive"); 2344 + close(nsfd); 2345 + 2346 + /* Now namespace should be inactive */ 2347 + nsfd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 2348 + ASSERT_LT(nsfd, 0); 2349 + /* Should fail with ENOENT (inactive) or ESTALE (gone) */ 2350 + TH_LOG("Namespace inactive as expected: %s (errno=%d)", strerror(errno), errno); 2351 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 2352 + } 2353 + 2354 + /* Structure for thread data in subprocess */ 2355 + struct thread_sleep_data { 2356 + int syncfd_read; 2357 + }; 2358 + 2359 + static void *thread_sleep_and_wait(void *arg) 2360 + { 2361 + struct thread_sleep_data *data = (struct thread_sleep_data *)arg; 2362 + char sync_byte; 2363 + 2364 + /* Wait for signal to exit - read will unblock when pipe is closed */ 2365 + (void)read(data->syncfd_read, &sync_byte, 1); 2366 + return NULL; 2367 + } 2368 + 2369 + /* 2370 + * Test that namespaces become inactive after subprocess with multiple threads exits. 2371 + * Create a subprocess that unshares user and network namespaces, then creates two 2372 + * threads that share those namespaces. Verify that after all threads and subprocess 2373 + * exit, the namespaces are no longer listed by listns() and cannot be opened by 2374 + * open_by_handle_at(). 2375 + */ 2376 + TEST(thread_subprocess_ns_inactive_after_all_exit) 2377 + { 2378 + int pipefd[2]; 2379 + int sv[2]; 2380 + pid_t pid; 2381 + int status; 2382 + __u64 user_id, net_id; 2383 + struct file_handle *user_handle, *net_handle; 2384 + char user_buf[sizeof(*user_handle) + MAX_HANDLE_SZ]; 2385 + char net_buf[sizeof(*net_handle) + MAX_HANDLE_SZ]; 2386 + char sync_byte; 2387 + int ret; 2388 + 2389 + ASSERT_EQ(pipe(pipefd), 0); 2390 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 2391 + 2392 + pid = fork(); 2393 + ASSERT_GE(pid, 0); 2394 + 2395 + if (pid == 0) { 2396 + /* Child process */ 2397 + close(pipefd[0]); 2398 + close(sv[0]); 2399 + 2400 + /* Create user namespace with mappings */ 2401 + if (setup_userns() < 0) { 2402 + fprintf(stderr, "Child: setup_userns() failed: %s\n", strerror(errno)); 2403 + close(pipefd[1]); 2404 + close(sv[1]); 2405 + exit(1); 2406 + } 2407 + fprintf(stderr, "Child: setup_userns() succeeded\n"); 2408 + 2409 + /* Get user namespace ID */ 2410 + int user_fd = open("/proc/self/ns/user", O_RDONLY); 2411 + if (user_fd < 0) { 2412 + fprintf(stderr, "Child: open(/proc/self/ns/user) failed: %s\n", strerror(errno)); 2413 + close(pipefd[1]); 2414 + close(sv[1]); 2415 + exit(1); 2416 + } 2417 + 2418 + if (ioctl(user_fd, NS_GET_ID, &user_id) < 0) { 2419 + fprintf(stderr, "Child: ioctl(NS_GET_ID) for user ns failed: %s\n", strerror(errno)); 2420 + close(user_fd); 2421 + close(pipefd[1]); 2422 + close(sv[1]); 2423 + exit(1); 2424 + } 2425 + close(user_fd); 2426 + fprintf(stderr, "Child: user ns ID = %llu\n", (unsigned long long)user_id); 2427 + 2428 + /* Unshare network namespace */ 2429 + if (unshare(CLONE_NEWNET) < 0) { 2430 + fprintf(stderr, "Child: unshare(CLONE_NEWNET) failed: %s\n", strerror(errno)); 2431 + close(pipefd[1]); 2432 + close(sv[1]); 2433 + exit(1); 2434 + } 2435 + fprintf(stderr, "Child: unshare(CLONE_NEWNET) succeeded\n"); 2436 + 2437 + /* Get network namespace ID */ 2438 + int net_fd = open("/proc/self/ns/net", O_RDONLY); 2439 + if (net_fd < 0) { 2440 + fprintf(stderr, "Child: open(/proc/self/ns/net) failed: %s\n", strerror(errno)); 2441 + close(pipefd[1]); 2442 + close(sv[1]); 2443 + exit(1); 2444 + } 2445 + 2446 + if (ioctl(net_fd, NS_GET_ID, &net_id) < 0) { 2447 + fprintf(stderr, "Child: ioctl(NS_GET_ID) for net ns failed: %s\n", strerror(errno)); 2448 + close(net_fd); 2449 + close(pipefd[1]); 2450 + close(sv[1]); 2451 + exit(1); 2452 + } 2453 + close(net_fd); 2454 + fprintf(stderr, "Child: net ns ID = %llu\n", (unsigned long long)net_id); 2455 + 2456 + /* Send namespace IDs to parent */ 2457 + if (write(pipefd[1], &user_id, sizeof(user_id)) != sizeof(user_id)) { 2458 + fprintf(stderr, "Child: write(user_id) failed: %s\n", strerror(errno)); 2459 + exit(1); 2460 + } 2461 + if (write(pipefd[1], &net_id, sizeof(net_id)) != sizeof(net_id)) { 2462 + fprintf(stderr, "Child: write(net_id) failed: %s\n", strerror(errno)); 2463 + exit(1); 2464 + } 2465 + close(pipefd[1]); 2466 + fprintf(stderr, "Child: sent namespace IDs to parent\n"); 2467 + 2468 + /* Create two threads that share the namespaces */ 2469 + pthread_t thread1, thread2; 2470 + struct thread_sleep_data data; 2471 + data.syncfd_read = sv[1]; 2472 + 2473 + int ret_thread = pthread_create(&thread1, NULL, thread_sleep_and_wait, &data); 2474 + if (ret_thread != 0) { 2475 + fprintf(stderr, "Child: pthread_create(thread1) failed: %s\n", strerror(ret_thread)); 2476 + close(sv[1]); 2477 + exit(1); 2478 + } 2479 + fprintf(stderr, "Child: created thread1\n"); 2480 + 2481 + ret_thread = pthread_create(&thread2, NULL, thread_sleep_and_wait, &data); 2482 + if (ret_thread != 0) { 2483 + fprintf(stderr, "Child: pthread_create(thread2) failed: %s\n", strerror(ret_thread)); 2484 + close(sv[1]); 2485 + pthread_cancel(thread1); 2486 + exit(1); 2487 + } 2488 + fprintf(stderr, "Child: created thread2\n"); 2489 + 2490 + /* Wait for threads to complete - they will unblock when parent writes */ 2491 + fprintf(stderr, "Child: waiting for threads to exit\n"); 2492 + pthread_join(thread1, NULL); 2493 + fprintf(stderr, "Child: thread1 exited\n"); 2494 + pthread_join(thread2, NULL); 2495 + fprintf(stderr, "Child: thread2 exited\n"); 2496 + 2497 + close(sv[1]); 2498 + 2499 + /* Exit - namespaces should become inactive */ 2500 + fprintf(stderr, "Child: all threads joined, exiting with success\n"); 2501 + exit(0); 2502 + } 2503 + 2504 + /* Parent process */ 2505 + close(pipefd[1]); 2506 + close(sv[1]); 2507 + 2508 + TH_LOG("Parent: waiting to read namespace IDs from child"); 2509 + 2510 + /* Read namespace IDs from child */ 2511 + ret = read(pipefd[0], &user_id, sizeof(user_id)); 2512 + if (ret != sizeof(user_id)) { 2513 + TH_LOG("Parent: failed to read user_id, ret=%d, errno=%s", ret, strerror(errno)); 2514 + close(pipefd[0]); 2515 + sync_byte = 'X'; 2516 + (void)write(sv[0], &sync_byte, 1); 2517 + close(sv[0]); 2518 + waitpid(pid, NULL, 0); 2519 + SKIP(return, "Failed to read user namespace ID from child"); 2520 + } 2521 + 2522 + ret = read(pipefd[0], &net_id, sizeof(net_id)); 2523 + close(pipefd[0]); 2524 + if (ret != sizeof(net_id)) { 2525 + TH_LOG("Parent: failed to read net_id, ret=%d, errno=%s", ret, strerror(errno)); 2526 + sync_byte = 'X'; 2527 + (void)write(sv[0], &sync_byte, 1); 2528 + close(sv[0]); 2529 + waitpid(pid, NULL, 0); 2530 + SKIP(return, "Failed to read network namespace ID from child"); 2531 + } 2532 + 2533 + TH_LOG("Child created user ns %llu and net ns %llu with 2 threads", 2534 + (unsigned long long)user_id, (unsigned long long)net_id); 2535 + 2536 + /* Construct file handles */ 2537 + user_handle = (struct file_handle *)user_buf; 2538 + user_handle->handle_bytes = sizeof(struct nsfs_file_handle); 2539 + user_handle->handle_type = FILEID_NSFS; 2540 + struct nsfs_file_handle *user_fh = (struct nsfs_file_handle *)user_handle->f_handle; 2541 + user_fh->ns_id = user_id; 2542 + user_fh->ns_type = 0; 2543 + user_fh->ns_inum = 0; 2544 + 2545 + net_handle = (struct file_handle *)net_buf; 2546 + net_handle->handle_bytes = sizeof(struct nsfs_file_handle); 2547 + net_handle->handle_type = FILEID_NSFS; 2548 + struct nsfs_file_handle *net_fh = (struct nsfs_file_handle *)net_handle->f_handle; 2549 + net_fh->ns_id = net_id; 2550 + net_fh->ns_type = 0; 2551 + net_fh->ns_inum = 0; 2552 + 2553 + /* Verify namespaces are active while subprocess and threads are alive */ 2554 + TH_LOG("Verifying namespaces are active while subprocess with threads is running"); 2555 + int user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); 2556 + ASSERT_GE(user_fd, 0); 2557 + 2558 + int net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); 2559 + ASSERT_GE(net_fd, 0); 2560 + 2561 + close(user_fd); 2562 + close(net_fd); 2563 + 2564 + /* Also verify they appear in listns() */ 2565 + TH_LOG("Verifying namespaces appear in listns() while active"); 2566 + struct ns_id_req req = { 2567 + .size = sizeof(struct ns_id_req), 2568 + .spare = 0, 2569 + .ns_id = 0, 2570 + .ns_type = CLONE_NEWUSER, 2571 + .spare2 = 0, 2572 + .user_ns_id = 0, 2573 + }; 2574 + __u64 ns_ids[256]; 2575 + int nr_ids = sys_listns(&req, ns_ids, 256, 0); 2576 + if (nr_ids < 0) { 2577 + TH_LOG("listns() not available, skipping listns verification"); 2578 + } else { 2579 + /* Check if user_id is in the list */ 2580 + int found_user = 0; 2581 + for (int i = 0; i < nr_ids; i++) { 2582 + if (ns_ids[i] == user_id) { 2583 + found_user = 1; 2584 + break; 2585 + } 2586 + } 2587 + ASSERT_TRUE(found_user); 2588 + TH_LOG("User namespace found in listns() as expected"); 2589 + 2590 + /* Check network namespace */ 2591 + req.ns_type = CLONE_NEWNET; 2592 + nr_ids = sys_listns(&req, ns_ids, 256, 0); 2593 + if (nr_ids >= 0) { 2594 + int found_net = 0; 2595 + for (int i = 0; i < nr_ids; i++) { 2596 + if (ns_ids[i] == net_id) { 2597 + found_net = 1; 2598 + break; 2599 + } 2600 + } 2601 + ASSERT_TRUE(found_net); 2602 + TH_LOG("Network namespace found in listns() as expected"); 2603 + } 2604 + } 2605 + 2606 + /* Signal threads to exit */ 2607 + TH_LOG("Signaling threads to exit"); 2608 + sync_byte = 'X'; 2609 + /* Write two bytes - one for each thread */ 2610 + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); 2611 + ASSERT_EQ(write(sv[0], &sync_byte, 1), 1); 2612 + close(sv[0]); 2613 + 2614 + /* Wait for child process to exit */ 2615 + waitpid(pid, &status, 0); 2616 + ASSERT_TRUE(WIFEXITED(status)); 2617 + if (WEXITSTATUS(status) != 0) { 2618 + TH_LOG("Child process failed with exit code %d", WEXITSTATUS(status)); 2619 + SKIP(return, "Child process failed"); 2620 + } 2621 + 2622 + TH_LOG("Subprocess and all threads have exited successfully"); 2623 + 2624 + /* Verify namespaces are now inactive - open_by_handle_at should fail */ 2625 + TH_LOG("Verifying namespaces are inactive after subprocess and threads exit"); 2626 + user_fd = open_by_handle_at(FD_NSFS_ROOT, user_handle, O_RDONLY); 2627 + ASSERT_LT(user_fd, 0); 2628 + TH_LOG("User namespace inactive as expected: %s (errno=%d)", 2629 + strerror(errno), errno); 2630 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 2631 + 2632 + net_fd = open_by_handle_at(FD_NSFS_ROOT, net_handle, O_RDONLY); 2633 + ASSERT_LT(net_fd, 0); 2634 + TH_LOG("Network namespace inactive as expected: %s (errno=%d)", 2635 + strerror(errno), errno); 2636 + ASSERT_TRUE(errno == ENOENT || errno == ESTALE); 2637 + 2638 + /* Verify namespaces do NOT appear in listns() */ 2639 + TH_LOG("Verifying namespaces do NOT appear in listns() when inactive"); 2640 + memset(&req, 0, sizeof(req)); 2641 + req.size = sizeof(struct ns_id_req); 2642 + req.ns_type = CLONE_NEWUSER; 2643 + nr_ids = sys_listns(&req, ns_ids, 256, 0); 2644 + if (nr_ids >= 0) { 2645 + int found_user = 0; 2646 + for (int i = 0; i < nr_ids; i++) { 2647 + if (ns_ids[i] == user_id) { 2648 + found_user = 1; 2649 + break; 2650 + } 2651 + } 2652 + ASSERT_FALSE(found_user); 2653 + TH_LOG("User namespace correctly not listed in listns()"); 2654 + 2655 + /* Check network namespace */ 2656 + req.ns_type = CLONE_NEWNET; 2657 + nr_ids = sys_listns(&req, ns_ids, 256, 0); 2658 + if (nr_ids >= 0) { 2659 + int found_net = 0; 2660 + for (int i = 0; i < nr_ids; i++) { 2661 + if (ns_ids[i] == net_id) { 2662 + found_net = 1; 2663 + break; 2664 + } 2665 + } 2666 + ASSERT_FALSE(found_net); 2667 + TH_LOG("Network namespace correctly not listed in listns()"); 2668 + } 2669 + } 2670 + } 2671 + 2672 + TEST_HARNESS_MAIN

+51 -56

tools/testing/selftests/namespaces/nsid_test.c

··· 6 6 #include <libgen.h> 7 7 #include <limits.h> 8 8 #include <pthread.h> 9 + #include <signal.h> 9 10 #include <string.h> 10 11 #include <sys/mount.h> 11 12 #include <poll.h> ··· 15 14 #include <sys/stat.h> 16 15 #include <sys/socket.h> 17 16 #include <sys/un.h> 17 + #include <sys/wait.h> 18 18 #include <unistd.h> 19 19 #include <linux/fs.h> 20 20 #include <linux/limits.h> 21 21 #include <linux/nsfs.h> 22 22 #include "../kselftest_harness.h" 23 + 24 + /* Fixture for tests that create child processes */ 25 + FIXTURE(nsid) { 26 + pid_t child_pid; 27 + }; 28 + 29 + FIXTURE_SETUP(nsid) { 30 + self->child_pid = 0; 31 + } 32 + 33 + FIXTURE_TEARDOWN(nsid) { 34 + /* Clean up any child process that may still be running */ 35 + if (self->child_pid > 0) { 36 + kill(self->child_pid, SIGKILL); 37 + waitpid(self->child_pid, NULL, 0); 38 + } 39 + } 23 40 24 41 TEST(nsid_mntns_basic) 25 42 { ··· 63 44 close(fd_mntns); 64 45 } 65 46 66 - TEST(nsid_mntns_separate) 47 + TEST_F(nsid, mntns_separate) 67 48 { 68 49 __u64 parent_mnt_ns_id = 0; 69 50 __u64 child_mnt_ns_id = 0; ··· 109 90 _exit(0); 110 91 } 111 92 93 + /* Track child for cleanup */ 94 + self->child_pid = pid; 95 + 112 96 /* Parent process */ 113 97 close(pipefd[1]); 114 98 ··· 121 99 122 100 if (buf == 'S') { 123 101 /* Child couldn't create namespace, skip test */ 124 - kill(pid, SIGTERM); 125 - waitpid(pid, NULL, 0); 126 102 close(fd_parent_mntns); 127 103 SKIP(return, "No permission to create mount namespace"); 128 104 } ··· 143 123 144 124 close(fd_parent_mntns); 145 125 close(fd_child_mntns); 146 - 147 - /* Clean up child process */ 148 - kill(pid, SIGTERM); 149 - waitpid(pid, NULL, 0); 150 126 } 151 127 152 128 TEST(nsid_cgroupns_basic) ··· 169 153 close(fd_cgroupns); 170 154 } 171 155 172 - TEST(nsid_cgroupns_separate) 156 + TEST_F(nsid, cgroupns_separate) 173 157 { 174 158 __u64 parent_cgroup_ns_id = 0; 175 159 __u64 child_cgroup_ns_id = 0; ··· 215 199 _exit(0); 216 200 } 217 201 202 + /* Track child for cleanup */ 203 + self->child_pid = pid; 204 + 218 205 /* Parent process */ 219 206 close(pipefd[1]); 220 207 ··· 227 208 228 209 if (buf == 'S') { 229 210 /* Child couldn't create namespace, skip test */ 230 - kill(pid, SIGTERM); 231 - waitpid(pid, NULL, 0); 232 211 close(fd_parent_cgroupns); 233 212 SKIP(return, "No permission to create cgroup namespace"); 234 213 } ··· 249 232 250 233 close(fd_parent_cgroupns); 251 234 close(fd_child_cgroupns); 252 - 253 - /* Clean up child process */ 254 - kill(pid, SIGTERM); 255 - waitpid(pid, NULL, 0); 256 235 } 257 236 258 237 TEST(nsid_ipcns_basic) ··· 275 262 close(fd_ipcns); 276 263 } 277 264 278 - TEST(nsid_ipcns_separate) 265 + TEST_F(nsid, ipcns_separate) 279 266 { 280 267 __u64 parent_ipc_ns_id = 0; 281 268 __u64 child_ipc_ns_id = 0; ··· 321 308 _exit(0); 322 309 } 323 310 311 + /* Track child for cleanup */ 312 + self->child_pid = pid; 313 + 324 314 /* Parent process */ 325 315 close(pipefd[1]); 326 316 ··· 333 317 334 318 if (buf == 'S') { 335 319 /* Child couldn't create namespace, skip test */ 336 - kill(pid, SIGTERM); 337 - waitpid(pid, NULL, 0); 338 320 close(fd_parent_ipcns); 339 321 SKIP(return, "No permission to create IPC namespace"); 340 322 } ··· 355 341 356 342 close(fd_parent_ipcns); 357 343 close(fd_child_ipcns); 358 - 359 - /* Clean up child process */ 360 - kill(pid, SIGTERM); 361 - waitpid(pid, NULL, 0); 362 344 } 363 345 364 346 TEST(nsid_utsns_basic) ··· 381 371 close(fd_utsns); 382 372 } 383 373 384 - TEST(nsid_utsns_separate) 374 + TEST_F(nsid, utsns_separate) 385 375 { 386 376 __u64 parent_uts_ns_id = 0; 387 377 __u64 child_uts_ns_id = 0; ··· 427 417 _exit(0); 428 418 } 429 419 420 + /* Track child for cleanup */ 421 + self->child_pid = pid; 422 + 430 423 /* Parent process */ 431 424 close(pipefd[1]); 432 425 ··· 439 426 440 427 if (buf == 'S') { 441 428 /* Child couldn't create namespace, skip test */ 442 - kill(pid, SIGTERM); 443 - waitpid(pid, NULL, 0); 444 429 close(fd_parent_utsns); 445 430 SKIP(return, "No permission to create UTS namespace"); 446 431 } ··· 461 450 462 451 close(fd_parent_utsns); 463 452 close(fd_child_utsns); 464 - 465 - /* Clean up child process */ 466 - kill(pid, SIGTERM); 467 - waitpid(pid, NULL, 0); 468 453 } 469 454 470 455 TEST(nsid_userns_basic) ··· 487 480 close(fd_userns); 488 481 } 489 482 490 - TEST(nsid_userns_separate) 483 + TEST_F(nsid, userns_separate) 491 484 { 492 485 __u64 parent_user_ns_id = 0; 493 486 __u64 child_user_ns_id = 0; ··· 533 526 _exit(0); 534 527 } 535 528 529 + /* Track child for cleanup */ 530 + self->child_pid = pid; 531 + 536 532 /* Parent process */ 537 533 close(pipefd[1]); 538 534 ··· 545 535 546 536 if (buf == 'S') { 547 537 /* Child couldn't create namespace, skip test */ 548 - kill(pid, SIGTERM); 549 - waitpid(pid, NULL, 0); 550 538 close(fd_parent_userns); 551 539 SKIP(return, "No permission to create user namespace"); 552 540 } ··· 567 559 568 560 close(fd_parent_userns); 569 561 close(fd_child_userns); 570 - 571 - /* Clean up child process */ 572 - kill(pid, SIGTERM); 573 - waitpid(pid, NULL, 0); 574 562 } 575 563 576 564 TEST(nsid_timens_basic) ··· 595 591 close(fd_timens); 596 592 } 597 593 598 - TEST(nsid_timens_separate) 594 + TEST_F(nsid, timens_separate) 599 595 { 600 596 __u64 parent_time_ns_id = 0; 601 597 __u64 child_time_ns_id = 0; ··· 656 652 } 657 653 } 658 654 655 + /* Track child for cleanup */ 656 + self->child_pid = pid; 657 + 659 658 /* Parent process */ 660 659 close(pipefd[1]); 661 660 ··· 667 660 668 661 if (buf == 'S') { 669 662 /* Child couldn't create namespace, skip test */ 670 - kill(pid, SIGTERM); 671 - waitpid(pid, NULL, 0); 672 663 close(fd_parent_timens); 673 664 close(pipefd[0]); 674 665 SKIP(return, "Cannot create time namespace"); ··· 694 689 695 690 close(fd_parent_timens); 696 691 close(fd_child_timens); 697 - 698 - /* Clean up child process */ 699 - kill(pid, SIGTERM); 700 - waitpid(pid, NULL, 0); 701 692 } 702 693 703 694 TEST(nsid_pidns_basic) ··· 720 719 close(fd_pidns); 721 720 } 722 721 723 - TEST(nsid_pidns_separate) 722 + TEST_F(nsid, pidns_separate) 724 723 { 725 724 __u64 parent_pid_ns_id = 0; 726 725 __u64 child_pid_ns_id = 0; ··· 777 776 } 778 777 } 779 778 779 + /* Track child for cleanup */ 780 + self->child_pid = pid; 781 + 780 782 /* Parent process */ 781 783 close(pipefd[1]); 782 784 ··· 788 784 789 785 if (buf == 'S') { 790 786 /* Child couldn't create namespace, skip test */ 791 - kill(pid, SIGTERM); 792 - waitpid(pid, NULL, 0); 793 787 close(fd_parent_pidns); 794 788 close(pipefd[0]); 795 789 SKIP(return, "No permission to create PID namespace"); ··· 815 813 816 814 close(fd_parent_pidns); 817 815 close(fd_child_pidns); 818 - 819 - /* Clean up child process */ 820 - kill(pid, SIGTERM); 821 - waitpid(pid, NULL, 0); 822 816 } 823 817 824 818 TEST(nsid_netns_basic) ··· 858 860 close(fd_netns); 859 861 } 860 862 861 - TEST(nsid_netns_separate) 863 + TEST_F(nsid, netns_separate) 862 864 { 863 865 __u64 parent_net_ns_id = 0; 864 866 __u64 parent_netns_cookie = 0; ··· 918 920 _exit(0); 919 921 } 920 922 923 + /* Track child for cleanup */ 924 + self->child_pid = pid; 925 + 921 926 /* Parent process */ 922 927 close(pipefd[1]); 923 928 ··· 930 929 931 930 if (buf == 'S') { 932 931 /* Child couldn't create namespace, skip test */ 933 - kill(pid, SIGTERM); 934 - waitpid(pid, NULL, 0); 935 932 close(fd_parent_netns); 936 933 close(parent_sock); 937 934 SKIP(return, "No permission to create network namespace"); ··· 976 977 close(fd_parent_netns); 977 978 close(fd_child_netns); 978 979 close(parent_sock); 979 - 980 - /* Clean up child process */ 981 - kill(pid, SIGTERM); 982 - waitpid(pid, NULL, 0); 983 980 } 984 981 985 982 TEST_HARNESS_MAIN

+113

tools/testing/selftests/namespaces/regression_pidfd_setns_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <sched.h> 5 + #include <signal.h> 6 + #include <stdio.h> 7 + #include <stdlib.h> 8 + #include <string.h> 9 + #include <sys/socket.h> 10 + #include <unistd.h> 11 + #include "../pidfd/pidfd.h" 12 + #include "../kselftest_harness.h" 13 + 14 + /* 15 + * Regression tests for the setns(pidfd) active reference counting bug. 16 + * 17 + * These tests are based on the reproducers that triggered the race condition 18 + * fixed by commit 1c465d0518dc ("ns: handle setns(pidfd, ...) cleanly"). 19 + * 20 + * The bug: When using setns() with a pidfd, if the target task exits between 21 + * prepare_nsset() and commit_nsset(), the namespaces would become inactive. 22 + * Then ns_ref_active_get() would increment from 0 without properly resurrecting 23 + * the owner chain, causing active reference count underflows. 24 + */ 25 + 26 + /* 27 + * Simple pidfd setns test using create_child()+unshare(). 28 + * 29 + * Without the fix, this would trigger active refcount warnings when the 30 + * parent exits after doing setns(pidfd) on a child that has already exited. 31 + */ 32 + TEST(simple_pidfd_setns) 33 + { 34 + pid_t child_pid; 35 + int pidfd = -1; 36 + int ret; 37 + int sv[2]; 38 + char c; 39 + 40 + /* Ignore SIGCHLD for autoreap */ 41 + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); 42 + 43 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 44 + 45 + /* Create a child process without namespaces initially */ 46 + child_pid = create_child(&pidfd, 0); 47 + ASSERT_GE(child_pid, 0); 48 + 49 + if (child_pid == 0) { 50 + close(sv[0]); 51 + 52 + if (unshare(CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET | CLONE_NEWUSER) < 0) { 53 + close(sv[1]); 54 + _exit(1); 55 + } 56 + 57 + /* Signal parent that namespaces are ready */ 58 + if (write_nointr(sv[1], "1", 1) < 0) { 59 + close(sv[1]); 60 + _exit(1); 61 + } 62 + 63 + close(sv[1]); 64 + _exit(0); 65 + } 66 + ASSERT_GE(pidfd, 0); 67 + EXPECT_EQ(close(sv[1]), 0); 68 + 69 + ret = read_nointr(sv[0], &c, 1); 70 + ASSERT_EQ(ret, 1); 71 + EXPECT_EQ(close(sv[0]), 0); 72 + 73 + /* Set to child's namespaces via pidfd */ 74 + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); 75 + TH_LOG("setns() returned %d", ret); 76 + close(pidfd); 77 + } 78 + 79 + /* 80 + * Simple pidfd setns test using create_child(). 81 + * 82 + * This variation uses create_child() with namespace flags directly. 83 + * Namespaces are created immediately at clone time. 84 + */ 85 + TEST(simple_pidfd_setns_clone) 86 + { 87 + pid_t child_pid; 88 + int pidfd = -1; 89 + int ret; 90 + 91 + /* Ignore SIGCHLD for autoreap */ 92 + ASSERT_NE(signal(SIGCHLD, SIG_IGN), SIG_ERR); 93 + 94 + /* Create a child process with new namespaces using create_child() */ 95 + child_pid = create_child(&pidfd, CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWNET); 96 + ASSERT_GE(child_pid, 0); 97 + 98 + if (child_pid == 0) { 99 + /* Child: sleep for a while so parent can setns to us */ 100 + sleep(2); 101 + _exit(0); 102 + } 103 + 104 + /* Parent: pidfd was already created by create_child() */ 105 + ASSERT_GE(pidfd, 0); 106 + 107 + /* Set to child's namespaces via pidfd */ 108 + ret = setns(pidfd, CLONE_NEWUTS | CLONE_NEWIPC); 109 + close(pidfd); 110 + TH_LOG("setns() returned %d", ret); 111 + } 112 + 113 + TEST_HARNESS_MAIN

+1824

tools/testing/selftests/namespaces/siocgskns_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <string.h> 10 + #include <sys/ioctl.h> 11 + #include <sys/socket.h> 12 + #include <sys/stat.h> 13 + #include <sys/types.h> 14 + #include <sys/wait.h> 15 + #include <unistd.h> 16 + #include <linux/if.h> 17 + #include <linux/sockios.h> 18 + #include <linux/nsfs.h> 19 + #include <arpa/inet.h> 20 + #include "../kselftest_harness.h" 21 + #include "../filesystems/utils.h" 22 + #include "wrappers.h" 23 + 24 + #ifndef SIOCGSKNS 25 + #define SIOCGSKNS 0x894C 26 + #endif 27 + 28 + #ifndef FD_NSFS_ROOT 29 + #define FD_NSFS_ROOT -10003 30 + #endif 31 + 32 + #ifndef FILEID_NSFS 33 + #define FILEID_NSFS 0xf1 34 + #endif 35 + 36 + /* 37 + * Test basic SIOCGSKNS functionality. 38 + * Create a socket and verify SIOCGSKNS returns the correct network namespace. 39 + */ 40 + TEST(siocgskns_basic) 41 + { 42 + int sock_fd, netns_fd, current_netns_fd; 43 + struct stat st1, st2; 44 + 45 + /* Create a TCP socket */ 46 + sock_fd = socket(AF_INET, SOCK_STREAM, 0); 47 + ASSERT_GE(sock_fd, 0); 48 + 49 + /* Use SIOCGSKNS to get network namespace */ 50 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 51 + if (netns_fd < 0) { 52 + close(sock_fd); 53 + if (errno == ENOTTY || errno == EINVAL) 54 + SKIP(return, "SIOCGSKNS not supported"); 55 + ASSERT_GE(netns_fd, 0); 56 + } 57 + 58 + /* Get current network namespace */ 59 + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); 60 + ASSERT_GE(current_netns_fd, 0); 61 + 62 + /* Verify they match */ 63 + ASSERT_EQ(fstat(netns_fd, &st1), 0); 64 + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); 65 + ASSERT_EQ(st1.st_ino, st2.st_ino); 66 + 67 + close(sock_fd); 68 + close(netns_fd); 69 + close(current_netns_fd); 70 + } 71 + 72 + /* 73 + * Test that socket file descriptors keep network namespaces active. 74 + * Create a network namespace, create a socket in it, then exit the namespace. 75 + * The namespace should remain active while the socket FD is held. 76 + */ 77 + TEST(siocgskns_keeps_netns_active) 78 + { 79 + int sock_fd, netns_fd, test_fd; 80 + int ipc_sockets[2]; 81 + pid_t pid; 82 + int status; 83 + struct stat st; 84 + 85 + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); 86 + 87 + pid = fork(); 88 + ASSERT_GE(pid, 0); 89 + 90 + if (pid == 0) { 91 + /* Child: create new netns and socket */ 92 + close(ipc_sockets[0]); 93 + 94 + if (unshare(CLONE_NEWNET) < 0) { 95 + TH_LOG("unshare(CLONE_NEWNET) failed: %s", strerror(errno)); 96 + close(ipc_sockets[1]); 97 + exit(1); 98 + } 99 + 100 + /* Create a socket in the new network namespace */ 101 + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); 102 + if (sock_fd < 0) { 103 + TH_LOG("socket() failed: %s", strerror(errno)); 104 + close(ipc_sockets[1]); 105 + exit(1); 106 + } 107 + 108 + /* Send socket FD to parent via SCM_RIGHTS */ 109 + struct msghdr msg = {0}; 110 + struct iovec iov = {0}; 111 + char buf[1] = {'X'}; 112 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 113 + 114 + iov.iov_base = buf; 115 + iov.iov_len = 1; 116 + msg.msg_iov = &iov; 117 + msg.msg_iovlen = 1; 118 + msg.msg_control = cmsg_buf; 119 + msg.msg_controllen = sizeof(cmsg_buf); 120 + 121 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 122 + cmsg->cmsg_level = SOL_SOCKET; 123 + cmsg->cmsg_type = SCM_RIGHTS; 124 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 125 + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); 126 + 127 + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { 128 + close(sock_fd); 129 + close(ipc_sockets[1]); 130 + exit(1); 131 + } 132 + 133 + close(sock_fd); 134 + close(ipc_sockets[1]); 135 + exit(0); 136 + } 137 + 138 + /* Parent: receive socket FD */ 139 + close(ipc_sockets[1]); 140 + 141 + struct msghdr msg = {0}; 142 + struct iovec iov = {0}; 143 + char buf[1]; 144 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 145 + 146 + iov.iov_base = buf; 147 + iov.iov_len = 1; 148 + msg.msg_iov = &iov; 149 + msg.msg_iovlen = 1; 150 + msg.msg_control = cmsg_buf; 151 + msg.msg_controllen = sizeof(cmsg_buf); 152 + 153 + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); 154 + close(ipc_sockets[0]); 155 + ASSERT_EQ(n, 1); 156 + 157 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 158 + ASSERT_NE(cmsg, NULL); 159 + ASSERT_EQ(cmsg->cmsg_type, SCM_RIGHTS); 160 + 161 + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); 162 + 163 + /* Wait for child to exit */ 164 + waitpid(pid, &status, 0); 165 + ASSERT_TRUE(WIFEXITED(status)); 166 + ASSERT_EQ(WEXITSTATUS(status), 0); 167 + 168 + /* Get network namespace from socket */ 169 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 170 + if (netns_fd < 0) { 171 + close(sock_fd); 172 + if (errno == ENOTTY || errno == EINVAL) 173 + SKIP(return, "SIOCGSKNS not supported"); 174 + ASSERT_GE(netns_fd, 0); 175 + } 176 + 177 + ASSERT_EQ(fstat(netns_fd, &st), 0); 178 + 179 + /* 180 + * Namespace should still be active because socket FD keeps it alive. 181 + * Try to access it via /proc/self/fd/<fd>. 182 + */ 183 + char path[64]; 184 + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fd); 185 + test_fd = open(path, O_RDONLY); 186 + ASSERT_GE(test_fd, 0); 187 + close(test_fd); 188 + close(netns_fd); 189 + 190 + /* Close socket - namespace should become inactive */ 191 + close(sock_fd); 192 + 193 + /* Try SIOCGSKNS again - should fail since socket is closed */ 194 + ASSERT_LT(ioctl(sock_fd, SIOCGSKNS), 0); 195 + } 196 + 197 + /* 198 + * Test SIOCGSKNS with different socket types (TCP, UDP, RAW). 199 + */ 200 + TEST(siocgskns_socket_types) 201 + { 202 + int sock_tcp, sock_udp, sock_raw; 203 + int netns_tcp, netns_udp, netns_raw; 204 + struct stat st_tcp, st_udp, st_raw; 205 + 206 + /* TCP socket */ 207 + sock_tcp = socket(AF_INET, SOCK_STREAM, 0); 208 + ASSERT_GE(sock_tcp, 0); 209 + 210 + /* UDP socket */ 211 + sock_udp = socket(AF_INET, SOCK_DGRAM, 0); 212 + ASSERT_GE(sock_udp, 0); 213 + 214 + /* RAW socket (may require privileges) */ 215 + sock_raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); 216 + if (sock_raw < 0 && (errno == EPERM || errno == EACCES)) { 217 + sock_raw = -1; /* Skip raw socket test */ 218 + } 219 + 220 + /* Test SIOCGSKNS on TCP */ 221 + netns_tcp = ioctl(sock_tcp, SIOCGSKNS); 222 + if (netns_tcp < 0) { 223 + close(sock_tcp); 224 + close(sock_udp); 225 + if (sock_raw >= 0) close(sock_raw); 226 + if (errno == ENOTTY || errno == EINVAL) 227 + SKIP(return, "SIOCGSKNS not supported"); 228 + ASSERT_GE(netns_tcp, 0); 229 + } 230 + 231 + /* Test SIOCGSKNS on UDP */ 232 + netns_udp = ioctl(sock_udp, SIOCGSKNS); 233 + ASSERT_GE(netns_udp, 0); 234 + 235 + /* Test SIOCGSKNS on RAW (if available) */ 236 + if (sock_raw >= 0) { 237 + netns_raw = ioctl(sock_raw, SIOCGSKNS); 238 + ASSERT_GE(netns_raw, 0); 239 + } 240 + 241 + /* Verify all return the same network namespace */ 242 + ASSERT_EQ(fstat(netns_tcp, &st_tcp), 0); 243 + ASSERT_EQ(fstat(netns_udp, &st_udp), 0); 244 + ASSERT_EQ(st_tcp.st_ino, st_udp.st_ino); 245 + 246 + if (sock_raw >= 0) { 247 + ASSERT_EQ(fstat(netns_raw, &st_raw), 0); 248 + ASSERT_EQ(st_tcp.st_ino, st_raw.st_ino); 249 + close(netns_raw); 250 + close(sock_raw); 251 + } 252 + 253 + close(netns_tcp); 254 + close(netns_udp); 255 + close(sock_tcp); 256 + close(sock_udp); 257 + } 258 + 259 + /* 260 + * Test SIOCGSKNS across setns. 261 + * Create a socket in netns A, switch to netns B, verify SIOCGSKNS still 262 + * returns netns A. 263 + */ 264 + TEST(siocgskns_across_setns) 265 + { 266 + int sock_fd, netns_a_fd, netns_b_fd, result_fd; 267 + struct stat st_a; 268 + 269 + /* Get current netns (A) */ 270 + netns_a_fd = open("/proc/self/ns/net", O_RDONLY); 271 + ASSERT_GE(netns_a_fd, 0); 272 + ASSERT_EQ(fstat(netns_a_fd, &st_a), 0); 273 + 274 + /* Create socket in netns A */ 275 + sock_fd = socket(AF_INET, SOCK_STREAM, 0); 276 + ASSERT_GE(sock_fd, 0); 277 + 278 + /* Create new netns (B) */ 279 + ASSERT_EQ(unshare(CLONE_NEWNET), 0); 280 + 281 + netns_b_fd = open("/proc/self/ns/net", O_RDONLY); 282 + ASSERT_GE(netns_b_fd, 0); 283 + 284 + /* Get netns from socket created in A */ 285 + result_fd = ioctl(sock_fd, SIOCGSKNS); 286 + if (result_fd < 0) { 287 + close(sock_fd); 288 + setns(netns_a_fd, CLONE_NEWNET); 289 + close(netns_a_fd); 290 + close(netns_b_fd); 291 + if (errno == ENOTTY || errno == EINVAL) 292 + SKIP(return, "SIOCGSKNS not supported"); 293 + ASSERT_GE(result_fd, 0); 294 + } 295 + 296 + /* Verify it still points to netns A */ 297 + struct stat st_result_stat; 298 + ASSERT_EQ(fstat(result_fd, &st_result_stat), 0); 299 + ASSERT_EQ(st_a.st_ino, st_result_stat.st_ino); 300 + 301 + close(result_fd); 302 + close(sock_fd); 303 + close(netns_b_fd); 304 + 305 + /* Restore original netns */ 306 + ASSERT_EQ(setns(netns_a_fd, CLONE_NEWNET), 0); 307 + close(netns_a_fd); 308 + } 309 + 310 + /* 311 + * Test SIOCGSKNS fails on non-socket file descriptors. 312 + */ 313 + TEST(siocgskns_non_socket) 314 + { 315 + int fd; 316 + int pipefd[2]; 317 + 318 + /* Test on regular file */ 319 + fd = open("/dev/null", O_RDONLY); 320 + ASSERT_GE(fd, 0); 321 + 322 + ASSERT_LT(ioctl(fd, SIOCGSKNS), 0); 323 + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); 324 + close(fd); 325 + 326 + /* Test on pipe */ 327 + ASSERT_EQ(pipe(pipefd), 0); 328 + 329 + ASSERT_LT(ioctl(pipefd[0], SIOCGSKNS), 0); 330 + ASSERT_TRUE(errno == ENOTTY || errno == EINVAL); 331 + 332 + close(pipefd[0]); 333 + close(pipefd[1]); 334 + } 335 + 336 + /* 337 + * Test multiple sockets keep the same network namespace active. 338 + * Create multiple sockets, verify closing some doesn't affect others. 339 + */ 340 + TEST(siocgskns_multiple_sockets) 341 + { 342 + int socks[5]; 343 + int netns_fds[5]; 344 + int i; 345 + struct stat st; 346 + ino_t netns_ino; 347 + 348 + /* Create new network namespace */ 349 + ASSERT_EQ(unshare(CLONE_NEWNET), 0); 350 + 351 + /* Create multiple sockets */ 352 + for (i = 0; i < 5; i++) { 353 + socks[i] = socket(AF_INET, SOCK_STREAM, 0); 354 + ASSERT_GE(socks[i], 0); 355 + } 356 + 357 + /* Get netns from all sockets */ 358 + for (i = 0; i < 5; i++) { 359 + netns_fds[i] = ioctl(socks[i], SIOCGSKNS); 360 + if (netns_fds[i] < 0) { 361 + int j; 362 + for (j = 0; j <= i; j++) { 363 + close(socks[j]); 364 + if (j < i && netns_fds[j] >= 0) 365 + close(netns_fds[j]); 366 + } 367 + if (errno == ENOTTY || errno == EINVAL) 368 + SKIP(return, "SIOCGSKNS not supported"); 369 + ASSERT_GE(netns_fds[i], 0); 370 + } 371 + } 372 + 373 + /* Verify all point to same netns */ 374 + ASSERT_EQ(fstat(netns_fds[0], &st), 0); 375 + netns_ino = st.st_ino; 376 + 377 + for (i = 1; i < 5; i++) { 378 + ASSERT_EQ(fstat(netns_fds[i], &st), 0); 379 + ASSERT_EQ(st.st_ino, netns_ino); 380 + } 381 + 382 + /* Close some sockets */ 383 + for (i = 0; i < 3; i++) { 384 + close(socks[i]); 385 + } 386 + 387 + /* Remaining netns FDs should still be valid */ 388 + for (i = 3; i < 5; i++) { 389 + char path[64]; 390 + snprintf(path, sizeof(path), "/proc/self/fd/%d", netns_fds[i]); 391 + int test_fd = open(path, O_RDONLY); 392 + ASSERT_GE(test_fd, 0); 393 + close(test_fd); 394 + } 395 + 396 + /* Cleanup */ 397 + for (i = 0; i < 5; i++) { 398 + if (i >= 3) 399 + close(socks[i]); 400 + close(netns_fds[i]); 401 + } 402 + } 403 + 404 + /* 405 + * Test socket keeps netns active after creating process exits. 406 + * Verify that as long as the socket FD exists, the namespace remains active. 407 + */ 408 + TEST(siocgskns_netns_lifecycle) 409 + { 410 + int sock_fd, netns_fd; 411 + int ipc_sockets[2]; 412 + int syncpipe[2]; 413 + pid_t pid; 414 + int status; 415 + char sync_byte; 416 + struct stat st; 417 + ino_t netns_ino; 418 + 419 + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); 420 + 421 + ASSERT_EQ(pipe(syncpipe), 0); 422 + 423 + pid = fork(); 424 + ASSERT_GE(pid, 0); 425 + 426 + if (pid == 0) { 427 + /* Child */ 428 + close(ipc_sockets[0]); 429 + close(syncpipe[1]); 430 + 431 + if (unshare(CLONE_NEWNET) < 0) { 432 + close(ipc_sockets[1]); 433 + close(syncpipe[0]); 434 + exit(1); 435 + } 436 + 437 + sock_fd = socket(AF_INET, SOCK_STREAM, 0); 438 + if (sock_fd < 0) { 439 + close(ipc_sockets[1]); 440 + close(syncpipe[0]); 441 + exit(1); 442 + } 443 + 444 + /* Send socket to parent */ 445 + struct msghdr msg = {0}; 446 + struct iovec iov = {0}; 447 + char buf[1] = {'X'}; 448 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 449 + 450 + iov.iov_base = buf; 451 + iov.iov_len = 1; 452 + msg.msg_iov = &iov; 453 + msg.msg_iovlen = 1; 454 + msg.msg_control = cmsg_buf; 455 + msg.msg_controllen = sizeof(cmsg_buf); 456 + 457 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 458 + cmsg->cmsg_level = SOL_SOCKET; 459 + cmsg->cmsg_type = SCM_RIGHTS; 460 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 461 + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); 462 + 463 + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { 464 + close(sock_fd); 465 + close(ipc_sockets[1]); 466 + close(syncpipe[0]); 467 + exit(1); 468 + } 469 + 470 + close(sock_fd); 471 + close(ipc_sockets[1]); 472 + 473 + /* Wait for parent signal */ 474 + read(syncpipe[0], &sync_byte, 1); 475 + close(syncpipe[0]); 476 + exit(0); 477 + } 478 + 479 + /* Parent */ 480 + close(ipc_sockets[1]); 481 + close(syncpipe[0]); 482 + 483 + /* Receive socket FD */ 484 + struct msghdr msg = {0}; 485 + struct iovec iov = {0}; 486 + char buf[1]; 487 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 488 + 489 + iov.iov_base = buf; 490 + iov.iov_len = 1; 491 + msg.msg_iov = &iov; 492 + msg.msg_iovlen = 1; 493 + msg.msg_control = cmsg_buf; 494 + msg.msg_controllen = sizeof(cmsg_buf); 495 + 496 + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); 497 + close(ipc_sockets[0]); 498 + ASSERT_EQ(n, 1); 499 + 500 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 501 + ASSERT_NE(cmsg, NULL); 502 + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); 503 + 504 + /* Get netns from socket while child is alive */ 505 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 506 + if (netns_fd < 0) { 507 + sync_byte = 'G'; 508 + write(syncpipe[1], &sync_byte, 1); 509 + close(syncpipe[1]); 510 + close(sock_fd); 511 + waitpid(pid, NULL, 0); 512 + if (errno == ENOTTY || errno == EINVAL) 513 + SKIP(return, "SIOCGSKNS not supported"); 514 + ASSERT_GE(netns_fd, 0); 515 + } 516 + ASSERT_EQ(fstat(netns_fd, &st), 0); 517 + netns_ino = st.st_ino; 518 + 519 + /* Signal child to exit */ 520 + sync_byte = 'G'; 521 + write(syncpipe[1], &sync_byte, 1); 522 + close(syncpipe[1]); 523 + 524 + waitpid(pid, &status, 0); 525 + ASSERT_TRUE(WIFEXITED(status)); 526 + 527 + /* 528 + * Socket FD should still keep namespace active even after 529 + * the creating process exited. 530 + */ 531 + int test_fd = ioctl(sock_fd, SIOCGSKNS); 532 + ASSERT_GE(test_fd, 0); 533 + 534 + struct stat st_test; 535 + ASSERT_EQ(fstat(test_fd, &st_test), 0); 536 + ASSERT_EQ(st_test.st_ino, netns_ino); 537 + 538 + close(test_fd); 539 + close(netns_fd); 540 + 541 + /* Close socket - namespace should become inactive */ 542 + close(sock_fd); 543 + } 544 + 545 + /* 546 + * Test IPv6 sockets also work with SIOCGSKNS. 547 + */ 548 + TEST(siocgskns_ipv6) 549 + { 550 + int sock_fd, netns_fd, current_netns_fd; 551 + struct stat st1, st2; 552 + 553 + /* Create an IPv6 TCP socket */ 554 + sock_fd = socket(AF_INET6, SOCK_STREAM, 0); 555 + ASSERT_GE(sock_fd, 0); 556 + 557 + /* Use SIOCGSKNS */ 558 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 559 + if (netns_fd < 0) { 560 + close(sock_fd); 561 + if (errno == ENOTTY || errno == EINVAL) 562 + SKIP(return, "SIOCGSKNS not supported"); 563 + ASSERT_GE(netns_fd, 0); 564 + } 565 + 566 + /* Verify it matches current namespace */ 567 + current_netns_fd = open("/proc/self/ns/net", O_RDONLY); 568 + ASSERT_GE(current_netns_fd, 0); 569 + 570 + ASSERT_EQ(fstat(netns_fd, &st1), 0); 571 + ASSERT_EQ(fstat(current_netns_fd, &st2), 0); 572 + ASSERT_EQ(st1.st_ino, st2.st_ino); 573 + 574 + close(sock_fd); 575 + close(netns_fd); 576 + close(current_netns_fd); 577 + } 578 + 579 + /* 580 + * Test that socket-kept netns appears in listns() output. 581 + * Verify that a network namespace kept alive by a socket FD appears in 582 + * listns() output even after the creating process exits, and that it 583 + * disappears when the socket is closed. 584 + */ 585 + TEST(siocgskns_listns_visibility) 586 + { 587 + int sock_fd, netns_fd, owner_fd; 588 + int ipc_sockets[2]; 589 + pid_t pid; 590 + int status; 591 + __u64 netns_id, owner_id; 592 + struct ns_id_req req = { 593 + .size = sizeof(req), 594 + .spare = 0, 595 + .ns_id = 0, 596 + .ns_type = CLONE_NEWNET, 597 + .spare2 = 0, 598 + .user_ns_id = 0, 599 + }; 600 + __u64 ns_ids[256]; 601 + int ret, i; 602 + bool found_netns = false; 603 + 604 + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); 605 + 606 + pid = fork(); 607 + ASSERT_GE(pid, 0); 608 + 609 + if (pid == 0) { 610 + /* Child: create new netns and socket */ 611 + close(ipc_sockets[0]); 612 + 613 + if (unshare(CLONE_NEWNET) < 0) { 614 + close(ipc_sockets[1]); 615 + exit(1); 616 + } 617 + 618 + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); 619 + if (sock_fd < 0) { 620 + close(ipc_sockets[1]); 621 + exit(1); 622 + } 623 + 624 + /* Send socket FD to parent via SCM_RIGHTS */ 625 + struct msghdr msg = {0}; 626 + struct iovec iov = {0}; 627 + char buf[1] = {'X'}; 628 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 629 + 630 + iov.iov_base = buf; 631 + iov.iov_len = 1; 632 + msg.msg_iov = &iov; 633 + msg.msg_iovlen = 1; 634 + msg.msg_control = cmsg_buf; 635 + msg.msg_controllen = sizeof(cmsg_buf); 636 + 637 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 638 + cmsg->cmsg_level = SOL_SOCKET; 639 + cmsg->cmsg_type = SCM_RIGHTS; 640 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 641 + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); 642 + 643 + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { 644 + close(sock_fd); 645 + close(ipc_sockets[1]); 646 + exit(1); 647 + } 648 + 649 + close(sock_fd); 650 + close(ipc_sockets[1]); 651 + exit(0); 652 + } 653 + 654 + /* Parent: receive socket FD */ 655 + close(ipc_sockets[1]); 656 + 657 + struct msghdr msg = {0}; 658 + struct iovec iov = {0}; 659 + char buf[1]; 660 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 661 + 662 + iov.iov_base = buf; 663 + iov.iov_len = 1; 664 + msg.msg_iov = &iov; 665 + msg.msg_iovlen = 1; 666 + msg.msg_control = cmsg_buf; 667 + msg.msg_controllen = sizeof(cmsg_buf); 668 + 669 + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); 670 + close(ipc_sockets[0]); 671 + ASSERT_EQ(n, 1); 672 + 673 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 674 + ASSERT_NE(cmsg, NULL); 675 + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); 676 + 677 + /* Wait for child to exit */ 678 + waitpid(pid, &status, 0); 679 + ASSERT_TRUE(WIFEXITED(status)); 680 + ASSERT_EQ(WEXITSTATUS(status), 0); 681 + 682 + /* Get network namespace from socket */ 683 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 684 + if (netns_fd < 0) { 685 + close(sock_fd); 686 + if (errno == ENOTTY || errno == EINVAL) 687 + SKIP(return, "SIOCGSKNS not supported"); 688 + ASSERT_GE(netns_fd, 0); 689 + } 690 + 691 + /* Get namespace ID */ 692 + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); 693 + if (ret < 0) { 694 + close(sock_fd); 695 + close(netns_fd); 696 + if (errno == ENOTTY || errno == EINVAL) 697 + SKIP(return, "NS_GET_ID not supported"); 698 + ASSERT_EQ(ret, 0); 699 + } 700 + 701 + /* Get owner user namespace */ 702 + owner_fd = ioctl(netns_fd, NS_GET_USERNS); 703 + if (owner_fd < 0) { 704 + close(sock_fd); 705 + close(netns_fd); 706 + if (errno == ENOTTY || errno == EINVAL) 707 + SKIP(return, "NS_GET_USERNS not supported"); 708 + ASSERT_GE(owner_fd, 0); 709 + } 710 + 711 + /* Get owner namespace ID */ 712 + ret = ioctl(owner_fd, NS_GET_ID, &owner_id); 713 + if (ret < 0) { 714 + close(owner_fd); 715 + close(sock_fd); 716 + close(netns_fd); 717 + ASSERT_EQ(ret, 0); 718 + } 719 + close(owner_fd); 720 + 721 + /* Namespace should appear in listns() output */ 722 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 723 + if (ret < 0) { 724 + close(sock_fd); 725 + close(netns_fd); 726 + if (errno == ENOSYS) 727 + SKIP(return, "listns() not supported"); 728 + TH_LOG("listns failed: %s", strerror(errno)); 729 + ASSERT_GE(ret, 0); 730 + } 731 + 732 + /* Search for our network namespace in the list */ 733 + for (i = 0; i < ret; i++) { 734 + if (ns_ids[i] == netns_id) { 735 + found_netns = true; 736 + break; 737 + } 738 + } 739 + 740 + ASSERT_TRUE(found_netns); 741 + TH_LOG("Found netns %llu in listns() output (kept alive by socket)", netns_id); 742 + 743 + /* Now verify with owner filtering */ 744 + req.user_ns_id = owner_id; 745 + found_netns = false; 746 + 747 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 748 + ASSERT_GE(ret, 0); 749 + 750 + for (i = 0; i < ret; i++) { 751 + if (ns_ids[i] == netns_id) { 752 + found_netns = true; 753 + break; 754 + } 755 + } 756 + 757 + ASSERT_TRUE(found_netns); 758 + TH_LOG("Found netns %llu owned by userns %llu", netns_id, owner_id); 759 + 760 + /* Close socket - namespace should become inactive and disappear from listns() */ 761 + close(sock_fd); 762 + close(netns_fd); 763 + 764 + /* Verify it's no longer in listns() output */ 765 + req.user_ns_id = 0; 766 + found_netns = false; 767 + 768 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 769 + ASSERT_GE(ret, 0); 770 + 771 + for (i = 0; i < ret; i++) { 772 + if (ns_ids[i] == netns_id) { 773 + found_netns = true; 774 + break; 775 + } 776 + } 777 + 778 + ASSERT_FALSE(found_netns); 779 + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); 780 + } 781 + 782 + /* 783 + * Test that socket-kept netns can be reopened via file handle. 784 + * Verify that a network namespace kept alive by a socket FD can be 785 + * reopened using file handles even after the creating process exits. 786 + */ 787 + TEST(siocgskns_file_handle) 788 + { 789 + int sock_fd, netns_fd, reopened_fd; 790 + int ipc_sockets[2]; 791 + pid_t pid; 792 + int status; 793 + struct stat st1, st2; 794 + ino_t netns_ino; 795 + __u64 netns_id; 796 + struct file_handle *handle; 797 + struct nsfs_file_handle *nsfs_fh; 798 + int ret; 799 + 800 + /* Allocate file_handle structure for nsfs */ 801 + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); 802 + ASSERT_NE(handle, NULL); 803 + handle->handle_bytes = sizeof(struct nsfs_file_handle); 804 + handle->handle_type = FILEID_NSFS; 805 + 806 + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); 807 + 808 + pid = fork(); 809 + ASSERT_GE(pid, 0); 810 + 811 + if (pid == 0) { 812 + /* Child: create new netns and socket */ 813 + close(ipc_sockets[0]); 814 + 815 + if (unshare(CLONE_NEWNET) < 0) { 816 + close(ipc_sockets[1]); 817 + exit(1); 818 + } 819 + 820 + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); 821 + if (sock_fd < 0) { 822 + close(ipc_sockets[1]); 823 + exit(1); 824 + } 825 + 826 + /* Send socket FD to parent via SCM_RIGHTS */ 827 + struct msghdr msg = {0}; 828 + struct iovec iov = {0}; 829 + char buf[1] = {'X'}; 830 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 831 + 832 + iov.iov_base = buf; 833 + iov.iov_len = 1; 834 + msg.msg_iov = &iov; 835 + msg.msg_iovlen = 1; 836 + msg.msg_control = cmsg_buf; 837 + msg.msg_controllen = sizeof(cmsg_buf); 838 + 839 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 840 + cmsg->cmsg_level = SOL_SOCKET; 841 + cmsg->cmsg_type = SCM_RIGHTS; 842 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 843 + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); 844 + 845 + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { 846 + close(sock_fd); 847 + close(ipc_sockets[1]); 848 + exit(1); 849 + } 850 + 851 + close(sock_fd); 852 + close(ipc_sockets[1]); 853 + exit(0); 854 + } 855 + 856 + /* Parent: receive socket FD */ 857 + close(ipc_sockets[1]); 858 + 859 + struct msghdr msg = {0}; 860 + struct iovec iov = {0}; 861 + char buf[1]; 862 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 863 + 864 + iov.iov_base = buf; 865 + iov.iov_len = 1; 866 + msg.msg_iov = &iov; 867 + msg.msg_iovlen = 1; 868 + msg.msg_control = cmsg_buf; 869 + msg.msg_controllen = sizeof(cmsg_buf); 870 + 871 + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); 872 + close(ipc_sockets[0]); 873 + ASSERT_EQ(n, 1); 874 + 875 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 876 + ASSERT_NE(cmsg, NULL); 877 + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); 878 + 879 + /* Wait for child to exit */ 880 + waitpid(pid, &status, 0); 881 + ASSERT_TRUE(WIFEXITED(status)); 882 + ASSERT_EQ(WEXITSTATUS(status), 0); 883 + 884 + /* Get network namespace from socket */ 885 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 886 + if (netns_fd < 0) { 887 + free(handle); 888 + close(sock_fd); 889 + if (errno == ENOTTY || errno == EINVAL) 890 + SKIP(return, "SIOCGSKNS not supported"); 891 + ASSERT_GE(netns_fd, 0); 892 + } 893 + 894 + ASSERT_EQ(fstat(netns_fd, &st1), 0); 895 + netns_ino = st1.st_ino; 896 + 897 + /* Get namespace ID */ 898 + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); 899 + if (ret < 0) { 900 + free(handle); 901 + close(sock_fd); 902 + close(netns_fd); 903 + if (errno == ENOTTY || errno == EINVAL) 904 + SKIP(return, "NS_GET_ID not supported"); 905 + ASSERT_EQ(ret, 0); 906 + } 907 + 908 + /* Construct file handle from namespace ID */ 909 + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; 910 + nsfs_fh->ns_id = netns_id; 911 + nsfs_fh->ns_type = 0; /* Type field not needed for reopening */ 912 + nsfs_fh->ns_inum = 0; /* Inum field not needed for reopening */ 913 + 914 + TH_LOG("Constructed file handle for netns %lu (id=%llu)", netns_ino, netns_id); 915 + 916 + /* Reopen namespace using file handle (while socket still keeps it alive) */ 917 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 918 + if (reopened_fd < 0) { 919 + free(handle); 920 + close(sock_fd); 921 + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) 922 + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); 923 + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); 924 + ASSERT_GE(reopened_fd, 0); 925 + } 926 + 927 + /* Verify it's the same namespace */ 928 + ASSERT_EQ(fstat(reopened_fd, &st2), 0); 929 + ASSERT_EQ(st1.st_ino, st2.st_ino); 930 + ASSERT_EQ(st1.st_dev, st2.st_dev); 931 + 932 + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); 933 + 934 + close(reopened_fd); 935 + 936 + /* Close the netns FD */ 937 + close(netns_fd); 938 + 939 + /* Try to reopen via file handle - should fail since namespace is now inactive */ 940 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 941 + ASSERT_LT(reopened_fd, 0); 942 + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); 943 + 944 + /* Get network namespace from socket */ 945 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 946 + if (netns_fd < 0) { 947 + free(handle); 948 + close(sock_fd); 949 + if (errno == ENOTTY || errno == EINVAL) 950 + SKIP(return, "SIOCGSKNS not supported"); 951 + ASSERT_GE(netns_fd, 0); 952 + } 953 + 954 + /* Reopen namespace using file handle (while socket still keeps it alive) */ 955 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 956 + if (reopened_fd < 0) { 957 + free(handle); 958 + close(sock_fd); 959 + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) 960 + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); 961 + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); 962 + ASSERT_GE(reopened_fd, 0); 963 + } 964 + 965 + /* Verify it's the same namespace */ 966 + ASSERT_EQ(fstat(reopened_fd, &st2), 0); 967 + ASSERT_EQ(st1.st_ino, st2.st_ino); 968 + ASSERT_EQ(st1.st_dev, st2.st_dev); 969 + 970 + TH_LOG("Successfully reopened netns %lu via file handle", netns_ino); 971 + 972 + /* Close socket - namespace should become inactive */ 973 + close(sock_fd); 974 + free(handle); 975 + } 976 + 977 + /* 978 + * Test combined listns() and file handle operations with socket-kept netns. 979 + * Create a netns, keep it alive with a socket, verify it appears in listns(), 980 + * then reopen it via file handle obtained from listns() entry. 981 + */ 982 + TEST(siocgskns_listns_and_file_handle) 983 + { 984 + int sock_fd, netns_fd, userns_fd, reopened_fd; 985 + int ipc_sockets[2]; 986 + pid_t pid; 987 + int status; 988 + struct stat st; 989 + ino_t netns_ino; 990 + __u64 netns_id, userns_id; 991 + struct ns_id_req req = { 992 + .size = sizeof(req), 993 + .spare = 0, 994 + .ns_id = 0, 995 + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, 996 + .spare2 = 0, 997 + .user_ns_id = 0, 998 + }; 999 + __u64 ns_ids[256]; 1000 + int ret, i; 1001 + bool found_netns = false, found_userns = false; 1002 + struct file_handle *handle; 1003 + struct nsfs_file_handle *nsfs_fh; 1004 + 1005 + /* Allocate file_handle structure for nsfs */ 1006 + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); 1007 + ASSERT_NE(handle, NULL); 1008 + handle->handle_bytes = sizeof(struct nsfs_file_handle); 1009 + handle->handle_type = FILEID_NSFS; 1010 + 1011 + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); 1012 + 1013 + pid = fork(); 1014 + ASSERT_GE(pid, 0); 1015 + 1016 + if (pid == 0) { 1017 + /* Child: create new userns and netns with socket */ 1018 + close(ipc_sockets[0]); 1019 + 1020 + if (setup_userns() < 0) { 1021 + close(ipc_sockets[1]); 1022 + exit(1); 1023 + } 1024 + 1025 + if (unshare(CLONE_NEWNET) < 0) { 1026 + close(ipc_sockets[1]); 1027 + exit(1); 1028 + } 1029 + 1030 + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); 1031 + if (sock_fd < 0) { 1032 + close(ipc_sockets[1]); 1033 + exit(1); 1034 + } 1035 + 1036 + /* Send socket FD to parent via SCM_RIGHTS */ 1037 + struct msghdr msg = {0}; 1038 + struct iovec iov = {0}; 1039 + char buf[1] = {'X'}; 1040 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 1041 + 1042 + iov.iov_base = buf; 1043 + iov.iov_len = 1; 1044 + msg.msg_iov = &iov; 1045 + msg.msg_iovlen = 1; 1046 + msg.msg_control = cmsg_buf; 1047 + msg.msg_controllen = sizeof(cmsg_buf); 1048 + 1049 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 1050 + cmsg->cmsg_level = SOL_SOCKET; 1051 + cmsg->cmsg_type = SCM_RIGHTS; 1052 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 1053 + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); 1054 + 1055 + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { 1056 + close(sock_fd); 1057 + close(ipc_sockets[1]); 1058 + exit(1); 1059 + } 1060 + 1061 + close(sock_fd); 1062 + close(ipc_sockets[1]); 1063 + exit(0); 1064 + } 1065 + 1066 + /* Parent: receive socket FD */ 1067 + close(ipc_sockets[1]); 1068 + 1069 + struct msghdr msg = {0}; 1070 + struct iovec iov = {0}; 1071 + char buf[1]; 1072 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 1073 + 1074 + iov.iov_base = buf; 1075 + iov.iov_len = 1; 1076 + msg.msg_iov = &iov; 1077 + msg.msg_iovlen = 1; 1078 + msg.msg_control = cmsg_buf; 1079 + msg.msg_controllen = sizeof(cmsg_buf); 1080 + 1081 + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); 1082 + close(ipc_sockets[0]); 1083 + ASSERT_EQ(n, 1); 1084 + 1085 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 1086 + ASSERT_NE(cmsg, NULL); 1087 + memcpy(&sock_fd, CMSG_DATA(cmsg), sizeof(int)); 1088 + 1089 + /* Wait for child to exit */ 1090 + waitpid(pid, &status, 0); 1091 + ASSERT_TRUE(WIFEXITED(status)); 1092 + ASSERT_EQ(WEXITSTATUS(status), 0); 1093 + 1094 + /* Get network namespace from socket */ 1095 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 1096 + if (netns_fd < 0) { 1097 + free(handle); 1098 + close(sock_fd); 1099 + if (errno == ENOTTY || errno == EINVAL) 1100 + SKIP(return, "SIOCGSKNS not supported"); 1101 + ASSERT_GE(netns_fd, 0); 1102 + } 1103 + 1104 + ASSERT_EQ(fstat(netns_fd, &st), 0); 1105 + netns_ino = st.st_ino; 1106 + 1107 + /* Get namespace ID */ 1108 + ret = ioctl(netns_fd, NS_GET_ID, &netns_id); 1109 + if (ret < 0) { 1110 + free(handle); 1111 + close(sock_fd); 1112 + close(netns_fd); 1113 + if (errno == ENOTTY || errno == EINVAL) 1114 + SKIP(return, "NS_GET_ID not supported"); 1115 + ASSERT_EQ(ret, 0); 1116 + } 1117 + 1118 + /* Get owner user namespace */ 1119 + userns_fd = ioctl(netns_fd, NS_GET_USERNS); 1120 + if (userns_fd < 0) { 1121 + free(handle); 1122 + close(sock_fd); 1123 + close(netns_fd); 1124 + if (errno == ENOTTY || errno == EINVAL) 1125 + SKIP(return, "NS_GET_USERNS not supported"); 1126 + ASSERT_GE(userns_fd, 0); 1127 + } 1128 + 1129 + /* Get owner namespace ID */ 1130 + ret = ioctl(userns_fd, NS_GET_ID, &userns_id); 1131 + if (ret < 0) { 1132 + close(userns_fd); 1133 + free(handle); 1134 + close(sock_fd); 1135 + close(netns_fd); 1136 + ASSERT_EQ(ret, 0); 1137 + } 1138 + close(userns_fd); 1139 + 1140 + TH_LOG("Testing netns %lu (id=%llu) owned by userns id=%llu", netns_ino, netns_id, userns_id); 1141 + 1142 + /* Verify namespace appears in listns() */ 1143 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 1144 + if (ret < 0) { 1145 + free(handle); 1146 + close(sock_fd); 1147 + close(netns_fd); 1148 + if (errno == ENOSYS) 1149 + SKIP(return, "listns() not supported"); 1150 + TH_LOG("listns failed: %s", strerror(errno)); 1151 + ASSERT_GE(ret, 0); 1152 + } 1153 + 1154 + found_netns = false; 1155 + found_userns = false; 1156 + for (i = 0; i < ret; i++) { 1157 + if (ns_ids[i] == netns_id) 1158 + found_netns = true; 1159 + if (ns_ids[i] == userns_id) 1160 + found_userns = true; 1161 + } 1162 + ASSERT_TRUE(found_netns); 1163 + ASSERT_TRUE(found_userns); 1164 + TH_LOG("Found netns %llu in listns() output", netns_id); 1165 + 1166 + /* Construct file handle from namespace ID */ 1167 + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; 1168 + nsfs_fh->ns_id = netns_id; 1169 + nsfs_fh->ns_type = 0; 1170 + nsfs_fh->ns_inum = 0; 1171 + 1172 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1173 + if (reopened_fd < 0) { 1174 + free(handle); 1175 + close(sock_fd); 1176 + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) 1177 + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); 1178 + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); 1179 + ASSERT_GE(reopened_fd, 0); 1180 + } 1181 + 1182 + struct stat reopened_st; 1183 + ASSERT_EQ(fstat(reopened_fd, &reopened_st), 0); 1184 + ASSERT_EQ(reopened_st.st_ino, netns_ino); 1185 + 1186 + TH_LOG("Successfully reopened netns %lu via file handle (socket-kept)", netns_ino); 1187 + 1188 + close(reopened_fd); 1189 + close(netns_fd); 1190 + 1191 + /* Try to reopen via file handle - should fail since namespace is now inactive */ 1192 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1193 + ASSERT_LT(reopened_fd, 0); 1194 + TH_LOG("Correctly failed to reopen inactive netns: %s", strerror(errno)); 1195 + 1196 + /* Get network namespace from socket */ 1197 + netns_fd = ioctl(sock_fd, SIOCGSKNS); 1198 + if (netns_fd < 0) { 1199 + free(handle); 1200 + close(sock_fd); 1201 + if (errno == ENOTTY || errno == EINVAL) 1202 + SKIP(return, "SIOCGSKNS not supported"); 1203 + ASSERT_GE(netns_fd, 0); 1204 + } 1205 + 1206 + /* Verify namespace appears in listns() */ 1207 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 1208 + if (ret < 0) { 1209 + free(handle); 1210 + close(sock_fd); 1211 + close(netns_fd); 1212 + if (errno == ENOSYS) 1213 + SKIP(return, "listns() not supported"); 1214 + TH_LOG("listns failed: %s", strerror(errno)); 1215 + ASSERT_GE(ret, 0); 1216 + } 1217 + 1218 + found_netns = false; 1219 + found_userns = false; 1220 + for (i = 0; i < ret; i++) { 1221 + if (ns_ids[i] == netns_id) 1222 + found_netns = true; 1223 + if (ns_ids[i] == userns_id) 1224 + found_userns = true; 1225 + } 1226 + ASSERT_TRUE(found_netns); 1227 + ASSERT_TRUE(found_userns); 1228 + TH_LOG("Found netns %llu in listns() output", netns_id); 1229 + 1230 + close(netns_fd); 1231 + 1232 + /* Verify namespace appears in listns() */ 1233 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 1234 + if (ret < 0) { 1235 + free(handle); 1236 + close(sock_fd); 1237 + close(netns_fd); 1238 + if (errno == ENOSYS) 1239 + SKIP(return, "listns() not supported"); 1240 + TH_LOG("listns failed: %s", strerror(errno)); 1241 + ASSERT_GE(ret, 0); 1242 + } 1243 + 1244 + found_netns = false; 1245 + found_userns = false; 1246 + for (i = 0; i < ret; i++) { 1247 + if (ns_ids[i] == netns_id) 1248 + found_netns = true; 1249 + if (ns_ids[i] == userns_id) 1250 + found_userns = true; 1251 + } 1252 + ASSERT_FALSE(found_netns); 1253 + ASSERT_FALSE(found_userns); 1254 + TH_LOG("Netns %llu correctly disappeared from listns() after socket closed", netns_id); 1255 + 1256 + close(sock_fd); 1257 + free(handle); 1258 + } 1259 + 1260 + /* 1261 + * Test multi-level namespace resurrection across three user namespace levels. 1262 + * 1263 + * This test creates a complex namespace hierarchy with three levels of user 1264 + * namespaces and a network namespace at the deepest level. It verifies that 1265 + * the resurrection semantics work correctly when SIOCGSKNS is called on a 1266 + * socket from an inactive namespace tree, and that listns() and 1267 + * open_by_handle_at() correctly respect visibility rules. 1268 + * 1269 + * Hierarchy after child processes exit (all with 0 active refcount): 1270 + * 1271 + * net_L3A (0) <- Level 3 network namespace 1272 + * | 1273 + * + 1274 + * userns_L3 (0) <- Level 3 user namespace 1275 + * | 1276 + * + 1277 + * userns_L2 (0) <- Level 2 user namespace 1278 + * | 1279 + * + 1280 + * userns_L1 (0) <- Level 1 user namespace 1281 + * | 1282 + * x 1283 + * init_user_ns 1284 + * 1285 + * The test verifies: 1286 + * 1. SIOCGSKNS on a socket from inactive net_L3A resurrects the entire chain 1287 + * 2. After resurrection, all namespaces are visible in listns() 1288 + * 3. Resurrected namespaces can be reopened via file handles 1289 + * 4. Closing the netns FD cascades down: the entire ownership chain 1290 + * (userns_L3 -> userns_L2 -> userns_L1) becomes inactive again 1291 + * 5. Inactive namespaces disappear from listns() and cannot be reopened 1292 + * 6. Calling SIOCGSKNS again on the same socket resurrects the tree again 1293 + * 7. After second resurrection, namespaces are visible and can be reopened 1294 + */ 1295 + TEST(siocgskns_multilevel_resurrection) 1296 + { 1297 + int ipc_sockets[2]; 1298 + pid_t pid_l1, pid_l2, pid_l3; 1299 + int status; 1300 + 1301 + /* Namespace file descriptors to be received from child */ 1302 + int sock_L3A_fd = -1; 1303 + int netns_L3A_fd = -1; 1304 + __u64 netns_L3A_id; 1305 + __u64 userns_L1_id, userns_L2_id, userns_L3_id; 1306 + 1307 + /* For listns() and file handle testing */ 1308 + struct ns_id_req req = { 1309 + .size = sizeof(req), 1310 + .spare = 0, 1311 + .ns_id = 0, 1312 + .ns_type = CLONE_NEWNET | CLONE_NEWUSER, 1313 + .spare2 = 0, 1314 + .user_ns_id = 0, 1315 + }; 1316 + __u64 ns_ids[256]; 1317 + int ret, i; 1318 + struct file_handle *handle; 1319 + struct nsfs_file_handle *nsfs_fh; 1320 + int reopened_fd; 1321 + 1322 + /* Allocate file handle for testing */ 1323 + handle = malloc(sizeof(struct file_handle) + sizeof(struct nsfs_file_handle)); 1324 + ASSERT_NE(handle, NULL); 1325 + handle->handle_bytes = sizeof(struct nsfs_file_handle); 1326 + handle->handle_type = FILEID_NSFS; 1327 + 1328 + EXPECT_EQ(socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_sockets), 0); 1329 + 1330 + /* 1331 + * Fork level 1 child that creates userns_L1 1332 + */ 1333 + pid_l1 = fork(); 1334 + ASSERT_GE(pid_l1, 0); 1335 + 1336 + if (pid_l1 == 0) { 1337 + /* Level 1 child */ 1338 + int ipc_L2[2]; 1339 + close(ipc_sockets[0]); 1340 + 1341 + /* Create userns_L1 */ 1342 + if (setup_userns() < 0) { 1343 + close(ipc_sockets[1]); 1344 + exit(1); 1345 + } 1346 + 1347 + /* Create socketpair for communicating with L2 child */ 1348 + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L2) < 0) { 1349 + close(ipc_sockets[1]); 1350 + exit(1); 1351 + } 1352 + 1353 + /* 1354 + * Fork level 2 child that creates userns_L2 1355 + */ 1356 + pid_l2 = fork(); 1357 + if (pid_l2 < 0) { 1358 + close(ipc_sockets[1]); 1359 + close(ipc_L2[0]); 1360 + close(ipc_L2[1]); 1361 + exit(1); 1362 + } 1363 + 1364 + if (pid_l2 == 0) { 1365 + /* Level 2 child */ 1366 + int ipc_L3[2]; 1367 + close(ipc_L2[0]); 1368 + 1369 + /* Create userns_L2 (nested inside userns_L1) */ 1370 + if (setup_userns() < 0) { 1371 + close(ipc_L2[1]); 1372 + exit(1); 1373 + } 1374 + 1375 + /* Create socketpair for communicating with L3 child */ 1376 + if (socketpair(AF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, ipc_L3) < 0) { 1377 + close(ipc_L2[1]); 1378 + exit(1); 1379 + } 1380 + 1381 + /* 1382 + * Fork level 3 child that creates userns_L3 and network namespaces 1383 + */ 1384 + pid_l3 = fork(); 1385 + if (pid_l3 < 0) { 1386 + close(ipc_L2[1]); 1387 + close(ipc_L3[0]); 1388 + close(ipc_L3[1]); 1389 + exit(1); 1390 + } 1391 + 1392 + if (pid_l3 == 0) { 1393 + /* Level 3 child - the deepest level */ 1394 + int sock_fd; 1395 + close(ipc_L3[0]); 1396 + 1397 + /* Create userns_L3 (nested inside userns_L2) */ 1398 + if (setup_userns() < 0) { 1399 + close(ipc_L3[1]); 1400 + exit(1); 1401 + } 1402 + 1403 + /* Create network namespace at level 3 */ 1404 + if (unshare(CLONE_NEWNET) < 0) { 1405 + close(ipc_L3[1]); 1406 + exit(1); 1407 + } 1408 + 1409 + /* Create socket in net_L3A */ 1410 + sock_fd = socket(AF_INET, SOCK_DGRAM, 0); 1411 + if (sock_fd < 0) { 1412 + close(ipc_L3[1]); 1413 + exit(1); 1414 + } 1415 + 1416 + /* Send socket FD to L2 parent */ 1417 + struct msghdr msg = {0}; 1418 + struct iovec iov = {0}; 1419 + char buf[1] = {'X'}; 1420 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 1421 + 1422 + iov.iov_base = buf; 1423 + iov.iov_len = 1; 1424 + msg.msg_iov = &iov; 1425 + msg.msg_iovlen = 1; 1426 + msg.msg_control = cmsg_buf; 1427 + msg.msg_controllen = sizeof(cmsg_buf); 1428 + 1429 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 1430 + cmsg->cmsg_level = SOL_SOCKET; 1431 + cmsg->cmsg_type = SCM_RIGHTS; 1432 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 1433 + memcpy(CMSG_DATA(cmsg), &sock_fd, sizeof(int)); 1434 + 1435 + if (sendmsg(ipc_L3[1], &msg, 0) < 0) { 1436 + close(sock_fd); 1437 + close(ipc_L3[1]); 1438 + exit(1); 1439 + } 1440 + 1441 + close(sock_fd); 1442 + close(ipc_L3[1]); 1443 + exit(0); 1444 + } 1445 + 1446 + /* Level 2 child - receive from L3 and forward to L1 */ 1447 + close(ipc_L3[1]); 1448 + 1449 + struct msghdr msg = {0}; 1450 + struct iovec iov = {0}; 1451 + char buf[1]; 1452 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 1453 + int received_fd; 1454 + 1455 + iov.iov_base = buf; 1456 + iov.iov_len = 1; 1457 + msg.msg_iov = &iov; 1458 + msg.msg_iovlen = 1; 1459 + msg.msg_control = cmsg_buf; 1460 + msg.msg_controllen = sizeof(cmsg_buf); 1461 + 1462 + ssize_t n = recvmsg(ipc_L3[0], &msg, 0); 1463 + close(ipc_L3[0]); 1464 + 1465 + if (n != 1) { 1466 + close(ipc_L2[1]); 1467 + waitpid(pid_l3, NULL, 0); 1468 + exit(1); 1469 + } 1470 + 1471 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 1472 + if (!cmsg) { 1473 + close(ipc_L2[1]); 1474 + waitpid(pid_l3, NULL, 0); 1475 + exit(1); 1476 + } 1477 + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); 1478 + 1479 + /* Wait for L3 child */ 1480 + waitpid(pid_l3, NULL, 0); 1481 + 1482 + /* Forward the socket FD to L1 parent */ 1483 + memset(&msg, 0, sizeof(msg)); 1484 + buf[0] = 'Y'; 1485 + iov.iov_base = buf; 1486 + iov.iov_len = 1; 1487 + msg.msg_iov = &iov; 1488 + msg.msg_iovlen = 1; 1489 + msg.msg_control = cmsg_buf; 1490 + msg.msg_controllen = sizeof(cmsg_buf); 1491 + 1492 + cmsg = CMSG_FIRSTHDR(&msg); 1493 + cmsg->cmsg_level = SOL_SOCKET; 1494 + cmsg->cmsg_type = SCM_RIGHTS; 1495 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 1496 + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); 1497 + 1498 + if (sendmsg(ipc_L2[1], &msg, 0) < 0) { 1499 + close(received_fd); 1500 + close(ipc_L2[1]); 1501 + exit(1); 1502 + } 1503 + 1504 + close(received_fd); 1505 + close(ipc_L2[1]); 1506 + exit(0); 1507 + } 1508 + 1509 + /* Level 1 child - receive from L2 and forward to parent */ 1510 + close(ipc_L2[1]); 1511 + 1512 + struct msghdr msg = {0}; 1513 + struct iovec iov = {0}; 1514 + char buf[1]; 1515 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 1516 + int received_fd; 1517 + 1518 + iov.iov_base = buf; 1519 + iov.iov_len = 1; 1520 + msg.msg_iov = &iov; 1521 + msg.msg_iovlen = 1; 1522 + msg.msg_control = cmsg_buf; 1523 + msg.msg_controllen = sizeof(cmsg_buf); 1524 + 1525 + ssize_t n = recvmsg(ipc_L2[0], &msg, 0); 1526 + close(ipc_L2[0]); 1527 + 1528 + if (n != 1) { 1529 + close(ipc_sockets[1]); 1530 + waitpid(pid_l2, NULL, 0); 1531 + exit(1); 1532 + } 1533 + 1534 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 1535 + if (!cmsg) { 1536 + close(ipc_sockets[1]); 1537 + waitpid(pid_l2, NULL, 0); 1538 + exit(1); 1539 + } 1540 + memcpy(&received_fd, CMSG_DATA(cmsg), sizeof(int)); 1541 + 1542 + /* Wait for L2 child */ 1543 + waitpid(pid_l2, NULL, 0); 1544 + 1545 + /* Forward the socket FD to parent */ 1546 + memset(&msg, 0, sizeof(msg)); 1547 + buf[0] = 'Z'; 1548 + iov.iov_base = buf; 1549 + iov.iov_len = 1; 1550 + msg.msg_iov = &iov; 1551 + msg.msg_iovlen = 1; 1552 + msg.msg_control = cmsg_buf; 1553 + msg.msg_controllen = sizeof(cmsg_buf); 1554 + 1555 + cmsg = CMSG_FIRSTHDR(&msg); 1556 + cmsg->cmsg_level = SOL_SOCKET; 1557 + cmsg->cmsg_type = SCM_RIGHTS; 1558 + cmsg->cmsg_len = CMSG_LEN(sizeof(int)); 1559 + memcpy(CMSG_DATA(cmsg), &received_fd, sizeof(int)); 1560 + 1561 + if (sendmsg(ipc_sockets[1], &msg, 0) < 0) { 1562 + close(received_fd); 1563 + close(ipc_sockets[1]); 1564 + exit(1); 1565 + } 1566 + 1567 + close(received_fd); 1568 + close(ipc_sockets[1]); 1569 + exit(0); 1570 + } 1571 + 1572 + /* Parent - receive the socket from the deepest level */ 1573 + close(ipc_sockets[1]); 1574 + 1575 + struct msghdr msg = {0}; 1576 + struct iovec iov = {0}; 1577 + char buf[1]; 1578 + char cmsg_buf[CMSG_SPACE(sizeof(int))]; 1579 + 1580 + iov.iov_base = buf; 1581 + iov.iov_len = 1; 1582 + msg.msg_iov = &iov; 1583 + msg.msg_iovlen = 1; 1584 + msg.msg_control = cmsg_buf; 1585 + msg.msg_controllen = sizeof(cmsg_buf); 1586 + 1587 + ssize_t n = recvmsg(ipc_sockets[0], &msg, 0); 1588 + close(ipc_sockets[0]); 1589 + 1590 + if (n != 1) { 1591 + free(handle); 1592 + waitpid(pid_l1, NULL, 0); 1593 + SKIP(return, "Failed to receive socket from child"); 1594 + } 1595 + 1596 + struct cmsghdr *cmsg = CMSG_FIRSTHDR(&msg); 1597 + if (!cmsg) { 1598 + free(handle); 1599 + waitpid(pid_l1, NULL, 0); 1600 + SKIP(return, "Failed to receive socket from child"); 1601 + } 1602 + memcpy(&sock_L3A_fd, CMSG_DATA(cmsg), sizeof(int)); 1603 + 1604 + /* Wait for L1 child */ 1605 + waitpid(pid_l1, &status, 0); 1606 + ASSERT_TRUE(WIFEXITED(status)); 1607 + ASSERT_EQ(WEXITSTATUS(status), 0); 1608 + 1609 + /* 1610 + * At this point, all child processes have exited. The socket itself 1611 + * doesn't keep the namespace active - we need to call SIOCGSKNS which 1612 + * will resurrect the entire namespace tree by taking active references. 1613 + */ 1614 + 1615 + /* Get network namespace from socket - this resurrects the tree */ 1616 + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); 1617 + if (netns_L3A_fd < 0) { 1618 + free(handle); 1619 + close(sock_L3A_fd); 1620 + if (errno == ENOTTY || errno == EINVAL) 1621 + SKIP(return, "SIOCGSKNS not supported"); 1622 + ASSERT_GE(netns_L3A_fd, 0); 1623 + } 1624 + 1625 + /* Get namespace ID for net_L3A */ 1626 + ret = ioctl(netns_L3A_fd, NS_GET_ID, &netns_L3A_id); 1627 + if (ret < 0) { 1628 + free(handle); 1629 + close(sock_L3A_fd); 1630 + close(netns_L3A_fd); 1631 + if (errno == ENOTTY || errno == EINVAL) 1632 + SKIP(return, "NS_GET_ID not supported"); 1633 + ASSERT_EQ(ret, 0); 1634 + } 1635 + 1636 + /* Get owner user namespace chain: userns_L3 -> userns_L2 -> userns_L1 */ 1637 + int userns_L3_fd = ioctl(netns_L3A_fd, NS_GET_USERNS); 1638 + if (userns_L3_fd < 0) { 1639 + free(handle); 1640 + close(sock_L3A_fd); 1641 + close(netns_L3A_fd); 1642 + if (errno == ENOTTY || errno == EINVAL) 1643 + SKIP(return, "NS_GET_USERNS not supported"); 1644 + ASSERT_GE(userns_L3_fd, 0); 1645 + } 1646 + 1647 + ret = ioctl(userns_L3_fd, NS_GET_ID, &userns_L3_id); 1648 + ASSERT_EQ(ret, 0); 1649 + 1650 + int userns_L2_fd = ioctl(userns_L3_fd, NS_GET_USERNS); 1651 + ASSERT_GE(userns_L2_fd, 0); 1652 + ret = ioctl(userns_L2_fd, NS_GET_ID, &userns_L2_id); 1653 + ASSERT_EQ(ret, 0); 1654 + 1655 + int userns_L1_fd = ioctl(userns_L2_fd, NS_GET_USERNS); 1656 + ASSERT_GE(userns_L1_fd, 0); 1657 + ret = ioctl(userns_L1_fd, NS_GET_ID, &userns_L1_id); 1658 + ASSERT_EQ(ret, 0); 1659 + 1660 + close(userns_L1_fd); 1661 + close(userns_L2_fd); 1662 + close(userns_L3_fd); 1663 + 1664 + TH_LOG("Multi-level hierarchy: net_L3A (id=%llu) -> userns_L3 (id=%llu) -> userns_L2 (id=%llu) -> userns_L1 (id=%llu)", 1665 + netns_L3A_id, userns_L3_id, userns_L2_id, userns_L1_id); 1666 + 1667 + /* 1668 + * Test 1: Verify net_L3A is visible in listns() after resurrection. 1669 + * The entire ownership chain should be resurrected and visible. 1670 + */ 1671 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 1672 + if (ret < 0) { 1673 + free(handle); 1674 + close(sock_L3A_fd); 1675 + close(netns_L3A_fd); 1676 + if (errno == ENOSYS) 1677 + SKIP(return, "listns() not supported"); 1678 + ASSERT_GE(ret, 0); 1679 + } 1680 + 1681 + bool found_netns_L3A = false; 1682 + bool found_userns_L1 = false; 1683 + bool found_userns_L2 = false; 1684 + bool found_userns_L3 = false; 1685 + 1686 + for (i = 0; i < ret; i++) { 1687 + if (ns_ids[i] == netns_L3A_id) 1688 + found_netns_L3A = true; 1689 + if (ns_ids[i] == userns_L1_id) 1690 + found_userns_L1 = true; 1691 + if (ns_ids[i] == userns_L2_id) 1692 + found_userns_L2 = true; 1693 + if (ns_ids[i] == userns_L3_id) 1694 + found_userns_L3 = true; 1695 + } 1696 + 1697 + ASSERT_TRUE(found_netns_L3A); 1698 + ASSERT_TRUE(found_userns_L1); 1699 + ASSERT_TRUE(found_userns_L2); 1700 + ASSERT_TRUE(found_userns_L3); 1701 + TH_LOG("Resurrection verified: all namespaces in hierarchy visible in listns()"); 1702 + 1703 + /* 1704 + * Test 2: Verify net_L3A can be reopened via file handle. 1705 + */ 1706 + nsfs_fh = (struct nsfs_file_handle *)handle->f_handle; 1707 + nsfs_fh->ns_id = netns_L3A_id; 1708 + nsfs_fh->ns_type = 0; 1709 + nsfs_fh->ns_inum = 0; 1710 + 1711 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1712 + if (reopened_fd < 0) { 1713 + free(handle); 1714 + close(sock_L3A_fd); 1715 + close(netns_L3A_fd); 1716 + if (errno == EOPNOTSUPP || errno == ENOSYS || errno == EBADF) 1717 + SKIP(return, "open_by_handle_at with FD_NSFS_ROOT not supported"); 1718 + TH_LOG("open_by_handle_at failed: %s", strerror(errno)); 1719 + ASSERT_GE(reopened_fd, 0); 1720 + } 1721 + 1722 + close(reopened_fd); 1723 + TH_LOG("File handle test passed: net_L3A can be reopened"); 1724 + 1725 + /* 1726 + * Test 3: Verify that when we close the netns FD (dropping the last 1727 + * active reference), the entire tree becomes inactive and disappears 1728 + * from listns(). The cascade goes: net_L3A drops -> userns_L3 drops -> 1729 + * userns_L2 drops -> userns_L1 drops. 1730 + */ 1731 + close(netns_L3A_fd); 1732 + 1733 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 1734 + ASSERT_GE(ret, 0); 1735 + 1736 + found_netns_L3A = false; 1737 + found_userns_L1 = false; 1738 + found_userns_L2 = false; 1739 + found_userns_L3 = false; 1740 + 1741 + for (i = 0; i < ret; i++) { 1742 + if (ns_ids[i] == netns_L3A_id) 1743 + found_netns_L3A = true; 1744 + if (ns_ids[i] == userns_L1_id) 1745 + found_userns_L1 = true; 1746 + if (ns_ids[i] == userns_L2_id) 1747 + found_userns_L2 = true; 1748 + if (ns_ids[i] == userns_L3_id) 1749 + found_userns_L3 = true; 1750 + } 1751 + 1752 + ASSERT_FALSE(found_netns_L3A); 1753 + ASSERT_FALSE(found_userns_L1); 1754 + ASSERT_FALSE(found_userns_L2); 1755 + ASSERT_FALSE(found_userns_L3); 1756 + TH_LOG("Cascade test passed: all namespaces disappeared after netns FD closed"); 1757 + 1758 + /* 1759 + * Test 4: Verify file handle no longer works for inactive namespace. 1760 + */ 1761 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1762 + if (reopened_fd >= 0) { 1763 + close(reopened_fd); 1764 + free(handle); 1765 + ASSERT_TRUE(false); /* Should have failed */ 1766 + } 1767 + TH_LOG("Inactive namespace correctly cannot be reopened via file handle"); 1768 + 1769 + /* 1770 + * Test 5: Verify that calling SIOCGSKNS again resurrects the tree again. 1771 + * The socket is still valid, so we can call SIOCGSKNS on it to resurrect 1772 + * the namespace tree once more. 1773 + */ 1774 + netns_L3A_fd = ioctl(sock_L3A_fd, SIOCGSKNS); 1775 + ASSERT_GE(netns_L3A_fd, 0); 1776 + 1777 + TH_LOG("Called SIOCGSKNS again to resurrect the namespace tree"); 1778 + 1779 + /* Verify the namespace tree is resurrected and visible in listns() */ 1780 + ret = sys_listns(&req, ns_ids, ARRAY_SIZE(ns_ids), 0); 1781 + ASSERT_GE(ret, 0); 1782 + 1783 + found_netns_L3A = false; 1784 + found_userns_L1 = false; 1785 + found_userns_L2 = false; 1786 + found_userns_L3 = false; 1787 + 1788 + for (i = 0; i < ret; i++) { 1789 + if (ns_ids[i] == netns_L3A_id) 1790 + found_netns_L3A = true; 1791 + if (ns_ids[i] == userns_L1_id) 1792 + found_userns_L1 = true; 1793 + if (ns_ids[i] == userns_L2_id) 1794 + found_userns_L2 = true; 1795 + if (ns_ids[i] == userns_L3_id) 1796 + found_userns_L3 = true; 1797 + } 1798 + 1799 + ASSERT_TRUE(found_netns_L3A); 1800 + ASSERT_TRUE(found_userns_L1); 1801 + ASSERT_TRUE(found_userns_L2); 1802 + ASSERT_TRUE(found_userns_L3); 1803 + TH_LOG("Second resurrection verified: all namespaces in hierarchy visible in listns() again"); 1804 + 1805 + /* Verify we can reopen via file handle again */ 1806 + reopened_fd = open_by_handle_at(FD_NSFS_ROOT, handle, O_RDONLY); 1807 + if (reopened_fd < 0) { 1808 + free(handle); 1809 + close(sock_L3A_fd); 1810 + close(netns_L3A_fd); 1811 + TH_LOG("open_by_handle_at failed after second resurrection: %s", strerror(errno)); 1812 + ASSERT_GE(reopened_fd, 0); 1813 + } 1814 + 1815 + close(reopened_fd); 1816 + TH_LOG("File handle test passed: net_L3A can be reopened after second resurrection"); 1817 + 1818 + /* Final cleanup */ 1819 + close(sock_L3A_fd); 1820 + close(netns_L3A_fd); 1821 + free(handle); 1822 + } 1823 + 1824 + TEST_HARNESS_MAIN

+626

tools/testing/selftests/namespaces/stress_test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + #define _GNU_SOURCE 3 + #include <errno.h> 4 + #include <fcntl.h> 5 + #include <limits.h> 6 + #include <sched.h> 7 + #include <stdio.h> 8 + #include <stdlib.h> 9 + #include <string.h> 10 + #include <sys/ioctl.h> 11 + #include <sys/socket.h> 12 + #include <sys/stat.h> 13 + #include <sys/syscall.h> 14 + #include <sys/types.h> 15 + #include <sys/wait.h> 16 + #include <unistd.h> 17 + #include <linux/nsfs.h> 18 + #include "../kselftest_harness.h" 19 + #include "../filesystems/utils.h" 20 + #include "wrappers.h" 21 + 22 + /* 23 + * Stress tests for namespace active reference counting. 24 + * 25 + * These tests validate that the active reference counting system can handle 26 + * high load scenarios including rapid namespace creation/destruction, large 27 + * numbers of concurrent namespaces, and various edge cases under stress. 28 + */ 29 + 30 + /* 31 + * Test rapid creation and destruction of user namespaces. 32 + * Create and destroy namespaces in quick succession to stress the 33 + * active reference tracking and ensure no leaks occur. 34 + */ 35 + TEST(rapid_namespace_creation_destruction) 36 + { 37 + struct ns_id_req req = { 38 + .size = sizeof(req), 39 + .spare = 0, 40 + .ns_id = 0, 41 + .ns_type = CLONE_NEWUSER, 42 + .spare2 = 0, 43 + .user_ns_id = 0, 44 + }; 45 + __u64 ns_ids_before[256], ns_ids_after[256]; 46 + ssize_t ret_before, ret_after; 47 + int i; 48 + 49 + /* Get baseline count of active user namespaces */ 50 + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 51 + if (ret_before < 0) { 52 + if (errno == ENOSYS) 53 + SKIP(return, "listns() not supported"); 54 + ASSERT_GE(ret_before, 0); 55 + } 56 + 57 + TH_LOG("Baseline: %zd active user namespaces", ret_before); 58 + 59 + /* Rapidly create and destroy 100 user namespaces */ 60 + for (i = 0; i < 100; i++) { 61 + pid_t pid = fork(); 62 + ASSERT_GE(pid, 0); 63 + 64 + if (pid == 0) { 65 + /* Child: create user namespace and immediately exit */ 66 + if (setup_userns() < 0) 67 + exit(1); 68 + exit(0); 69 + } 70 + 71 + /* Parent: wait for child */ 72 + int status; 73 + waitpid(pid, &status, 0); 74 + ASSERT_TRUE(WIFEXITED(status)); 75 + ASSERT_EQ(WEXITSTATUS(status), 0); 76 + } 77 + 78 + /* Verify we're back to baseline (no leaked namespaces) */ 79 + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 80 + ASSERT_GE(ret_after, 0); 81 + 82 + TH_LOG("After 100 rapid create/destroy cycles: %zd active user namespaces", ret_after); 83 + ASSERT_EQ(ret_before, ret_after); 84 + } 85 + 86 + /* 87 + * Test creating many concurrent namespaces. 88 + * Verify that listns() correctly tracks all of them and that they all 89 + * become inactive after processes exit. 90 + */ 91 + TEST(many_concurrent_namespaces) 92 + { 93 + struct ns_id_req req = { 94 + .size = sizeof(req), 95 + .spare = 0, 96 + .ns_id = 0, 97 + .ns_type = CLONE_NEWUSER, 98 + .spare2 = 0, 99 + .user_ns_id = 0, 100 + }; 101 + __u64 ns_ids_before[512], ns_ids_during[512], ns_ids_after[512]; 102 + ssize_t ret_before, ret_during, ret_after; 103 + pid_t pids[50]; 104 + int num_children = 50; 105 + int i; 106 + int sv[2]; 107 + 108 + /* Get baseline */ 109 + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 110 + if (ret_before < 0) { 111 + if (errno == ENOSYS) 112 + SKIP(return, "listns() not supported"); 113 + ASSERT_GE(ret_before, 0); 114 + } 115 + 116 + TH_LOG("Baseline: %zd active user namespaces", ret_before); 117 + 118 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 119 + 120 + /* Create many children, each with their own user namespace */ 121 + for (i = 0; i < num_children; i++) { 122 + pids[i] = fork(); 123 + ASSERT_GE(pids[i], 0); 124 + 125 + if (pids[i] == 0) { 126 + /* Child: create user namespace and wait for parent signal */ 127 + char c; 128 + 129 + close(sv[0]); 130 + 131 + if (setup_userns() < 0) { 132 + close(sv[1]); 133 + exit(1); 134 + } 135 + 136 + /* Signal parent we're ready */ 137 + if (write(sv[1], &c, 1) != 1) { 138 + close(sv[1]); 139 + exit(1); 140 + } 141 + 142 + /* Wait for parent signal to exit */ 143 + if (read(sv[1], &c, 1) != 1) { 144 + close(sv[1]); 145 + exit(1); 146 + } 147 + 148 + close(sv[1]); 149 + exit(0); 150 + } 151 + } 152 + 153 + close(sv[1]); 154 + 155 + /* Wait for all children to signal ready */ 156 + for (i = 0; i < num_children; i++) { 157 + char c; 158 + if (read(sv[0], &c, 1) != 1) { 159 + /* If we fail to read, kill all children and exit */ 160 + close(sv[0]); 161 + for (int j = 0; j < num_children; j++) 162 + kill(pids[j], SIGKILL); 163 + for (int j = 0; j < num_children; j++) 164 + waitpid(pids[j], NULL, 0); 165 + ASSERT_TRUE(false); 166 + } 167 + } 168 + 169 + /* List namespaces while all children are running */ 170 + ret_during = sys_listns(&req, ns_ids_during, ARRAY_SIZE(ns_ids_during), 0); 171 + ASSERT_GE(ret_during, 0); 172 + 173 + TH_LOG("With %d children running: %zd active user namespaces", num_children, ret_during); 174 + 175 + /* Should have at least num_children more namespaces than baseline */ 176 + ASSERT_GE(ret_during, ret_before + num_children); 177 + 178 + /* Signal all children to exit */ 179 + for (i = 0; i < num_children; i++) { 180 + char c = 'X'; 181 + if (write(sv[0], &c, 1) != 1) { 182 + /* If we fail to write, kill remaining children */ 183 + close(sv[0]); 184 + for (int j = i; j < num_children; j++) 185 + kill(pids[j], SIGKILL); 186 + for (int j = 0; j < num_children; j++) 187 + waitpid(pids[j], NULL, 0); 188 + ASSERT_TRUE(false); 189 + } 190 + } 191 + 192 + close(sv[0]); 193 + 194 + /* Wait for all children */ 195 + for (i = 0; i < num_children; i++) { 196 + int status; 197 + waitpid(pids[i], &status, 0); 198 + ASSERT_TRUE(WIFEXITED(status)); 199 + } 200 + 201 + /* Verify we're back to baseline */ 202 + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 203 + ASSERT_GE(ret_after, 0); 204 + 205 + TH_LOG("After all children exit: %zd active user namespaces", ret_after); 206 + ASSERT_EQ(ret_before, ret_after); 207 + } 208 + 209 + /* 210 + * Test rapid namespace creation with different namespace types. 211 + * Create multiple types of namespaces rapidly to stress the tracking system. 212 + */ 213 + TEST(rapid_mixed_namespace_creation) 214 + { 215 + struct ns_id_req req = { 216 + .size = sizeof(req), 217 + .spare = 0, 218 + .ns_id = 0, 219 + .ns_type = 0, /* All types */ 220 + .spare2 = 0, 221 + .user_ns_id = 0, 222 + }; 223 + __u64 ns_ids_before[512], ns_ids_after[512]; 224 + ssize_t ret_before, ret_after; 225 + int i; 226 + 227 + /* Get baseline count */ 228 + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 229 + if (ret_before < 0) { 230 + if (errno == ENOSYS) 231 + SKIP(return, "listns() not supported"); 232 + ASSERT_GE(ret_before, 0); 233 + } 234 + 235 + TH_LOG("Baseline: %zd active namespaces (all types)", ret_before); 236 + 237 + /* Rapidly create and destroy namespaces with multiple types */ 238 + for (i = 0; i < 50; i++) { 239 + pid_t pid = fork(); 240 + ASSERT_GE(pid, 0); 241 + 242 + if (pid == 0) { 243 + /* Child: create multiple namespace types */ 244 + if (setup_userns() < 0) 245 + exit(1); 246 + 247 + /* Create additional namespace types */ 248 + if (unshare(CLONE_NEWNET) < 0) 249 + exit(1); 250 + if (unshare(CLONE_NEWUTS) < 0) 251 + exit(1); 252 + if (unshare(CLONE_NEWIPC) < 0) 253 + exit(1); 254 + 255 + exit(0); 256 + } 257 + 258 + /* Parent: wait for child */ 259 + int status; 260 + waitpid(pid, &status, 0); 261 + ASSERT_TRUE(WIFEXITED(status)); 262 + } 263 + 264 + /* Verify we're back to baseline */ 265 + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 266 + ASSERT_GE(ret_after, 0); 267 + 268 + TH_LOG("After 50 rapid mixed namespace cycles: %zd active namespaces", ret_after); 269 + ASSERT_EQ(ret_before, ret_after); 270 + } 271 + 272 + /* 273 + * Test nested namespace creation under stress. 274 + * Create deeply nested namespace hierarchies and verify proper cleanup. 275 + */ 276 + TEST(nested_namespace_stress) 277 + { 278 + struct ns_id_req req = { 279 + .size = sizeof(req), 280 + .spare = 0, 281 + .ns_id = 0, 282 + .ns_type = CLONE_NEWUSER, 283 + .spare2 = 0, 284 + .user_ns_id = 0, 285 + }; 286 + __u64 ns_ids_before[512], ns_ids_after[512]; 287 + ssize_t ret_before, ret_after; 288 + int i; 289 + 290 + /* Get baseline */ 291 + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 292 + if (ret_before < 0) { 293 + if (errno == ENOSYS) 294 + SKIP(return, "listns() not supported"); 295 + ASSERT_GE(ret_before, 0); 296 + } 297 + 298 + TH_LOG("Baseline: %zd active user namespaces", ret_before); 299 + 300 + /* Create 20 processes, each with nested user namespaces */ 301 + for (i = 0; i < 20; i++) { 302 + pid_t pid = fork(); 303 + ASSERT_GE(pid, 0); 304 + 305 + if (pid == 0) { 306 + int userns_fd; 307 + uid_t orig_uid = getuid(); 308 + int depth; 309 + 310 + /* Create nested user namespaces (up to 5 levels) */ 311 + for (depth = 0; depth < 5; depth++) { 312 + userns_fd = get_userns_fd(0, (depth == 0) ? orig_uid : 0, 1); 313 + if (userns_fd < 0) 314 + exit(1); 315 + 316 + if (setns(userns_fd, CLONE_NEWUSER) < 0) { 317 + close(userns_fd); 318 + exit(1); 319 + } 320 + close(userns_fd); 321 + } 322 + 323 + exit(0); 324 + } 325 + 326 + /* Parent: wait for child */ 327 + int status; 328 + waitpid(pid, &status, 0); 329 + ASSERT_TRUE(WIFEXITED(status)); 330 + } 331 + 332 + /* Verify we're back to baseline */ 333 + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 334 + ASSERT_GE(ret_after, 0); 335 + 336 + TH_LOG("After 20 nested namespace hierarchies: %zd active user namespaces", ret_after); 337 + ASSERT_EQ(ret_before, ret_after); 338 + } 339 + 340 + /* 341 + * Test listns() pagination under stress. 342 + * Create many namespaces and verify pagination works correctly. 343 + */ 344 + TEST(listns_pagination_stress) 345 + { 346 + struct ns_id_req req = { 347 + .size = sizeof(req), 348 + .spare = 0, 349 + .ns_id = 0, 350 + .ns_type = CLONE_NEWUSER, 351 + .spare2 = 0, 352 + .user_ns_id = 0, 353 + }; 354 + pid_t pids[30]; 355 + int num_children = 30; 356 + int i; 357 + int sv[2]; 358 + __u64 all_ns_ids[512]; 359 + int total_found = 0; 360 + 361 + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, sv), 0); 362 + 363 + /* Create many children with user namespaces */ 364 + for (i = 0; i < num_children; i++) { 365 + pids[i] = fork(); 366 + ASSERT_GE(pids[i], 0); 367 + 368 + if (pids[i] == 0) { 369 + char c; 370 + close(sv[0]); 371 + 372 + if (setup_userns() < 0) { 373 + close(sv[1]); 374 + exit(1); 375 + } 376 + 377 + /* Signal parent we're ready */ 378 + if (write(sv[1], &c, 1) != 1) { 379 + close(sv[1]); 380 + exit(1); 381 + } 382 + 383 + /* Wait for parent signal to exit */ 384 + if (read(sv[1], &c, 1) != 1) { 385 + close(sv[1]); 386 + exit(1); 387 + } 388 + 389 + close(sv[1]); 390 + exit(0); 391 + } 392 + } 393 + 394 + close(sv[1]); 395 + 396 + /* Wait for all children to signal ready */ 397 + for (i = 0; i < num_children; i++) { 398 + char c; 399 + if (read(sv[0], &c, 1) != 1) { 400 + /* If we fail to read, kill all children and exit */ 401 + close(sv[0]); 402 + for (int j = 0; j < num_children; j++) 403 + kill(pids[j], SIGKILL); 404 + for (int j = 0; j < num_children; j++) 405 + waitpid(pids[j], NULL, 0); 406 + ASSERT_TRUE(false); 407 + } 408 + } 409 + 410 + /* Paginate through all namespaces using small batch sizes */ 411 + req.ns_id = 0; 412 + while (1) { 413 + __u64 batch[5]; /* Small batch size to force pagination */ 414 + ssize_t ret; 415 + 416 + ret = sys_listns(&req, batch, ARRAY_SIZE(batch), 0); 417 + if (ret < 0) { 418 + if (errno == ENOSYS) { 419 + close(sv[0]); 420 + for (i = 0; i < num_children; i++) 421 + kill(pids[i], SIGKILL); 422 + for (i = 0; i < num_children; i++) 423 + waitpid(pids[i], NULL, 0); 424 + SKIP(return, "listns() not supported"); 425 + } 426 + ASSERT_GE(ret, 0); 427 + } 428 + 429 + if (ret == 0) 430 + break; 431 + 432 + /* Store results */ 433 + for (i = 0; i < ret && total_found < 512; i++) { 434 + all_ns_ids[total_found++] = batch[i]; 435 + } 436 + 437 + /* Update cursor for next batch */ 438 + if (ret == ARRAY_SIZE(batch)) 439 + req.ns_id = batch[ret - 1]; 440 + else 441 + break; 442 + } 443 + 444 + TH_LOG("Paginated through %d user namespaces", total_found); 445 + 446 + /* Verify no duplicates in pagination */ 447 + for (i = 0; i < total_found; i++) { 448 + for (int j = i + 1; j < total_found; j++) { 449 + if (all_ns_ids[i] == all_ns_ids[j]) { 450 + TH_LOG("Found duplicate ns_id: %llu at positions %d and %d", 451 + (unsigned long long)all_ns_ids[i], i, j); 452 + ASSERT_TRUE(false); 453 + } 454 + } 455 + } 456 + 457 + /* Signal all children to exit */ 458 + for (i = 0; i < num_children; i++) { 459 + char c = 'X'; 460 + if (write(sv[0], &c, 1) != 1) { 461 + close(sv[0]); 462 + for (int j = i; j < num_children; j++) 463 + kill(pids[j], SIGKILL); 464 + for (int j = 0; j < num_children; j++) 465 + waitpid(pids[j], NULL, 0); 466 + ASSERT_TRUE(false); 467 + } 468 + } 469 + 470 + close(sv[0]); 471 + 472 + /* Wait for all children */ 473 + for (i = 0; i < num_children; i++) { 474 + int status; 475 + waitpid(pids[i], &status, 0); 476 + } 477 + } 478 + 479 + /* 480 + * Test concurrent namespace operations. 481 + * Multiple processes creating, querying, and destroying namespaces concurrently. 482 + */ 483 + TEST(concurrent_namespace_operations) 484 + { 485 + struct ns_id_req req = { 486 + .size = sizeof(req), 487 + .spare = 0, 488 + .ns_id = 0, 489 + .ns_type = 0, 490 + .spare2 = 0, 491 + .user_ns_id = 0, 492 + }; 493 + __u64 ns_ids_before[512], ns_ids_after[512]; 494 + ssize_t ret_before, ret_after; 495 + pid_t pids[20]; 496 + int num_workers = 20; 497 + int i; 498 + 499 + /* Get baseline */ 500 + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 501 + if (ret_before < 0) { 502 + if (errno == ENOSYS) 503 + SKIP(return, "listns() not supported"); 504 + ASSERT_GE(ret_before, 0); 505 + } 506 + 507 + TH_LOG("Baseline: %zd active namespaces", ret_before); 508 + 509 + /* Create worker processes that do concurrent operations */ 510 + for (i = 0; i < num_workers; i++) { 511 + pids[i] = fork(); 512 + ASSERT_GE(pids[i], 0); 513 + 514 + if (pids[i] == 0) { 515 + /* Each worker: create namespaces, list them, repeat */ 516 + int iterations; 517 + 518 + for (iterations = 0; iterations < 10; iterations++) { 519 + int userns_fd; 520 + __u64 temp_ns_ids[100]; 521 + ssize_t ret; 522 + 523 + /* Create a user namespace */ 524 + userns_fd = get_userns_fd(0, getuid(), 1); 525 + if (userns_fd < 0) 526 + continue; 527 + 528 + /* List namespaces */ 529 + ret = sys_listns(&req, temp_ns_ids, ARRAY_SIZE(temp_ns_ids), 0); 530 + (void)ret; 531 + 532 + close(userns_fd); 533 + 534 + /* Small delay */ 535 + usleep(1000); 536 + } 537 + 538 + exit(0); 539 + } 540 + } 541 + 542 + /* Wait for all workers */ 543 + for (i = 0; i < num_workers; i++) { 544 + int status; 545 + waitpid(pids[i], &status, 0); 546 + ASSERT_TRUE(WIFEXITED(status)); 547 + ASSERT_EQ(WEXITSTATUS(status), 0); 548 + } 549 + 550 + /* Verify we're back to baseline */ 551 + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 552 + ASSERT_GE(ret_after, 0); 553 + 554 + TH_LOG("After concurrent operations: %zd active namespaces", ret_after); 555 + ASSERT_EQ(ret_before, ret_after); 556 + } 557 + 558 + /* 559 + * Test namespace churn - continuous creation and destruction. 560 + * Simulates high-churn scenarios like container orchestration. 561 + */ 562 + TEST(namespace_churn) 563 + { 564 + struct ns_id_req req = { 565 + .size = sizeof(req), 566 + .spare = 0, 567 + .ns_id = 0, 568 + .ns_type = CLONE_NEWUSER | CLONE_NEWNET | CLONE_NEWUTS, 569 + .spare2 = 0, 570 + .user_ns_id = 0, 571 + }; 572 + __u64 ns_ids_before[512], ns_ids_after[512]; 573 + ssize_t ret_before, ret_after; 574 + int cycle; 575 + 576 + /* Get baseline */ 577 + ret_before = sys_listns(&req, ns_ids_before, ARRAY_SIZE(ns_ids_before), 0); 578 + if (ret_before < 0) { 579 + if (errno == ENOSYS) 580 + SKIP(return, "listns() not supported"); 581 + ASSERT_GE(ret_before, 0); 582 + } 583 + 584 + TH_LOG("Baseline: %zd active namespaces", ret_before); 585 + 586 + /* Simulate churn: batches of namespaces created and destroyed */ 587 + for (cycle = 0; cycle < 10; cycle++) { 588 + pid_t batch_pids[10]; 589 + int i; 590 + 591 + /* Create batch */ 592 + for (i = 0; i < 10; i++) { 593 + batch_pids[i] = fork(); 594 + ASSERT_GE(batch_pids[i], 0); 595 + 596 + if (batch_pids[i] == 0) { 597 + /* Create multiple namespace types */ 598 + if (setup_userns() < 0) 599 + exit(1); 600 + if (unshare(CLONE_NEWNET) < 0) 601 + exit(1); 602 + if (unshare(CLONE_NEWUTS) < 0) 603 + exit(1); 604 + 605 + /* Keep namespaces alive briefly */ 606 + usleep(10000); 607 + exit(0); 608 + } 609 + } 610 + 611 + /* Wait for batch to complete */ 612 + for (i = 0; i < 10; i++) { 613 + int status; 614 + waitpid(batch_pids[i], &status, 0); 615 + } 616 + } 617 + 618 + /* Verify we're back to baseline */ 619 + ret_after = sys_listns(&req, ns_ids_after, ARRAY_SIZE(ns_ids_after), 0); 620 + ASSERT_GE(ret_after, 0); 621 + 622 + TH_LOG("After 10 churn cycles (100 namespace sets): %zd active namespaces", ret_after); 623 + ASSERT_EQ(ret_before, ret_after); 624 + } 625 + 626 + TEST_HARNESS_MAIN

+35

tools/testing/selftests/namespaces/wrappers.h

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/nsfs.h> 4 + #include <linux/types.h> 5 + #include <sys/syscall.h> 6 + #include <unistd.h> 7 + 8 + #ifndef __SELFTESTS_NAMESPACES_WRAPPERS_H__ 9 + #define __SELFTESTS_NAMESPACES_WRAPPERS_H__ 10 + 11 + #ifndef __NR_listns 12 + #if defined __alpha__ 13 + #define __NR_listns 580 14 + #elif defined _MIPS_SIM 15 + #if _MIPS_SIM == _MIPS_SIM_ABI32 /* o32 */ 16 + #define __NR_listns 4470 17 + #endif 18 + #if _MIPS_SIM == _MIPS_SIM_NABI32 /* n32 */ 19 + #define __NR_listns 6470 20 + #endif 21 + #if _MIPS_SIM == _MIPS_SIM_ABI64 /* n64 */ 22 + #define __NR_listns 5470 23 + #endif 24 + #else 25 + #define __NR_listns 470 26 + #endif 27 + #endif 28 + 29 + static inline int sys_listns(const struct ns_id_req *req, __u64 *ns_ids, 30 + size_t nr_ns_ids, unsigned int flags) 31 + { 32 + return syscall(__NR_listns, req, ns_ids, nr_ns_ids, flags); 33 + } 34 + 35 + #endif /* __SELFTESTS_NAMESPACES_WRAPPERS_H__ */

Configure Feed

Configure Feed