Merge branch 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+11

Documentation/admin-guide/cgroup-v1/cpusets.rst

··· 223 223 automatically tracks the value of node_states[N_MEMORY]--i.e., 224 224 nodes with memory--using the cpuset_track_online_nodes() hook. 225 225 226 + The cpuset.effective_cpus and cpuset.effective_mems files are 227 + normally read-only copies of cpuset.cpus and cpuset.mems files 228 + respectively. If the cpuset cgroup filesystem is mounted with the 229 + special "cpuset_v2_mode" option, the behavior of these files will become 230 + similar to the corresponding files in cpuset v2. In other words, hotplug 231 + events will not change cpuset.cpus and cpuset.mems. Those events will 232 + only affect cpuset.effective_cpus and cpuset.effective_mems which show 233 + the actual cpus and memory nodes that are currently used by this cpuset. 234 + See Documentation/admin-guide/cgroup-v2.rst for more information about 235 + cpuset v2 behavior. 236 + 226 237 227 238 1.4 What are exclusive cpusets ? 228 239 --------------------------------

+90 -1

fs/kernfs/inode.c

··· 53 53 kn->iattr->ia_ctime = kn->iattr->ia_atime; 54 54 55 55 simple_xattrs_init(&kn->iattr->xattrs); 56 + atomic_set(&kn->iattr->nr_user_xattrs, 0); 57 + atomic_set(&kn->iattr->user_xattr_size, 0); 56 58 out_unlock: 57 59 ret = kn->iattr; 58 60 mutex_unlock(&iattr_mutex); ··· 305 303 if (!attrs) 306 304 return -ENOMEM; 307 305 308 - return simple_xattr_set(&attrs->xattrs, name, value, size, flags); 306 + return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); 309 307 } 310 308 311 309 static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, ··· 329 327 return kernfs_xattr_set(kn, name, value, size, flags); 330 328 } 331 329 330 + static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, 331 + const char *full_name, 332 + struct simple_xattrs *xattrs, 333 + const void *value, size_t size, int flags) 334 + { 335 + atomic_t *sz = &kn->iattr->user_xattr_size; 336 + atomic_t *nr = &kn->iattr->nr_user_xattrs; 337 + ssize_t removed_size; 338 + int ret; 339 + 340 + if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { 341 + ret = -ENOSPC; 342 + goto dec_count_out; 343 + } 344 + 345 + if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { 346 + ret = -ENOSPC; 347 + goto dec_size_out; 348 + } 349 + 350 + ret = simple_xattr_set(xattrs, full_name, value, size, flags, 351 + &removed_size); 352 + 353 + if (!ret && removed_size >= 0) 354 + size = removed_size; 355 + else if (!ret) 356 + return 0; 357 + dec_size_out: 358 + atomic_sub(size, sz); 359 + dec_count_out: 360 + atomic_dec(nr); 361 + return ret; 362 + } 363 + 364 + static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, 365 + const char *full_name, 366 + struct simple_xattrs *xattrs, 367 + const void *value, size_t size, int flags) 368 + { 369 + atomic_t *sz = &kn->iattr->user_xattr_size; 370 + atomic_t *nr = &kn->iattr->nr_user_xattrs; 371 + ssize_t removed_size; 372 + int ret; 373 + 374 + ret = simple_xattr_set(xattrs, full_name, value, size, flags, 375 + &removed_size); 376 + 377 + if (removed_size >= 0) { 378 + atomic_sub(removed_size, sz); 379 + atomic_dec(nr); 380 + } 381 + 382 + return ret; 383 + } 384 + 385 + static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, 386 + struct dentry *unused, struct inode *inode, 387 + const char *suffix, const void *value, 388 + size_t size, int flags) 389 + { 390 + const char *full_name = xattr_full_name(handler, suffix); 391 + struct kernfs_node *kn = inode->i_private; 392 + struct kernfs_iattrs *attrs; 393 + 394 + if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) 395 + return -EOPNOTSUPP; 396 + 397 + attrs = kernfs_iattrs(kn); 398 + if (!attrs) 399 + return -ENOMEM; 400 + 401 + if (value) 402 + return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, 403 + value, size, flags); 404 + else 405 + return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, 406 + value, size, flags); 407 + 408 + } 409 + 332 410 static const struct xattr_handler kernfs_trusted_xattr_handler = { 333 411 .prefix = XATTR_TRUSTED_PREFIX, 334 412 .get = kernfs_vfs_xattr_get, ··· 421 339 .set = kernfs_vfs_xattr_set, 422 340 }; 423 341 342 + static const struct xattr_handler kernfs_user_xattr_handler = { 343 + .prefix = XATTR_USER_PREFIX, 344 + .get = kernfs_vfs_xattr_get, 345 + .set = kernfs_vfs_user_xattr_set, 346 + }; 347 + 424 348 const struct xattr_handler *kernfs_xattr_handlers[] = { 425 349 &kernfs_trusted_xattr_handler, 426 350 &kernfs_security_xattr_handler, 351 + &kernfs_user_xattr_handler, 427 352 NULL 428 353 };

+2

fs/kernfs/kernfs-internal.h

··· 26 26 struct timespec64 ia_ctime; 27 27 28 28 struct simple_xattrs xattrs; 29 + atomic_t nr_user_xattrs; 30 + atomic_t user_xattr_size; 29 31 }; 30 32 31 33 /* +1 to avoid triggering overflow warning when negating it */

+13 -4

fs/xattr.c

··· 817 817 if (len < sizeof(*new_xattr)) 818 818 return NULL; 819 819 820 - new_xattr = kmalloc(len, GFP_KERNEL); 820 + new_xattr = kvmalloc(len, GFP_KERNEL); 821 821 if (!new_xattr) 822 822 return NULL; 823 823 ··· 860 860 * @value: value of the xattr. If %NULL, will remove the attribute. 861 861 * @size: size of the new xattr 862 862 * @flags: %XATTR_{CREATE|REPLACE} 863 + * @removed_size: returns size of the removed xattr, -1 if none removed 863 864 * 864 865 * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails 865 866 * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; ··· 869 868 * Returns 0 on success, -errno on failure. 870 869 */ 871 870 int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, 872 - const void *value, size_t size, int flags) 871 + const void *value, size_t size, int flags, 872 + ssize_t *removed_size) 873 873 { 874 874 struct simple_xattr *xattr; 875 875 struct simple_xattr *new_xattr = NULL; ··· 884 882 885 883 new_xattr->name = kstrdup(name, GFP_KERNEL); 886 884 if (!new_xattr->name) { 887 - kfree(new_xattr); 885 + kvfree(new_xattr); 888 886 return -ENOMEM; 889 887 } 890 888 } ··· 897 895 err = -EEXIST; 898 896 } else if (new_xattr) { 899 897 list_replace(&xattr->list, &new_xattr->list); 898 + if (removed_size) 899 + *removed_size = xattr->size; 900 900 } else { 901 901 list_del(&xattr->list); 902 + if (removed_size) 903 + *removed_size = xattr->size; 902 904 } 903 905 goto out; 904 906 } ··· 914 908 list_add(&new_xattr->list, &xattrs->head); 915 909 xattr = NULL; 916 910 } 911 + 912 + if (removed_size) 913 + *removed_size = -1; 917 914 out: 918 915 spin_unlock(&xattrs->lock); 919 916 if (xattr) { 920 917 kfree(xattr->name); 921 - kfree(xattr); 918 + kvfree(xattr); 922 919 } 923 920 return err; 924 921

+3 -2

include/linux/cgroup-defs.h

··· 633 633 void (*cancel_attach)(struct cgroup_taskset *tset); 634 634 void (*attach)(struct cgroup_taskset *tset); 635 635 void (*post_attach)(void); 636 - int (*can_fork)(struct task_struct *task); 637 - void (*cancel_fork)(struct task_struct *task); 636 + int (*can_fork)(struct task_struct *task, 637 + struct css_set *cset); 638 + void (*cancel_fork)(struct task_struct *task, struct css_set *cset); 638 639 void (*fork)(struct task_struct *task); 639 640 void (*exit)(struct task_struct *task); 640 641 void (*release)(struct task_struct *task);

+14 -9

include/linux/cgroup.h

··· 27 27 28 28 #include <linux/cgroup-defs.h> 29 29 30 + struct kernel_clone_args; 31 + 30 32 #ifdef CONFIG_CGROUPS 31 33 32 34 /* ··· 60 58 struct list_head *tcset_head; 61 59 62 60 struct list_head *task_pos; 63 - struct list_head *tasks_head; 64 - struct list_head *mg_tasks_head; 65 - struct list_head *dying_tasks_head; 66 61 67 62 struct list_head *cur_tasks_head; 68 63 struct css_set *cur_cset; ··· 121 122 struct pid *pid, struct task_struct *tsk); 122 123 123 124 void cgroup_fork(struct task_struct *p); 124 - extern int cgroup_can_fork(struct task_struct *p); 125 - extern void cgroup_cancel_fork(struct task_struct *p); 126 - extern void cgroup_post_fork(struct task_struct *p); 125 + extern int cgroup_can_fork(struct task_struct *p, 126 + struct kernel_clone_args *kargs); 127 + extern void cgroup_cancel_fork(struct task_struct *p, 128 + struct kernel_clone_args *kargs); 129 + extern void cgroup_post_fork(struct task_struct *p, 130 + struct kernel_clone_args *kargs); 127 131 void cgroup_exit(struct task_struct *p); 128 132 void cgroup_release(struct task_struct *p); 129 133 void cgroup_free(struct task_struct *p); ··· 710 708 struct dentry *dentry) { return -EINVAL; } 711 709 712 710 static inline void cgroup_fork(struct task_struct *p) {} 713 - static inline int cgroup_can_fork(struct task_struct *p) { return 0; } 714 - static inline void cgroup_cancel_fork(struct task_struct *p) {} 715 - static inline void cgroup_post_fork(struct task_struct *p) {} 711 + static inline int cgroup_can_fork(struct task_struct *p, 712 + struct kernel_clone_args *kargs) { return 0; } 713 + static inline void cgroup_cancel_fork(struct task_struct *p, 714 + struct kernel_clone_args *kargs) {} 715 + static inline void cgroup_post_fork(struct task_struct *p, 716 + struct kernel_clone_args *kargs) {} 716 717 static inline void cgroup_exit(struct task_struct *p) {} 717 718 static inline void cgroup_release(struct task_struct *p) {} 718 719 static inline void cgroup_free(struct task_struct *p) {}

+9 -2

include/linux/kernfs.h

··· 37 37 KERNFS_LINK = 0x0004, 38 38 }; 39 39 40 - #define KERNFS_TYPE_MASK 0x000f 41 - #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK 40 + #define KERNFS_TYPE_MASK 0x000f 41 + #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK 42 + #define KERNFS_MAX_USER_XATTRS 128 43 + #define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) 42 44 43 45 enum kernfs_node_flag { 44 46 KERNFS_ACTIVATED = 0x0010, ··· 80 78 * fhandle to access nodes of the fs. 81 79 */ 82 80 KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, 81 + 82 + /* 83 + * Support user xattrs to be written to nodes rooted at this root. 84 + */ 85 + KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, 83 86 }; 84 87 85 88 /* type-specific structures for kernfs_node union members */

+4

include/linux/sched/task.h

··· 13 13 struct task_struct; 14 14 struct rusage; 15 15 union thread_union; 16 + struct css_set; 16 17 17 18 /* All the bits taken by the old clone syscall. */ 18 19 #define CLONE_LEGACY_FLAGS 0xffffffffULL ··· 30 29 pid_t *set_tid; 31 30 /* Number of elements in *set_tid */ 32 31 size_t set_tid_size; 32 + int cgroup; 33 + struct cgroup *cgrp; 34 + struct css_set *cset; 33 35 }; 34 36 35 37 /*

+2 -1

include/linux/xattr.h

··· 102 102 int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, 103 103 void *buffer, size_t size); 104 104 int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, 105 - const void *value, size_t size, int flags); 105 + const void *value, size_t size, int flags, 106 + ssize_t *removed_size); 106 107 ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, 107 108 size_t size); 108 109 void simple_xattr_list_add(struct simple_xattrs *xattrs,

+5

include/uapi/linux/sched.h

··· 35 35 36 36 /* Flags for the clone3() syscall. */ 37 37 #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ 38 + #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ 38 39 39 40 /* 40 41 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3 ··· 82 81 * @set_tid_size: This defines the size of the array referenced 83 82 * in @set_tid. This cannot be larger than the 84 83 * kernel's limit of nested PID namespaces. 84 + * @cgroup: If CLONE_INTO_CGROUP is specified set this to 85 + * a file descriptor for the cgroup. 85 86 * 86 87 * The structure is versioned by size and thus extensible. 87 88 * New struct members must go at the end of the struct and ··· 100 97 __aligned_u64 tls; 101 98 __aligned_u64 set_tid; 102 99 __aligned_u64 set_tid_size; 100 + __aligned_u64 cgroup; 103 101 }; 104 102 #endif 105 103 106 104 #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ 107 105 #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ 106 + #define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ 108 107 109 108 /* 110 109 * Scheduling policies

+17 -17

kernel/cgroup/cgroup-v1.c

··· 38 38 */ 39 39 static struct workqueue_struct *cgroup_pidlist_destroy_wq; 40 40 41 - /* 42 - * Protects cgroup_subsys->release_agent_path. Modifying it also requires 43 - * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. 44 - */ 41 + /* protects cgroup_subsys->release_agent_path */ 45 42 static DEFINE_SPINLOCK(release_agent_path_lock); 46 43 47 44 bool cgroup1_ssid_disabled(int ssid) ··· 772 775 { 773 776 struct cgroup *cgrp = 774 777 container_of(work, struct cgroup, release_agent_work); 775 - char *pathbuf = NULL, *agentbuf = NULL; 778 + char *pathbuf, *agentbuf; 776 779 char *argv[3], *envp[3]; 777 780 int ret; 778 781 779 - mutex_lock(&cgroup_mutex); 782 + /* snoop agent path and exit early if empty */ 783 + if (!cgrp->root->release_agent_path[0]) 784 + return; 780 785 786 + /* prepare argument buffers */ 781 787 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); 782 - agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 783 - if (!pathbuf || !agentbuf || !strlen(agentbuf)) 784 - goto out; 788 + agentbuf = kmalloc(PATH_MAX, GFP_KERNEL); 789 + if (!pathbuf || !agentbuf) 790 + goto out_free; 785 791 786 - spin_lock_irq(&css_set_lock); 787 - ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); 788 - spin_unlock_irq(&css_set_lock); 792 + spin_lock(&release_agent_path_lock); 793 + strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX); 794 + spin_unlock(&release_agent_path_lock); 795 + if (!agentbuf[0]) 796 + goto out_free; 797 + 798 + ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); 789 799 if (ret < 0 || ret >= PATH_MAX) 790 - goto out; 800 + goto out_free; 791 801 792 802 argv[0] = agentbuf; 793 803 argv[1] = pathbuf; ··· 805 801 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 806 802 envp[2] = NULL; 807 803 808 - mutex_unlock(&cgroup_mutex); 809 804 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 810 - goto out_free; 811 - out: 812 - mutex_unlock(&cgroup_mutex); 813 805 out_free: 814 806 kfree(agentbuf); 815 807 kfree(pathbuf);

+269 -90

kernel/cgroup/cgroup.c

··· 1966 1966 1967 1967 root->kf_root = kernfs_create_root(kf_sops, 1968 1968 KERNFS_ROOT_CREATE_DEACTIVATED | 1969 - KERNFS_ROOT_SUPPORT_EXPORTOP, 1969 + KERNFS_ROOT_SUPPORT_EXPORTOP | 1970 + KERNFS_ROOT_SUPPORT_USER_XATTR, 1970 1971 root_cgrp); 1971 1972 if (IS_ERR(root->kf_root)) { 1972 1973 ret = PTR_ERR(root->kf_root); ··· 2727 2726 { 2728 2727 DEFINE_CGROUP_MGCTX(mgctx); 2729 2728 struct task_struct *task; 2730 - int ret; 2731 - 2732 - ret = cgroup_migrate_vet_dst(dst_cgrp); 2733 - if (ret) 2734 - return ret; 2729 + int ret = 0; 2735 2730 2736 2731 /* look up all src csets */ 2737 2732 spin_lock_irq(&css_set_lock); ··· 4157 4160 } else if (likely(!(pos->flags & CSS_RELEASED))) { 4158 4161 next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); 4159 4162 } else { 4160 - list_for_each_entry_rcu(next, &parent->children, sibling) 4163 + list_for_each_entry_rcu(next, &parent->children, sibling, 4164 + lockdep_is_held(&cgroup_mutex)) 4161 4165 if (next->serial_nr > pos->serial_nr) 4162 4166 break; 4163 4167 } ··· 4401 4403 4402 4404 lockdep_assert_held(&css_set_lock); 4403 4405 4404 - /* Advance to the next non-empty css_set */ 4405 - do { 4406 - cset = css_task_iter_next_css_set(it); 4407 - if (!cset) { 4408 - it->task_pos = NULL; 4409 - return; 4406 + /* Advance to the next non-empty css_set and find first non-empty tasks list*/ 4407 + while ((cset = css_task_iter_next_css_set(it))) { 4408 + if (!list_empty(&cset->tasks)) { 4409 + it->cur_tasks_head = &cset->tasks; 4410 + break; 4411 + } else if (!list_empty(&cset->mg_tasks)) { 4412 + it->cur_tasks_head = &cset->mg_tasks; 4413 + break; 4414 + } else if (!list_empty(&cset->dying_tasks)) { 4415 + it->cur_tasks_head = &cset->dying_tasks; 4416 + break; 4410 4417 } 4411 - } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); 4412 - 4413 - if (!list_empty(&cset->tasks)) { 4414 - it->task_pos = cset->tasks.next; 4415 - it->cur_tasks_head = &cset->tasks; 4416 - } else if (!list_empty(&cset->mg_tasks)) { 4417 - it->task_pos = cset->mg_tasks.next; 4418 - it->cur_tasks_head = &cset->mg_tasks; 4419 - } else { 4420 - it->task_pos = cset->dying_tasks.next; 4421 - it->cur_tasks_head = &cset->dying_tasks; 4422 4418 } 4423 - 4424 - it->tasks_head = &cset->tasks; 4425 - it->mg_tasks_head = &cset->mg_tasks; 4426 - it->dying_tasks_head = &cset->dying_tasks; 4419 + if (!cset) { 4420 + it->task_pos = NULL; 4421 + return; 4422 + } 4423 + it->task_pos = it->cur_tasks_head->next; 4427 4424 4428 4425 /* 4429 4426 * We don't keep css_sets locked across iteration steps and thus ··· 4463 4470 repeat: 4464 4471 if (it->task_pos) { 4465 4472 /* 4466 - * Advance iterator to find next entry. cset->tasks is 4467 - * consumed first and then ->mg_tasks. After ->mg_tasks, 4468 - * we move onto the next cset. 4473 + * Advance iterator to find next entry. We go through cset 4474 + * tasks, mg_tasks and dying_tasks, when consumed we move onto 4475 + * the next cset. 4469 4476 */ 4470 4477 if (it->flags & CSS_TASK_ITER_SKIPPED) 4471 4478 it->flags &= ~CSS_TASK_ITER_SKIPPED; 4472 4479 else 4473 4480 it->task_pos = it->task_pos->next; 4474 4481 4475 - if (it->task_pos == it->tasks_head) { 4476 - it->task_pos = it->mg_tasks_head->next; 4477 - it->cur_tasks_head = it->mg_tasks_head; 4482 + if (it->task_pos == &it->cur_cset->tasks) { 4483 + it->cur_tasks_head = &it->cur_cset->mg_tasks; 4484 + it->task_pos = it->cur_tasks_head->next; 4478 4485 } 4479 - if (it->task_pos == it->mg_tasks_head) { 4480 - it->task_pos = it->dying_tasks_head->next; 4481 - it->cur_tasks_head = it->dying_tasks_head; 4486 + if (it->task_pos == &it->cur_cset->mg_tasks) { 4487 + it->cur_tasks_head = &it->cur_cset->dying_tasks; 4488 + it->task_pos = it->cur_tasks_head->next; 4482 4489 } 4483 - if (it->task_pos == it->dying_tasks_head) 4490 + if (it->task_pos == &it->cur_cset->dying_tasks) 4484 4491 css_task_iter_advance_css_set(it); 4485 4492 } else { 4486 4493 /* called from start, proceed to the first cset */ ··· 4498 4505 goto repeat; 4499 4506 4500 4507 /* and dying leaders w/o live member threads */ 4501 - if (it->cur_tasks_head == it->dying_tasks_head && 4508 + if (it->cur_tasks_head == &it->cur_cset->dying_tasks && 4502 4509 !atomic_read(&task->signal->live)) 4503 4510 goto repeat; 4504 4511 } else { 4505 4512 /* skip all dying ones */ 4506 - if (it->cur_tasks_head == it->dying_tasks_head) 4513 + if (it->cur_tasks_head == &it->cur_cset->dying_tasks) 4507 4514 goto repeat; 4508 4515 } 4509 4516 } ··· 4667 4674 return 0; 4668 4675 } 4669 4676 4677 + static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) 4678 + { 4679 + int ret; 4680 + struct inode *inode; 4681 + 4682 + lockdep_assert_held(&cgroup_mutex); 4683 + 4684 + inode = kernfs_get_inode(sb, cgrp->procs_file.kn); 4685 + if (!inode) 4686 + return -ENOMEM; 4687 + 4688 + ret = inode_permission(inode, MAY_WRITE); 4689 + iput(inode); 4690 + return ret; 4691 + } 4692 + 4670 4693 static int cgroup_procs_write_permission(struct cgroup *src_cgrp, 4671 4694 struct cgroup *dst_cgrp, 4672 4695 struct super_block *sb) 4673 4696 { 4674 4697 struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; 4675 4698 struct cgroup *com_cgrp = src_cgrp; 4676 - struct inode *inode; 4677 4699 int ret; 4678 4700 4679 4701 lockdep_assert_held(&cgroup_mutex); ··· 4698 4690 com_cgrp = cgroup_parent(com_cgrp); 4699 4691 4700 4692 /* %current should be authorized to migrate to the common ancestor */ 4701 - inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); 4702 - if (!inode) 4703 - return -ENOMEM; 4704 - 4705 - ret = inode_permission(inode, MAY_WRITE); 4706 - iput(inode); 4693 + ret = cgroup_may_write(com_cgrp, sb); 4707 4694 if (ret) 4708 4695 return ret; 4709 4696 ··· 4712 4709 return -ENOENT; 4713 4710 4714 4711 return 0; 4712 + } 4713 + 4714 + static int cgroup_attach_permissions(struct cgroup *src_cgrp, 4715 + struct cgroup *dst_cgrp, 4716 + struct super_block *sb, bool threadgroup) 4717 + { 4718 + int ret = 0; 4719 + 4720 + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); 4721 + if (ret) 4722 + return ret; 4723 + 4724 + ret = cgroup_migrate_vet_dst(dst_cgrp); 4725 + if (ret) 4726 + return ret; 4727 + 4728 + if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)) 4729 + ret = -EOPNOTSUPP; 4730 + 4731 + return ret; 4715 4732 } 4716 4733 4717 4734 static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ··· 4756 4733 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); 4757 4734 spin_unlock_irq(&css_set_lock); 4758 4735 4759 - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, 4760 - of->file->f_path.dentry->d_sb); 4736 + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, 4737 + of->file->f_path.dentry->d_sb, true); 4761 4738 if (ret) 4762 4739 goto out_finish; 4763 4740 ··· 4801 4778 spin_unlock_irq(&css_set_lock); 4802 4779 4803 4780 /* thread migrations follow the cgroup.procs delegation rule */ 4804 - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, 4805 - of->file->f_path.dentry->d_sb); 4781 + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, 4782 + of->file->f_path.dentry->d_sb, false); 4806 4783 if (ret) 4807 - goto out_finish; 4808 - 4809 - /* and must be contained in the same domain */ 4810 - ret = -EOPNOTSUPP; 4811 - if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp) 4812 4784 goto out_finish; 4813 4785 4814 4786 ret = cgroup_attach_task(dst_cgrp, task, false); ··· 5894 5876 * @child: pointer to task_struct of forking parent process. 5895 5877 * 5896 5878 * A task is associated with the init_css_set until cgroup_post_fork() 5897 - * attaches it to the parent's css_set. Empty cg_list indicates that 5898 - * @child isn't holding reference to its css_set. 5879 + * attaches it to the target css_set. 5899 5880 */ 5900 5881 void cgroup_fork(struct task_struct *child) 5901 5882 { ··· 5902 5885 INIT_LIST_HEAD(&child->cg_list); 5903 5886 } 5904 5887 5888 + static struct cgroup *cgroup_get_from_file(struct file *f) 5889 + { 5890 + struct cgroup_subsys_state *css; 5891 + struct cgroup *cgrp; 5892 + 5893 + css = css_tryget_online_from_dir(f->f_path.dentry, NULL); 5894 + if (IS_ERR(css)) 5895 + return ERR_CAST(css); 5896 + 5897 + cgrp = css->cgroup; 5898 + if (!cgroup_on_dfl(cgrp)) { 5899 + cgroup_put(cgrp); 5900 + return ERR_PTR(-EBADF); 5901 + } 5902 + 5903 + return cgrp; 5904 + } 5905 + 5906 + /** 5907 + * cgroup_css_set_fork - find or create a css_set for a child process 5908 + * @kargs: the arguments passed to create the child process 5909 + * 5910 + * This functions finds or creates a new css_set which the child 5911 + * process will be attached to in cgroup_post_fork(). By default, 5912 + * the child process will be given the same css_set as its parent. 5913 + * 5914 + * If CLONE_INTO_CGROUP is specified this function will try to find an 5915 + * existing css_set which includes the requested cgroup and if not create 5916 + * a new css_set that the child will be attached to later. If this function 5917 + * succeeds it will hold cgroup_threadgroup_rwsem on return. If 5918 + * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex 5919 + * before grabbing cgroup_threadgroup_rwsem and will hold a reference 5920 + * to the target cgroup. 5921 + */ 5922 + static int cgroup_css_set_fork(struct kernel_clone_args *kargs) 5923 + __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem) 5924 + { 5925 + int ret; 5926 + struct cgroup *dst_cgrp = NULL; 5927 + struct css_set *cset; 5928 + struct super_block *sb; 5929 + struct file *f; 5930 + 5931 + if (kargs->flags & CLONE_INTO_CGROUP) 5932 + mutex_lock(&cgroup_mutex); 5933 + 5934 + cgroup_threadgroup_change_begin(current); 5935 + 5936 + spin_lock_irq(&css_set_lock); 5937 + cset = task_css_set(current); 5938 + get_css_set(cset); 5939 + spin_unlock_irq(&css_set_lock); 5940 + 5941 + if (!(kargs->flags & CLONE_INTO_CGROUP)) { 5942 + kargs->cset = cset; 5943 + return 0; 5944 + } 5945 + 5946 + f = fget_raw(kargs->cgroup); 5947 + if (!f) { 5948 + ret = -EBADF; 5949 + goto err; 5950 + } 5951 + sb = f->f_path.dentry->d_sb; 5952 + 5953 + dst_cgrp = cgroup_get_from_file(f); 5954 + if (IS_ERR(dst_cgrp)) { 5955 + ret = PTR_ERR(dst_cgrp); 5956 + dst_cgrp = NULL; 5957 + goto err; 5958 + } 5959 + 5960 + if (cgroup_is_dead(dst_cgrp)) { 5961 + ret = -ENODEV; 5962 + goto err; 5963 + } 5964 + 5965 + /* 5966 + * Verify that we the target cgroup is writable for us. This is 5967 + * usually done by the vfs layer but since we're not going through 5968 + * the vfs layer here we need to do it "manually". 5969 + */ 5970 + ret = cgroup_may_write(dst_cgrp, sb); 5971 + if (ret) 5972 + goto err; 5973 + 5974 + ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, 5975 + !(kargs->flags & CLONE_THREAD)); 5976 + if (ret) 5977 + goto err; 5978 + 5979 + kargs->cset = find_css_set(cset, dst_cgrp); 5980 + if (!kargs->cset) { 5981 + ret = -ENOMEM; 5982 + goto err; 5983 + } 5984 + 5985 + put_css_set(cset); 5986 + fput(f); 5987 + kargs->cgrp = dst_cgrp; 5988 + return ret; 5989 + 5990 + err: 5991 + cgroup_threadgroup_change_end(current); 5992 + mutex_unlock(&cgroup_mutex); 5993 + if (f) 5994 + fput(f); 5995 + if (dst_cgrp) 5996 + cgroup_put(dst_cgrp); 5997 + put_css_set(cset); 5998 + if (kargs->cset) 5999 + put_css_set(kargs->cset); 6000 + return ret; 6001 + } 6002 + 6003 + /** 6004 + * cgroup_css_set_put_fork - drop references we took during fork 6005 + * @kargs: the arguments passed to create the child process 6006 + * 6007 + * Drop references to the prepared css_set and target cgroup if 6008 + * CLONE_INTO_CGROUP was requested. 6009 + */ 6010 + static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs) 6011 + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) 6012 + { 6013 + cgroup_threadgroup_change_end(current); 6014 + 6015 + if (kargs->flags & CLONE_INTO_CGROUP) { 6016 + struct cgroup *cgrp = kargs->cgrp; 6017 + struct css_set *cset = kargs->cset; 6018 + 6019 + mutex_unlock(&cgroup_mutex); 6020 + 6021 + if (cset) { 6022 + put_css_set(cset); 6023 + kargs->cset = NULL; 6024 + } 6025 + 6026 + if (cgrp) { 6027 + cgroup_put(cgrp); 6028 + kargs->cgrp = NULL; 6029 + } 6030 + } 6031 + } 6032 + 5905 6033 /** 5906 6034 * cgroup_can_fork - called on a new task before the process is exposed 5907 - * @child: the task in question. 6035 + * @child: the child process 5908 6036 * 5909 - * This calls the subsystem can_fork() callbacks. If the can_fork() callback 5910 - * returns an error, the fork aborts with that error code. This allows for 5911 - * a cgroup subsystem to conditionally allow or deny new forks. 6037 + * This prepares a new css_set for the child process which the child will 6038 + * be attached to in cgroup_post_fork(). 6039 + * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork() 6040 + * callback returns an error, the fork aborts with that error code. This 6041 + * allows for a cgroup subsystem to conditionally allow or deny new forks. 5912 6042 */ 5913 - int cgroup_can_fork(struct task_struct *child) 6043 + int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs) 5914 6044 { 5915 6045 struct cgroup_subsys *ss; 5916 6046 int i, j, ret; 5917 6047 6048 + ret = cgroup_css_set_fork(kargs); 6049 + if (ret) 6050 + return ret; 6051 + 5918 6052 do_each_subsys_mask(ss, i, have_canfork_callback) { 5919 - ret = ss->can_fork(child); 6053 + ret = ss->can_fork(child, kargs->cset); 5920 6054 if (ret) 5921 6055 goto out_revert; 5922 6056 } while_each_subsys_mask(); ··· 6079 5911 if (j >= i) 6080 5912 break; 6081 5913 if (ss->cancel_fork) 6082 - ss->cancel_fork(child); 5914 + ss->cancel_fork(child, kargs->cset); 6083 5915 } 5916 + 5917 + cgroup_css_set_put_fork(kargs); 6084 5918 6085 5919 return ret; 6086 5920 } 6087 5921 6088 5922 /** 6089 5923 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() 6090 - * @child: the task in question 5924 + * @child: the child process 5925 + * @kargs: the arguments passed to create the child process 6091 5926 * 6092 5927 * This calls the cancel_fork() callbacks if a fork failed *after* 6093 - * cgroup_can_fork() succeded. 5928 + * cgroup_can_fork() succeded and cleans up references we took to 5929 + * prepare a new css_set for the child process in cgroup_can_fork(). 6094 5930 */ 6095 - void cgroup_cancel_fork(struct task_struct *child) 5931 + void cgroup_cancel_fork(struct task_struct *child, 5932 + struct kernel_clone_args *kargs) 6096 5933 { 6097 5934 struct cgroup_subsys *ss; 6098 5935 int i; 6099 5936 6100 5937 for_each_subsys(ss, i) 6101 5938 if (ss->cancel_fork) 6102 - ss->cancel_fork(child); 5939 + ss->cancel_fork(child, kargs->cset); 5940 + 5941 + cgroup_css_set_put_fork(kargs); 6103 5942 } 6104 5943 6105 5944 /** 6106 - * cgroup_post_fork - called on a new task after adding it to the task list 6107 - * @child: the task in question 5945 + * cgroup_post_fork - finalize cgroup setup for the child process 5946 + * @child: the child process 6108 5947 * 6109 - * Adds the task to the list running through its css_set if necessary and 6110 - * call the subsystem fork() callbacks. Has to be after the task is 6111 - * visible on the task list in case we race with the first call to 6112 - * cgroup_task_iter_start() - to guarantee that the new task ends up on its 6113 - * list. 5948 + * Attach the child process to its css_set calling the subsystem fork() 5949 + * callbacks. 6114 5950 */ 6115 - void cgroup_post_fork(struct task_struct *child) 5951 + void cgroup_post_fork(struct task_struct *child, 5952 + struct kernel_clone_args *kargs) 5953 + __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) 6116 5954 { 6117 5955 struct cgroup_subsys *ss; 6118 5956 struct css_set *cset; 6119 5957 int i; 5958 + 5959 + cset = kargs->cset; 5960 + kargs->cset = NULL; 6120 5961 6121 5962 spin_lock_irq(&css_set_lock); 6122 5963 6123 5964 /* init tasks are special, only link regular threads */ 6124 5965 if (likely(child->pid)) { 6125 5966 WARN_ON_ONCE(!list_empty(&child->cg_list)); 6126 - cset = task_css_set(current); /* current is @child's parent */ 6127 - get_css_set(cset); 6128 5967 cset->nr_tasks++; 6129 5968 css_set_move_task(child, NULL, cset, false); 5969 + } else { 5970 + put_css_set(cset); 5971 + cset = NULL; 6130 5972 } 6131 5973 6132 5974 /* ··· 6168 5990 do_each_subsys_mask(ss, i, have_fork_callback) { 6169 5991 ss->fork(child); 6170 5992 } while_each_subsys_mask(); 5993 + 5994 + /* Make the new cset the root_cset of the new cgroup namespace. */ 5995 + if (kargs->flags & CLONE_NEWCGROUP) { 5996 + struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset; 5997 + 5998 + get_css_set(cset); 5999 + child->nsproxy->cgroup_ns->root_cset = cset; 6000 + put_css_set(rcset); 6001 + } 6002 + 6003 + cgroup_css_set_put_fork(kargs); 6171 6004 } 6172 6005 6173 6006 /** ··· 6365 6176 */ 6366 6177 struct cgroup *cgroup_get_from_fd(int fd) 6367 6178 { 6368 - struct cgroup_subsys_state *css; 6369 6179 struct cgroup *cgrp; 6370 6180 struct file *f; 6371 6181 ··· 6372 6184 if (!f) 6373 6185 return ERR_PTR(-EBADF); 6374 6186 6375 - css = css_tryget_online_from_dir(f->f_path.dentry, NULL); 6187 + cgrp = cgroup_get_from_file(f); 6376 6188 fput(f); 6377 - if (IS_ERR(css)) 6378 - return ERR_CAST(css); 6379 - 6380 - cgrp = css->cgroup; 6381 - if (!cgroup_on_dfl(cgrp)) { 6382 - cgroup_put(cgrp); 6383 - return ERR_PTR(-EBADF); 6384 - } 6385 - 6386 6189 return cgrp; 6387 6190 } 6388 6191 EXPORT_SYMBOL_GPL(cgroup_get_from_fd);

+6 -2

kernel/cgroup/cpuset.c

··· 358 358 static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); 359 359 360 360 /* 361 - * Cgroup v2 behavior is used when on default hierarchy or the 362 - * cgroup_v2_mode flag is set. 361 + * Cgroup v2 behavior is used on the "cpus" and "mems" control files when 362 + * on default hierarchy or when the cpuset_v2_mode flag is set by mounting 363 + * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option. 364 + * With v2 behavior, "cpus" and "mems" are always what the users have 365 + * requested and won't be changed by hotplug events. Only the effective 366 + * cpus or mems will be affected. 363 367 */ 364 368 static inline bool is_in_v2_mode(void) 365 369 {

+11 -4

kernel/cgroup/pids.c

··· 33 33 #include <linux/atomic.h> 34 34 #include <linux/cgroup.h> 35 35 #include <linux/slab.h> 36 + #include <linux/sched/task.h> 36 37 37 38 #define PIDS_MAX (PID_MAX_LIMIT + 1ULL) 38 39 #define PIDS_MAX_STR "max" ··· 215 214 * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies 216 215 * on cgroup_threadgroup_change_begin() held by the copy_process(). 217 216 */ 218 - static int pids_can_fork(struct task_struct *task) 217 + static int pids_can_fork(struct task_struct *task, struct css_set *cset) 219 218 { 220 219 struct cgroup_subsys_state *css; 221 220 struct pids_cgroup *pids; 222 221 int err; 223 222 224 - css = task_css_check(current, pids_cgrp_id, true); 223 + if (cset) 224 + css = cset->subsys[pids_cgrp_id]; 225 + else 226 + css = task_css_check(current, pids_cgrp_id, true); 225 227 pids = css_pids(css); 226 228 err = pids_try_charge(pids, 1); 227 229 if (err) { ··· 239 235 return err; 240 236 } 241 237 242 - static void pids_cancel_fork(struct task_struct *task) 238 + static void pids_cancel_fork(struct task_struct *task, struct css_set *cset) 243 239 { 244 240 struct cgroup_subsys_state *css; 245 241 struct pids_cgroup *pids; 246 242 247 - css = task_css_check(current, pids_cgrp_id, true); 243 + if (cset) 244 + css = cset->subsys[pids_cgrp_id]; 245 + else 246 + css = task_css_check(current, pids_cgrp_id, true); 248 247 pids = css_pids(css); 249 248 pids_uncharge(pids, 1); 250 249 }

+10 -9

kernel/fork.c

··· 2176 2176 INIT_LIST_HEAD(&p->thread_group); 2177 2177 p->task_works = NULL; 2178 2178 2179 - cgroup_threadgroup_change_begin(current); 2180 2179 /* 2181 2180 * Ensure that the cgroup subsystem policies allow the new process to be 2182 2181 * forked. It should be noted the the new process's css_set can be changed 2183 2182 * between here and cgroup_post_fork() if an organisation operation is in 2184 2183 * progress. 2185 2184 */ 2186 - retval = cgroup_can_fork(p); 2185 + retval = cgroup_can_fork(p, args); 2187 2186 if (retval) 2188 - goto bad_fork_cgroup_threadgroup_change_end; 2187 + goto bad_fork_put_pidfd; 2189 2188 2190 2189 /* 2191 2190 * From this point on we must avoid any synchronous user-space ··· 2289 2290 write_unlock_irq(&tasklist_lock); 2290 2291 2291 2292 proc_fork_connector(p); 2292 - cgroup_post_fork(p); 2293 - cgroup_threadgroup_change_end(current); 2293 + cgroup_post_fork(p, args); 2294 2294 perf_event_fork(p); 2295 2295 2296 2296 trace_task_newtask(p, clone_flags); ··· 2300 2302 bad_fork_cancel_cgroup: 2301 2303 spin_unlock(&current->sighand->siglock); 2302 2304 write_unlock_irq(&tasklist_lock); 2303 - cgroup_cancel_fork(p); 2304 - bad_fork_cgroup_threadgroup_change_end: 2305 - cgroup_threadgroup_change_end(current); 2305 + cgroup_cancel_fork(p, args); 2306 2306 bad_fork_put_pidfd: 2307 2307 if (clone_flags & CLONE_PIDFD) { 2308 2308 fput(pidfile); ··· 2629 2633 !valid_signal(args.exit_signal))) 2630 2634 return -EINVAL; 2631 2635 2636 + if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0) 2637 + return -EINVAL; 2638 + 2632 2639 *kargs = (struct kernel_clone_args){ 2633 2640 .flags = args.flags, 2634 2641 .pidfd = u64_to_user_ptr(args.pidfd), ··· 2642 2643 .stack_size = args.stack_size, 2643 2644 .tls = args.tls, 2644 2645 .set_tid_size = args.set_tid_size, 2646 + .cgroup = args.cgroup, 2645 2647 }; 2646 2648 2647 2649 if (args.set_tid && ··· 2686 2686 static bool clone3_args_valid(struct kernel_clone_args *kargs) 2687 2687 { 2688 2688 /* Verify that no unknown flags are passed along. */ 2689 - if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) 2689 + if (kargs->flags & 2690 + ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) 2690 2691 return false; 2691 2692 2692 2693 /*

+1 -1

mm/shmem.c

··· 3243 3243 struct shmem_inode_info *info = SHMEM_I(inode); 3244 3244 3245 3245 name = xattr_full_name(handler, name); 3246 - return simple_xattr_set(&info->xattrs, name, value, size, flags); 3246 + return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); 3247 3247 } 3248 3248 3249 3249 static const struct xattr_handler shmem_security_xattr_handler = {

+3 -3

tools/testing/selftests/cgroup/Makefile

··· 11 11 12 12 include ../lib.mk 13 13 14 - $(OUTPUT)/test_memcontrol: cgroup_util.c 15 - $(OUTPUT)/test_core: cgroup_util.c 16 - $(OUTPUT)/test_freezer: cgroup_util.c 14 + $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h 15 + $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h 16 + $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h

+126

tools/testing/selftests/cgroup/cgroup_util.c

··· 15 15 #include <unistd.h> 16 16 17 17 #include "cgroup_util.h" 18 + #include "../clone3/clone3_selftests.h" 18 19 19 20 static ssize_t read_text(const char *path, char *buf, size_t max_len) 20 21 { ··· 332 331 } 333 332 } 334 333 334 + pid_t clone_into_cgroup(int cgroup_fd) 335 + { 336 + #ifdef CLONE_ARGS_SIZE_VER2 337 + pid_t pid; 338 + 339 + struct clone_args args = { 340 + .flags = CLONE_INTO_CGROUP, 341 + .exit_signal = SIGCHLD, 342 + .cgroup = cgroup_fd, 343 + }; 344 + 345 + pid = sys_clone3(&args, sizeof(struct clone_args)); 346 + /* 347 + * Verify that this is a genuine test failure: 348 + * ENOSYS -> clone3() not available 349 + * E2BIG -> CLONE_INTO_CGROUP not available 350 + */ 351 + if (pid < 0 && (errno == ENOSYS || errno == E2BIG)) 352 + goto pretend_enosys; 353 + 354 + return pid; 355 + 356 + pretend_enosys: 357 + #endif 358 + errno = ENOSYS; 359 + return -ENOSYS; 360 + } 361 + 362 + int clone_reap(pid_t pid, int options) 363 + { 364 + int ret; 365 + siginfo_t info = { 366 + .si_signo = 0, 367 + }; 368 + 369 + again: 370 + ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD); 371 + if (ret < 0) { 372 + if (errno == EINTR) 373 + goto again; 374 + return -1; 375 + } 376 + 377 + if (options & WEXITED) { 378 + if (WIFEXITED(info.si_status)) 379 + return WEXITSTATUS(info.si_status); 380 + } 381 + 382 + if (options & WSTOPPED) { 383 + if (WIFSTOPPED(info.si_status)) 384 + return WSTOPSIG(info.si_status); 385 + } 386 + 387 + if (options & WCONTINUED) { 388 + if (WIFCONTINUED(info.si_status)) 389 + return 0; 390 + } 391 + 392 + return -1; 393 + } 394 + 395 + int dirfd_open_opath(const char *dir) 396 + { 397 + return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH); 398 + } 399 + 400 + #define close_prot_errno(fd) \ 401 + if (fd >= 0) { \ 402 + int _e_ = errno; \ 403 + close(fd); \ 404 + errno = _e_; \ 405 + } 406 + 407 + static int clone_into_cgroup_run_nowait(const char *cgroup, 408 + int (*fn)(const char *cgroup, void *arg), 409 + void *arg) 410 + { 411 + int cgroup_fd; 412 + pid_t pid; 413 + 414 + cgroup_fd = dirfd_open_opath(cgroup); 415 + if (cgroup_fd < 0) 416 + return -1; 417 + 418 + pid = clone_into_cgroup(cgroup_fd); 419 + close_prot_errno(cgroup_fd); 420 + if (pid == 0) 421 + exit(fn(cgroup, arg)); 422 + 423 + return pid; 424 + } 425 + 335 426 int cg_run_nowait(const char *cgroup, 336 427 int (*fn)(const char *cgroup, void *arg), 337 428 void *arg) 338 429 { 339 430 int pid; 431 + 432 + pid = clone_into_cgroup_run_nowait(cgroup, fn, arg); 433 + if (pid > 0) 434 + return pid; 435 + 436 + /* Genuine test failure. */ 437 + if (pid < 0 && errno != ENOSYS) 438 + return -1; 340 439 341 440 pid = fork(); 342 441 if (pid == 0) { ··· 550 449 return -1; 551 450 552 451 return strstr(buf, needle) ? 0 : -1; 452 + } 453 + 454 + int clone_into_cgroup_run_wait(const char *cgroup) 455 + { 456 + int cgroup_fd; 457 + pid_t pid; 458 + 459 + cgroup_fd = dirfd_open_opath(cgroup); 460 + if (cgroup_fd < 0) 461 + return -1; 462 + 463 + pid = clone_into_cgroup(cgroup_fd); 464 + close_prot_errno(cgroup_fd); 465 + if (pid < 0) 466 + return -1; 467 + 468 + if (pid == 0) 469 + exit(EXIT_SUCCESS); 470 + 471 + /* 472 + * We don't care whether this fails. We only care whether the initial 473 + * clone succeeded. 474 + */ 475 + (void)clone_reap(pid, WEXITED); 476 + return 0; 553 477 }

+4

tools/testing/selftests/cgroup/cgroup_util.h

··· 50 50 extern int cg_killall(const char *cgroup); 51 51 extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); 52 52 extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); 53 + extern pid_t clone_into_cgroup(int cgroup_fd); 54 + extern int clone_reap(pid_t pid, int options); 55 + extern int clone_into_cgroup_run_wait(const char *cgroup); 56 + extern int dirfd_open_opath(const char *dir);

+177

tools/testing/selftests/cgroup/test_core.c

··· 2 2 3 3 #include <linux/limits.h> 4 4 #include <sys/types.h> 5 + #include <sys/mman.h> 6 + #include <sys/wait.h> 5 7 #include <unistd.h> 8 + #include <fcntl.h> 6 9 #include <stdio.h> 7 10 #include <errno.h> 8 11 #include <signal.h> ··· 14 11 15 12 #include "../kselftest.h" 16 13 #include "cgroup_util.h" 14 + 15 + static int touch_anon(char *buf, size_t size) 16 + { 17 + int fd; 18 + char *pos = buf; 19 + 20 + fd = open("/dev/urandom", O_RDONLY); 21 + if (fd < 0) 22 + return -1; 23 + 24 + while (size > 0) { 25 + ssize_t ret = read(fd, pos, size); 26 + 27 + if (ret < 0) { 28 + if (errno != EINTR) { 29 + close(fd); 30 + return -1; 31 + } 32 + } else { 33 + pos += ret; 34 + size -= ret; 35 + } 36 + } 37 + close(fd); 38 + 39 + return 0; 40 + } 41 + 42 + static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg) 43 + { 44 + int ppid = getppid(); 45 + size_t size = (size_t)arg; 46 + void *buf; 47 + 48 + buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, 49 + 0, 0); 50 + if (buf == MAP_FAILED) 51 + return -1; 52 + 53 + if (touch_anon((char *)buf, size)) { 54 + munmap(buf, size); 55 + return -1; 56 + } 57 + 58 + while (getppid() == ppid) 59 + sleep(1); 60 + 61 + munmap(buf, size); 62 + return 0; 63 + } 64 + 65 + /* 66 + * Create a child process that allocates and touches 100MB, then waits to be 67 + * killed. Wait until the child is attached to the cgroup, kill all processes 68 + * in that cgroup and wait until "cgroup.procs" is empty. At this point try to 69 + * destroy the empty cgroup. The test helps detect race conditions between 70 + * dying processes leaving the cgroup and cgroup destruction path. 71 + */ 72 + static int test_cgcore_destroy(const char *root) 73 + { 74 + int ret = KSFT_FAIL; 75 + char *cg_test = NULL; 76 + int child_pid; 77 + char buf[PAGE_SIZE]; 78 + 79 + cg_test = cg_name(root, "cg_test"); 80 + 81 + if (!cg_test) 82 + goto cleanup; 83 + 84 + for (int i = 0; i < 10; i++) { 85 + if (cg_create(cg_test)) 86 + goto cleanup; 87 + 88 + child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit, 89 + (void *) MB(100)); 90 + 91 + if (child_pid < 0) 92 + goto cleanup; 93 + 94 + /* wait for the child to enter cgroup */ 95 + if (cg_wait_for_proc_count(cg_test, 1)) 96 + goto cleanup; 97 + 98 + if (cg_killall(cg_test)) 99 + goto cleanup; 100 + 101 + /* wait for cgroup to be empty */ 102 + while (1) { 103 + if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf))) 104 + goto cleanup; 105 + if (buf[0] == '\0') 106 + break; 107 + usleep(1000); 108 + } 109 + 110 + if (rmdir(cg_test)) 111 + goto cleanup; 112 + 113 + if (waitpid(child_pid, NULL, 0) < 0) 114 + goto cleanup; 115 + } 116 + ret = KSFT_PASS; 117 + cleanup: 118 + if (cg_test) 119 + cg_destroy(cg_test); 120 + free(cg_test); 121 + return ret; 122 + } 17 123 18 124 /* 19 125 * A(0) - B(0) - C(1) ··· 137 25 static int test_cgcore_populated(const char *root) 138 26 { 139 27 int ret = KSFT_FAIL; 28 + int err; 140 29 char *cg_test_a = NULL, *cg_test_b = NULL; 141 30 char *cg_test_c = NULL, *cg_test_d = NULL; 31 + int cgroup_fd = -EBADF; 32 + pid_t pid; 142 33 143 34 cg_test_a = cg_name(root, "cg_test_a"); 144 35 cg_test_b = cg_name(root, "cg_test_a/cg_test_b"); ··· 193 78 if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n")) 194 79 goto cleanup; 195 80 81 + /* Test that we can directly clone into a new cgroup. */ 82 + cgroup_fd = dirfd_open_opath(cg_test_d); 83 + if (cgroup_fd < 0) 84 + goto cleanup; 85 + 86 + pid = clone_into_cgroup(cgroup_fd); 87 + if (pid < 0) { 88 + if (errno == ENOSYS) 89 + goto cleanup_pass; 90 + goto cleanup; 91 + } 92 + 93 + if (pid == 0) { 94 + if (raise(SIGSTOP)) 95 + exit(EXIT_FAILURE); 96 + exit(EXIT_SUCCESS); 97 + } 98 + 99 + err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n"); 100 + 101 + (void)clone_reap(pid, WSTOPPED); 102 + (void)kill(pid, SIGCONT); 103 + (void)clone_reap(pid, WEXITED); 104 + 105 + if (err) 106 + goto cleanup; 107 + 108 + if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n")) 109 + goto cleanup; 110 + 111 + /* Remove cgroup. */ 112 + if (cg_test_d) { 113 + cg_destroy(cg_test_d); 114 + free(cg_test_d); 115 + cg_test_d = NULL; 116 + } 117 + 118 + pid = clone_into_cgroup(cgroup_fd); 119 + if (pid < 0) 120 + goto cleanup_pass; 121 + if (pid == 0) 122 + exit(EXIT_SUCCESS); 123 + (void)clone_reap(pid, WEXITED); 124 + goto cleanup; 125 + 126 + cleanup_pass: 196 127 ret = KSFT_PASS; 197 128 198 129 cleanup: ··· 254 93 free(cg_test_c); 255 94 free(cg_test_b); 256 95 free(cg_test_a); 96 + if (cgroup_fd >= 0) 97 + close(cgroup_fd); 257 98 return ret; 258 99 } 259 100 ··· 299 136 if (errno != EOPNOTSUPP) 300 137 goto cleanup; 301 138 139 + if (!clone_into_cgroup_run_wait(child)) 140 + goto cleanup; 141 + 142 + if (errno == ENOSYS) 143 + goto cleanup_pass; 144 + 145 + if (errno != EOPNOTSUPP) 146 + goto cleanup; 147 + 148 + cleanup_pass: 302 149 ret = KSFT_PASS; 303 150 304 151 cleanup: ··· 518 345 if (!cg_enter_current(parent)) 519 346 goto cleanup; 520 347 348 + if (!clone_into_cgroup_run_wait(parent)) 349 + goto cleanup; 350 + 521 351 ret = KSFT_PASS; 522 352 523 353 cleanup: ··· 688 512 T(test_cgcore_populated), 689 513 T(test_cgcore_proc_migration), 690 514 T(test_cgcore_thread_migration), 515 + T(test_cgcore_destroy), 691 516 }; 692 517 #undef T 693 518

+17 -2

tools/testing/selftests/clone3/clone3_selftests.h

··· 5 5 6 6 #define _GNU_SOURCE 7 7 #include <sched.h> 8 + #include <linux/sched.h> 9 + #include <linux/types.h> 8 10 #include <stdint.h> 9 11 #include <syscall.h> 10 - #include <linux/types.h> 12 + #include <sys/wait.h> 13 + 14 + #include "../kselftest.h" 11 15 12 16 #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) 17 + 18 + #ifndef CLONE_INTO_CGROUP 19 + #define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */ 20 + #endif 21 + 22 + #ifndef CLONE_ARGS_SIZE_VER0 23 + #define CLONE_ARGS_SIZE_VER0 64 24 + #endif 13 25 14 26 #ifndef __NR_clone3 15 27 #define __NR_clone3 -1 ··· 34 22 __aligned_u64 stack; 35 23 __aligned_u64 stack_size; 36 24 __aligned_u64 tls; 25 + #define CLONE_ARGS_SIZE_VER1 80 37 26 __aligned_u64 set_tid; 38 27 __aligned_u64 set_tid_size; 28 + #define CLONE_ARGS_SIZE_VER2 88 29 + __aligned_u64 cgroup; 39 30 }; 40 - #endif 31 + #endif /* __NR_clone3 */ 41 32 42 33 static pid_t sys_clone3(struct clone_args *args, size_t size) 43 34 {

Configure Feed

Configure Feed