Merge branch 'for-5.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+19

kernel/cgroup/cgroup-internal.h

··· 65 65 return container_of(kfc, struct cgroup_fs_context, kfc); 66 66 } 67 67 68 + struct cgroup_pidlist; 69 + 70 + struct cgroup_file_ctx { 71 + struct cgroup_namespace *ns; 72 + 73 + struct { 74 + void *trigger; 75 + } psi; 76 + 77 + struct { 78 + bool started; 79 + struct css_task_iter iter; 80 + } procs; 81 + 82 + struct { 83 + struct cgroup_pidlist *pidlist; 84 + } procs1; 85 + }; 86 + 68 87 /* 69 88 * A cgroup can be associated with multiple css_sets as different tasks may 70 89 * belong to different cgroups on different hierarchies. In the other

+18 -15

kernel/cgroup/cgroup-v1.c

··· 394 394 * next pid to display, if any 395 395 */ 396 396 struct kernfs_open_file *of = s->private; 397 + struct cgroup_file_ctx *ctx = of->priv; 397 398 struct cgroup *cgrp = seq_css(s)->cgroup; 398 399 struct cgroup_pidlist *l; 399 400 enum cgroup_filetype type = seq_cft(s)->private; ··· 404 403 mutex_lock(&cgrp->pidlist_mutex); 405 404 406 405 /* 407 - * !NULL @of->priv indicates that this isn't the first start() 408 - * after open. If the matching pidlist is around, we can use that. 409 - * Look for it. Note that @of->priv can't be used directly. It 410 - * could already have been destroyed. 406 + * !NULL @ctx->procs1.pidlist indicates that this isn't the first 407 + * start() after open. If the matching pidlist is around, we can use 408 + * that. Look for it. Note that @ctx->procs1.pidlist can't be used 409 + * directly. It could already have been destroyed. 411 410 */ 412 - if (of->priv) 413 - of->priv = cgroup_pidlist_find(cgrp, type); 411 + if (ctx->procs1.pidlist) 412 + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); 414 413 415 414 /* 416 415 * Either this is the first start() after open or the matching 417 416 * pidlist has been destroyed inbetween. Create a new one. 418 417 */ 419 - if (!of->priv) { 420 - ret = pidlist_array_load(cgrp, type, 421 - (struct cgroup_pidlist **)&of->priv); 418 + if (!ctx->procs1.pidlist) { 419 + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); 422 420 if (ret) 423 421 return ERR_PTR(ret); 424 422 } 425 - l = of->priv; 423 + l = ctx->procs1.pidlist; 426 424 427 425 if (pid) { 428 426 int end = l->length; ··· 449 449 static void cgroup_pidlist_stop(struct seq_file *s, void *v) 450 450 { 451 451 struct kernfs_open_file *of = s->private; 452 - struct cgroup_pidlist *l = of->priv; 452 + struct cgroup_file_ctx *ctx = of->priv; 453 + struct cgroup_pidlist *l = ctx->procs1.pidlist; 453 454 454 455 if (l) 455 456 mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, ··· 461 460 static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 462 461 { 463 462 struct kernfs_open_file *of = s->private; 464 - struct cgroup_pidlist *l = of->priv; 463 + struct cgroup_file_ctx *ctx = of->priv; 464 + struct cgroup_pidlist *l = ctx->procs1.pidlist; 465 465 pid_t *p = v; 466 466 pid_t *end = l->list + l->length; 467 467 /* ··· 506 504 goto out_unlock; 507 505 508 506 /* 509 - * Even if we're attaching all tasks in the thread group, we only 510 - * need to check permissions on one of them. 507 + * Even if we're attaching all tasks in the thread group, we only need 508 + * to check permissions on one of them. Check permissions using the 509 + * credentials from file open to protect against inherited fd attacks. 511 510 */ 512 - cred = current_cred(); 511 + cred = of->file->f_cred; 513 512 tcred = get_task_cred(task); 514 513 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 515 514 !uid_eq(cred->euid, tcred->uid) &&

+60 -28

kernel/cgroup/cgroup.c

··· 3630 3630 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, 3631 3631 size_t nbytes, enum psi_res res) 3632 3632 { 3633 + struct cgroup_file_ctx *ctx = of->priv; 3633 3634 struct psi_trigger *new; 3634 3635 struct cgroup *cgrp; 3635 3636 struct psi_group *psi; ··· 3649 3648 return PTR_ERR(new); 3650 3649 } 3651 3650 3652 - psi_trigger_replace(&of->priv, new); 3651 + psi_trigger_replace(&ctx->psi.trigger, new); 3653 3652 3654 3653 cgroup_put(cgrp); 3655 3654 ··· 3680 3679 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, 3681 3680 poll_table *pt) 3682 3681 { 3683 - return psi_trigger_poll(&of->priv, of->file, pt); 3682 + struct cgroup_file_ctx *ctx = of->priv; 3683 + 3684 + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); 3684 3685 } 3685 3686 3686 3687 static void cgroup_pressure_release(struct kernfs_open_file *of) 3687 3688 { 3688 - psi_trigger_replace(&of->priv, NULL); 3689 + struct cgroup_file_ctx *ctx = of->priv; 3690 + 3691 + psi_trigger_replace(&ctx->psi.trigger, NULL); 3689 3692 } 3690 3693 3691 3694 bool cgroup_psi_enabled(void) ··· 3816 3811 static int cgroup_file_open(struct kernfs_open_file *of) 3817 3812 { 3818 3813 struct cftype *cft = of_cft(of); 3814 + struct cgroup_file_ctx *ctx; 3815 + int ret; 3819 3816 3820 - if (cft->open) 3821 - return cft->open(of); 3822 - return 0; 3817 + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 3818 + if (!ctx) 3819 + return -ENOMEM; 3820 + 3821 + ctx->ns = current->nsproxy->cgroup_ns; 3822 + get_cgroup_ns(ctx->ns); 3823 + of->priv = ctx; 3824 + 3825 + if (!cft->open) 3826 + return 0; 3827 + 3828 + ret = cft->open(of); 3829 + if (ret) { 3830 + put_cgroup_ns(ctx->ns); 3831 + kfree(ctx); 3832 + } 3833 + return ret; 3823 3834 } 3824 3835 3825 3836 static void cgroup_file_release(struct kernfs_open_file *of) 3826 3837 { 3827 3838 struct cftype *cft = of_cft(of); 3839 + struct cgroup_file_ctx *ctx = of->priv; 3828 3840 3829 3841 if (cft->release) 3830 3842 cft->release(of); 3843 + put_cgroup_ns(ctx->ns); 3844 + kfree(ctx); 3831 3845 } 3832 3846 3833 3847 static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, 3834 3848 size_t nbytes, loff_t off) 3835 3849 { 3836 - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; 3850 + struct cgroup_file_ctx *ctx = of->priv; 3837 3851 struct cgroup *cgrp = of->kn->parent->priv; 3838 3852 struct cftype *cft = of_cft(of); 3839 3853 struct cgroup_subsys_state *css; ··· 3869 3845 */ 3870 3846 if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && 3871 3847 !(cft->flags & CFTYPE_NS_DELEGATABLE) && 3872 - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) 3848 + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) 3873 3849 return -EPERM; 3874 3850 3875 3851 if (cft->write) ··· 4775 4751 4776 4752 static void cgroup_procs_release(struct kernfs_open_file *of) 4777 4753 { 4778 - if (of->priv) { 4779 - css_task_iter_end(of->priv); 4780 - kfree(of->priv); 4781 - } 4754 + struct cgroup_file_ctx *ctx = of->priv; 4755 + 4756 + if (ctx->procs.started) 4757 + css_task_iter_end(&ctx->procs.iter); 4782 4758 } 4783 4759 4784 4760 static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) 4785 4761 { 4786 4762 struct kernfs_open_file *of = s->private; 4787 - struct css_task_iter *it = of->priv; 4763 + struct cgroup_file_ctx *ctx = of->priv; 4788 4764 4789 4765 if (pos) 4790 4766 (*pos)++; 4791 4767 4792 - return css_task_iter_next(it); 4768 + return css_task_iter_next(&ctx->procs.iter); 4793 4769 } 4794 4770 4795 4771 static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, ··· 4797 4773 { 4798 4774 struct kernfs_open_file *of = s->private; 4799 4775 struct cgroup *cgrp = seq_css(s)->cgroup; 4800 - struct css_task_iter *it = of->priv; 4776 + struct cgroup_file_ctx *ctx = of->priv; 4777 + struct css_task_iter *it = &ctx->procs.iter; 4801 4778 4802 4779 /* 4803 4780 * When a seq_file is seeked, it's always traversed sequentially 4804 4781 * from position 0, so we can simply keep iterating on !0 *pos. 4805 4782 */ 4806 - if (!it) { 4783 + if (!ctx->procs.started) { 4807 4784 if (WARN_ON_ONCE((*pos))) 4808 4785 return ERR_PTR(-EINVAL); 4809 - 4810 - it = kzalloc(sizeof(*it), GFP_KERNEL); 4811 - if (!it) 4812 - return ERR_PTR(-ENOMEM); 4813 - of->priv = it; 4814 4786 css_task_iter_start(&cgrp->self, iter_flags, it); 4787 + ctx->procs.started = true; 4815 4788 } else if (!(*pos)) { 4816 4789 css_task_iter_end(it); 4817 4790 css_task_iter_start(&cgrp->self, iter_flags, it); ··· 4859 4838 4860 4839 static int cgroup_procs_write_permission(struct cgroup *src_cgrp, 4861 4840 struct cgroup *dst_cgrp, 4862 - struct super_block *sb) 4841 + struct super_block *sb, 4842 + struct cgroup_namespace *ns) 4863 4843 { 4864 - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; 4865 4844 struct cgroup *com_cgrp = src_cgrp; 4866 4845 int ret; 4867 4846 ··· 4890 4869 4891 4870 static int cgroup_attach_permissions(struct cgroup *src_cgrp, 4892 4871 struct cgroup *dst_cgrp, 4893 - struct super_block *sb, bool threadgroup) 4872 + struct super_block *sb, bool threadgroup, 4873 + struct cgroup_namespace *ns) 4894 4874 { 4895 4875 int ret = 0; 4896 4876 4897 - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); 4877 + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); 4898 4878 if (ret) 4899 4879 return ret; 4900 4880 ··· 4912 4890 static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, 4913 4891 bool threadgroup) 4914 4892 { 4893 + struct cgroup_file_ctx *ctx = of->priv; 4915 4894 struct cgroup *src_cgrp, *dst_cgrp; 4916 4895 struct task_struct *task; 4896 + const struct cred *saved_cred; 4917 4897 ssize_t ret; 4918 4898 bool locked; 4919 4899 ··· 4933 4909 src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); 4934 4910 spin_unlock_irq(&css_set_lock); 4935 4911 4936 - /* process and thread migrations follow same delegation rule */ 4912 + /* 4913 + * Process and thread migrations follow same delegation rule. Check 4914 + * permissions using the credentials from file open to protect against 4915 + * inherited fd attacks. 4916 + */ 4917 + saved_cred = override_creds(of->file->f_cred); 4937 4918 ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, 4938 - of->file->f_path.dentry->d_sb, threadgroup); 4919 + of->file->f_path.dentry->d_sb, 4920 + threadgroup, ctx->ns); 4921 + revert_creds(saved_cred); 4939 4922 if (ret) 4940 4923 goto out_finish; 4941 4924 ··· 6161 6130 goto err; 6162 6131 6163 6132 ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, 6164 - !(kargs->flags & CLONE_THREAD)); 6133 + !(kargs->flags & CLONE_THREAD), 6134 + current->nsproxy->cgroup_ns); 6165 6135 if (ret) 6166 6136 goto err; 6167 6137

+1 -1

tools/testing/selftests/cgroup/cgroup_util.c

··· 221 221 222 222 int cg_create(const char *cgroup) 223 223 { 224 - return mkdir(cgroup, 0644); 224 + return mkdir(cgroup, 0755); 225 225 } 226 226 227 227 int cg_wait_for_proc_count(const char *cgroup, int count)

+165

tools/testing/selftests/cgroup/test_core.c

··· 1 1 /* SPDX-License-Identifier: GPL-2.0 */ 2 2 3 + #define _GNU_SOURCE 3 4 #include <linux/limits.h> 5 + #include <linux/sched.h> 4 6 #include <sys/types.h> 5 7 #include <sys/mman.h> 6 8 #include <sys/wait.h> 7 9 #include <unistd.h> 8 10 #include <fcntl.h> 11 + #include <sched.h> 9 12 #include <stdio.h> 10 13 #include <errno.h> 11 14 #include <signal.h> ··· 677 674 return ret; 678 675 } 679 676 677 + /* 678 + * cgroup migration permission check should be performed based on the 679 + * credentials at the time of open instead of write. 680 + */ 681 + static int test_cgcore_lesser_euid_open(const char *root) 682 + { 683 + const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ 684 + int ret = KSFT_FAIL; 685 + char *cg_test_a = NULL, *cg_test_b = NULL; 686 + char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; 687 + int cg_test_b_procs_fd = -1; 688 + uid_t saved_uid; 689 + 690 + cg_test_a = cg_name(root, "cg_test_a"); 691 + cg_test_b = cg_name(root, "cg_test_b"); 692 + 693 + if (!cg_test_a || !cg_test_b) 694 + goto cleanup; 695 + 696 + cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs"); 697 + cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs"); 698 + 699 + if (!cg_test_a_procs || !cg_test_b_procs) 700 + goto cleanup; 701 + 702 + if (cg_create(cg_test_a) || cg_create(cg_test_b)) 703 + goto cleanup; 704 + 705 + if (cg_enter_current(cg_test_a)) 706 + goto cleanup; 707 + 708 + if (chown(cg_test_a_procs, test_euid, -1) || 709 + chown(cg_test_b_procs, test_euid, -1)) 710 + goto cleanup; 711 + 712 + saved_uid = geteuid(); 713 + if (seteuid(test_euid)) 714 + goto cleanup; 715 + 716 + cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR); 717 + 718 + if (seteuid(saved_uid)) 719 + goto cleanup; 720 + 721 + if (cg_test_b_procs_fd < 0) 722 + goto cleanup; 723 + 724 + if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES) 725 + goto cleanup; 726 + 727 + ret = KSFT_PASS; 728 + 729 + cleanup: 730 + cg_enter_current(root); 731 + if (cg_test_b_procs_fd >= 0) 732 + close(cg_test_b_procs_fd); 733 + if (cg_test_b) 734 + cg_destroy(cg_test_b); 735 + if (cg_test_a) 736 + cg_destroy(cg_test_a); 737 + free(cg_test_b_procs); 738 + free(cg_test_a_procs); 739 + free(cg_test_b); 740 + free(cg_test_a); 741 + return ret; 742 + } 743 + 744 + struct lesser_ns_open_thread_arg { 745 + const char *path; 746 + int fd; 747 + int err; 748 + }; 749 + 750 + static int lesser_ns_open_thread_fn(void *arg) 751 + { 752 + struct lesser_ns_open_thread_arg *targ = arg; 753 + 754 + targ->fd = open(targ->path, O_RDWR); 755 + targ->err = errno; 756 + return 0; 757 + } 758 + 759 + /* 760 + * cgroup migration permission check should be performed based on the cgroup 761 + * namespace at the time of open instead of write. 762 + */ 763 + static int test_cgcore_lesser_ns_open(const char *root) 764 + { 765 + static char stack[65536]; 766 + const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ 767 + int ret = KSFT_FAIL; 768 + char *cg_test_a = NULL, *cg_test_b = NULL; 769 + char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; 770 + int cg_test_b_procs_fd = -1; 771 + struct lesser_ns_open_thread_arg targ = { .fd = -1 }; 772 + pid_t pid; 773 + int status; 774 + 775 + cg_test_a = cg_name(root, "cg_test_a"); 776 + cg_test_b = cg_name(root, "cg_test_b"); 777 + 778 + if (!cg_test_a || !cg_test_b) 779 + goto cleanup; 780 + 781 + cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs"); 782 + cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs"); 783 + 784 + if (!cg_test_a_procs || !cg_test_b_procs) 785 + goto cleanup; 786 + 787 + if (cg_create(cg_test_a) || cg_create(cg_test_b)) 788 + goto cleanup; 789 + 790 + if (cg_enter_current(cg_test_b)) 791 + goto cleanup; 792 + 793 + if (chown(cg_test_a_procs, test_euid, -1) || 794 + chown(cg_test_b_procs, test_euid, -1)) 795 + goto cleanup; 796 + 797 + targ.path = cg_test_b_procs; 798 + pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack), 799 + CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD, 800 + &targ); 801 + if (pid < 0) 802 + goto cleanup; 803 + 804 + if (waitpid(pid, &status, 0) < 0) 805 + goto cleanup; 806 + 807 + if (!WIFEXITED(status)) 808 + goto cleanup; 809 + 810 + cg_test_b_procs_fd = targ.fd; 811 + if (cg_test_b_procs_fd < 0) 812 + goto cleanup; 813 + 814 + if (cg_enter_current(cg_test_a)) 815 + goto cleanup; 816 + 817 + if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT) 818 + goto cleanup; 819 + 820 + ret = KSFT_PASS; 821 + 822 + cleanup: 823 + cg_enter_current(root); 824 + if (cg_test_b_procs_fd >= 0) 825 + close(cg_test_b_procs_fd); 826 + if (cg_test_b) 827 + cg_destroy(cg_test_b); 828 + if (cg_test_a) 829 + cg_destroy(cg_test_a); 830 + free(cg_test_b_procs); 831 + free(cg_test_a_procs); 832 + free(cg_test_b); 833 + free(cg_test_a); 834 + return ret; 835 + } 836 + 680 837 #define T(x) { x, #x } 681 838 struct corecg_test { 682 839 int (*fn)(const char *root); ··· 852 689 T(test_cgcore_proc_migration), 853 690 T(test_cgcore_thread_migration), 854 691 T(test_cgcore_destroy), 692 + T(test_cgcore_lesser_euid_open), 693 + T(test_cgcore_lesser_ns_open), 855 694 }; 856 695 #undef T 857 696

Configure Feed

Configure Feed