Merge tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:

- Fix UAF race in psi pressure_write() against cgroup file release by
extending cgroup_mutex coverage and ordering of->priv access after
cgroup_kn_lock_live()

- Fix integer overflow in rdmacg_try_charge() when usage equals INT_MAX
by performing the increment in s64

- Fix asymmetric DL bandwidth accounting on cpuset attach rollback by
recording the CPU used by dl_bw_alloc() so cancel_attach() returns
the reservation to the same root domain

- Fix nr_dying_subsys_* race that briefly showed 0 in cgroup.stat after
rmdir by incrementing from kill_css() instead of offline_css()

- Typo fix in cgroup-v2 documentation

* tag 'cgroup-for-7.1-rc1-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
docs: cgroup: fix typo 'protetion' -> 'protection'
cgroup: Increment nr_dying_subsys_* from rmdir context
cgroup/cpuset: record DL BW alloc CPU for attach rollback
cgroup/rdma: fix integer overflow in rdmacg_try_charge()
sched/psi: fix race between file release and pressure write

Linus Torvalds 1 week ago 3b3bea6d a1a67109

+44 -24

5 changed files

expand all

Documentation

admin-guide

cgroup-v2.rst

kernel

cgroup

cgroup.c

cpuset-internal.h

cpuset.c

rdma.c

+1 -1

Documentation/admin-guide/cgroup-v2.rst

··· 220 220 memory_hugetlb_accounting 221 221 Count HugeTLB memory usage towards the cgroup's overall 222 222 memory usage for the memory controller (for the purpose of 223 - statistics reporting and memory protetion). This is a new 223 + statistics reporting and memory protection). This is a new 224 224 behavior that could regress existing setups, so it must be 225 225 explicitly opted in with this mount option. 226 226

+28 -18

kernel/cgroup/cgroup.c

··· 3934 3934 static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, 3935 3935 size_t nbytes, enum psi_res res) 3936 3936 { 3937 - struct cgroup_file_ctx *ctx = of->priv; 3937 + struct cgroup_file_ctx *ctx; 3938 3938 struct psi_trigger *new; 3939 3939 struct cgroup *cgrp; 3940 3940 struct psi_group *psi; 3941 + ssize_t ret = 0; 3941 3942 3942 3943 cgrp = cgroup_kn_lock_live(of->kn, false); 3943 3944 if (!cgrp) 3944 3945 return -ENODEV; 3945 3946 3946 - cgroup_get(cgrp); 3947 - cgroup_kn_unlock(of->kn); 3947 + ctx = of->priv; 3948 + if (!ctx) { 3949 + ret = -ENODEV; 3950 + goto out_unlock; 3951 + } 3948 3952 3949 3953 /* Allow only one trigger per file descriptor */ 3950 3954 if (ctx->psi.trigger) { 3951 - cgroup_put(cgrp); 3952 - return -EBUSY; 3955 + ret = -EBUSY; 3956 + goto out_unlock; 3953 3957 } 3954 3958 3955 3959 psi = cgroup_psi(cgrp); 3956 3960 new = psi_trigger_create(psi, buf, res, of->file, of); 3957 3961 if (IS_ERR(new)) { 3958 - cgroup_put(cgrp); 3959 - return PTR_ERR(new); 3962 + ret = PTR_ERR(new); 3963 + goto out_unlock; 3960 3964 } 3961 3965 3962 3966 smp_store_release(&ctx->psi.trigger, new); 3963 - cgroup_put(cgrp); 3967 + 3968 + out_unlock: 3969 + cgroup_kn_unlock(of->kn); 3970 + if (ret) 3971 + return ret; 3964 3972 3965 3973 return nbytes; 3966 3974 } ··· 5724 5716 RCU_INIT_POINTER(css->cgroup->subsys[ss->id], NULL); 5725 5717 5726 5718 wake_up_all(&css->cgroup->offline_waitq); 5727 - 5728 - css->cgroup->nr_dying_subsys[ss->id]++; 5729 - /* 5730 - * Parent css and cgroup cannot be freed until after the freeing 5731 - * of child css, see css_free_rwork_fn(). 5732 - */ 5733 - while ((css = css->parent)) { 5734 - css->nr_descendants--; 5735 - css->cgroup->nr_dying_subsys[ss->id]++; 5736 - } 5737 5719 } 5738 5720 5739 5721 /** ··· 6036 6038 */ 6037 6039 static void kill_css(struct cgroup_subsys_state *css) 6038 6040 { 6041 + struct cgroup_subsys *ss = css->ss; 6042 + 6039 6043 lockdep_assert_held(&cgroup_mutex); 6040 6044 6041 6045 if (css->flags & CSS_DYING) ··· 6074 6074 * css is confirmed to be seen as killed on all CPUs. 6075 6075 */ 6076 6076 percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); 6077 + 6078 + css->cgroup->nr_dying_subsys[ss->id]++; 6079 + /* 6080 + * Parent css and cgroup cannot be freed until after the freeing 6081 + * of child css, see css_free_rwork_fn(). 6082 + */ 6083 + while ((css = css->parent)) { 6084 + css->nr_descendants--; 6085 + css->cgroup->nr_dying_subsys[ss->id]++; 6086 + } 6077 6087 } 6078 6088 6079 6089 /**

kernel/cgroup/cpuset-internal.h

··· 168 168 int nr_deadline_tasks; 169 169 int nr_migrate_dl_tasks; 170 170 u64 sum_migrate_dl_bw; 171 + /* 172 + * CPU used for temporary DL bandwidth allocation during attach; 173 + * -1 if no DL bandwidth was allocated in the current attach. 174 + */ 175 + int dl_bw_cpu; 171 176 172 177 /* Invalid partition error code, not lock protected */ 173 178 enum prs_errcode prs_err;

+9 -4

kernel/cgroup/cpuset.c

··· 288 288 .flags = BIT(CS_CPU_EXCLUSIVE) | 289 289 BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), 290 290 .partition_root_state = PRS_ROOT, 291 + .dl_bw_cpu = -1, 291 292 }; 292 293 293 294 /** ··· 579 578 kzalloc_obj(*cs); 580 579 if (!trial) 581 580 return NULL; 581 + 582 + trial->dl_bw_cpu = -1; 582 583 583 584 /* Setup cpumask pointer array */ 584 585 cpumask_var_t *pmask[4] = { ··· 2983 2980 { 2984 2981 cs->nr_migrate_dl_tasks = 0; 2985 2982 cs->sum_migrate_dl_bw = 0; 2983 + cs->dl_bw_cpu = -1; 2986 2984 } 2987 2985 2988 2986 /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ ··· 3060 3056 reset_migrate_dl_data(cs); 3061 3057 goto out_unlock; 3062 3058 } 3059 + 3060 + cs->dl_bw_cpu = cpu; 3063 3061 } 3064 3062 3065 3063 out_success: ··· 3086 3080 mutex_lock(&cpuset_mutex); 3087 3081 dec_attach_in_progress_locked(cs); 3088 3082 3089 - if (cs->nr_migrate_dl_tasks) { 3090 - int cpu = cpumask_any(cs->effective_cpus); 3083 + if (cs->dl_bw_cpu >= 0) 3084 + dl_bw_free(cs->dl_bw_cpu, cs->sum_migrate_dl_bw); 3091 3085 3092 - dl_bw_free(cpu, cs->sum_migrate_dl_bw); 3086 + if (cs->nr_migrate_dl_tasks) 3093 3087 reset_migrate_dl_data(cs); 3094 - } 3095 3088 3096 3089 mutex_unlock(&cpuset_mutex); 3097 3090 }

+1 -1

kernel/cgroup/rdma.c

··· 283 283 ret = PTR_ERR(rpool); 284 284 goto err; 285 285 } else { 286 - new = rpool->resources[index].usage + 1; 286 + new = (s64)rpool->resources[index].usage + 1; 287 287 if (new > rpool->resources[index].max) { 288 288 ret = -EAGAIN; 289 289 goto err;

Configure Feed

Configure Feed