Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
"This has an unusually high density of tricky fixes:

- task_get_css() could deadlock when it races against a dying cgroup.

- cgroup.procs didn't list thread group leaders with live threads.

This could mislead readers to think that a cgroup is empty when
it's not. Fixed by making PROCS iterator include dead tasks. I made
a couple mistakes making this change and this pull request contains
a couple follow-up patches.

- When cpusets run out of online cpus, it updates cpusmasks of member
tasks in bizarre ways. Joel improved the behavior significantly"

* 'for-5.2-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cpuset: restore sanity to cpuset_cpus_allowed_fallback()
cgroup: Fix css_task_iter_advance_css_set() cset skip condition
cgroup: css_task_iter_skip()'d iterators must be advanced before accessed
cgroup: Include dying leaders with live threads in PROCS iterations
cgroup: Implement css_task_iter_skip()
cgroup: Call cgroup_release() before __exit_signal()
docs cgroups: add another example size for hugetlb
cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css()

+118 -44
+13 -9
Documentation/cgroup-v1/hugetlb.txt
··· 32 32 hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb 33 33 hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB limit 34 34 35 - For a system supporting two hugepage size (16M and 16G) the control 35 + For a system supporting three hugepage sizes (64k, 32M and 1G), the control 36 36 files include: 37 37 38 - hugetlb.16GB.limit_in_bytes 39 - hugetlb.16GB.max_usage_in_bytes 40 - hugetlb.16GB.usage_in_bytes 41 - hugetlb.16GB.failcnt 42 - hugetlb.16MB.limit_in_bytes 43 - hugetlb.16MB.max_usage_in_bytes 44 - hugetlb.16MB.usage_in_bytes 45 - hugetlb.16MB.failcnt 38 + hugetlb.1GB.limit_in_bytes 39 + hugetlb.1GB.max_usage_in_bytes 40 + hugetlb.1GB.usage_in_bytes 41 + hugetlb.1GB.failcnt 42 + hugetlb.64KB.limit_in_bytes 43 + hugetlb.64KB.max_usage_in_bytes 44 + hugetlb.64KB.usage_in_bytes 45 + hugetlb.64KB.failcnt 46 + hugetlb.32MB.limit_in_bytes 47 + hugetlb.32MB.max_usage_in_bytes 48 + hugetlb.32MB.usage_in_bytes 49 + hugetlb.32MB.failcnt
+1
include/linux/cgroup-defs.h
··· 221 221 */ 222 222 struct list_head tasks; 223 223 struct list_head mg_tasks; 224 + struct list_head dying_tasks; 224 225 225 226 /* all css_task_iters currently walking this cset */ 226 227 struct list_head task_iters;
+12 -2
include/linux/cgroup.h
··· 43 43 /* walk all threaded css_sets in the domain */ 44 44 #define CSS_TASK_ITER_THREADED (1U << 1) 45 45 46 + /* internal flags */ 47 + #define CSS_TASK_ITER_SKIPPED (1U << 16) 48 + 46 49 /* a css_task_iter should be treated as an opaque object */ 47 50 struct css_task_iter { 48 51 struct cgroup_subsys *ss; ··· 60 57 struct list_head *task_pos; 61 58 struct list_head *tasks_head; 62 59 struct list_head *mg_tasks_head; 60 + struct list_head *dying_tasks_head; 63 61 64 62 struct css_set *cur_cset; 65 63 struct css_set *cur_dcset; ··· 491 487 * 492 488 * Find the css for the (@task, @subsys_id) combination, increment a 493 489 * reference on and return it. This function is guaranteed to return a 494 - * valid css. 490 + * valid css. The returned css may already have been offlined. 495 491 */ 496 492 static inline struct cgroup_subsys_state * 497 493 task_get_css(struct task_struct *task, int subsys_id) ··· 501 497 rcu_read_lock(); 502 498 while (true) { 503 499 css = task_css(task, subsys_id); 504 - if (likely(css_tryget_online(css))) 500 + /* 501 + * Can't use css_tryget_online() here. A task which has 502 + * PF_EXITING set may stay associated with an offline css. 503 + * If such task calls this function, css_tryget_online() 504 + * will keep failing. 505 + */ 506 + if (likely(css_tryget(css))) 505 507 break; 506 508 cpu_relax(); 507 509 }
+77 -31
kernel/cgroup/cgroup.c
··· 215 215 216 216 static int cgroup_apply_control(struct cgroup *cgrp); 217 217 static void cgroup_finalize_control(struct cgroup *cgrp, int ret); 218 - static void css_task_iter_advance(struct css_task_iter *it); 218 + static void css_task_iter_skip(struct css_task_iter *it, 219 + struct task_struct *task); 219 220 static int cgroup_destroy_locked(struct cgroup *cgrp); 220 221 static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, 221 222 struct cgroup_subsys *ss); ··· 739 738 .dom_cset = &init_css_set, 740 739 .tasks = LIST_HEAD_INIT(init_css_set.tasks), 741 740 .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), 741 + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), 742 742 .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), 743 743 .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), 744 744 .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), ··· 845 843 cgroup_update_populated(link->cgrp, populated); 846 844 } 847 845 846 + /* 847 + * @task is leaving, advance task iterators which are pointing to it so 848 + * that they can resume at the next position. Advancing an iterator might 849 + * remove it from the list, use safe walk. See css_task_iter_skip() for 850 + * details. 851 + */ 852 + static void css_set_skip_task_iters(struct css_set *cset, 853 + struct task_struct *task) 854 + { 855 + struct css_task_iter *it, *pos; 856 + 857 + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) 858 + css_task_iter_skip(it, task); 859 + } 860 + 848 861 /** 849 862 * css_set_move_task - move a task from one css_set to another 850 863 * @task: task being moved ··· 885 868 css_set_update_populated(to_cset, true); 886 869 887 870 if (from_cset) { 888 - struct css_task_iter *it, *pos; 889 - 890 871 WARN_ON_ONCE(list_empty(&task->cg_list)); 891 872 892 - /* 893 - * @task is leaving, advance task iterators which are 894 - * pointing to it so that they can resume at the next 895 - * position. Advancing an iterator might remove it from 896 - * the list, use safe walk. See css_task_iter_advance*() 897 - * for details. 898 - */ 899 - list_for_each_entry_safe(it, pos, &from_cset->task_iters, 900 - iters_node) 901 - if (it->task_pos == &task->cg_list) 902 - css_task_iter_advance(it); 903 - 873 + css_set_skip_task_iters(from_cset, task); 904 874 list_del_init(&task->cg_list); 905 875 if (!css_set_populated(from_cset)) 906 876 css_set_update_populated(from_cset, false); ··· 1214 1210 cset->dom_cset = cset; 1215 1211 INIT_LIST_HEAD(&cset->tasks); 1216 1212 INIT_LIST_HEAD(&cset->mg_tasks); 1213 + INIT_LIST_HEAD(&cset->dying_tasks); 1217 1214 INIT_LIST_HEAD(&cset->task_iters); 1218 1215 INIT_LIST_HEAD(&cset->threaded_csets); 1219 1216 INIT_HLIST_NODE(&cset->hlist); ··· 4413 4408 it->task_pos = NULL; 4414 4409 return; 4415 4410 } 4416 - } while (!css_set_populated(cset)); 4411 + } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); 4417 4412 4418 4413 if (!list_empty(&cset->tasks)) 4419 4414 it->task_pos = cset->tasks.next; 4420 - else 4415 + else if (!list_empty(&cset->mg_tasks)) 4421 4416 it->task_pos = cset->mg_tasks.next; 4417 + else 4418 + it->task_pos = cset->dying_tasks.next; 4422 4419 4423 4420 it->tasks_head = &cset->tasks; 4424 4421 it->mg_tasks_head = &cset->mg_tasks; 4422 + it->dying_tasks_head = &cset->dying_tasks; 4425 4423 4426 4424 /* 4427 4425 * We don't keep css_sets locked across iteration steps and thus ··· 4450 4442 list_add(&it->iters_node, &cset->task_iters); 4451 4443 } 4452 4444 4445 + static void css_task_iter_skip(struct css_task_iter *it, 4446 + struct task_struct *task) 4447 + { 4448 + lockdep_assert_held(&css_set_lock); 4449 + 4450 + if (it->task_pos == &task->cg_list) { 4451 + it->task_pos = it->task_pos->next; 4452 + it->flags |= CSS_TASK_ITER_SKIPPED; 4453 + } 4454 + } 4455 + 4453 4456 static void css_task_iter_advance(struct css_task_iter *it) 4454 4457 { 4455 - struct list_head *next; 4458 + struct task_struct *task; 4456 4459 4457 4460 lockdep_assert_held(&css_set_lock); 4458 4461 repeat: ··· 4473 4454 * consumed first and then ->mg_tasks. After ->mg_tasks, 4474 4455 * we move onto the next cset. 4475 4456 */ 4476 - next = it->task_pos->next; 4477 - 4478 - if (next == it->tasks_head) 4479 - next = it->mg_tasks_head->next; 4480 - 4481 - if (next == it->mg_tasks_head) 4482 - css_task_iter_advance_css_set(it); 4457 + if (it->flags & CSS_TASK_ITER_SKIPPED) 4458 + it->flags &= ~CSS_TASK_ITER_SKIPPED; 4483 4459 else 4484 - it->task_pos = next; 4460 + it->task_pos = it->task_pos->next; 4461 + 4462 + if (it->task_pos == it->tasks_head) 4463 + it->task_pos = it->mg_tasks_head->next; 4464 + if (it->task_pos == it->mg_tasks_head) 4465 + it->task_pos = it->dying_tasks_head->next; 4466 + if (it->task_pos == it->dying_tasks_head) 4467 + css_task_iter_advance_css_set(it); 4485 4468 } else { 4486 4469 /* called from start, proceed to the first cset */ 4487 4470 css_task_iter_advance_css_set(it); 4488 4471 } 4489 4472 4490 - /* if PROCS, skip over tasks which aren't group leaders */ 4491 - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && 4492 - !thread_group_leader(list_entry(it->task_pos, struct task_struct, 4493 - cg_list))) 4494 - goto repeat; 4473 + if (!it->task_pos) 4474 + return; 4475 + 4476 + task = list_entry(it->task_pos, struct task_struct, cg_list); 4477 + 4478 + if (it->flags & CSS_TASK_ITER_PROCS) { 4479 + /* if PROCS, skip over tasks which aren't group leaders */ 4480 + if (!thread_group_leader(task)) 4481 + goto repeat; 4482 + 4483 + /* and dying leaders w/o live member threads */ 4484 + if (!atomic_read(&task->signal->live)) 4485 + goto repeat; 4486 + } else { 4487 + /* skip all dying ones */ 4488 + if (task->flags & PF_EXITING) 4489 + goto repeat; 4490 + } 4495 4491 } 4496 4492 4497 4493 /** ··· 4561 4527 } 4562 4528 4563 4529 spin_lock_irq(&css_set_lock); 4530 + 4531 + /* @it may be half-advanced by skips, finish advancing */ 4532 + if (it->flags & CSS_TASK_ITER_SKIPPED) 4533 + css_task_iter_advance(it); 4564 4534 4565 4535 if (it->task_pos) { 4566 4536 it->cur_task = list_entry(it->task_pos, struct task_struct, ··· 6047 6009 if (!list_empty(&tsk->cg_list)) { 6048 6010 spin_lock_irq(&css_set_lock); 6049 6011 css_set_move_task(tsk, cset, NULL, false); 6012 + list_add_tail(&tsk->cg_list, &cset->dying_tasks); 6050 6013 cset->nr_tasks--; 6051 6014 6052 6015 WARN_ON_ONCE(cgroup_task_frozen(tsk)); ··· 6073 6034 do_each_subsys_mask(ss, ssid, have_release_callback) { 6074 6035 ss->release(task); 6075 6036 } while_each_subsys_mask(); 6037 + 6038 + if (use_task_css_set_links) { 6039 + spin_lock_irq(&css_set_lock); 6040 + css_set_skip_task_iters(task_css_set(task), task); 6041 + list_del_init(&task->cg_list); 6042 + spin_unlock_irq(&css_set_lock); 6043 + } 6076 6044 } 6077 6045 6078 6046 void cgroup_free(struct task_struct *task)
+14 -1
kernel/cgroup/cpuset.c
··· 3254 3254 spin_unlock_irqrestore(&callback_lock, flags); 3255 3255 } 3256 3256 3257 + /** 3258 + * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. 3259 + * @tsk: pointer to task_struct with which the scheduler is struggling 3260 + * 3261 + * Description: In the case that the scheduler cannot find an allowed cpu in 3262 + * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy 3263 + * mode however, this value is the same as task_cs(tsk)->effective_cpus, 3264 + * which will not contain a sane cpumask during cases such as cpu hotplugging. 3265 + * This is the absolute last resort for the scheduler and it is only used if 3266 + * _every_ other avenue has been traveled. 3267 + **/ 3268 + 3257 3269 void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 3258 3270 { 3259 3271 rcu_read_lock(); 3260 - do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); 3272 + do_set_cpus_allowed(tsk, is_in_v2_mode() ? 3273 + task_cs(tsk)->cpus_allowed : cpu_possible_mask); 3261 3274 rcu_read_unlock(); 3262 3275 3263 3276 /*
+1 -1
kernel/exit.c
··· 195 195 rcu_read_unlock(); 196 196 197 197 proc_flush_task(p); 198 + cgroup_release(p); 198 199 199 200 write_lock_irq(&tasklist_lock); 200 201 ptrace_release_task(p); ··· 221 220 } 222 221 223 222 write_unlock_irq(&tasklist_lock); 224 - cgroup_release(p); 225 223 release_thread(p); 226 224 call_rcu(&p->rcu, delayed_put_task_struct); 227 225