Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-3.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup fixes from Tejun Heo:
"Mostly fixes for the fallouts from the recent cgroup core changes.

The decoupled nature of cgroup dynamic hierarchy management
(hierarchies are created dynamically on mount but may or may not be
reused once unmounted depending on remaining usages) led to more
ugliness being added to kernfs.

Hopefully, this is the last of it"

* 'for-3.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cpuset: break kernfs active protection in cpuset_write_resmask()
cgroup: fix a race between cgroup_mount() and cgroup_kill_sb()
kernfs: introduce kernfs_pin_sb()
cgroup: fix mount failure in a corner case
cpuset,mempolicy: fix sleeping function called from invalid context
cgroup: fix broken css_has_online_children()

+100 -11
+30
fs/kernfs/mount.c
··· 211 211 kernfs_put(root_kn); 212 212 } 213 213 214 + /** 215 + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root 216 + * @kernfs_root: the kernfs_root in question 217 + * @ns: the namespace tag 218 + * 219 + * Pin the superblock so the superblock won't be destroyed in subsequent 220 + * operations. This can be used to block ->kill_sb() which may be useful 221 + * for kernfs users which dynamically manage superblocks. 222 + * 223 + * Returns NULL if there's no superblock associated to this kernfs_root, or 224 + * -EINVAL if the superblock is being freed. 225 + */ 226 + struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) 227 + { 228 + struct kernfs_super_info *info; 229 + struct super_block *sb = NULL; 230 + 231 + mutex_lock(&kernfs_mutex); 232 + list_for_each_entry(info, &root->supers, node) { 233 + if (info->ns == ns) { 234 + sb = info->sb; 235 + if (!atomic_inc_not_zero(&info->sb->s_active)) 236 + sb = ERR_PTR(-EINVAL); 237 + break; 238 + } 239 + } 240 + mutex_unlock(&kernfs_mutex); 241 + return sb; 242 + } 243 + 214 244 void __init kernfs_init(void) 215 245 { 216 246 kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
+1
include/linux/kernfs.h
··· 305 305 struct kernfs_root *root, unsigned long magic, 306 306 bool *new_sb_created, const void *ns); 307 307 void kernfs_kill_sb(struct super_block *sb); 308 + struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); 308 309 309 310 void kernfs_init(void); 310 311
+50 -8
kernel/cgroup.c
··· 1648 1648 int flags, const char *unused_dev_name, 1649 1649 void *data) 1650 1650 { 1651 + struct super_block *pinned_sb = NULL; 1652 + struct cgroup_subsys *ss; 1651 1653 struct cgroup_root *root; 1652 1654 struct cgroup_sb_opts opts; 1653 1655 struct dentry *dentry; 1654 1656 int ret; 1657 + int i; 1655 1658 bool new_sb; 1656 1659 1657 1660 /* ··· 1678 1675 cgroup_get(&root->cgrp); 1679 1676 ret = 0; 1680 1677 goto out_unlock; 1678 + } 1679 + 1680 + /* 1681 + * Destruction of cgroup root is asynchronous, so subsystems may 1682 + * still be dying after the previous unmount. Let's drain the 1683 + * dying subsystems. We just need to ensure that the ones 1684 + * unmounted previously finish dying and don't care about new ones 1685 + * starting. Testing ref liveliness is good enough. 1686 + */ 1687 + for_each_subsys(ss, i) { 1688 + if (!(opts.subsys_mask & (1 << i)) || 1689 + ss->root == &cgrp_dfl_root) 1690 + continue; 1691 + 1692 + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { 1693 + mutex_unlock(&cgroup_mutex); 1694 + msleep(10); 1695 + ret = restart_syscall(); 1696 + goto out_free; 1697 + } 1698 + cgroup_put(&ss->root->cgrp); 1681 1699 } 1682 1700 1683 1701 for_each_root(root) { ··· 1741 1717 } 1742 1718 1743 1719 /* 1744 - * A root's lifetime is governed by its root cgroup. 1745 - * tryget_live failure indicate that the root is being 1746 - * destroyed. Wait for destruction to complete so that the 1747 - * subsystems are free. We can use wait_queue for the wait 1748 - * but this path is super cold. Let's just sleep for a bit 1749 - * and retry. 1720 + * We want to reuse @root whose lifetime is governed by its 1721 + * ->cgrp. Let's check whether @root is alive and keep it 1722 + * that way. As cgroup_kill_sb() can happen anytime, we 1723 + * want to block it by pinning the sb so that @root doesn't 1724 + * get killed before mount is complete. 1725 + * 1726 + * With the sb pinned, tryget_live can reliably indicate 1727 + * whether @root can be reused. If it's being killed, 1728 + * drain it. We can use wait_queue for the wait but this 1729 + * path is super cold. Let's just sleep a bit and retry. 1750 1730 */ 1751 - if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { 1731 + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); 1732 + if (IS_ERR(pinned_sb) || 1733 + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { 1752 1734 mutex_unlock(&cgroup_mutex); 1735 + if (!IS_ERR_OR_NULL(pinned_sb)) 1736 + deactivate_super(pinned_sb); 1753 1737 msleep(10); 1754 1738 ret = restart_syscall(); 1755 1739 goto out_free; ··· 1802 1770 CGROUP_SUPER_MAGIC, &new_sb); 1803 1771 if (IS_ERR(dentry) || !new_sb) 1804 1772 cgroup_put(&root->cgrp); 1773 + 1774 + /* 1775 + * If @pinned_sb, we're reusing an existing root and holding an 1776 + * extra ref on its sb. Mount is complete. Put the extra ref. 1777 + */ 1778 + if (pinned_sb) { 1779 + WARN_ON(new_sb); 1780 + deactivate_super(pinned_sb); 1781 + } 1782 + 1805 1783 return dentry; 1806 1784 } 1807 1785 ··· 3370 3328 3371 3329 rcu_read_lock(); 3372 3330 css_for_each_child(child, css) { 3373 - if (css->flags & CSS_ONLINE) { 3331 + if (child->flags & CSS_ONLINE) { 3374 3332 ret = true; 3375 3333 break; 3376 3334 }
+19 -1
kernel/cpuset.c
··· 1181 1181 1182 1182 int current_cpuset_is_being_rebound(void) 1183 1183 { 1184 - return task_cs(current) == cpuset_being_rebound; 1184 + int ret; 1185 + 1186 + rcu_read_lock(); 1187 + ret = task_cs(current) == cpuset_being_rebound; 1188 + rcu_read_unlock(); 1189 + 1190 + return ret; 1185 1191 } 1186 1192 1187 1193 static int update_relax_domain_level(struct cpuset *cs, s64 val) ··· 1623 1617 * resources, wait for the previously scheduled operations before 1624 1618 * proceeding, so that we don't end up keep removing tasks added 1625 1619 * after execution capability is restored. 1620 + * 1621 + * cpuset_hotplug_work calls back into cgroup core via 1622 + * cgroup_transfer_tasks() and waiting for it from a cgroupfs 1623 + * operation like this one can lead to a deadlock through kernfs 1624 + * active_ref protection. Let's break the protection. Losing the 1625 + * protection is okay as we check whether @cs is online after 1626 + * grabbing cpuset_mutex anyway. This only happens on the legacy 1627 + * hierarchies. 1626 1628 */ 1629 + css_get(&cs->css); 1630 + kernfs_break_active_protection(of->kn); 1627 1631 flush_work(&cpuset_hotplug_work); 1628 1632 1629 1633 mutex_lock(&cpuset_mutex); ··· 1661 1645 free_trial_cpuset(trialcs); 1662 1646 out_unlock: 1663 1647 mutex_unlock(&cpuset_mutex); 1648 + kernfs_unbreak_active_protection(of->kn); 1649 + css_put(&cs->css); 1664 1650 return retval ?: nbytes; 1665 1651 } 1666 1652
-2
mm/mempolicy.c
··· 2139 2139 } else 2140 2140 *new = *old; 2141 2141 2142 - rcu_read_lock(); 2143 2142 if (current_cpuset_is_being_rebound()) { 2144 2143 nodemask_t mems = cpuset_mems_allowed(current); 2145 2144 if (new->flags & MPOL_F_REBINDING) ··· 2146 2147 else 2147 2148 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); 2148 2149 } 2149 - rcu_read_unlock(); 2150 2150 atomic_set(&new->refcnt, 1); 2151 2151 return new; 2152 2152 }