Merge branch 'for-3.16-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+30

fs/kernfs/mount.c

··· 211 211 kernfs_put(root_kn); 212 212 } 213 213 214 + /** 215 + * kernfs_pin_sb: try to pin the superblock associated with a kernfs_root 216 + * @kernfs_root: the kernfs_root in question 217 + * @ns: the namespace tag 218 + * 219 + * Pin the superblock so the superblock won't be destroyed in subsequent 220 + * operations. This can be used to block ->kill_sb() which may be useful 221 + * for kernfs users which dynamically manage superblocks. 222 + * 223 + * Returns NULL if there's no superblock associated to this kernfs_root, or 224 + * -EINVAL if the superblock is being freed. 225 + */ 226 + struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) 227 + { 228 + struct kernfs_super_info *info; 229 + struct super_block *sb = NULL; 230 + 231 + mutex_lock(&kernfs_mutex); 232 + list_for_each_entry(info, &root->supers, node) { 233 + if (info->ns == ns) { 234 + sb = info->sb; 235 + if (!atomic_inc_not_zero(&info->sb->s_active)) 236 + sb = ERR_PTR(-EINVAL); 237 + break; 238 + } 239 + } 240 + mutex_unlock(&kernfs_mutex); 241 + return sb; 242 + } 243 + 214 244 void __init kernfs_init(void) 215 245 { 216 246 kernfs_node_cache = kmem_cache_create("kernfs_node_cache",

+1

include/linux/kernfs.h

··· 305 305 struct kernfs_root *root, unsigned long magic, 306 306 bool *new_sb_created, const void *ns); 307 307 void kernfs_kill_sb(struct super_block *sb); 308 + struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns); 308 309 309 310 void kernfs_init(void); 310 311

+50 -8

kernel/cgroup.c

··· 1648 1648 int flags, const char *unused_dev_name, 1649 1649 void *data) 1650 1650 { 1651 + struct super_block *pinned_sb = NULL; 1652 + struct cgroup_subsys *ss; 1651 1653 struct cgroup_root *root; 1652 1654 struct cgroup_sb_opts opts; 1653 1655 struct dentry *dentry; 1654 1656 int ret; 1657 + int i; 1655 1658 bool new_sb; 1656 1659 1657 1660 /* ··· 1678 1675 cgroup_get(&root->cgrp); 1679 1676 ret = 0; 1680 1677 goto out_unlock; 1678 + } 1679 + 1680 + /* 1681 + * Destruction of cgroup root is asynchronous, so subsystems may 1682 + * still be dying after the previous unmount. Let's drain the 1683 + * dying subsystems. We just need to ensure that the ones 1684 + * unmounted previously finish dying and don't care about new ones 1685 + * starting. Testing ref liveliness is good enough. 1686 + */ 1687 + for_each_subsys(ss, i) { 1688 + if (!(opts.subsys_mask & (1 << i)) || 1689 + ss->root == &cgrp_dfl_root) 1690 + continue; 1691 + 1692 + if (!percpu_ref_tryget_live(&ss->root->cgrp.self.refcnt)) { 1693 + mutex_unlock(&cgroup_mutex); 1694 + msleep(10); 1695 + ret = restart_syscall(); 1696 + goto out_free; 1697 + } 1698 + cgroup_put(&ss->root->cgrp); 1681 1699 } 1682 1700 1683 1701 for_each_root(root) { ··· 1741 1717 } 1742 1718 1743 1719 /* 1744 - * A root's lifetime is governed by its root cgroup. 1745 - * tryget_live failure indicate that the root is being 1746 - * destroyed. Wait for destruction to complete so that the 1747 - * subsystems are free. We can use wait_queue for the wait 1748 - * but this path is super cold. Let's just sleep for a bit 1749 - * and retry. 1720 + * We want to reuse @root whose lifetime is governed by its 1721 + * ->cgrp. Let's check whether @root is alive and keep it 1722 + * that way. As cgroup_kill_sb() can happen anytime, we 1723 + * want to block it by pinning the sb so that @root doesn't 1724 + * get killed before mount is complete. 1725 + * 1726 + * With the sb pinned, tryget_live can reliably indicate 1727 + * whether @root can be reused. If it's being killed, 1728 + * drain it. We can use wait_queue for the wait but this 1729 + * path is super cold. Let's just sleep a bit and retry. 1750 1730 */ 1751 - if (!percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { 1731 + pinned_sb = kernfs_pin_sb(root->kf_root, NULL); 1732 + if (IS_ERR(pinned_sb) || 1733 + !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) { 1752 1734 mutex_unlock(&cgroup_mutex); 1735 + if (!IS_ERR_OR_NULL(pinned_sb)) 1736 + deactivate_super(pinned_sb); 1753 1737 msleep(10); 1754 1738 ret = restart_syscall(); 1755 1739 goto out_free; ··· 1802 1770 CGROUP_SUPER_MAGIC, &new_sb); 1803 1771 if (IS_ERR(dentry) || !new_sb) 1804 1772 cgroup_put(&root->cgrp); 1773 + 1774 + /* 1775 + * If @pinned_sb, we're reusing an existing root and holding an 1776 + * extra ref on its sb. Mount is complete. Put the extra ref. 1777 + */ 1778 + if (pinned_sb) { 1779 + WARN_ON(new_sb); 1780 + deactivate_super(pinned_sb); 1781 + } 1782 + 1805 1783 return dentry; 1806 1784 } 1807 1785 ··· 3370 3328 3371 3329 rcu_read_lock(); 3372 3330 css_for_each_child(child, css) { 3373 - if (css->flags & CSS_ONLINE) { 3331 + if (child->flags & CSS_ONLINE) { 3374 3332 ret = true; 3375 3333 break; 3376 3334 }

+19 -1

kernel/cpuset.c

··· 1181 1181 1182 1182 int current_cpuset_is_being_rebound(void) 1183 1183 { 1184 - return task_cs(current) == cpuset_being_rebound; 1184 + int ret; 1185 + 1186 + rcu_read_lock(); 1187 + ret = task_cs(current) == cpuset_being_rebound; 1188 + rcu_read_unlock(); 1189 + 1190 + return ret; 1185 1191 } 1186 1192 1187 1193 static int update_relax_domain_level(struct cpuset *cs, s64 val) ··· 1623 1617 * resources, wait for the previously scheduled operations before 1624 1618 * proceeding, so that we don't end up keep removing tasks added 1625 1619 * after execution capability is restored. 1620 + * 1621 + * cpuset_hotplug_work calls back into cgroup core via 1622 + * cgroup_transfer_tasks() and waiting for it from a cgroupfs 1623 + * operation like this one can lead to a deadlock through kernfs 1624 + * active_ref protection. Let's break the protection. Losing the 1625 + * protection is okay as we check whether @cs is online after 1626 + * grabbing cpuset_mutex anyway. This only happens on the legacy 1627 + * hierarchies. 1626 1628 */ 1629 + css_get(&cs->css); 1630 + kernfs_break_active_protection(of->kn); 1627 1631 flush_work(&cpuset_hotplug_work); 1628 1632 1629 1633 mutex_lock(&cpuset_mutex); ··· 1661 1645 free_trial_cpuset(trialcs); 1662 1646 out_unlock: 1663 1647 mutex_unlock(&cpuset_mutex); 1648 + kernfs_unbreak_active_protection(of->kn); 1649 + css_put(&cs->css); 1664 1650 return retval ?: nbytes; 1665 1651 } 1666 1652

-2

mm/mempolicy.c

··· 2139 2139 } else 2140 2140 *new = *old; 2141 2141 2142 - rcu_read_lock(); 2143 2142 if (current_cpuset_is_being_rebound()) { 2144 2143 nodemask_t mems = cpuset_mems_allowed(current); 2145 2144 if (new->flags & MPOL_F_REBINDING) ··· 2146 2147 else 2147 2148 mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); 2148 2149 } 2149 - rcu_read_unlock(); 2150 2150 atomic_set(&new->refcnt, 1); 2151 2151 return new; 2152 2152 }

Configure Feed

Configure Feed