Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

preparations to taking MNT_WRITE_HOLD out of ->mnt_flags

We have an unpleasant wart in accessibility rules for struct mount. There
are per-superblock lists of mounts, used by sb_prepare_remount_readonly()
to check if any of those is currently claimed for write access and to
block further attempts to get write access on those until we are done.

As soon as it is attached to a filesystem, mount becomes reachable
via that list. Only sb_prepare_remount_readonly() traverses it and
it only accesses a few members of struct mount. Unfortunately,
->mnt_flags is one of those and it is modified - MNT_WRITE_HOLD set
and then cleared. It is done under mount_lock, so from the locking
rules POV everything's fine.

However, it has easily overlooked implications - once mount has been
attached to a filesystem, it has to be treated as globally visible.
In particular, initializing ->mnt_flags *must* be done either prior
to that point or under mount_lock. All other members are still
private at that point.

Life gets simpler if we move that bit (and that's *all* that can get
touched by access via this list) out of ->mnt_flags. It's not even
hard to do - currently the list is implemented as list_head one,
anchored in super_block->s_mounts and linked via mount->mnt_instance.

As the first step, switch it to hlist-like open-coded structure -
address of the first mount in the set is stored in ->s_mounts
and ->mnt_instance replaced with ->mnt_next_for_sb and ->mnt_pprev_for_sb -
the former either NULL or pointing to the next mount in set, the
latter - address of either ->s_mounts or ->mnt_next_for_sb in the
previous element of the set.

In the next commit we'll steal the LSB of ->mnt_pprev_for_sb as
replacement for MNT_WRITE_HOLD.

Reviewed-by: Christian Brauner <brauner@kernel.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

Al Viro 09a1b33c 5d132cfa

+36 -13
+3 -1
fs/mount.h
··· 64 64 #endif 65 65 struct list_head mnt_mounts; /* list of children, anchored here */ 66 66 struct list_head mnt_child; /* and going through their mnt_child */ 67 - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ 67 + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ 68 + struct mount * __aligned(1) *mnt_pprev_for_sb; 69 + /* except that LSB of pprev will be stolen */ 68 70 const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ 69 71 struct list_head mnt_list; 70 72 struct list_head mnt_expire; /* link in fs-specific expiry list */
+29 -9
fs/namespace.c
··· 730 730 mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 731 731 } 732 732 733 + static inline void mnt_del_instance(struct mount *m) 734 + { 735 + struct mount **p = m->mnt_pprev_for_sb; 736 + struct mount *next = m->mnt_next_for_sb; 737 + 738 + if (next) 739 + next->mnt_pprev_for_sb = p; 740 + *p = next; 741 + } 742 + 743 + static inline void mnt_add_instance(struct mount *m, struct super_block *s) 744 + { 745 + struct mount *first = s->s_mounts; 746 + 747 + if (first) 748 + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; 749 + m->mnt_next_for_sb = first; 750 + m->mnt_pprev_for_sb = &s->s_mounts; 751 + s->s_mounts = m; 752 + } 753 + 733 754 static int mnt_make_readonly(struct mount *mnt) 734 755 { 735 756 int ret; ··· 764 743 765 744 int sb_prepare_remount_readonly(struct super_block *sb) 766 745 { 767 - struct mount *mnt; 768 746 int err = 0; 769 747 770 748 /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ ··· 771 751 return -EBUSY; 772 752 773 753 lock_mount_hash(); 774 - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 775 - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { 776 - err = mnt_hold_writers(mnt); 754 + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { 755 + if (!(m->mnt.mnt_flags & MNT_READONLY)) { 756 + err = mnt_hold_writers(m); 777 757 if (err) 778 758 break; 779 759 } ··· 783 763 784 764 if (!err) 785 765 sb_start_ro_state_change(sb); 786 - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { 787 - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) 788 - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 766 + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { 767 + if (m->mnt.mnt_flags & MNT_WRITE_HOLD) 768 + m->mnt.mnt_flags &= ~MNT_WRITE_HOLD; 789 769 } 790 770 unlock_mount_hash(); 791 771 ··· 1227 1207 m->mnt_parent = m; 1228 1208 1229 1209 lock_mount_hash(); 1230 - list_add_tail(&m->mnt_instance, &s->s_mounts); 1210 + mnt_add_instance(m, s); 1231 1211 unlock_mount_hash(); 1232 1212 } 1233 1213 ··· 1445 1425 mnt->mnt.mnt_flags |= MNT_DOOMED; 1446 1426 rcu_read_unlock(); 1447 1427 1448 - list_del(&mnt->mnt_instance); 1428 + mnt_del_instance(mnt); 1449 1429 if (unlikely(!list_empty(&mnt->mnt_expire))) 1450 1430 list_del(&mnt->mnt_expire); 1451 1431
+1 -2
fs/super.c
··· 323 323 if (!s) 324 324 return NULL; 325 325 326 - INIT_LIST_HEAD(&s->s_mounts); 327 326 s->s_user_ns = get_user_ns(user_ns); 328 327 init_rwsem(&s->s_umount); 329 328 lockdep_set_class(&s->s_umount, &type->s_umount_key); ··· 407 408 list_del_init(&s->s_list); 408 409 WARN_ON(s->s_dentry_lru.node); 409 410 WARN_ON(s->s_inode_lru.node); 410 - WARN_ON(!list_empty(&s->s_mounts)); 411 + WARN_ON(s->s_mounts); 411 412 call_rcu(&s->rcu, destroy_super_rcu); 412 413 } 413 414 }
+3 -1
include/linux/fs.h
··· 1324 1324 struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; 1325 1325 }; 1326 1326 1327 + struct mount; 1328 + 1327 1329 struct super_block { 1328 1330 struct list_head s_list; /* Keep this first */ 1329 1331 dev_t s_dev; /* search index; _not_ kdev_t */ ··· 1360 1358 __u16 s_encoding_flags; 1361 1359 #endif 1362 1360 struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ 1363 - struct list_head s_mounts; /* list of mounts; _not_ for fs use */ 1361 + struct mount *s_mounts; /* list of mounts; _not_ for fs use */ 1364 1362 struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ 1365 1363 struct file *s_bdev_file; 1366 1364 struct backing_dev_info *s_bdi;