Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'vfs-6.15-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs fixes from Christian Brauner:

- For some reason we went from zero to three maintainers for HFS/HFS+
in a matter of days. The lesson to learn from this might just be that
we need to threaten code removal more often!?

- Fix a regression introduced by enabling large folios for lage logical
block sizes. This has caused issues for noref migration with large
folios due to sleeping while in an atomic context.

New sleeping variants of pagecache lookup helpers are introduced.
These helpers take the folio lock instead of the mapping's private
spinlock. The problematic users are converted to the sleeping
variants and serialize against noref migration. Atomic users will
bail on seeing the new BH_Migrate flag.

This also shrinks the critical region of the mapping's private lock
and the new blocking callers reduce contention on the spinlock for
bdev mappings.

- Fix two bugs in do_move_mount() when with MOVE_MOUNT_BENEATH. The
first bug is using a mountpoint that is located on a mount we're not
holding a reference to. The second bug is putting the mountpoint
after we've called namespace_unlock() as it's no longer guaranteed
that it does stay a mountpoint.

- Remove a pointless call to vfs_getattr_nosec() in the devtmpfs code
just to query i_mode instead of simply querying the inode directly.
This also avoids lifetime issues for the dm code by an earlier bugfix
this cycle that moved bdev_statx() handling into vfs_getattr_nosec().

- Fix AT_FDCWD handling with getname_maybe_null() in the xattr code.

- Fix a performance regression for files when multiple callers issue a
close when it's not the last reference.

- Remove a duplicate noinline annotation from pipe_clear_nowait().

* tag 'vfs-6.15-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fs/xattr: Fix handling of AT_FDCWD in setxattrat(2) and getxattrat(2)
MAINTAINERS: hfs/hfsplus: add myself as maintainer
splice: remove duplicate noinline from pipe_clear_nowait
devtmpfs: don't use vfs_getattr_nosec to query i_mode
fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount()
fs: fall back to file_ref_put() for non-last reference
mm/migrate: fix sleep in atomic for large folios and buffer heads
fs/ext4: use sleeping version of sb_find_get_block()
fs/jbd2: use sleeping version of __find_get_block()
fs/ocfs2: use sleeping version of __find_get_block()
fs/buffer: use sleeping version of __find_get_block()
fs/buffer: introduce sleeping flavors for pagecache lookups
MAINTAINERS: add HFS/HFS+ maintainers
fs/buffer: split locking for pagecache lookups

+145 -96
+8 -2
MAINTAINERS
··· 10464 10464 F: drivers/infiniband/hw/hfi1 10465 10465 10466 10466 HFS FILESYSTEM 10467 + M: Viacheslav Dubeyko <slava@dubeyko.com> 10468 + M: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> 10469 + M: Yangtao Li <frank.li@vivo.com> 10467 10470 L: linux-fsdevel@vger.kernel.org 10468 - S: Orphan 10471 + S: Maintained 10469 10472 F: Documentation/filesystems/hfs.rst 10470 10473 F: fs/hfs/ 10471 10474 10472 10475 HFSPLUS FILESYSTEM 10476 + M: Viacheslav Dubeyko <slava@dubeyko.com> 10477 + M: John Paul Adrian Glaubitz <glaubitz@physik.fu-berlin.de> 10478 + M: Yangtao Li <frank.li@vivo.com> 10473 10479 L: linux-fsdevel@vger.kernel.org 10474 - S: Orphan 10480 + S: Maintained 10475 10481 F: Documentation/filesystems/hfsplus.rst 10476 10482 F: fs/hfsplus/ 10477 10483
+9 -13
drivers/base/devtmpfs.c
··· 296 296 return err; 297 297 } 298 298 299 - static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat) 299 + static int dev_mynode(struct device *dev, struct inode *inode) 300 300 { 301 301 /* did we create it */ 302 302 if (inode->i_private != &thread) ··· 304 304 305 305 /* does the dev_t match */ 306 306 if (is_blockdev(dev)) { 307 - if (!S_ISBLK(stat->mode)) 307 + if (!S_ISBLK(inode->i_mode)) 308 308 return 0; 309 309 } else { 310 - if (!S_ISCHR(stat->mode)) 310 + if (!S_ISCHR(inode->i_mode)) 311 311 return 0; 312 312 } 313 - if (stat->rdev != dev->devt) 313 + if (inode->i_rdev != dev->devt) 314 314 return 0; 315 315 316 316 /* ours */ ··· 321 321 { 322 322 struct path parent; 323 323 struct dentry *dentry; 324 - struct kstat stat; 325 - struct path p; 324 + struct inode *inode; 326 325 int deleted = 0; 327 - int err; 326 + int err = 0; 328 327 329 328 dentry = kern_path_locked(nodename, &parent); 330 329 if (IS_ERR(dentry)) 331 330 return PTR_ERR(dentry); 332 331 333 - p.mnt = parent.mnt; 334 - p.dentry = dentry; 335 - err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, 336 - AT_STATX_SYNC_AS_STAT); 337 - if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { 332 + inode = d_inode(dentry); 333 + if (dev_mynode(dev, inode)) { 338 334 struct iattr newattrs; 339 335 /* 340 336 * before unlinking this node, reset permissions ··· 338 342 */ 339 343 newattrs.ia_uid = GLOBAL_ROOT_UID; 340 344 newattrs.ia_gid = GLOBAL_ROOT_GID; 341 - newattrs.ia_mode = stat.mode & ~0777; 345 + newattrs.ia_mode = inode->i_mode & ~0777; 342 346 newattrs.ia_valid = 343 347 ATTR_UID|ATTR_GID|ATTR_MODE; 344 348 inode_lock(d_inode(dentry));
+54 -19
fs/buffer.c
··· 176 176 } 177 177 EXPORT_SYMBOL(end_buffer_write_sync); 178 178 179 - /* 180 - * Various filesystems appear to want __find_get_block to be non-blocking. 181 - * But it's the page lock which protects the buffers. To get around this, 182 - * we get exclusion from try_to_free_buffers with the blockdev mapping's 183 - * i_private_lock. 184 - * 185 - * Hack idea: for the blockdev mapping, i_private_lock contention 186 - * may be quite high. This code could TryLock the page, and if that 187 - * succeeds, there is no need to take i_private_lock. 188 - */ 189 179 static struct buffer_head * 190 - __find_get_block_slow(struct block_device *bdev, sector_t block) 180 + __find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic) 191 181 { 192 182 struct address_space *bd_mapping = bdev->bd_mapping; 193 183 const int blkbits = bd_mapping->host->i_blkbits; ··· 194 204 if (IS_ERR(folio)) 195 205 goto out; 196 206 197 - spin_lock(&bd_mapping->i_private_lock); 207 + /* 208 + * Folio lock protects the buffers. Callers that cannot block 209 + * will fallback to serializing vs try_to_free_buffers() via 210 + * the i_private_lock. 211 + */ 212 + if (atomic) 213 + spin_lock(&bd_mapping->i_private_lock); 214 + else 215 + folio_lock(folio); 216 + 198 217 head = folio_buffers(folio); 199 218 if (!head) 200 219 goto out_unlock; 220 + /* 221 + * Upon a noref migration, the folio lock serializes here; 222 + * otherwise bail. 223 + */ 224 + if (test_bit_acquire(BH_Migrate, &head->b_state)) { 225 + WARN_ON(!atomic); 226 + goto out_unlock; 227 + } 228 + 201 229 bh = head; 202 230 do { 203 231 if (!buffer_mapped(bh)) ··· 244 236 1 << blkbits); 245 237 } 246 238 out_unlock: 247 - spin_unlock(&bd_mapping->i_private_lock); 239 + if (atomic) 240 + spin_unlock(&bd_mapping->i_private_lock); 241 + else 242 + folio_unlock(folio); 248 243 folio_put(folio); 249 244 out: 250 245 return ret; ··· 667 656 void write_boundary_block(struct block_device *bdev, 668 657 sector_t bblock, unsigned blocksize) 669 658 { 670 - struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); 659 + struct buffer_head *bh; 660 + 661 + bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize); 671 662 if (bh) { 672 663 if (buffer_dirty(bh)) 673 664 write_dirty_buffer(bh, 0); ··· 1399 1386 /* 1400 1387 * Perform a pagecache lookup for the matching buffer. If it's there, refresh 1401 1388 * it in the LRU and mark it as accessed. If it is not present then return 1402 - * NULL 1389 + * NULL. Atomic context callers may also return NULL if the buffer is being 1390 + * migrated; similarly the page is not marked accessed either. 1403 1391 */ 1404 - struct buffer_head * 1405 - __find_get_block(struct block_device *bdev, sector_t block, unsigned size) 1392 + static struct buffer_head * 1393 + find_get_block_common(struct block_device *bdev, sector_t block, 1394 + unsigned size, bool atomic) 1406 1395 { 1407 1396 struct buffer_head *bh = lookup_bh_lru(bdev, block, size); 1408 1397 1409 1398 if (bh == NULL) { 1410 1399 /* __find_get_block_slow will mark the page accessed */ 1411 - bh = __find_get_block_slow(bdev, block); 1400 + bh = __find_get_block_slow(bdev, block, atomic); 1412 1401 if (bh) 1413 1402 bh_lru_install(bh); 1414 1403 } else ··· 1418 1403 1419 1404 return bh; 1420 1405 } 1406 + 1407 + struct buffer_head * 1408 + __find_get_block(struct block_device *bdev, sector_t block, unsigned size) 1409 + { 1410 + return find_get_block_common(bdev, block, size, true); 1411 + } 1421 1412 EXPORT_SYMBOL(__find_get_block); 1413 + 1414 + /* same as __find_get_block() but allows sleeping contexts */ 1415 + struct buffer_head * 1416 + __find_get_block_nonatomic(struct block_device *bdev, sector_t block, 1417 + unsigned size) 1418 + { 1419 + return find_get_block_common(bdev, block, size, false); 1420 + } 1421 + EXPORT_SYMBOL(__find_get_block_nonatomic); 1422 1422 1423 1423 /** 1424 1424 * bdev_getblk - Get a buffer_head in a block device's buffer cache. ··· 1452 1422 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, 1453 1423 unsigned size, gfp_t gfp) 1454 1424 { 1455 - struct buffer_head *bh = __find_get_block(bdev, block, size); 1425 + struct buffer_head *bh; 1426 + 1427 + if (gfpflags_allow_blocking(gfp)) 1428 + bh = __find_get_block_nonatomic(bdev, block, size); 1429 + else 1430 + bh = __find_get_block(bdev, block, size); 1456 1431 1457 1432 might_alloc(gfp); 1458 1433 if (bh)
+2 -1
fs/ext4/ialloc.c
··· 691 691 if (!bh || !buffer_uptodate(bh)) 692 692 /* 693 693 * If the block is not in the buffer cache, then it 694 - * must have been written out. 694 + * must have been written out, or, most unlikely, is 695 + * being migrated - false failure should be OK here. 695 696 */ 696 697 goto out; 697 698
+2 -1
fs/ext4/mballoc.c
··· 6642 6642 for (i = 0; i < count; i++) { 6643 6643 cond_resched(); 6644 6644 if (is_metadata) 6645 - bh = sb_find_get_block(inode->i_sb, block + i); 6645 + bh = sb_find_get_block_nonatomic(inode->i_sb, 6646 + block + i); 6646 6647 ext4_forget(handle, is_metadata, inode, bh, block + i); 6647 6648 } 6648 6649 }
+1 -1
fs/file.c
··· 26 26 27 27 #include "internal.h" 28 28 29 - bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) 29 + static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) 30 30 { 31 31 /* 32 32 * If the reference count was already in the dead zone, then this
+9 -6
fs/jbd2/revoke.c
··· 345 345 bh = bh_in; 346 346 347 347 if (!bh) { 348 - bh = __find_get_block(bdev, blocknr, journal->j_blocksize); 348 + bh = __find_get_block_nonatomic(bdev, blocknr, 349 + journal->j_blocksize); 349 350 if (bh) 350 351 BUFFER_TRACE(bh, "found on hash"); 351 352 } ··· 356 355 357 356 /* If there is a different buffer_head lying around in 358 357 * memory anywhere... */ 359 - bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); 358 + bh2 = __find_get_block_nonatomic(bdev, blocknr, 359 + journal->j_blocksize); 360 360 if (bh2) { 361 361 /* ... and it has RevokeValid status... */ 362 362 if (bh2 != bh && buffer_revokevalid(bh2)) ··· 466 464 * state machine will get very upset later on. */ 467 465 if (need_cancel) { 468 466 struct buffer_head *bh2; 469 - bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); 467 + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, 468 + bh->b_size); 470 469 if (bh2) { 471 470 if (bh2 != bh) 472 471 clear_buffer_revoked(bh2); ··· 495 492 struct jbd2_revoke_record_s *record; 496 493 struct buffer_head *bh; 497 494 record = (struct jbd2_revoke_record_s *)list_entry; 498 - bh = __find_get_block(journal->j_fs_dev, 499 - record->blocknr, 500 - journal->j_blocksize); 495 + bh = __find_get_block_nonatomic(journal->j_fs_dev, 496 + record->blocknr, 497 + journal->j_blocksize); 501 498 if (bh) { 502 499 clear_buffer_revoked(bh); 503 500 __brelse(bh);
+36 -33
fs/namespace.c
··· 2826 2826 struct vfsmount *mnt = path->mnt; 2827 2827 struct dentry *dentry; 2828 2828 struct mountpoint *mp = ERR_PTR(-ENOENT); 2829 + struct path under = {}; 2829 2830 2830 2831 for (;;) { 2831 - struct mount *m; 2832 + struct mount *m = real_mount(mnt); 2832 2833 2833 2834 if (beneath) { 2834 - m = real_mount(mnt); 2835 + path_put(&under); 2835 2836 read_seqlock_excl(&mount_lock); 2836 - dentry = dget(m->mnt_mountpoint); 2837 + under.mnt = mntget(&m->mnt_parent->mnt); 2838 + under.dentry = dget(m->mnt_mountpoint); 2837 2839 read_sequnlock_excl(&mount_lock); 2840 + dentry = under.dentry; 2838 2841 } else { 2839 2842 dentry = path->dentry; 2840 2843 } 2841 2844 2842 2845 inode_lock(dentry->d_inode); 2843 - if (unlikely(cant_mount(dentry))) { 2844 - inode_unlock(dentry->d_inode); 2845 - goto out; 2846 - } 2847 - 2848 2846 namespace_lock(); 2849 2847 2850 - if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { 2848 + if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) 2849 + break; // not to be mounted on 2850 + 2851 + if (beneath && unlikely(m->mnt_mountpoint != dentry || 2852 + &m->mnt_parent->mnt != under.mnt)) { 2851 2853 namespace_unlock(); 2852 2854 inode_unlock(dentry->d_inode); 2853 - goto out; 2855 + continue; // got moved 2854 2856 } 2855 2857 2856 2858 mnt = lookup_mnt(path); 2857 - if (likely(!mnt)) 2859 + if (unlikely(mnt)) { 2860 + namespace_unlock(); 2861 + inode_unlock(dentry->d_inode); 2862 + path_put(path); 2863 + path->mnt = mnt; 2864 + path->dentry = dget(mnt->mnt_root); 2865 + continue; // got overmounted 2866 + } 2867 + mp = get_mountpoint(dentry); 2868 + if (IS_ERR(mp)) 2858 2869 break; 2859 - 2860 - namespace_unlock(); 2861 - inode_unlock(dentry->d_inode); 2862 - if (beneath) 2863 - dput(dentry); 2864 - path_put(path); 2865 - path->mnt = mnt; 2866 - path->dentry = dget(mnt->mnt_root); 2870 + if (beneath) { 2871 + /* 2872 + * @under duplicates the references that will stay 2873 + * at least until namespace_unlock(), so the path_put() 2874 + * below is safe (and OK to do under namespace_lock - 2875 + * we are not dropping the final references here). 2876 + */ 2877 + path_put(&under); 2878 + } 2879 + return mp; 2867 2880 } 2868 - 2869 - mp = get_mountpoint(dentry); 2870 - if (IS_ERR(mp)) { 2871 - namespace_unlock(); 2872 - inode_unlock(dentry->d_inode); 2873 - } 2874 - 2875 - out: 2881 + namespace_unlock(); 2882 + inode_unlock(dentry->d_inode); 2876 2883 if (beneath) 2877 - dput(dentry); 2878 - 2884 + path_put(&under); 2879 2885 return mp; 2880 2886 } 2881 2887 ··· 2892 2886 2893 2887 static void unlock_mount(struct mountpoint *where) 2894 2888 { 2895 - struct dentry *dentry = where->m_dentry; 2896 - 2889 + inode_unlock(where->m_dentry->d_inode); 2897 2890 read_seqlock_excl(&mount_lock); 2898 2891 put_mountpoint(where); 2899 2892 read_sequnlock_excl(&mount_lock); 2900 - 2901 2893 namespace_unlock(); 2902 - inode_unlock(dentry->d_inode); 2903 2894 } 2904 2895 2905 2896 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
+1 -1
fs/ocfs2/journal.c
··· 1249 1249 } 1250 1250 1251 1251 for (i = 0; i < p_blocks; i++, p_blkno++) { 1252 - bh = __find_get_block(osb->sb->s_bdev, p_blkno, 1252 + bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno, 1253 1253 osb->sb->s_blocksize); 1254 1254 /* block not cached. */ 1255 1255 if (!bh)
+1 -1
fs/splice.c
··· 45 45 * here if set to avoid blocking other users of this pipe if splice is 46 46 * being done on it. 47 47 */ 48 - static noinline void noinline pipe_clear_nowait(struct file *file) 48 + static noinline void pipe_clear_nowait(struct file *file) 49 49 { 50 50 fmode_t fmode = READ_ONCE(file->f_mode); 51 51
+2 -2
fs/xattr.c
··· 703 703 return error; 704 704 705 705 filename = getname_maybe_null(pathname, at_flags); 706 - if (!filename) { 706 + if (!filename && dfd >= 0) { 707 707 CLASS(fd, f)(dfd); 708 708 if (fd_empty(f)) 709 709 error = -EBADF; ··· 847 847 return error; 848 848 849 849 filename = getname_maybe_null(pathname, at_flags); 850 - if (!filename) { 850 + if (!filename && dfd >= 0) { 851 851 CLASS(fd, f)(dfd); 852 852 if (fd_empty(f)) 853 853 return -EBADF;
+9
include/linux/buffer_head.h
··· 34 34 BH_Meta, /* Buffer contains metadata */ 35 35 BH_Prio, /* Buffer should be submitted with REQ_PRIO */ 36 36 BH_Defer_Completion, /* Defer AIO completion to workqueue */ 37 + BH_Migrate, /* Buffer is being migrated (norefs) */ 37 38 38 39 BH_PrivateStart,/* not a state bit, but the first bit available 39 40 * for private allocation by other entities ··· 223 222 wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); 224 223 struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, 225 224 unsigned size); 225 + struct buffer_head *__find_get_block_nonatomic(struct block_device *bdev, 226 + sector_t block, unsigned size); 226 227 struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, 227 228 unsigned size, gfp_t gfp); 228 229 void __brelse(struct buffer_head *); ··· 398 395 sb_find_get_block(struct super_block *sb, sector_t block) 399 396 { 400 397 return __find_get_block(sb->s_bdev, block, sb->s_blocksize); 398 + } 399 + 400 + static inline struct buffer_head * 401 + sb_find_get_block_nonatomic(struct super_block *sb, sector_t block) 402 + { 403 + return __find_get_block_nonatomic(sb->s_bdev, block, sb->s_blocksize); 401 404 } 402 405 403 406 static inline void
+6 -13
include/linux/file_ref.h
··· 61 61 atomic_long_set(&ref->refcnt, cnt - 1); 62 62 } 63 63 64 - bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); 65 64 bool __file_ref_put(file_ref_t *ref, unsigned long cnt); 66 65 67 66 /** ··· 177 178 */ 178 179 static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) 179 180 { 180 - long old, new; 181 + long old; 181 182 182 183 old = atomic_long_read(&ref->refcnt); 183 - do { 184 - if (unlikely(old < 0)) 185 - return __file_ref_put_badval(ref, old); 186 - 187 - if (old == FILE_REF_ONEREF) 188 - new = FILE_REF_DEAD; 189 - else 190 - new = old - 1; 191 - } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); 192 - 193 - return new == FILE_REF_DEAD; 184 + if (likely(old == FILE_REF_ONEREF)) { 185 + if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD))) 186 + return true; 187 + } 188 + return file_ref_put(ref); 194 189 } 195 190 196 191 /**
+5 -3
mm/migrate.c
··· 845 845 return -EAGAIN; 846 846 847 847 if (check_refs) { 848 - bool busy; 848 + bool busy, migrating; 849 849 bool invalidated = false; 850 850 851 + migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state); 852 + VM_WARN_ON_ONCE(migrating); 851 853 recheck_buffers: 852 854 busy = false; 853 855 spin_lock(&mapping->i_private_lock); ··· 861 859 } 862 860 bh = bh->b_this_page; 863 861 } while (bh != head); 862 + spin_unlock(&mapping->i_private_lock); 864 863 if (busy) { 865 864 if (invalidated) { 866 865 rc = -EAGAIN; 867 866 goto unlock_buffers; 868 867 } 869 - spin_unlock(&mapping->i_private_lock); 870 868 invalidate_bh_lrus(); 871 869 invalidated = true; 872 870 goto recheck_buffers; ··· 885 883 886 884 unlock_buffers: 887 885 if (check_refs) 888 - spin_unlock(&mapping->i_private_lock); 886 + clear_bit_unlock(BH_Migrate, &head->b_state); 889 887 bh = head; 890 888 do { 891 889 unlock_buffer(bh);