Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'vfs-6.7-rc3.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs fixes from Christian Brauner:

- Avoid calling back into LSMs from vfs_getattr_nosec() calls.

IMA used to query inode properties accessing raw inode fields without
dedicated helpers. That was finally fixed a few releases ago by
forcing IMA to use vfs_getattr_nosec() helpers.

The goal of the vfs_getattr_nosec() helper is to query for attributes
without calling into the LSM layer which would be quite problematic
because incredibly IMA is called from __fput()...

__fput()
-> ima_file_free()

What it does is to call back into the filesystem to update the file's
IMA xattr. Querying the inode without using vfs_getattr_nosec() meant
that IMA didn't handle stacking filesystems such as overlayfs
correctly. So the switch to vfs_getattr_nosec() is quite correct. But
the switch to vfs_getattr_nosec() revealed another bug when used on
stacking filesystems:

__fput()
-> ima_file_free()
-> vfs_getattr_nosec()
-> i_op->getattr::ovl_getattr()
-> vfs_getattr()
-> i_op->getattr::$WHATEVER_UNDERLYING_FS_getattr()
-> security_inode_getattr() # calls back into LSMs

Now, if that __fput() happens from task_work_run() of an exiting task
current->fs and various other pointer could already be NULL. So
anything in the LSM layer relying on that not being NULL would be
quite surprised.

Fix that by passing the information that this is a security request
through to the stacking filesystem by adding a new internal
ATT_GETATTR_NOSEC flag. Now the callchain becomes:

__fput()
-> ima_file_free()
-> vfs_getattr_nosec()
-> i_op->getattr::ovl_getattr()
-> if (AT_GETATTR_NOSEC)
vfs_getattr_nosec()
else
vfs_getattr()
-> i_op->getattr::$WHATEVER_UNDERLYING_FS_getattr()

- Fix a bug introduced with the iov_iter rework from last cycle.

This broke /proc/kcore by copying too much and without the correct
offset.

- Add a missing NULL check when allocating the root inode in
autofs_fill_super().

- Fix stable writes for multi-device filesystems (xfs, btrfs etc) and
the block device pseudo filesystem.

Stable writes used to be a superblock flag only, making it a per
filesystem property. Add an additional AS_STABLE_WRITES mapping flag
to allow for fine-grained control.

- Ensure that offset_iterate_dir() returns 0 after reaching the end of
a directory so it adheres to getdents() convention.

* tag 'vfs-6.7-rc3.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
libfs: getdents() should return 0 after reaching EOD
xfs: respect the stable writes flag on the RT device
xfs: clean up FS_XFLAG_REALTIME handling in xfs_ioctl_setattr_xflags
block: update the stable_writes flag in bdev_add
filemap: add a per-mapping stable writes flag
autofs: add: new_inode check in autofs_fill_super()
iov_iter: fix copy_page_to_iter_nofault()
fs: Pass AT_GETATTR_NOSEC flag to getattr interface function

+123 -60
+2
block/bdev.c
··· 425 425 426 426 void bdev_add(struct block_device *bdev, dev_t dev) 427 427 { 428 + if (bdev_stable_writes(bdev)) 429 + mapping_set_stable_writes(bdev->bd_inode->i_mapping); 428 430 bdev->bd_dev = dev; 429 431 bdev->bd_inode->i_rdev = dev; 430 432 bdev->bd_inode->i_ino = dev;
+21 -35
fs/autofs/inode.c
··· 309 309 struct autofs_fs_context *ctx = fc->fs_private; 310 310 struct autofs_sb_info *sbi = s->s_fs_info; 311 311 struct inode *root_inode; 312 - struct dentry *root; 313 312 struct autofs_info *ino; 314 - int ret = -ENOMEM; 315 313 316 314 pr_debug("starting up, sbi = %p\n", sbi); 317 315 ··· 326 328 */ 327 329 ino = autofs_new_ino(sbi); 328 330 if (!ino) 329 - goto fail; 331 + return -ENOMEM; 330 332 331 333 root_inode = autofs_get_inode(s, S_IFDIR | 0755); 334 + if (!root_inode) 335 + return -ENOMEM; 336 + 332 337 root_inode->i_uid = ctx->uid; 333 338 root_inode->i_gid = ctx->gid; 339 + root_inode->i_fop = &autofs_root_operations; 340 + root_inode->i_op = &autofs_dir_inode_operations; 334 341 335 - root = d_make_root(root_inode); 336 - if (!root) 337 - goto fail_ino; 338 - 339 - root->d_fsdata = ino; 342 + s->s_root = d_make_root(root_inode); 343 + if (unlikely(!s->s_root)) { 344 + autofs_free_ino(ino); 345 + return -ENOMEM; 346 + } 347 + s->s_root->d_fsdata = ino; 340 348 341 349 if (ctx->pgrp_set) { 342 350 sbi->oz_pgrp = find_get_pid(ctx->pgrp); 343 - if (!sbi->oz_pgrp) { 344 - ret = invalf(fc, "Could not find process group %d", 345 - ctx->pgrp); 346 - goto fail_dput; 347 - } 348 - } else { 351 + if (!sbi->oz_pgrp) 352 + return invalf(fc, "Could not find process group %d", 353 + ctx->pgrp); 354 + } else 349 355 sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); 350 - } 351 356 352 357 if (autofs_type_trigger(sbi->type)) 353 - __managed_dentry_set_managed(root); 354 - 355 - root_inode->i_fop = &autofs_root_operations; 356 - root_inode->i_op = &autofs_dir_inode_operations; 358 + /* s->s_root won't be contended so there's little to 359 + * be gained by not taking the d_lock when setting 360 + * d_flags, even when a lot mounts are being done. 361 + */ 362 + managed_dentry_set_managed(s->s_root); 357 363 358 364 pr_debug("pipe fd = %d, pgrp = %u\n", 359 365 sbi->pipefd, pid_nr(sbi->oz_pgrp)); 360 366 361 367 sbi->flags &= ~AUTOFS_SBI_CATATONIC; 362 - 363 - /* 364 - * Success! Install the root dentry now to indicate completion. 365 - */ 366 - s->s_root = root; 367 368 return 0; 368 - 369 - /* 370 - * Failure ... clean up. 371 - */ 372 - fail_dput: 373 - dput(root); 374 - goto fail; 375 - fail_ino: 376 - autofs_free_ino(ino); 377 - fail: 378 - return ret; 379 369 } 380 370 381 371 /*
+10 -2
fs/ecryptfs/inode.c
··· 998 998 return rc; 999 999 } 1000 1000 1001 + static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat, 1002 + u32 request_mask, unsigned int flags) 1003 + { 1004 + if (flags & AT_GETATTR_NOSEC) 1005 + return vfs_getattr_nosec(path, stat, request_mask, flags); 1006 + return vfs_getattr(path, stat, request_mask, flags); 1007 + } 1008 + 1001 1009 static int ecryptfs_getattr(struct mnt_idmap *idmap, 1002 1010 const struct path *path, struct kstat *stat, 1003 1011 u32 request_mask, unsigned int flags) ··· 1014 1006 struct kstat lower_stat; 1015 1007 int rc; 1016 1008 1017 - rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat, 1018 - request_mask, flags); 1009 + rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry), 1010 + &lower_stat, request_mask, flags); 1019 1011 if (!rc) { 1020 1012 fsstack_copy_attr_all(d_inode(dentry), 1021 1013 ecryptfs_inode_to_lower(d_inode(dentry)));
+2
fs/inode.c
··· 215 215 lockdep_set_class_and_name(&mapping->invalidate_lock, 216 216 &sb->s_type->invalidate_lock_key, 217 217 "mapping.invalidate_lock"); 218 + if (sb->s_iflags & SB_I_STABLE_WRITES) 219 + mapping_set_stable_writes(mapping); 218 220 inode->i_private = NULL; 219 221 inode->i_mapping = mapping; 220 222 INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
+11 -3
fs/libfs.c
··· 399 399 return -EINVAL; 400 400 } 401 401 402 + /* In this case, ->private_data is protected by f_pos_lock */ 403 + file->private_data = NULL; 402 404 return vfs_setpos(file, offset, U32_MAX); 403 405 } 404 406 ··· 430 428 inode->i_ino, fs_umode_to_dtype(inode->i_mode)); 431 429 } 432 430 433 - static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) 431 + static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx) 434 432 { 435 433 struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); 436 434 XA_STATE(xas, &so_ctx->xa, ctx->pos); ··· 439 437 while (true) { 440 438 dentry = offset_find_next(&xas); 441 439 if (!dentry) 442 - break; 440 + return ERR_PTR(-ENOENT); 443 441 444 442 if (!offset_dir_emit(ctx, dentry)) { 445 443 dput(dentry); ··· 449 447 dput(dentry); 450 448 ctx->pos = xas.xa_index + 1; 451 449 } 450 + return NULL; 452 451 } 453 452 454 453 /** ··· 482 479 if (!dir_emit_dots(file, ctx)) 483 480 return 0; 484 481 485 - offset_iterate_dir(d_inode(dir), ctx); 482 + /* In this case, ->private_data is protected by f_pos_lock */ 483 + if (ctx->pos == 2) 484 + file->private_data = NULL; 485 + else if (file->private_data == ERR_PTR(-ENOENT)) 486 + return 0; 487 + file->private_data = offset_iterate_dir(d_inode(dir), ctx); 486 488 return 0; 487 489 } 488 490
+5 -5
fs/overlayfs/inode.c
··· 171 171 172 172 type = ovl_path_real(dentry, &realpath); 173 173 old_cred = ovl_override_creds(dentry->d_sb); 174 - err = vfs_getattr(&realpath, stat, request_mask, flags); 174 + err = ovl_do_getattr(&realpath, stat, request_mask, flags); 175 175 if (err) 176 176 goto out; 177 177 ··· 196 196 (!is_dir ? STATX_NLINK : 0); 197 197 198 198 ovl_path_lower(dentry, &realpath); 199 - err = vfs_getattr(&realpath, &lowerstat, 200 - lowermask, flags); 199 + err = ovl_do_getattr(&realpath, &lowerstat, lowermask, 200 + flags); 201 201 if (err) 202 202 goto out; 203 203 ··· 249 249 250 250 ovl_path_lowerdata(dentry, &realpath); 251 251 if (realpath.dentry) { 252 - err = vfs_getattr(&realpath, &lowerdatastat, 253 - lowermask, flags); 252 + err = ovl_do_getattr(&realpath, &lowerdatastat, 253 + lowermask, flags); 254 254 if (err) 255 255 goto out; 256 256 } else {
+8
fs/overlayfs/overlayfs.h
··· 408 408 return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); 409 409 } 410 410 411 + static inline int ovl_do_getattr(const struct path *path, struct kstat *stat, 412 + u32 request_mask, unsigned int flags) 413 + { 414 + if (flags & AT_GETATTR_NOSEC) 415 + return vfs_getattr_nosec(path, stat, request_mask, flags); 416 + return vfs_getattr(path, stat, request_mask, flags); 417 + } 418 + 411 419 /* util.c */ 412 420 int ovl_get_write_access(struct dentry *dentry); 413 421 void ovl_put_write_access(struct dentry *dentry);
+5 -1
fs/stat.c
··· 133 133 idmap = mnt_idmap(path->mnt); 134 134 if (inode->i_op->getattr) 135 135 return inode->i_op->getattr(idmap, path, stat, 136 - request_mask, query_flags); 136 + request_mask, 137 + query_flags | AT_GETATTR_NOSEC); 137 138 138 139 generic_fillattr(idmap, request_mask, inode, stat); 139 140 return 0; ··· 166 165 u32 request_mask, unsigned int query_flags) 167 166 { 168 167 int retval; 168 + 169 + if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC)) 170 + return -EPERM; 169 171 170 172 retval = security_inode_getattr(path); 171 173 if (retval)
+8
fs/xfs/xfs_inode.h
··· 569 569 extern void xfs_setup_iops(struct xfs_inode *ip); 570 570 extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); 571 571 572 + static inline void xfs_update_stable_writes(struct xfs_inode *ip) 573 + { 574 + if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) 575 + mapping_set_stable_writes(VFS_I(ip)->i_mapping); 576 + else 577 + mapping_clear_stable_writes(VFS_I(ip)->i_mapping); 578 + } 579 + 572 580 /* 573 581 * When setting up a newly allocated inode, we need to call 574 582 * xfs_finish_inode_setup() once the inode is fully instantiated at
+22 -12
fs/xfs/xfs_ioctl.c
··· 1121 1121 struct fileattr *fa) 1122 1122 { 1123 1123 struct xfs_mount *mp = ip->i_mount; 1124 + bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); 1124 1125 uint64_t i_flags2; 1125 1126 1126 - /* Can't change realtime flag if any extents are allocated. */ 1127 - if ((ip->i_df.if_nextents || ip->i_delayed_blks) && 1128 - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) 1129 - return -EINVAL; 1130 - 1131 - /* If realtime flag is set then must have realtime device */ 1132 - if (fa->fsx_xflags & FS_XFLAG_REALTIME) { 1133 - if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || 1134 - xfs_extlen_to_rtxmod(mp, ip->i_extsize)) 1127 + if (rtflag != XFS_IS_REALTIME_INODE(ip)) { 1128 + /* Can't change realtime flag if any extents are allocated. */ 1129 + if (ip->i_df.if_nextents || ip->i_delayed_blks) 1135 1130 return -EINVAL; 1136 1131 } 1137 1132 1138 - /* Clear reflink if we are actually able to set the rt flag. */ 1139 - if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) 1140 - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1133 + if (rtflag) { 1134 + /* If realtime flag is set then must have realtime device */ 1135 + if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || 1136 + xfs_extlen_to_rtxmod(mp, ip->i_extsize)) 1137 + return -EINVAL; 1138 + 1139 + /* Clear reflink if we are actually able to set the rt flag. */ 1140 + if (xfs_is_reflink_inode(ip)) 1141 + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1142 + } 1141 1143 1142 1144 /* diflags2 only valid for v3 inodes. */ 1143 1145 i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); ··· 1150 1148 ip->i_diflags2 = i_flags2; 1151 1149 1152 1150 xfs_diflags_to_iflags(ip, false); 1151 + 1152 + /* 1153 + * Make the stable writes flag match that of the device the inode 1154 + * resides on when flipping the RT flag. 1155 + */ 1156 + if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode)) 1157 + xfs_update_stable_writes(ip); 1158 + 1153 1159 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 1154 1160 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1155 1161 XFS_STATS_INC(mp, xs_ig_attrchg);
+7
fs/xfs/xfs_iops.c
··· 1299 1299 mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); 1300 1300 1301 1301 /* 1302 + * For real-time inodes update the stable write flags to that of the RT 1303 + * device instead of the data device. 1304 + */ 1305 + if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip)) 1306 + xfs_update_stable_writes(ip); 1307 + 1308 + /* 1302 1309 * If there is no attribute fork no ACL can exist on this inode, 1303 1310 * and it can't have any file capabilities attached to it either. 1304 1311 */
+17
include/linux/pagemap.h
··· 204 204 AS_NO_WRITEBACK_TAGS = 5, 205 205 AS_LARGE_FOLIO_SUPPORT = 6, 206 206 AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ 207 + AS_STABLE_WRITES, /* must wait for writeback before modifying 208 + folio contents */ 207 209 }; 208 210 209 211 /** ··· 289 287 static inline void mapping_clear_release_always(struct address_space *mapping) 290 288 { 291 289 clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); 290 + } 291 + 292 + static inline bool mapping_stable_writes(const struct address_space *mapping) 293 + { 294 + return test_bit(AS_STABLE_WRITES, &mapping->flags); 295 + } 296 + 297 + static inline void mapping_set_stable_writes(struct address_space *mapping) 298 + { 299 + set_bit(AS_STABLE_WRITES, &mapping->flags); 300 + } 301 + 302 + static inline void mapping_clear_stable_writes(struct address_space *mapping) 303 + { 304 + clear_bit(AS_STABLE_WRITES, &mapping->flags); 292 305 } 293 306 294 307 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
+3
include/uapi/linux/fcntl.h
··· 116 116 #define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to 117 117 compare object identity and may not 118 118 be usable to open_by_handle_at(2) */ 119 + #if defined(__KERNEL__) 120 + #define AT_GETATTR_NOSEC 0x80000000 121 + #endif 119 122 120 123 #endif /* _UAPI_LINUX_FCNTL_H */
+1 -1
lib/iov_iter.c
··· 409 409 void *kaddr = kmap_local_page(page); 410 410 size_t n = min(bytes, (size_t)PAGE_SIZE - offset); 411 411 412 - n = iterate_and_advance(i, bytes, kaddr, 412 + n = iterate_and_advance(i, n, kaddr + offset, 413 413 copy_to_user_iter_nofault, 414 414 memcpy_to_iter); 415 415 kunmap_local(kaddr);
+1 -1
mm/page-writeback.c
··· 3107 3107 */ 3108 3108 void folio_wait_stable(struct folio *folio) 3109 3109 { 3110 - if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) 3110 + if (mapping_stable_writes(folio_mapping(folio))) 3111 3111 folio_wait_writeback(folio); 3112 3112 } 3113 3113 EXPORT_SYMBOL_GPL(folio_wait_stable);