Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ext4_for_linux-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:

- Refactor code paths involved with partial block zero-out in
prearation for converting ext4 to use iomap for buffered writes

- Remove use of d_alloc() from ext4 in preparation for the deprecation
of this interface

- Replace some J_ASSERTS with a journal abort so we can avoid a kernel
panic for a localized file system error

- Simplify various code paths in mballoc, move_extent, and fast commit

- Fix rare deadlock in jbd2_journal_cancel_revoke() that can be
triggered by generic/013 when blocksize < pagesize

- Fix memory leak when releasing an extended attribute when its value
is stored in an ea_inode

- Fix various potential kunit test bugs in fs/ext4/extents.c

- Fix potential out-of-bounds access in check_xattr() with a corrupted
file system

- Make the jbd2_inode dirty range tracking safe for lockless reads

- Avoid a WARN_ON when writeback files due to a corrupted file system;
we already print an ext4 warning indicatign that data will be lost,
so the WARN_ON is not necessary and doesn't add any new information

* tag 'ext4_for_linux-7.0-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (37 commits)
jbd2: fix deadlock in jbd2_journal_cancel_revoke()
ext4: fix missing brelse() in ext4_xattr_inode_dec_ref_all()
ext4: fix possible null-ptr-deref in mbt_kunit_exit()
ext4: fix possible null-ptr-deref in extents_kunit_exit()
ext4: fix the error handling process in extents_kunit_init).
ext4: call deactivate_super() in extents_kunit_exit()
ext4: fix miss unlock 'sb->s_umount' in extents_kunit_init()
ext4: fix bounds check in check_xattrs() to prevent out-of-bounds access
ext4: zero post-EOF partial block before appending write
ext4: move pagecache_isize_extended() out of active handle
ext4: remove ctime/mtime update from ext4_alloc_file_blocks()
ext4: unify SYNC mode checks in fallocate paths
ext4: ensure zeroed partial blocks are persisted in SYNC mode
ext4: move zero partial block range functions out of active handle
ext4: pass allocate range as loff_t to ext4_alloc_file_blocks()
ext4: remove handle parameters from zero partial block functions
ext4: move ordered data handling out of ext4_block_do_zero_range()
ext4: rename ext4_block_zero_page_range() to ext4_block_zero_range()
ext4: factor out journalled block zeroing range
ext4: rename and extend ext4_block_truncate_page()
...

+633 -389
+6 -8
fs/ext4/ext4.h
··· 28 28 #include <linux/seqlock.h> 29 29 #include <linux/mutex.h> 30 30 #include <linux/timer.h> 31 - #include <linux/wait.h> 32 31 #include <linux/sched/signal.h> 33 32 #include <linux/blockgroup_lock.h> 34 33 #include <linux/percpu_counter.h> ··· 1080 1081 ext4_lblk_t i_fc_lblk_len; 1081 1082 1082 1083 spinlock_t i_raw_lock; /* protects updates to the raw inode */ 1083 - 1084 - /* Fast commit wait queue for this inode */ 1085 - wait_queue_head_t i_fc_wait; 1086 1084 1087 1085 /* 1088 1086 * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len ··· 2972 2976 void __ext4_fc_track_link(handle_t *handle, struct inode *inode, 2973 2977 struct dentry *dentry); 2974 2978 void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); 2975 - void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); 2979 + void ext4_fc_track_link(handle_t *handle, struct inode *inode, 2980 + struct dentry *dentry); 2976 2981 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, 2977 2982 struct dentry *dentry); 2978 2983 void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); ··· 3098 3101 extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); 3099 3102 extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 3100 3103 int pextents); 3101 - extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 3102 - loff_t lstart, loff_t lend); 3104 + extern int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end); 3105 + extern int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, 3106 + loff_t length, bool *did_zero); 3103 3107 extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); 3104 3108 extern qsize_t *ext4_get_reserved_space(struct inode *inode); 3105 3109 extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); ··· 3719 3721 extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, 3720 3722 struct inode *inode, struct dentry *dentry); 3721 3723 extern int __ext4_link(struct inode *dir, struct inode *inode, 3722 - struct dentry *dentry); 3724 + const struct qstr *d_name, struct dentry *dentry); 3723 3725 3724 3726 #define S_SHIFT 12 3725 3727 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
+45 -19
fs/ext4/extents-test.c
··· 142 142 143 143 static void extents_kunit_exit(struct kunit *test) 144 144 { 145 - struct super_block *sb = k_ctx.k_ei->vfs_inode.i_sb; 146 - struct ext4_sb_info *sbi = sb->s_fs_info; 145 + struct ext4_sb_info *sbi; 147 146 147 + if (!k_ctx.k_ei) 148 + return; 149 + 150 + sbi = k_ctx.k_ei->vfs_inode.i_sb->s_fs_info; 148 151 ext4_es_unregister_shrinker(sbi); 152 + deactivate_super(sbi->s_sb); 149 153 kfree(sbi); 150 154 kfree(k_ctx.k_ei); 151 155 kfree(k_ctx.k_data); ··· 228 224 (struct kunit_ext_test_param *)(test->param_value); 229 225 int err; 230 226 231 - sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); 232 - if (IS_ERR(sb)) 233 - return PTR_ERR(sb); 234 - 235 - sb->s_blocksize = 4096; 236 - sb->s_blocksize_bits = 12; 237 - 238 227 sbi = kzalloc_obj(struct ext4_sb_info); 239 228 if (sbi == NULL) 240 229 return -ENOMEM; 241 230 231 + sb = sget(&ext_fs_type, NULL, ext_set, 0, NULL); 232 + if (IS_ERR(sb)) { 233 + kfree(sbi); 234 + return PTR_ERR(sb); 235 + } 236 + 242 237 sbi->s_sb = sb; 243 238 sb->s_fs_info = sbi; 239 + 240 + sb->s_blocksize = 4096; 241 + sb->s_blocksize_bits = 12; 244 242 245 243 if (!param || !param->disable_zeroout) 246 244 sbi->s_extent_max_zeroout_kb = 32; 247 245 248 - /* setup the mock inode */ 249 - k_ctx.k_ei = kzalloc_obj(struct ext4_inode_info); 250 - if (k_ctx.k_ei == NULL) 251 - return -ENOMEM; 252 - ei = k_ctx.k_ei; 253 - inode = &ei->vfs_inode; 254 - 255 246 err = ext4_es_register_shrinker(sbi); 256 247 if (err) 257 - return err; 248 + goto out_deactivate; 249 + 250 + /* setup the mock inode */ 251 + k_ctx.k_ei = kzalloc_obj(struct ext4_inode_info); 252 + if (k_ctx.k_ei == NULL) { 253 + err = -ENOMEM; 254 + goto out; 255 + } 256 + ei = k_ctx.k_ei; 257 + inode = &ei->vfs_inode; 258 258 259 259 ext4_es_init_tree(&ei->i_es_tree); 260 260 rwlock_init(&ei->i_es_lock); ··· 274 266 inode->i_sb = sb; 275 267 276 268 k_ctx.k_data = kzalloc(EXT_DATA_LEN * 4096, GFP_KERNEL); 277 - if (k_ctx.k_data == NULL) 278 - return -ENOMEM; 269 + if (k_ctx.k_data == NULL) { 270 + err = -ENOMEM; 271 + goto out; 272 + } 279 273 280 274 /* 281 275 * set the data area to a junk value ··· 319 309 kunit_activate_static_stub(test, ext4_ext_zeroout, ext4_ext_zeroout_stub); 320 310 kunit_activate_static_stub(test, ext4_issue_zeroout, 321 311 ext4_issue_zeroout_stub); 312 + up_write(&sb->s_umount); 313 + 322 314 return 0; 315 + 316 + out: 317 + kfree(k_ctx.k_ei); 318 + k_ctx.k_ei = NULL; 319 + 320 + kfree(k_ctx.k_data); 321 + k_ctx.k_data = NULL; 322 + 323 + ext4_es_unregister_shrinker(sbi); 324 + out_deactivate: 325 + deactivate_locked_super(sb); 326 + kfree(sbi); 327 + 328 + return err; 323 329 } 324 330 325 331 /*
+86 -77
fs/ext4/extents.c
··· 4571 4571 return err; 4572 4572 } 4573 4573 4574 - static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, 4575 - ext4_lblk_t len, loff_t new_size, 4576 - int flags) 4574 + static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len, 4575 + loff_t new_size, int flags) 4577 4576 { 4578 4577 struct inode *inode = file_inode(file); 4579 4578 handle_t *handle; 4580 4579 int ret = 0, ret2 = 0, ret3 = 0; 4581 4580 int retries = 0; 4582 4581 int depth = 0; 4582 + ext4_lblk_t len_lblk; 4583 4583 struct ext4_map_blocks map; 4584 4584 unsigned int credits; 4585 - loff_t epos, old_size = i_size_read(inode); 4585 + loff_t epos = 0, old_size = i_size_read(inode); 4586 4586 unsigned int blkbits = inode->i_blkbits; 4587 4587 bool alloc_zero = false; 4588 4588 4589 4589 BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)); 4590 - map.m_lblk = offset; 4591 - map.m_len = len; 4590 + map.m_lblk = offset >> blkbits; 4591 + map.m_len = len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); 4592 4592 /* 4593 4593 * Don't normalize the request if it can fit in one extent so 4594 4594 * that it doesn't get unnecessarily split into multiple 4595 4595 * extents. 4596 4596 */ 4597 - if (len <= EXT_UNWRITTEN_MAX_LEN) 4597 + if (len_lblk <= EXT_UNWRITTEN_MAX_LEN) 4598 4598 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; 4599 4599 4600 4600 /* ··· 4611 4611 /* 4612 4612 * credits to insert 1 extent into extent tree 4613 4613 */ 4614 - credits = ext4_chunk_trans_blocks(inode, len); 4614 + credits = ext4_chunk_trans_blocks(inode, len_lblk); 4615 4615 depth = ext_depth(inode); 4616 4616 4617 + /* Zero to the end of the block containing i_size */ 4618 + if (new_size > old_size) { 4619 + ret = ext4_block_zero_eof(inode, old_size, LLONG_MAX); 4620 + if (ret) 4621 + return ret; 4622 + } 4623 + 4617 4624 retry: 4618 - while (len) { 4625 + while (len_lblk) { 4619 4626 /* 4620 4627 * Recalculate credits when extent tree depth changes. 4621 4628 */ 4622 4629 if (depth != ext_depth(inode)) { 4623 - credits = ext4_chunk_trans_blocks(inode, len); 4630 + credits = ext4_chunk_trans_blocks(inode, len_lblk); 4624 4631 depth = ext_depth(inode); 4625 4632 } 4626 4633 ··· 4647 4640 ext4_journal_stop(handle); 4648 4641 break; 4649 4642 } 4643 + ext4_update_inode_fsync_trans(handle, inode, 1); 4644 + ret = ext4_journal_stop(handle); 4645 + if (unlikely(ret)) 4646 + break; 4647 + 4650 4648 /* 4651 4649 * allow a full retry cycle for any remaining allocations 4652 4650 */ 4653 4651 retries = 0; 4654 - epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); 4655 - inode_set_ctime_current(inode); 4656 - if (new_size) { 4657 - if (epos > new_size) 4658 - epos = new_size; 4659 - if (ext4_update_inode_size(inode, epos) & 0x1) 4660 - inode_set_mtime_to_ts(inode, 4661 - inode_get_ctime(inode)); 4662 - if (epos > old_size) { 4663 - pagecache_isize_extended(inode, old_size, epos); 4664 - ext4_zero_partial_blocks(handle, inode, 4665 - old_size, epos - old_size); 4666 - } 4667 - } 4668 - ret2 = ext4_mark_inode_dirty(handle, inode); 4669 - ext4_update_inode_fsync_trans(handle, inode, 1); 4670 - ret3 = ext4_journal_stop(handle); 4671 - ret2 = ret3 ? ret3 : ret2; 4672 - if (unlikely(ret2)) 4673 - break; 4674 4652 4675 4653 if (alloc_zero && 4676 4654 (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) { 4677 - ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, 4678 - map.m_len); 4679 - if (likely(!ret2)) 4680 - ret2 = ext4_convert_unwritten_extents(NULL, 4655 + ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk, 4656 + map.m_len); 4657 + if (likely(!ret)) 4658 + ret = ext4_convert_unwritten_extents(NULL, 4681 4659 inode, (loff_t)map.m_lblk << blkbits, 4682 4660 (loff_t)map.m_len << blkbits); 4683 - if (ret2) 4661 + if (ret) 4684 4662 break; 4685 4663 } 4686 4664 4687 - map.m_lblk += ret; 4688 - map.m_len = len = len - ret; 4665 + map.m_lblk += map.m_len; 4666 + map.m_len = len_lblk = len_lblk - map.m_len; 4667 + epos = EXT4_LBLK_TO_B(inode, map.m_lblk); 4689 4668 } 4669 + 4690 4670 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 4691 4671 goto retry; 4692 4672 4693 - return ret > 0 ? ret2 : ret; 4673 + if (!epos || !new_size) 4674 + return ret; 4675 + 4676 + /* 4677 + * Allocate blocks, update the file size to match the size of the 4678 + * already successfully allocated blocks. 4679 + */ 4680 + if (epos > new_size) 4681 + epos = new_size; 4682 + 4683 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 4684 + if (IS_ERR(handle)) 4685 + return ret ? ret : PTR_ERR(handle); 4686 + 4687 + ext4_update_inode_size(inode, epos); 4688 + ret2 = ext4_mark_inode_dirty(handle, inode); 4689 + ext4_update_inode_fsync_trans(handle, inode, 1); 4690 + ret3 = ext4_journal_stop(handle); 4691 + ret2 = ret3 ? ret3 : ret2; 4692 + 4693 + if (epos > old_size) 4694 + pagecache_isize_extended(inode, old_size, epos); 4695 + 4696 + return ret ? ret : ret2; 4694 4697 } 4695 4698 4696 4699 static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len); ··· 4712 4695 { 4713 4696 struct inode *inode = file_inode(file); 4714 4697 handle_t *handle = NULL; 4715 - loff_t new_size = 0; 4698 + loff_t align_start, align_end, new_size = 0; 4716 4699 loff_t end = offset + len; 4717 - ext4_lblk_t start_lblk, end_lblk; 4718 4700 unsigned int blocksize = i_blocksize(inode); 4719 - unsigned int blkbits = inode->i_blkbits; 4720 - int ret, flags, credits; 4701 + bool partial_zeroed = false; 4702 + int ret, flags; 4721 4703 4722 4704 trace_ext4_zero_range(inode, offset, len, mode); 4723 4705 WARN_ON_ONCE(!inode_is_locked(inode)); ··· 4736 4720 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4737 4721 /* Preallocate the range including the unaligned edges */ 4738 4722 if (!IS_ALIGNED(offset | end, blocksize)) { 4739 - ext4_lblk_t alloc_lblk = offset >> blkbits; 4740 - ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); 4741 - 4742 - ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, 4743 - new_size, flags); 4723 + ret = ext4_alloc_file_blocks(file, offset, len, new_size, 4724 + flags); 4744 4725 if (ret) 4745 4726 return ret; 4746 4727 } ··· 4752 4739 return ret; 4753 4740 4754 4741 /* Zero range excluding the unaligned edges */ 4755 - start_lblk = EXT4_B_TO_LBLK(inode, offset); 4756 - end_lblk = end >> blkbits; 4757 - if (end_lblk > start_lblk) { 4758 - ext4_lblk_t zero_blks = end_lblk - start_lblk; 4759 - 4742 + align_start = round_up(offset, blocksize); 4743 + align_end = round_down(end, blocksize); 4744 + if (align_end > align_start) { 4760 4745 if (mode & FALLOC_FL_WRITE_ZEROES) 4761 4746 flags = EXT4_GET_BLOCKS_CREATE_ZERO | EXT4_EX_NOCACHE; 4762 4747 else 4763 4748 flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4764 4749 EXT4_EX_NOCACHE); 4765 - ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, 4766 - new_size, flags); 4750 + ret = ext4_alloc_file_blocks(file, align_start, 4751 + align_end - align_start, new_size, 4752 + flags); 4767 4753 if (ret) 4768 4754 return ret; 4769 4755 } ··· 4770 4758 if (IS_ALIGNED(offset | end, blocksize)) 4771 4759 return ret; 4772 4760 4773 - /* 4774 - * In worst case we have to writeout two nonadjacent unwritten 4775 - * blocks and update the inode 4776 - */ 4777 - credits = (2 * ext4_ext_index_trans_blocks(inode, 2)) + 1; 4778 - if (ext4_should_journal_data(inode)) 4779 - credits += 2; 4780 - handle = ext4_journal_start(inode, EXT4_HT_MISC, credits); 4761 + /* Zero out partial block at the edges of the range */ 4762 + ret = ext4_zero_partial_blocks(inode, offset, len, &partial_zeroed); 4763 + if (ret) 4764 + return ret; 4765 + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) { 4766 + ret = filemap_write_and_wait_range(inode->i_mapping, offset, 4767 + end - 1); 4768 + if (ret) 4769 + return ret; 4770 + } 4771 + 4772 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 4781 4773 if (IS_ERR(handle)) { 4782 4774 ret = PTR_ERR(handle); 4783 4775 ext4_std_error(inode->i_sb, ret); 4784 4776 return ret; 4785 4777 } 4786 - 4787 - /* Zero out partial block at the edges of the range */ 4788 - ret = ext4_zero_partial_blocks(handle, inode, offset, len); 4789 - if (ret) 4790 - goto out_handle; 4791 4778 4792 4779 if (new_size) 4793 4780 ext4_update_inode_size(inode, new_size); ··· 4795 4784 goto out_handle; 4796 4785 4797 4786 ext4_update_inode_fsync_trans(handle, inode, 1); 4798 - if (file->f_flags & O_SYNC) 4787 + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) 4799 4788 ext4_handle_sync(handle); 4800 4789 4801 4790 out_handle: ··· 4809 4798 struct inode *inode = file_inode(file); 4810 4799 loff_t end = offset + len; 4811 4800 loff_t new_size = 0; 4812 - ext4_lblk_t start_lblk, len_lblk; 4813 4801 int ret; 4814 4802 4815 4803 trace_ext4_fallocate_enter(inode, offset, len, mode); 4816 4804 WARN_ON_ONCE(!inode_is_locked(inode)); 4817 - 4818 - start_lblk = offset >> inode->i_blkbits; 4819 - len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); 4820 4805 4821 4806 /* We only support preallocation for extent-based files only. */ 4822 4807 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ··· 4828 4821 goto out; 4829 4822 } 4830 4823 4831 - ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, 4824 + ret = ext4_alloc_file_blocks(file, offset, len, new_size, 4832 4825 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); 4833 4826 if (ret) 4834 4827 goto out; 4835 4828 4836 - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { 4829 + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && 4830 + EXT4_SB(inode->i_sb)->s_journal) { 4837 4831 ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, 4838 4832 EXT4_I(inode)->i_sync_tid); 4839 4833 } 4840 4834 out: 4841 - trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); 4835 + trace_ext4_fallocate_exit(inode, offset, 4836 + EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits), ret); 4842 4837 return ret; 4843 4838 } 4844 4839 ··· 5607 5598 goto out_handle; 5608 5599 5609 5600 ext4_update_inode_fsync_trans(handle, inode, 1); 5610 - if (IS_SYNC(inode)) 5601 + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) 5611 5602 ext4_handle_sync(handle); 5612 5603 5613 5604 out_handle: ··· 5731 5722 goto out_handle; 5732 5723 5733 5724 ext4_update_inode_fsync_trans(handle, inode, 1); 5734 - if (IS_SYNC(inode)) 5725 + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) 5735 5726 ext4_handle_sync(handle); 5736 5727 5737 5728 out_handle:
+24 -65
fs/ext4/fast_commit.c
··· 13 13 #include "mballoc.h" 14 14 15 15 #include <linux/lockdep.h> 16 + #include <linux/wait_bit.h> 16 17 /* 17 18 * Ext4 Fast Commits 18 19 * ----------------- ··· 216 215 ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); 217 216 INIT_LIST_HEAD(&ei->i_fc_list); 218 217 INIT_LIST_HEAD(&ei->i_fc_dilist); 219 - init_waitqueue_head(&ei->i_fc_wait); 220 218 } 221 219 222 220 static bool ext4_fc_disabled(struct super_block *sb) 223 221 { 224 222 return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || 225 223 (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); 224 + } 225 + 226 + static bool ext4_fc_eligible(struct super_block *sb) 227 + { 228 + return !ext4_fc_disabled(sb) && 229 + !(ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)); 226 230 } 227 231 228 232 /* ··· 326 320 if (ext4_fc_disabled(sb)) 327 321 return; 328 322 329 - if (handle && !IS_ERR(handle)) 323 + if (!IS_ERR_OR_NULL(handle)) 330 324 tid = handle->h_transaction->t_tid; 331 325 else { 332 326 read_lock(&sbi->s_journal->j_state_lock); ··· 479 473 { 480 474 struct inode *inode = d_inode(dentry); 481 475 482 - if (ext4_fc_disabled(inode->i_sb)) 483 - return; 484 - 485 - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 486 - return; 487 - 488 - __ext4_fc_track_unlink(handle, inode, dentry); 476 + if (ext4_fc_eligible(inode->i_sb)) 477 + __ext4_fc_track_unlink(handle, inode, dentry); 489 478 } 490 479 491 480 void __ext4_fc_track_link(handle_t *handle, ··· 497 496 trace_ext4_fc_track_link(handle, inode, dentry, ret); 498 497 } 499 498 500 - void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) 499 + void ext4_fc_track_link(handle_t *handle, struct inode *inode, 500 + struct dentry *dentry) 501 501 { 502 - struct inode *inode = d_inode(dentry); 503 - 504 - if (ext4_fc_disabled(inode->i_sb)) 505 - return; 506 - 507 - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 508 - return; 509 - 510 - __ext4_fc_track_link(handle, inode, dentry); 502 + if (ext4_fc_eligible(inode->i_sb)) 503 + __ext4_fc_track_link(handle, inode, dentry); 511 504 } 512 505 513 506 void __ext4_fc_track_create(handle_t *handle, struct inode *inode, ··· 522 527 { 523 528 struct inode *inode = d_inode(dentry); 524 529 525 - if (ext4_fc_disabled(inode->i_sb)) 526 - return; 527 - 528 - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 529 - return; 530 - 531 - __ext4_fc_track_create(handle, inode, dentry); 530 + if (ext4_fc_eligible(inode->i_sb)) 531 + __ext4_fc_track_create(handle, inode, dentry); 532 532 } 533 533 534 534 /* __track_fn for inode tracking */ ··· 547 557 if (S_ISDIR(inode->i_mode)) 548 558 return; 549 559 550 - if (ext4_fc_disabled(inode->i_sb)) 551 - return; 552 - 553 560 if (ext4_should_journal_data(inode)) { 554 561 ext4_fc_mark_ineligible(inode->i_sb, 555 562 EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); 556 563 return; 557 564 } 558 565 559 - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 566 + if (!ext4_fc_eligible(inode->i_sb)) 560 567 return; 561 568 562 569 /* ··· 631 644 if (S_ISDIR(inode->i_mode)) 632 645 return; 633 646 634 - if (ext4_fc_disabled(inode->i_sb)) 635 - return; 636 - 637 - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) 647 + if (!ext4_fc_eligible(inode->i_sb)) 638 648 return; 639 649 640 650 if (ext4_has_inline_data(inode)) { ··· 1430 1446 struct inode *inode) 1431 1447 { 1432 1448 struct inode *dir = NULL; 1433 - struct dentry *dentry_dir = NULL, *dentry_inode = NULL; 1434 1449 struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); 1435 1450 int ret = 0; 1436 1451 ··· 1440 1457 goto out; 1441 1458 } 1442 1459 1443 - dentry_dir = d_obtain_alias(dir); 1444 - if (IS_ERR(dentry_dir)) { 1445 - ext4_debug("Failed to obtain dentry"); 1446 - dentry_dir = NULL; 1447 - goto out; 1448 - } 1449 - 1450 - dentry_inode = d_alloc(dentry_dir, &qstr_dname); 1451 - if (!dentry_inode) { 1452 - ext4_debug("Inode dentry not created."); 1453 - ret = -ENOMEM; 1454 - goto out; 1455 - } 1456 - 1457 - ret = __ext4_link(dir, inode, dentry_inode); 1460 + ret = __ext4_link(dir, inode, &qstr_dname, NULL); 1458 1461 /* 1459 1462 * It's possible that link already existed since data blocks 1460 1463 * for the dir in question got persisted before we crashed OR ··· 1454 1485 1455 1486 ret = 0; 1456 1487 out: 1457 - if (dentry_dir) { 1458 - d_drop(dentry_dir); 1459 - dput(dentry_dir); 1460 - } else if (dir) { 1488 + if (dir) 1461 1489 iput(dir); 1462 - } 1463 - if (dentry_inode) { 1464 - d_drop(dentry_inode); 1465 - dput(dentry_inode); 1466 - } 1467 1490 1468 1491 return ret; 1469 1492 } ··· 1720 1759 } 1721 1760 1722 1761 /* Replay add range tag */ 1723 - static int ext4_fc_replay_add_range(struct super_block *sb, 1724 - struct ext4_fc_tl_mem *tl, u8 *val) 1762 + static int ext4_fc_replay_add_range(struct super_block *sb, u8 *val) 1725 1763 { 1726 1764 struct ext4_fc_add_range fc_add_ex; 1727 1765 struct ext4_extent newex, *ex; ··· 1840 1880 1841 1881 /* Replay DEL_RANGE tag */ 1842 1882 static int 1843 - ext4_fc_replay_del_range(struct super_block *sb, 1844 - struct ext4_fc_tl_mem *tl, u8 *val) 1883 + ext4_fc_replay_del_range(struct super_block *sb, u8 *val) 1845 1884 { 1846 1885 struct inode *inode; 1847 1886 struct ext4_fc_del_range lrange; ··· 2210 2251 ret = ext4_fc_replay_unlink(sb, &tl, val); 2211 2252 break; 2212 2253 case EXT4_FC_TAG_ADD_RANGE: 2213 - ret = ext4_fc_replay_add_range(sb, &tl, val); 2254 + ret = ext4_fc_replay_add_range(sb, val); 2214 2255 break; 2215 2256 case EXT4_FC_TAG_CREAT: 2216 2257 ret = ext4_fc_replay_create(sb, &tl, val); 2217 2258 break; 2218 2259 case EXT4_FC_TAG_DEL_RANGE: 2219 - ret = ext4_fc_replay_del_range(sb, &tl, val); 2260 + ret = ext4_fc_replay_del_range(sb, val); 2220 2261 break; 2221 2262 case EXT4_FC_TAG_INODE: 2222 2263 ret = ext4_fc_replay_inode(sb, &tl, val);
+17
fs/ext4/file.c
··· 270 270 271 271 static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) 272 272 { 273 + struct inode *inode = file_inode(iocb->ki_filp); 274 + loff_t old_size = i_size_read(inode); 273 275 ssize_t ret, count; 274 276 275 277 count = ext4_generic_write_checks(iocb, from); ··· 281 279 ret = file_modified(iocb->ki_filp); 282 280 if (ret) 283 281 return ret; 282 + 283 + /* 284 + * If the position is beyond the EOF, it is necessary to zero out the 285 + * partial block that beyond the existing EOF, as it may contains 286 + * stale data written through mmap. 287 + */ 288 + if (iocb->ki_pos > old_size && !ext4_verity_in_progress(inode)) { 289 + if (iocb->ki_flags & IOCB_NOWAIT) 290 + return -EAGAIN; 291 + 292 + ret = ext4_block_zero_eof(inode, old_size, iocb->ki_pos); 293 + if (ret) 294 + return ret; 295 + } 296 + 284 297 return count; 285 298 } 286 299
+199 -109
fs/ext4/inode.c
··· 1468 1468 folio_unlock(folio); 1469 1469 folio_put(folio); 1470 1470 1471 - if (old_size < pos && !verity) { 1471 + if (old_size < pos && !verity) 1472 1472 pagecache_isize_extended(inode, old_size, pos); 1473 - ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); 1474 - } 1473 + 1475 1474 /* 1476 1475 * Don't mark the inode dirty under folio lock. First, it unnecessarily 1477 1476 * makes the holding time of folio lock longer. Second, it forces lock ··· 1585 1586 folio_unlock(folio); 1586 1587 folio_put(folio); 1587 1588 1588 - if (old_size < pos && !verity) { 1589 + if (old_size < pos && !verity) 1589 1590 pagecache_isize_extended(inode, old_size, pos); 1590 - ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size); 1591 - } 1592 1591 1593 1592 if (size_changed) { 1594 1593 ret2 = ext4_mark_inode_dirty(handle, inode); ··· 1756 1759 BUG_ON(!folio_test_locked(folio)); 1757 1760 BUG_ON(folio_test_writeback(folio)); 1758 1761 if (invalidate) { 1759 - if (folio_mapped(folio)) 1762 + if (folio_mapped(folio)) { 1760 1763 folio_clear_dirty_for_io(folio); 1764 + /* 1765 + * Unmap folio from page 1766 + * tables to prevent 1767 + * subsequent accesses through 1768 + * stale PTEs. This ensures 1769 + * future accesses trigger new 1770 + * page faults rather than 1771 + * reusing the invalidated 1772 + * folio. 1773 + */ 1774 + unmap_mapping_pages(folio->mapping, 1775 + folio->index, 1776 + folio_nr_pages(folio), false); 1777 + } 1761 1778 block_invalidate_folio(folio, 0, 1762 1779 folio_size(folio)); 1763 1780 folio_clear_uptodate(folio); ··· 3054 3043 3055 3044 int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) 3056 3045 { 3046 + loff_t range_start, range_end; 3057 3047 struct writeback_control wbc = { 3058 3048 .sync_mode = WB_SYNC_ALL, 3059 3049 .nr_to_write = LONG_MAX, 3060 - .range_start = jinode->i_dirty_start, 3061 - .range_end = jinode->i_dirty_end, 3062 3050 }; 3063 3051 struct mpage_da_data mpd = { 3064 3052 .inode = jinode->i_vfs_inode, 3065 3053 .wbc = &wbc, 3066 3054 .can_map = 0, 3067 3055 }; 3056 + 3057 + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) 3058 + return 0; 3059 + 3060 + wbc.range_start = range_start; 3061 + wbc.range_end = range_end; 3062 + 3068 3063 return ext4_do_writepages(&mpd); 3069 3064 } 3070 3065 ··· 3225 3208 struct inode *inode = mapping->host; 3226 3209 loff_t old_size = inode->i_size; 3227 3210 bool disksize_changed = false; 3228 - loff_t new_i_size, zero_len = 0; 3211 + loff_t new_i_size; 3229 3212 handle_t *handle; 3230 3213 3231 3214 if (unlikely(!folio_buffers(folio))) { ··· 3269 3252 folio_unlock(folio); 3270 3253 folio_put(folio); 3271 3254 3272 - if (pos > old_size) { 3255 + if (pos > old_size) 3273 3256 pagecache_isize_extended(inode, old_size, pos); 3274 - zero_len = pos - old_size; 3275 - } 3276 3257 3277 - if (!disksize_changed && !zero_len) 3258 + if (!disksize_changed) 3278 3259 return copied; 3279 3260 3280 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 3261 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 3281 3262 if (IS_ERR(handle)) 3282 3263 return PTR_ERR(handle); 3283 - if (zero_len) 3284 - ext4_zero_partial_blocks(handle, inode, old_size, zero_len); 3285 3264 ext4_mark_inode_dirty(handle, inode); 3286 3265 ext4_journal_stop(handle); 3287 3266 ··· 4027 4014 * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a 4028 4015 * racing writeback can come later and flush the stale pagecache to disk. 4029 4016 */ 4030 - static int __ext4_block_zero_page_range(handle_t *handle, 4031 - struct address_space *mapping, loff_t from, loff_t length) 4017 + static struct buffer_head *ext4_load_tail_bh(struct inode *inode, loff_t from) 4032 4018 { 4033 4019 unsigned int offset, blocksize, pos; 4034 4020 ext4_lblk_t iblock; 4035 - struct inode *inode = mapping->host; 4021 + struct address_space *mapping = inode->i_mapping; 4036 4022 struct buffer_head *bh; 4037 4023 struct folio *folio; 4038 4024 int err = 0; ··· 4040 4028 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, 4041 4029 mapping_gfp_constraint(mapping, ~__GFP_FS)); 4042 4030 if (IS_ERR(folio)) 4043 - return PTR_ERR(folio); 4031 + return ERR_CAST(folio); 4044 4032 4045 4033 blocksize = inode->i_sb->s_blocksize; 4046 4034 ··· 4092 4080 } 4093 4081 } 4094 4082 } 4095 - if (ext4_should_journal_data(inode)) { 4096 - BUFFER_TRACE(bh, "get write access"); 4097 - err = ext4_journal_get_write_access(handle, inode->i_sb, bh, 4098 - EXT4_JTR_NONE); 4099 - if (err) 4100 - goto unlock; 4101 - } 4102 - folio_zero_range(folio, offset, length); 4103 - BUFFER_TRACE(bh, "zeroed end of block"); 4104 - 4105 - if (ext4_should_journal_data(inode)) { 4106 - err = ext4_dirty_journalled_data(handle, bh); 4107 - } else { 4108 - mark_buffer_dirty(bh); 4109 - /* 4110 - * Only the written block requires ordered data to prevent 4111 - * exposing stale data. 4112 - */ 4113 - if (!buffer_unwritten(bh) && !buffer_delay(bh) && 4114 - ext4_should_order_data(inode)) 4115 - err = ext4_jbd2_inode_add_write(handle, inode, from, 4116 - length); 4117 - } 4083 + return bh; 4118 4084 4119 4085 unlock: 4120 4086 folio_unlock(folio); 4121 4087 folio_put(folio); 4088 + return err ? ERR_PTR(err) : NULL; 4089 + } 4090 + 4091 + static int ext4_block_do_zero_range(struct inode *inode, loff_t from, 4092 + loff_t length, bool *did_zero, 4093 + bool *zero_written) 4094 + { 4095 + struct buffer_head *bh; 4096 + struct folio *folio; 4097 + 4098 + bh = ext4_load_tail_bh(inode, from); 4099 + if (IS_ERR_OR_NULL(bh)) 4100 + return PTR_ERR_OR_ZERO(bh); 4101 + 4102 + folio = bh->b_folio; 4103 + folio_zero_range(folio, offset_in_folio(folio, from), length); 4104 + BUFFER_TRACE(bh, "zeroed end of block"); 4105 + 4106 + mark_buffer_dirty(bh); 4107 + if (did_zero) 4108 + *did_zero = true; 4109 + if (zero_written && !buffer_unwritten(bh) && !buffer_delay(bh)) 4110 + *zero_written = true; 4111 + 4112 + folio_unlock(folio); 4113 + folio_put(folio); 4114 + return 0; 4115 + } 4116 + 4117 + static int ext4_block_journalled_zero_range(struct inode *inode, loff_t from, 4118 + loff_t length, bool *did_zero) 4119 + { 4120 + struct buffer_head *bh; 4121 + struct folio *folio; 4122 + handle_t *handle; 4123 + int err; 4124 + 4125 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 4126 + if (IS_ERR(handle)) 4127 + return PTR_ERR(handle); 4128 + 4129 + bh = ext4_load_tail_bh(inode, from); 4130 + if (IS_ERR_OR_NULL(bh)) { 4131 + err = PTR_ERR_OR_ZERO(bh); 4132 + goto out_handle; 4133 + } 4134 + folio = bh->b_folio; 4135 + 4136 + BUFFER_TRACE(bh, "get write access"); 4137 + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, 4138 + EXT4_JTR_NONE); 4139 + if (err) 4140 + goto out; 4141 + 4142 + folio_zero_range(folio, offset_in_folio(folio, from), length); 4143 + BUFFER_TRACE(bh, "zeroed end of block"); 4144 + 4145 + err = ext4_dirty_journalled_data(handle, bh); 4146 + if (err) 4147 + goto out; 4148 + 4149 + if (did_zero) 4150 + *did_zero = true; 4151 + out: 4152 + folio_unlock(folio); 4153 + folio_put(folio); 4154 + out_handle: 4155 + ext4_journal_stop(handle); 4122 4156 return err; 4123 4157 } 4124 4158 4125 4159 /* 4126 - * ext4_block_zero_page_range() zeros out a mapping of length 'length' 4127 - * starting from file offset 'from'. The range to be zero'd must 4128 - * be contained with in one block. If the specified range exceeds 4129 - * the end of the block it will be shortened to end of the block 4130 - * that corresponds to 'from' 4160 + * Zeros out a mapping of length 'length' starting from file offset 4161 + * 'from'. The range to be zero'd must be contained with in one block. 4162 + * If the specified range exceeds the end of the block it will be 4163 + * shortened to end of the block that corresponds to 'from'. 4131 4164 */ 4132 - static int ext4_block_zero_page_range(handle_t *handle, 4133 - struct address_space *mapping, loff_t from, loff_t length) 4165 + static int ext4_block_zero_range(struct inode *inode, 4166 + loff_t from, loff_t length, bool *did_zero, 4167 + bool *zero_written) 4134 4168 { 4135 - struct inode *inode = mapping->host; 4136 4169 unsigned blocksize = inode->i_sb->s_blocksize; 4137 4170 unsigned int max = blocksize - (from & (blocksize - 1)); 4138 4171 ··· 4189 4132 length = max; 4190 4133 4191 4134 if (IS_DAX(inode)) { 4192 - return dax_zero_range(inode, from, length, NULL, 4135 + return dax_zero_range(inode, from, length, did_zero, 4193 4136 &ext4_iomap_ops); 4137 + } else if (ext4_should_journal_data(inode)) { 4138 + return ext4_block_journalled_zero_range(inode, from, length, 4139 + did_zero); 4194 4140 } 4195 - return __ext4_block_zero_page_range(handle, mapping, from, length); 4141 + return ext4_block_do_zero_range(inode, from, length, did_zero, 4142 + zero_written); 4196 4143 } 4197 4144 4198 4145 /* 4199 - * ext4_block_truncate_page() zeroes out a mapping from file offset `from' 4200 - * up to the end of the block which corresponds to `from'. 4201 - * This required during truncate. We need to physically zero the tail end 4202 - * of that block so it doesn't yield old data if the file is later grown. 4146 + * Zero out a mapping from file offset 'from' up to the end of the block 4147 + * which corresponds to 'from' or to the given 'end' inside this block. 4148 + * This required during truncate up and performing append writes. We need 4149 + * to physically zero the tail end of that block so it doesn't yield old 4150 + * data if the file is grown. 4203 4151 */ 4204 - static int ext4_block_truncate_page(handle_t *handle, 4205 - struct address_space *mapping, loff_t from) 4152 + int ext4_block_zero_eof(struct inode *inode, loff_t from, loff_t end) 4206 4153 { 4207 - unsigned length; 4208 - unsigned blocksize; 4209 - struct inode *inode = mapping->host; 4154 + unsigned int blocksize = i_blocksize(inode); 4155 + unsigned int offset; 4156 + loff_t length = end - from; 4157 + bool did_zero = false; 4158 + bool zero_written = false; 4159 + int err; 4210 4160 4161 + offset = from & (blocksize - 1); 4162 + if (!offset || from >= end) 4163 + return 0; 4211 4164 /* If we are processing an encrypted inode during orphan list handling */ 4212 4165 if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) 4213 4166 return 0; 4214 4167 4215 - blocksize = i_blocksize(inode); 4216 - length = blocksize - (from & (blocksize - 1)); 4168 + if (length > blocksize - offset) 4169 + length = blocksize - offset; 4217 4170 4218 - return ext4_block_zero_page_range(handle, mapping, from, length); 4171 + err = ext4_block_zero_range(inode, from, length, 4172 + &did_zero, &zero_written); 4173 + if (err) 4174 + return err; 4175 + /* 4176 + * It's necessary to order zeroed data before update i_disksize when 4177 + * truncating up or performing an append write, because there might be 4178 + * exposing stale on-disk data which may caused by concurrent post-EOF 4179 + * mmap write during folio writeback. 4180 + */ 4181 + if (ext4_should_order_data(inode) && 4182 + did_zero && zero_written && !IS_DAX(inode)) { 4183 + handle_t *handle; 4184 + 4185 + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); 4186 + if (IS_ERR(handle)) 4187 + return PTR_ERR(handle); 4188 + 4189 + err = ext4_jbd2_inode_add_write(handle, inode, from, length); 4190 + ext4_journal_stop(handle); 4191 + if (err) 4192 + return err; 4193 + } 4194 + 4195 + return 0; 4219 4196 } 4220 4197 4221 - int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, 4222 - loff_t lstart, loff_t length) 4198 + int ext4_zero_partial_blocks(struct inode *inode, loff_t lstart, loff_t length, 4199 + bool *did_zero) 4223 4200 { 4224 4201 struct super_block *sb = inode->i_sb; 4225 - struct address_space *mapping = inode->i_mapping; 4226 4202 unsigned partial_start, partial_end; 4227 4203 ext4_fsblk_t start, end; 4228 4204 loff_t byte_end = (lstart + length - 1); ··· 4270 4180 /* Handle partial zero within the single block */ 4271 4181 if (start == end && 4272 4182 (partial_start || (partial_end != sb->s_blocksize - 1))) { 4273 - err = ext4_block_zero_page_range(handle, mapping, 4274 - lstart, length); 4183 + err = ext4_block_zero_range(inode, lstart, length, did_zero, 4184 + NULL); 4275 4185 return err; 4276 4186 } 4277 4187 /* Handle partial zero out on the start of the range */ 4278 4188 if (partial_start) { 4279 - err = ext4_block_zero_page_range(handle, mapping, 4280 - lstart, sb->s_blocksize); 4189 + err = ext4_block_zero_range(inode, lstart, sb->s_blocksize, 4190 + did_zero, NULL); 4281 4191 if (err) 4282 4192 return err; 4283 4193 } 4284 4194 /* Handle partial zero out on the end of the range */ 4285 4195 if (partial_end != sb->s_blocksize - 1) 4286 - err = ext4_block_zero_page_range(handle, mapping, 4287 - byte_end - partial_end, 4288 - partial_end + 1); 4196 + err = ext4_block_zero_range(inode, byte_end - partial_end, 4197 + partial_end + 1, did_zero, NULL); 4289 4198 return err; 4290 4199 } 4291 4200 ··· 4433 4344 loff_t end = offset + length; 4434 4345 handle_t *handle; 4435 4346 unsigned int credits; 4347 + bool partial_zeroed = false; 4436 4348 int ret; 4437 4349 4438 4350 trace_ext4_punch_hole(inode, offset, length, 0); ··· 4460 4370 end = max_end; 4461 4371 length = end - offset; 4462 4372 4463 - /* 4464 - * Attach jinode to inode for jbd2 if we do any zeroing of partial 4465 - * block. 4466 - */ 4467 - if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { 4468 - ret = ext4_inode_attach_jinode(inode); 4469 - if (ret < 0) 4470 - return ret; 4471 - } 4472 - 4473 - 4474 4373 ret = ext4_update_disksize_before_punch(inode, offset, length); 4475 4374 if (ret) 4476 4375 return ret; ··· 4469 4390 if (ret) 4470 4391 return ret; 4471 4392 4393 + ret = ext4_zero_partial_blocks(inode, offset, length, &partial_zeroed); 4394 + if (ret) 4395 + return ret; 4396 + if (((file->f_flags & O_SYNC) || IS_SYNC(inode)) && partial_zeroed) { 4397 + ret = filemap_write_and_wait_range(inode->i_mapping, offset, 4398 + end - 1); 4399 + if (ret) 4400 + return ret; 4401 + } 4402 + 4472 4403 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4473 - credits = ext4_chunk_trans_extent(inode, 2); 4404 + credits = ext4_chunk_trans_extent(inode, 0); 4474 4405 else 4475 4406 credits = ext4_blocks_for_truncate(inode); 4476 4407 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); ··· 4489 4400 ext4_std_error(sb, ret); 4490 4401 return ret; 4491 4402 } 4492 - 4493 - ret = ext4_zero_partial_blocks(handle, inode, offset, length); 4494 - if (ret) 4495 - goto out_handle; 4496 4403 4497 4404 /* If there are blocks to remove, do it */ 4498 4405 start_lblk = EXT4_B_TO_LBLK(inode, offset); ··· 4526 4441 goto out_handle; 4527 4442 4528 4443 ext4_update_inode_fsync_trans(handle, inode, 1); 4529 - if (IS_SYNC(inode)) 4444 + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) 4530 4445 ext4_handle_sync(handle); 4531 4446 out_handle: 4532 4447 ext4_journal_stop(handle); ··· 4597 4512 unsigned int credits; 4598 4513 int err = 0, err2; 4599 4514 handle_t *handle; 4600 - struct address_space *mapping = inode->i_mapping; 4601 4515 4602 4516 /* 4603 4517 * There is a possibility that we're either freeing the inode ··· 4626 4542 err = ext4_inode_attach_jinode(inode); 4627 4543 if (err) 4628 4544 goto out_trace; 4545 + 4546 + /* Zero to the end of the block containing i_size */ 4547 + err = ext4_block_zero_eof(inode, inode->i_size, LLONG_MAX); 4548 + if (err) 4549 + goto out_trace; 4629 4550 } 4630 4551 4631 4552 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ··· 4643 4554 err = PTR_ERR(handle); 4644 4555 goto out_trace; 4645 4556 } 4646 - 4647 - if (inode->i_size & (inode->i_sb->s_blocksize - 1)) 4648 - ext4_block_truncate_page(handle, mapping, inode->i_size); 4649 4557 4650 4558 /* 4651 4559 * We add the inode to the orphan list, so that if this ··· 6013 5927 goto out_mmap_sem; 6014 5928 } 6015 5929 6016 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); 6017 - if (IS_ERR(handle)) { 6018 - error = PTR_ERR(handle); 6019 - goto out_mmap_sem; 6020 - } 6021 - if (ext4_handle_valid(handle) && shrink) { 6022 - error = ext4_orphan_add(handle, inode); 6023 - orphan = 1; 6024 - } 6025 5930 /* 6026 5931 * Update c/mtime and tail zero the EOF folio on 6027 5932 * truncate up. ext4_truncate() handles the shrink case ··· 6021 5944 if (!shrink) { 6022 5945 inode_set_mtime_to_ts(inode, 6023 5946 inode_set_ctime_current(inode)); 6024 - if (oldsize & (inode->i_sb->s_blocksize - 1)) 6025 - ext4_block_truncate_page(handle, 6026 - inode->i_mapping, oldsize); 5947 + if (oldsize & (inode->i_sb->s_blocksize - 1)) { 5948 + error = ext4_block_zero_eof(inode, 5949 + oldsize, LLONG_MAX); 5950 + if (error) 5951 + goto out_mmap_sem; 5952 + } 5953 + } 5954 + 5955 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); 5956 + if (IS_ERR(handle)) { 5957 + error = PTR_ERR(handle); 5958 + goto out_mmap_sem; 5959 + } 5960 + if (ext4_handle_valid(handle) && shrink) { 5961 + error = ext4_orphan_add(handle, inode); 5962 + orphan = 1; 6027 5963 } 6028 5964 6029 5965 if (shrink)
+5 -1
fs/ext4/mballoc-test.c
··· 362 362 return ret; 363 363 } 364 364 365 - test->priv = sb; 366 365 kunit_activate_static_stub(test, 367 366 ext4_read_block_bitmap_nowait, 368 367 ext4_read_block_bitmap_nowait_stub); ··· 382 383 return -ENOMEM; 383 384 } 384 385 386 + test->priv = sb; 387 + 385 388 return 0; 386 389 } 387 390 388 391 static void mbt_kunit_exit(struct kunit *test) 389 392 { 390 393 struct super_block *sb = (struct super_block *)test->priv; 394 + 395 + if (!sb) 396 + return; 391 397 392 398 mbt_mb_release(sb); 393 399 mbt_ctx_release(sb);
+10 -16
fs/ext4/mballoc.c
··· 2876 2876 EXT4_MB_GRP_NEED_INIT(grp) && 2877 2877 ext4_free_group_clusters(sb, gdp) > 0 ) { 2878 2878 bh = ext4_read_block_bitmap_nowait(sb, group, true); 2879 - if (bh && !IS_ERR(bh)) { 2879 + if (!IS_ERR_OR_NULL(bh)) { 2880 2880 if (!buffer_uptodate(bh) && cnt) 2881 2881 (*cnt)++; 2882 2882 brelse(bh); ··· 4561 4561 (req <= (size) || max <= (chunk_size)) 4562 4562 4563 4563 /* first, try to predict filesize */ 4564 - /* XXX: should this table be tunable? */ 4565 4564 start_off = 0; 4566 - if (size <= 16 * 1024) { 4567 - size = 16 * 1024; 4568 - } else if (size <= 32 * 1024) { 4569 - size = 32 * 1024; 4570 - } else if (size <= 64 * 1024) { 4571 - size = 64 * 1024; 4572 - } else if (size <= 128 * 1024) { 4573 - size = 128 * 1024; 4574 - } else if (size <= 256 * 1024) { 4575 - size = 256 * 1024; 4576 - } else if (size <= 512 * 1024) { 4577 - size = 512 * 1024; 4578 - } else if (size <= 1024 * 1024) { 4579 - size = 1024 * 1024; 4565 + if (size <= SZ_1M) { 4566 + /* 4567 + * For files up to 1MB, round up the preallocation size to 4568 + * the next power of two, with a minimum of 16KB. 4569 + */ 4570 + if (size <= (unsigned long)SZ_16K) 4571 + size = SZ_16K; 4572 + else 4573 + size = roundup_pow_of_two(size); 4580 4574 } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { 4581 4575 start_off = ((loff_t)ac->ac_o_ex.fe_logical >> 4582 4576 (21 - bsbits)) << 21;
+2 -2
fs/ext4/move_extent.c
··· 224 224 } 225 225 226 226 /* Adjust the moving length according to the length of shorter folio. */ 227 - move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos, 228 - folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos); 227 + move_len = umin(folio_next_pos(folio[0]) - orig_pos, 228 + folio_next_pos(folio[1]) - donor_pos); 229 229 move_len >>= blkbits; 230 230 if (move_len < mext->orig_map.m_len) 231 231 mext->orig_map.m_len = move_len;
+29 -19
fs/ext4/namei.c
··· 647 647 /* Directory is not encrypted */ 648 648 (void) ext4fs_dirhash(dir, de->name, 649 649 de->name_len, &h); 650 - printk("%*.s:(U)%x.%u ", len, 650 + printk("%.*s:(U)%x.%u ", len, 651 651 name, h.hash, 652 652 (unsigned) ((char *) de 653 653 - base)); ··· 683 683 (void) ext4fs_dirhash(dir, 684 684 de->name, 685 685 de->name_len, &h); 686 - printk("%*.s:(E)%x.%u ", len, name, 686 + printk("%.*s:(E)%x.%u ", len, name, 687 687 h.hash, (unsigned) ((char *) de 688 688 - base)); 689 689 fscrypt_fname_free_buffer( ··· 694 694 char *name = de->name; 695 695 (void) ext4fs_dirhash(dir, de->name, 696 696 de->name_len, &h); 697 - printk("%*.s:%x.%u ", len, name, h.hash, 697 + printk("%.*s:%x.%u ", len, name, h.hash, 698 698 (unsigned) ((char *) de - base)); 699 699 #endif 700 700 } ··· 723 723 struct stats stats; 724 724 printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); 725 725 bh = ext4_bread(NULL,dir, block, 0); 726 - if (!bh || IS_ERR(bh)) 726 + if (IS_ERR_OR_NULL(bh)) 727 727 continue; 728 728 stats = levels? 729 729 dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): ··· 2353 2353 * may not sleep between calling this and putting something into 2354 2354 * the entry, as someone else might have used it while you slept. 2355 2355 */ 2356 - static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 2356 + static int __ext4_add_entry(handle_t *handle, struct inode *dir, 2357 + const struct qstr *d_name, 2357 2358 struct inode *inode) 2358 2359 { 2359 - struct inode *dir = d_inode(dentry->d_parent); 2360 2360 struct buffer_head *bh = NULL; 2361 2361 struct ext4_dir_entry_2 *de; 2362 2362 struct super_block *sb; ··· 2373 2373 sb = dir->i_sb; 2374 2374 blocksize = sb->s_blocksize; 2375 2375 2376 - if (fscrypt_is_nokey_name(dentry)) 2377 - return -ENOKEY; 2378 - 2379 - if (!generic_ci_validate_strict_name(dir, &dentry->d_name)) 2376 + if (!generic_ci_validate_strict_name(dir, d_name)) 2380 2377 return -EINVAL; 2381 2378 2382 - retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); 2379 + retval = ext4_fname_setup_filename(dir, d_name, 0, &fname); 2383 2380 if (retval) 2384 2381 return retval; 2385 2382 ··· 2455 2458 if (retval == 0) 2456 2459 ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); 2457 2460 return retval; 2461 + } 2462 + 2463 + static int ext4_add_entry(handle_t *handle, struct dentry *dentry, 2464 + struct inode *inode) 2465 + { 2466 + struct inode *dir = d_inode(dentry->d_parent); 2467 + 2468 + if (fscrypt_is_nokey_name(dentry)) 2469 + return -ENOKEY; 2470 + return __ext4_add_entry(handle, dir, &dentry->d_name, inode); 2458 2471 } 2459 2472 2460 2473 /* ··· 3452 3445 return err; 3453 3446 } 3454 3447 3455 - int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry) 3448 + int __ext4_link(struct inode *dir, struct inode *inode, 3449 + const struct qstr *d_name, struct dentry *dentry) 3456 3450 { 3457 3451 handle_t *handle; 3458 3452 int err, retries = 0; ··· 3469 3461 3470 3462 inode_set_ctime_current(inode); 3471 3463 ext4_inc_count(inode); 3472 - ihold(inode); 3473 3464 3474 - err = ext4_add_entry(handle, dentry, inode); 3465 + err = __ext4_add_entry(handle, dir, d_name, inode); 3475 3466 if (!err) { 3476 3467 err = ext4_mark_inode_dirty(handle, inode); 3477 3468 /* this can happen only for tmpfile being ··· 3478 3471 */ 3479 3472 if (inode->i_nlink == 1) 3480 3473 ext4_orphan_del(handle, inode); 3481 - d_instantiate(dentry, inode); 3482 - ext4_fc_track_link(handle, dentry); 3474 + if (dentry) 3475 + ext4_fc_track_link(handle, inode, dentry); 3483 3476 } else { 3484 3477 drop_nlink(inode); 3485 - iput(inode); 3486 3478 } 3487 3479 ext4_journal_stop(handle); 3488 3480 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) ··· 3510 3504 err = dquot_initialize(dir); 3511 3505 if (err) 3512 3506 return err; 3513 - return __ext4_link(dir, inode, dentry); 3507 + err = __ext4_link(dir, inode, &dentry->d_name, dentry); 3508 + if (!err) { 3509 + ihold(inode); 3510 + d_instantiate(dentry, inode); 3511 + } 3512 + return err; 3514 3513 } 3515 - 3516 3514 /* 3517 3515 * Try to find buffer head where contains the parent block. 3518 3516 * It should be the inode block if it is inlined or the 1st block
+11 -5
fs/ext4/super.c
··· 521 521 { 522 522 struct buffer_head *bh, *head; 523 523 struct journal_head *jh; 524 + transaction_t *trans = READ_ONCE(jinode->i_transaction); 524 525 525 526 bh = head = folio_buffers(folio); 526 527 do { ··· 540 539 */ 541 540 jh = bh2jh(bh); 542 541 if (buffer_dirty(bh) || 543 - (jh && (jh->b_transaction != jinode->i_transaction || 542 + (jh && (jh->b_transaction != trans || 544 543 jh->b_next_transaction))) 545 544 return true; 546 545 } while ((bh = bh->b_this_page) != head); ··· 551 550 static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) 552 551 { 553 552 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 553 + loff_t range_start, range_end; 554 554 struct writeback_control wbc = { 555 - .sync_mode = WB_SYNC_ALL, 555 + .sync_mode = WB_SYNC_ALL, 556 556 .nr_to_write = LONG_MAX, 557 - .range_start = jinode->i_dirty_start, 558 - .range_end = jinode->i_dirty_end, 559 - }; 557 + }; 560 558 struct folio *folio = NULL; 561 559 int error; 560 + 561 + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) 562 + return 0; 563 + 564 + wbc.range_start = range_start; 565 + wbc.range_end = range_end; 562 566 563 567 /* 564 568 * writeback_iter() already checks for dirty pages and calls
+1 -1
fs/ext4/symlink.c
··· 92 92 93 93 if (!dentry) { 94 94 bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT); 95 - if (IS_ERR(bh) || !bh) 95 + if (IS_ERR_OR_NULL(bh)) 96 96 return ERR_PTR(-ECHILD); 97 97 if (!ext4_buffer_uptodate(bh)) { 98 98 brelse(bh);
+4 -2
fs/ext4/xattr.c
··· 226 226 /* Find the end of the names list */ 227 227 while (!IS_LAST_ENTRY(e)) { 228 228 struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); 229 - if ((void *)next >= end) { 229 + if ((void *)next + sizeof(u32) > end) { 230 230 err_str = "e_name out of bounds"; 231 231 goto errout; 232 232 } ··· 1165 1165 { 1166 1166 struct inode *ea_inode; 1167 1167 struct ext4_xattr_entry *entry; 1168 - struct ext4_iloc iloc; 1168 + struct ext4_iloc iloc = { .bh = NULL }; 1169 1169 bool dirty = false; 1170 1170 unsigned int ea_ino; 1171 1171 int err; ··· 1260 1260 ext4_warning_inode(parent, 1261 1261 "handle dirty metadata err=%d", err); 1262 1262 } 1263 + 1264 + brelse(iloc.bh); 1263 1265 } 1264 1266 1265 1267 /*
+42 -13
fs/jbd2/commit.c
··· 180 180 /* Send all the data buffers related to an inode */ 181 181 int jbd2_submit_inode_data(journal_t *journal, struct jbd2_inode *jinode) 182 182 { 183 - if (!jinode || !(jinode->i_flags & JI_WRITE_DATA)) 183 + unsigned long flags; 184 + 185 + if (!jinode) 186 + return 0; 187 + 188 + flags = READ_ONCE(jinode->i_flags); 189 + if (!(flags & JI_WRITE_DATA)) 184 190 return 0; 185 191 186 192 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); ··· 197 191 198 192 int jbd2_wait_inode_data(journal_t *journal, struct jbd2_inode *jinode) 199 193 { 200 - if (!jinode || !(jinode->i_flags & JI_WAIT_DATA) || 201 - !jinode->i_vfs_inode || !jinode->i_vfs_inode->i_mapping) 194 + struct address_space *mapping; 195 + struct inode *inode; 196 + unsigned long flags; 197 + loff_t start_byte, end_byte; 198 + 199 + if (!jinode) 200 + return 0; 201 + 202 + flags = READ_ONCE(jinode->i_flags); 203 + if (!(flags & JI_WAIT_DATA)) 204 + return 0; 205 + 206 + inode = jinode->i_vfs_inode; 207 + if (!inode) 208 + return 0; 209 + 210 + mapping = inode->i_mapping; 211 + if (!mapping) 212 + return 0; 213 + 214 + if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 202 215 return 0; 203 216 return filemap_fdatawait_range_keep_errors( 204 - jinode->i_vfs_inode->i_mapping, jinode->i_dirty_start, 205 - jinode->i_dirty_end); 217 + mapping, start_byte, end_byte); 206 218 } 207 219 EXPORT_SYMBOL(jbd2_wait_inode_data); 208 220 ··· 242 218 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 243 219 if (!(jinode->i_flags & JI_WRITE_DATA)) 244 220 continue; 245 - jinode->i_flags |= JI_COMMIT_RUNNING; 221 + WRITE_ONCE(jinode->i_flags, 222 + jinode->i_flags | JI_COMMIT_RUNNING); 246 223 spin_unlock(&journal->j_list_lock); 247 224 /* submit the inode data buffers. */ 248 225 trace_jbd2_submit_inode_data(jinode->i_vfs_inode); ··· 254 229 } 255 230 spin_lock(&journal->j_list_lock); 256 231 J_ASSERT(jinode->i_transaction == commit_transaction); 257 - jinode->i_flags &= ~JI_COMMIT_RUNNING; 232 + WRITE_ONCE(jinode->i_flags, 233 + jinode->i_flags & ~JI_COMMIT_RUNNING); 258 234 smp_mb(); 259 235 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 260 236 } ··· 266 240 int jbd2_journal_finish_inode_data_buffers(struct jbd2_inode *jinode) 267 241 { 268 242 struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 243 + loff_t start_byte, end_byte; 244 + 245 + if (!jbd2_jinode_get_dirty_range(jinode, &start_byte, &end_byte)) 246 + return 0; 269 247 270 248 return filemap_fdatawait_range_keep_errors(mapping, 271 - jinode->i_dirty_start, 272 - jinode->i_dirty_end); 249 + start_byte, end_byte); 273 250 } 274 251 275 252 /* ··· 291 262 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { 292 263 if (!(jinode->i_flags & JI_WAIT_DATA)) 293 264 continue; 294 - jinode->i_flags |= JI_COMMIT_RUNNING; 265 + WRITE_ONCE(jinode->i_flags, jinode->i_flags | JI_COMMIT_RUNNING); 295 266 spin_unlock(&journal->j_list_lock); 296 267 /* wait for the inode data buffers writeout. */ 297 268 if (journal->j_finish_inode_data_buffers) { ··· 301 272 } 302 273 cond_resched(); 303 274 spin_lock(&journal->j_list_lock); 304 - jinode->i_flags &= ~JI_COMMIT_RUNNING; 275 + WRITE_ONCE(jinode->i_flags, jinode->i_flags & ~JI_COMMIT_RUNNING); 305 276 smp_mb(); 306 277 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); 307 278 } ··· 317 288 &jinode->i_transaction->t_inode_list); 318 289 } else { 319 290 jinode->i_transaction = NULL; 320 - jinode->i_dirty_start = 0; 321 - jinode->i_dirty_end = 0; 291 + WRITE_ONCE(jinode->i_dirty_start_page, 0); 292 + WRITE_ONCE(jinode->i_dirty_end_page, 0); 322 293 } 323 294 } 324 295 spin_unlock(&journal->j_list_lock);
+2 -3
fs/jbd2/journal.c
··· 3018 3018 jinode->i_next_transaction = NULL; 3019 3019 jinode->i_vfs_inode = inode; 3020 3020 jinode->i_flags = 0; 3021 - jinode->i_dirty_start = 0; 3022 - jinode->i_dirty_end = 0; 3021 + jinode->i_dirty_start_page = 0; 3022 + jinode->i_dirty_end_page = 0; 3023 3023 INIT_LIST_HEAD(&jinode->i_list); 3024 3024 } 3025 3025 ··· 3176 3176 MODULE_LICENSE("GPL"); 3177 3177 module_init(journal_init); 3178 3178 module_exit(journal_exit); 3179 -
+5 -3
fs/jbd2/revoke.c
··· 428 428 journal_t *journal = handle->h_transaction->t_journal; 429 429 int need_cancel; 430 430 struct buffer_head *bh = jh2bh(jh); 431 + struct address_space *bh_mapping = bh->b_folio->mapping; 431 432 432 433 jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); 433 434 ··· 465 464 * buffer_head? If so, we'd better make sure we clear the 466 465 * revoked status on any hashed alias too, otherwise the revoke 467 466 * state machine will get very upset later on. */ 468 - if (need_cancel) { 467 + if (need_cancel && !sb_is_blkdev_sb(bh_mapping->host->i_sb)) { 469 468 struct buffer_head *bh2; 469 + 470 470 bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, 471 471 bh->b_size); 472 472 if (bh2) { 473 - if (bh2 != bh) 474 - clear_buffer_revoked(bh2); 473 + WARN_ON_ONCE(bh2 == bh); 474 + clear_buffer_revoked(bh2); 475 475 __brelse(bh2); 476 476 } 477 477 }
+107 -37
fs/jbd2/transaction.c
··· 474 474 return ERR_PTR(-EROFS); 475 475 476 476 if (handle) { 477 - J_ASSERT(handle->h_transaction->t_journal == journal); 477 + if (WARN_ON_ONCE(handle->h_transaction->t_journal != journal)) 478 + return ERR_PTR(-EINVAL); 478 479 handle->h_ref++; 479 480 return handle; 480 481 } ··· 1037 1036 */ 1038 1037 if (!jh->b_transaction) { 1039 1038 JBUFFER_TRACE(jh, "no transaction"); 1040 - J_ASSERT_JH(jh, !jh->b_next_transaction); 1039 + if (WARN_ON_ONCE(jh->b_next_transaction)) { 1040 + spin_unlock(&jh->b_state_lock); 1041 + unlock_buffer(bh); 1042 + error = -EINVAL; 1043 + jbd2_journal_abort(journal, error); 1044 + goto out; 1045 + } 1041 1046 JBUFFER_TRACE(jh, "file as BJ_Reserved"); 1042 1047 /* 1043 1048 * Make sure all stores to jh (b_modified, b_frozen_data) are ··· 1076 1069 */ 1077 1070 if (jh->b_frozen_data) { 1078 1071 JBUFFER_TRACE(jh, "has frozen data"); 1079 - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 1072 + if (WARN_ON_ONCE(jh->b_next_transaction)) { 1073 + spin_unlock(&jh->b_state_lock); 1074 + error = -EINVAL; 1075 + jbd2_journal_abort(journal, error); 1076 + goto out; 1077 + } 1080 1078 goto attach_next; 1081 1079 } 1082 1080 1083 1081 JBUFFER_TRACE(jh, "owned by older transaction"); 1084 - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 1085 - J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); 1082 + if (WARN_ON_ONCE(jh->b_next_transaction || 1083 + jh->b_transaction != 1084 + journal->j_committing_transaction)) { 1085 + pr_err("JBD2: %s: assertion failure: b_next_transaction=%p b_transaction=%p j_committing_transaction=%p\n", 1086 + journal->j_devname, jh->b_next_transaction, 1087 + jh->b_transaction, journal->j_committing_transaction); 1088 + spin_unlock(&jh->b_state_lock); 1089 + error = -EINVAL; 1090 + jbd2_journal_abort(journal, error); 1091 + goto out; 1092 + } 1086 1093 1087 1094 /* 1088 1095 * There is one case we have to be very careful about. If the ··· 1323 1302 goto out; 1324 1303 } 1325 1304 1326 - J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 1305 + if (WARN_ON_ONCE(!buffer_locked(jh2bh(jh)))) { 1306 + err = -EINVAL; 1307 + spin_unlock(&jh->b_state_lock); 1308 + jbd2_journal_abort(journal, err); 1309 + goto out; 1310 + } 1327 1311 1328 1312 if (jh->b_transaction == NULL) { 1329 1313 /* ··· 1517 1491 int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) 1518 1492 { 1519 1493 transaction_t *transaction = handle->h_transaction; 1520 - journal_t *journal; 1494 + journal_t *journal = transaction->t_journal; 1521 1495 struct journal_head *jh; 1522 1496 int ret = 0; 1523 1497 ··· 1541 1515 if (data_race(jh->b_transaction != transaction && 1542 1516 jh->b_next_transaction != transaction)) { 1543 1517 spin_lock(&jh->b_state_lock); 1544 - J_ASSERT_JH(jh, jh->b_transaction == transaction || 1545 - jh->b_next_transaction == transaction); 1518 + if (WARN_ON_ONCE(jh->b_transaction != transaction && 1519 + jh->b_next_transaction != transaction)) { 1520 + pr_err("JBD2: %s: assertion failure: b_transaction=%p transaction=%p b_next_transaction=%p\n", 1521 + journal->j_devname, jh->b_transaction, 1522 + transaction, jh->b_next_transaction); 1523 + ret = -EINVAL; 1524 + goto out_unlock_bh; 1525 + } 1546 1526 spin_unlock(&jh->b_state_lock); 1547 1527 } 1548 1528 if (data_race(jh->b_modified == 1)) { ··· 1556 1524 if (data_race(jh->b_transaction == transaction && 1557 1525 jh->b_jlist != BJ_Metadata)) { 1558 1526 spin_lock(&jh->b_state_lock); 1559 - if (jh->b_transaction == transaction && 1560 - jh->b_jlist != BJ_Metadata) 1561 - pr_err("JBD2: assertion failure: h_type=%u " 1562 - "h_line_no=%u block_no=%llu jlist=%u\n", 1527 + if (WARN_ON_ONCE(jh->b_transaction == transaction && 1528 + jh->b_jlist != BJ_Metadata)) { 1529 + pr_err("JBD2: assertion failure: h_type=%u h_line_no=%u block_no=%llu jlist=%u\n", 1563 1530 handle->h_type, handle->h_line_no, 1564 1531 (unsigned long long) bh->b_blocknr, 1565 1532 jh->b_jlist); 1566 - J_ASSERT_JH(jh, jh->b_transaction != transaction || 1567 - jh->b_jlist == BJ_Metadata); 1533 + ret = -EINVAL; 1534 + goto out_unlock_bh; 1535 + } 1568 1536 spin_unlock(&jh->b_state_lock); 1569 1537 } 1570 1538 goto out; ··· 1583 1551 ret = -EROFS; 1584 1552 goto out_unlock_bh; 1585 1553 } 1586 - 1587 - journal = transaction->t_journal; 1588 1554 1589 1555 if (jh->b_modified == 0) { 1590 1556 /* ··· 1661 1631 } 1662 1632 1663 1633 /* That test should have eliminated the following case: */ 1664 - J_ASSERT_JH(jh, jh->b_frozen_data == NULL); 1634 + if (WARN_ON_ONCE(jh->b_frozen_data)) { 1635 + ret = -EINVAL; 1636 + goto out_unlock_bh; 1637 + } 1665 1638 1666 1639 JBUFFER_TRACE(jh, "file as BJ_Metadata"); 1667 1640 spin_lock(&journal->j_list_lock); ··· 1703 1670 int err = 0; 1704 1671 int was_modified = 0; 1705 1672 int wait_for_writeback = 0; 1673 + int abort_journal = 0; 1706 1674 1707 1675 if (is_handle_aborted(handle)) 1708 1676 return -EROFS; ··· 1737 1703 jh->b_modified = 0; 1738 1704 1739 1705 if (jh->b_transaction == transaction) { 1740 - J_ASSERT_JH(jh, !jh->b_frozen_data); 1706 + if (WARN_ON_ONCE(jh->b_frozen_data)) { 1707 + err = -EINVAL; 1708 + abort_journal = 1; 1709 + goto drop; 1710 + } 1741 1711 1742 1712 /* If we are forgetting a buffer which is already part 1743 1713 * of this transaction, then we can just drop it from ··· 1780 1742 } 1781 1743 spin_unlock(&journal->j_list_lock); 1782 1744 } else if (jh->b_transaction) { 1783 - J_ASSERT_JH(jh, (jh->b_transaction == 1784 - journal->j_committing_transaction)); 1745 + if (WARN_ON_ONCE(jh->b_transaction != journal->j_committing_transaction)) { 1746 + err = -EINVAL; 1747 + abort_journal = 1; 1748 + goto drop; 1749 + } 1785 1750 /* However, if the buffer is still owned by a prior 1786 1751 * (committing) transaction, we can't drop it yet... */ 1787 1752 JBUFFER_TRACE(jh, "belongs to older transaction"); ··· 1802 1761 jh->b_next_transaction = transaction; 1803 1762 spin_unlock(&journal->j_list_lock); 1804 1763 } else { 1805 - J_ASSERT(jh->b_next_transaction == transaction); 1764 + if (WARN_ON_ONCE(jh->b_next_transaction != transaction)) { 1765 + err = -EINVAL; 1766 + abort_journal = 1; 1767 + goto drop; 1768 + } 1806 1769 1807 1770 /* 1808 1771 * only drop a reference if this transaction modified ··· 1852 1807 drop: 1853 1808 __brelse(bh); 1854 1809 spin_unlock(&jh->b_state_lock); 1810 + if (abort_journal) 1811 + jbd2_journal_abort(journal, err); 1855 1812 if (wait_for_writeback) 1856 1813 wait_on_buffer(bh); 1857 1814 jbd2_journal_put_journal_head(jh); ··· 2178 2131 struct buffer_head *bh; 2179 2132 bool ret = false; 2180 2133 2181 - J_ASSERT(folio_test_locked(folio)); 2134 + if (WARN_ON_ONCE(!folio_test_locked(folio))) 2135 + return false; 2182 2136 2183 2137 head = folio_buffers(folio); 2184 2138 bh = head; ··· 2694 2646 { 2695 2647 transaction_t *transaction = handle->h_transaction; 2696 2648 journal_t *journal; 2649 + pgoff_t start_page, end_page; 2650 + int err = 0; 2651 + int abort_transaction = 0; 2697 2652 2698 2653 if (is_handle_aborted(handle)) 2699 2654 return -EROFS; ··· 2705 2654 jbd2_debug(4, "Adding inode %llu, tid:%d\n", jinode->i_vfs_inode->i_ino, 2706 2655 transaction->t_tid); 2707 2656 2708 - spin_lock(&journal->j_list_lock); 2709 - jinode->i_flags |= flags; 2657 + start_page = (pgoff_t)(start_byte >> PAGE_SHIFT); 2658 + end_page = (pgoff_t)(end_byte >> PAGE_SHIFT) + 1; 2710 2659 2711 - if (jinode->i_dirty_end) { 2712 - jinode->i_dirty_start = min(jinode->i_dirty_start, start_byte); 2713 - jinode->i_dirty_end = max(jinode->i_dirty_end, end_byte); 2660 + spin_lock(&journal->j_list_lock); 2661 + WRITE_ONCE(jinode->i_flags, jinode->i_flags | flags); 2662 + 2663 + if (jinode->i_dirty_start_page != jinode->i_dirty_end_page) { 2664 + WRITE_ONCE(jinode->i_dirty_start_page, 2665 + min(jinode->i_dirty_start_page, start_page)); 2666 + WRITE_ONCE(jinode->i_dirty_end_page, 2667 + max(jinode->i_dirty_end_page, end_page)); 2714 2668 } else { 2715 - jinode->i_dirty_start = start_byte; 2716 - jinode->i_dirty_end = end_byte; 2669 + /* Publish a new non-empty range by making end visible first. */ 2670 + WRITE_ONCE(jinode->i_dirty_end_page, end_page); 2671 + WRITE_ONCE(jinode->i_dirty_start_page, start_page); 2717 2672 } 2718 2673 2719 2674 /* Is inode already attached where we need it? */ ··· 2737 2680 /* On some different transaction's list - should be 2738 2681 * the committing one */ 2739 2682 if (jinode->i_transaction) { 2740 - J_ASSERT(jinode->i_next_transaction == NULL); 2741 - J_ASSERT(jinode->i_transaction == 2742 - journal->j_committing_transaction); 2683 + if (WARN_ON_ONCE(jinode->i_next_transaction || 2684 + jinode->i_transaction != 2685 + journal->j_committing_transaction)) { 2686 + pr_err("JBD2: %s: assertion failure: i_next_transaction=%p i_transaction=%p j_committing_transaction=%p\n", 2687 + journal->j_devname, jinode->i_next_transaction, 2688 + jinode->i_transaction, 2689 + journal->j_committing_transaction); 2690 + err = -EINVAL; 2691 + abort_transaction = 1; 2692 + goto done; 2693 + } 2743 2694 jinode->i_next_transaction = transaction; 2744 2695 goto done; 2745 2696 } 2746 2697 /* Not on any transaction list... */ 2747 - J_ASSERT(!jinode->i_next_transaction); 2698 + if (WARN_ON_ONCE(jinode->i_next_transaction)) { 2699 + err = -EINVAL; 2700 + abort_transaction = 1; 2701 + goto done; 2702 + } 2748 2703 jinode->i_transaction = transaction; 2749 2704 list_add(&jinode->i_list, &transaction->t_inode_list); 2750 2705 done: 2751 2706 spin_unlock(&journal->j_list_lock); 2752 - 2753 - return 0; 2707 + if (abort_transaction) 2708 + jbd2_journal_abort(journal, err); 2709 + return err; 2754 2710 } 2755 2711 2756 2712 int jbd2_journal_inode_ranged_write(handle_t *handle, ··· 2809 2739 int ret = 0; 2810 2740 2811 2741 /* This is a quick check to avoid locking if not necessary */ 2812 - if (!jinode->i_transaction) 2742 + if (!READ_ONCE(jinode->i_transaction)) 2813 2743 goto out; 2814 2744 /* Locks are here just to force reading of recent values, it is 2815 2745 * enough that the transaction was not committing before we started
+7 -2
fs/ocfs2/journal.c
··· 899 899 900 900 static int ocfs2_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) 901 901 { 902 - return filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, 903 - jinode->i_dirty_start, jinode->i_dirty_end); 902 + struct address_space *mapping = jinode->i_vfs_inode->i_mapping; 903 + loff_t range_start, range_end; 904 + 905 + if (!jbd2_jinode_get_dirty_range(jinode, &range_start, &range_end)) 906 + return 0; 907 + 908 + return filemap_fdatawrite_range(mapping, range_start, range_end); 904 909 } 905 910 906 911 int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty)
+31 -7
include/linux/jbd2.h
··· 429 429 unsigned long i_flags; 430 430 431 431 /** 432 - * @i_dirty_start: 432 + * @i_dirty_start_page: 433 433 * 434 - * Offset in bytes where the dirty range for this inode starts. 434 + * Dirty range start in PAGE_SIZE units. 435 + * 436 + * The dirty range is empty if @i_dirty_start_page is greater than or 437 + * equal to @i_dirty_end_page. 438 + * 435 439 * [j_list_lock] 436 440 */ 437 - loff_t i_dirty_start; 441 + pgoff_t i_dirty_start_page; 438 442 439 443 /** 440 - * @i_dirty_end: 444 + * @i_dirty_end_page: 441 445 * 442 - * Inclusive offset in bytes where the dirty range for this inode 443 - * ends. [j_list_lock] 446 + * Dirty range end in PAGE_SIZE units (exclusive). 447 + * 448 + * [j_list_lock] 444 449 */ 445 - loff_t i_dirty_end; 450 + pgoff_t i_dirty_end_page; 446 451 }; 452 + 453 + /* 454 + * Lockless readers treat start_page >= end_page as an empty range. 455 + * Writers publish a new non-empty range by storing i_dirty_end_page before 456 + * i_dirty_start_page. 457 + */ 458 + static inline bool jbd2_jinode_get_dirty_range(const struct jbd2_inode *jinode, 459 + loff_t *start, loff_t *end) 460 + { 461 + pgoff_t start_page = READ_ONCE(jinode->i_dirty_start_page); 462 + pgoff_t end_page = READ_ONCE(jinode->i_dirty_end_page); 463 + 464 + if (start_page >= end_page) 465 + return false; 466 + 467 + *start = (loff_t)start_page << PAGE_SHIFT; 468 + *end = ((loff_t)end_page << PAGE_SHIFT) - 1; 469 + return true; 470 + } 447 471 448 472 struct jbd2_revoke_table_s; 449 473