Merge tag 'ext4-for_linus-6.15-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+3 -4

Documentation/admin-guide/ext4.rst

··· 238 238 configured using tune2fs) 239 239 240 240 data_err=ignore(*) 241 - Just print an error message if an error occurs in a file data buffer in 242 - ordered mode. 241 + Just print an error message if an error occurs in a file data buffer. 242 + 243 243 data_err=abort 244 - Abort the journal if an error occurs in a file data buffer in ordered 245 - mode. 244 + Abort the journal if an error occurs in a file data buffer. 246 245 247 246 grpid | bsdgroups 248 247 New objects have the group ID of their parent.

+1 -3

Documentation/filesystems/journalling.rst

··· 111 111 so that you can do some of your own management. You ask the journalling 112 112 layer for calling the callback by simply setting 113 113 ``journal->j_commit_callback`` function pointer and that function is 114 - called after each transaction commit. You can also use 115 - ``transaction->t_private_list`` for attaching entries to a transaction 116 - that need processing when the transaction commits. 114 + called after each transaction commit. 117 115 118 116 JBD2 also provides a way to block all transaction updates via 119 117 jbd2_journal_lock_updates() /

+2 -2

fs/ext4/balloc.c

··· 649 649 /* Hm, nope. Are (enough) root reserved clusters available? */ 650 650 if (uid_eq(sbi->s_resuid, current_fsuid()) || 651 651 (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || 652 - capable(CAP_SYS_RESOURCE) || 653 - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { 652 + (flags & EXT4_MB_USE_ROOT_BLOCKS) || 653 + capable(CAP_SYS_RESOURCE)) { 654 654 655 655 if (free_clusters >= (nclusters + dirty_clusters + 656 656 resv_clusters))

+4 -4

fs/ext4/bitmap.c

··· 25 25 struct ext4_sb_info *sbi = EXT4_SB(sb); 26 26 int sz; 27 27 28 - if (!ext4_has_metadata_csum(sb)) 28 + if (!ext4_has_feature_metadata_csum(sb)) 29 29 return 1; 30 30 31 31 sz = EXT4_INODES_PER_GROUP(sb) >> 3; ··· 48 48 struct ext4_sb_info *sbi = EXT4_SB(sb); 49 49 int sz; 50 50 51 - if (!ext4_has_metadata_csum(sb)) 51 + if (!ext4_has_feature_metadata_csum(sb)) 52 52 return; 53 53 54 54 sz = EXT4_INODES_PER_GROUP(sb) >> 3; ··· 67 67 struct ext4_sb_info *sbi = EXT4_SB(sb); 68 68 int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; 69 69 70 - if (!ext4_has_metadata_csum(sb)) 70 + if (!ext4_has_feature_metadata_csum(sb)) 71 71 return 1; 72 72 73 73 provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); ··· 89 89 __u32 csum; 90 90 struct ext4_sb_info *sbi = EXT4_SB(sb); 91 91 92 - if (!ext4_has_metadata_csum(sb)) 92 + if (!ext4_has_feature_metadata_csum(sb)) 93 93 return; 94 94 95 95 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);

+5 -2

fs/ext4/dir.c

··· 86 86 dir->i_sb->s_blocksize); 87 87 const int next_offset = ((char *) de - buf) + rlen; 88 88 bool fake = is_fake_dir_entry(de); 89 - bool has_csum = ext4_has_metadata_csum(dir->i_sb); 89 + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); 90 90 91 91 if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) 92 92 error_msg = "rec_len is smaller than minimal"; ··· 104 104 else if (unlikely(le32_to_cpu(de->inode) > 105 105 le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) 106 106 error_msg = "inode out of bounds"; 107 + else if (unlikely(next_offset == size && de->name_len == 1 && 108 + de->name[0] == '.')) 109 + error_msg = "'.' directory cannot be the last in data block"; 107 110 else 108 111 return 0; 109 112 ··· 148 145 return err; 149 146 150 147 /* Can we just clear INDEX flag to ignore htree information? */ 151 - if (!ext4_has_metadata_csum(sb)) { 148 + if (!ext4_has_feature_metadata_csum(sb)) { 152 149 /* 153 150 * We don't set the inode dirty flag since it's not 154 151 * critical that it gets flushed back to the disk.

+53 -41

fs/ext4/ext4.h

··· 278 278 /* 279 279 * Flags for ext4_io_end->flags 280 280 */ 281 - #define EXT4_IO_END_UNWRITTEN 0x0001 281 + #define EXT4_IO_END_UNWRITTEN 0x0001 282 + #define EXT4_IO_END_FAILED 0x0002 283 + 284 + #define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) 282 285 283 286 struct ext4_io_end_vec { 284 287 struct list_head list; /* list of io_end_vec */ ··· 370 367 #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ 371 368 ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ 372 369 blkbits)) 370 + #define EXT4_B_TO_LBLK(inode, offset) \ 371 + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) 373 372 374 373 /* Translate a block number to a cluster number */ 375 374 #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) ··· 1063 1058 1064 1059 /* Number of ongoing updates on this inode */ 1065 1060 atomic_t i_fc_updates; 1066 - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 1061 + 1062 + spinlock_t i_raw_lock; /* protects updates to the raw inode */ 1067 1063 1068 1064 /* Fast commit wait queue for this inode */ 1069 1065 wait_queue_head_t i_fc_wait; ··· 1102 1096 struct rw_semaphore i_data_sem; 1103 1097 struct inode vfs_inode; 1104 1098 struct jbd2_inode *jinode; 1105 - 1106 - spinlock_t i_raw_lock; /* protects updates to the raw inode */ 1107 1099 1108 1100 /* 1109 1101 * File creation time. Its function is same as that of ··· 1145 1141 /* quota space reservation, managed internally by quota code */ 1146 1142 qsize_t i_reserved_quota; 1147 1143 #endif 1144 + spinlock_t i_block_reservation_lock; 1148 1145 1149 1146 /* Lock protecting lists below */ 1150 1147 spinlock_t i_completed_io_lock; ··· 1155 1150 */ 1156 1151 struct list_head i_rsv_conversion_list; 1157 1152 struct work_struct i_rsv_conversion_work; 1158 - 1159 - spinlock_t i_block_reservation_lock; 1160 1153 1161 1154 /* 1162 1155 * Transactions that contain inode's metadata needed to complete ··· 1609 1606 unsigned int s_mb_prefetch; 1610 1607 unsigned int s_mb_prefetch_limit; 1611 1608 unsigned int s_mb_best_avail_max_trim_order; 1609 + unsigned int s_sb_update_sec; 1610 + unsigned int s_sb_update_kb; 1612 1611 1613 1612 /* stats for buddy allocator */ 1614 1613 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ··· 1826 1821 */ 1827 1822 enum { 1828 1823 EXT4_MF_MNTDIR_SAMPLED, 1829 - EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ 1824 + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ 1825 + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ 1830 1826 }; 1831 1827 1832 1828 static inline void ext4_set_mount_flag(struct super_block *sb, int bit) ··· 2238 2232 /* 2239 2233 * Superblock flags 2240 2234 */ 2241 - #define EXT4_FLAGS_RESIZING 0 2242 - #define EXT4_FLAGS_SHUTDOWN 1 2243 - #define EXT4_FLAGS_BDEV_IS_DAX 2 2235 + enum { 2236 + EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ 2237 + EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ 2238 + EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ 2239 + EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ 2240 + }; 2244 2241 2245 2242 static inline int ext4_forced_shutdown(struct super_block *sb) 2246 2243 { 2247 2244 return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); 2245 + } 2246 + 2247 + static inline int ext4_emergency_ro(struct super_block *sb) 2248 + { 2249 + return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); 2250 + } 2251 + 2252 + static inline int ext4_emergency_state(struct super_block *sb) 2253 + { 2254 + if (unlikely(ext4_forced_shutdown(sb))) 2255 + return -EIO; 2256 + if (unlikely(ext4_emergency_ro(sb))) 2257 + return -EROFS; 2258 + return 0; 2248 2259 } 2249 2260 2250 2261 /* ··· 2299 2276 */ 2300 2277 #define EXT4_DEF_MIN_BATCH_TIME 0 2301 2278 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ 2279 + 2280 + /* 2281 + * Default values for superblock update 2282 + */ 2283 + #define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ 2284 + #define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ 2285 + 2302 2286 2303 2287 /* 2304 2288 * Minimum number of groups in a flexgroup before we separate out ··· 2840 2810 struct ext4_dir_entry_2 *dirent, 2841 2811 struct fscrypt_str *ent_name); 2842 2812 extern void ext4_htree_free_dir_info(struct dir_private_info *p); 2843 - extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, 2844 - struct buffer_head *bh, 2813 + extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, 2845 2814 void *buf, int buf_size, 2846 2815 struct ext4_filename *fname, 2847 2816 struct ext4_dir_entry_2 **dest_de); ··· 3030 3001 extern int ext4_can_truncate(struct inode *inode); 3031 3002 extern int ext4_truncate(struct inode *); 3032 3003 extern int ext4_break_layouts(struct inode *); 3004 + extern int ext4_truncate_page_cache_block_range(struct inode *inode, 3005 + loff_t start, loff_t end); 3033 3006 extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); 3034 3007 extern void ext4_set_inode_flags(struct inode *, bool init); 3035 3008 extern int ext4_alloc_da_blocks(struct inode *inode); ··· 3290 3259 extern int ext4_register_li_request(struct super_block *sb, 3291 3260 ext4_group_t first_not_zeroed); 3292 3261 3293 - static inline int ext4_has_metadata_csum(struct super_block *sb) 3294 - { 3295 - return ext4_has_feature_metadata_csum(sb); 3296 - } 3297 - 3298 3262 static inline int ext4_has_group_desc_csum(struct super_block *sb) 3299 3263 { 3300 - return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); 3264 + return ext4_has_feature_gdt_csum(sb) || 3265 + ext4_has_feature_metadata_csum(sb); 3301 3266 } 3302 3267 3303 3268 #define ext4_read_incompat_64bit_val(es, name) \ ··· 3573 3546 struct folio **foliop); 3574 3547 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, 3575 3548 unsigned copied, struct folio *folio); 3576 - extern int ext4_da_write_inline_data_begin(struct address_space *mapping, 3577 - struct inode *inode, 3578 - loff_t pos, unsigned len, 3579 - struct folio **foliop, 3580 - void **fsdata); 3549 + extern int ext4_generic_write_inline_data(struct address_space *mapping, 3550 + struct inode *inode, 3551 + loff_t pos, unsigned len, 3552 + struct folio **foliop, 3553 + void **fsdata, bool da); 3581 3554 extern int ext4_try_add_inline_entry(handle_t *handle, 3582 3555 struct ext4_filename *fname, 3583 3556 struct inode *dir, struct inode *inode); ··· 3812 3785 set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); 3813 3786 } 3814 3787 3815 - /* For ioend & aio unwritten conversion wait queues */ 3816 - #define EXT4_WQ_HASH_SZ 37 3817 - #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ 3818 - EXT4_WQ_HASH_SZ]) 3819 - extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 3820 - 3821 3788 extern int ext4_resize_begin(struct super_block *sb); 3822 3789 extern int ext4_resize_end(struct super_block *sb, bool update_backups); 3823 3790 3824 - static inline void ext4_set_io_unwritten_flag(struct inode *inode, 3825 - struct ext4_io_end *io_end) 3791 + static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) 3826 3792 { 3827 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 3793 + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) 3828 3794 io_end->flag |= EXT4_IO_END_UNWRITTEN; 3829 - atomic_inc(&EXT4_I(inode)->i_unwritten); 3830 - } 3831 3795 } 3832 3796 3833 3797 static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) 3834 3798 { 3835 - struct inode *inode = io_end->inode; 3836 - 3837 - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 3799 + if (io_end->flag & EXT4_IO_END_UNWRITTEN) 3838 3800 io_end->flag &= ~EXT4_IO_END_UNWRITTEN; 3839 - /* Wake up anyone waiting on unwritten extent conversion */ 3840 - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 3841 - wake_up_all(ext4_ioend_wq(inode)); 3842 - } 3843 3801 } 3844 3802 3845 3803 extern const struct iomap_ops ext4_iomap_ops;

+8 -4

fs/ext4/ext4_jbd2.c

··· 63 63 */ 64 64 static int ext4_journal_check_start(struct super_block *sb) 65 65 { 66 + int ret; 66 67 journal_t *journal; 67 68 68 69 might_sleep(); 69 70 70 - if (unlikely(ext4_forced_shutdown(sb))) 71 - return -EIO; 71 + ret = ext4_emergency_state(sb); 72 + if (unlikely(ret)) 73 + return ret; 72 74 73 75 if (WARN_ON_ONCE(sb_rdonly(sb))) 74 76 return -EROFS; ··· 246 244 } 247 245 } else 248 246 ext4_check_bdev_write_error(sb); 249 - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) 247 + if (trigger_type == EXT4_JTR_NONE || 248 + !ext4_has_feature_metadata_csum(sb)) 250 249 return 0; 251 250 BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); 252 251 jbd2_journal_set_triggers(bh, ··· 334 331 err); 335 332 return err; 336 333 } 337 - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) 334 + if (trigger_type == EXT4_JTR_NONE || 335 + !ext4_has_feature_metadata_csum(sb)) 338 336 return 0; 339 337 BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); 340 338 jbd2_journal_set_triggers(bh,

+29 -84

fs/ext4/ext4_jbd2.h

··· 122 122 #define EXT4_HT_EXT_CONVERT 11 123 123 #define EXT4_HT_MAX 12 124 124 125 - /** 126 - * struct ext4_journal_cb_entry - Base structure for callback information. 127 - * 128 - * This struct is a 'seed' structure for a using with your own callback 129 - * structs. If you are using callbacks you must allocate one of these 130 - * or another struct of your own definition which has this struct 131 - * as it's first element and pass it to ext4_journal_callback_add(). 132 - */ 133 - struct ext4_journal_cb_entry { 134 - /* list information for other callbacks attached to the same handle */ 135 - struct list_head jce_list; 136 - 137 - /* Function to call with this callback structure */ 138 - void (*jce_func)(struct super_block *sb, 139 - struct ext4_journal_cb_entry *jce, int error); 140 - 141 - /* user data goes here */ 142 - }; 143 - 144 - /** 145 - * ext4_journal_callback_add: add a function to call after transaction commit 146 - * @handle: active journal transaction handle to register callback on 147 - * @func: callback function to call after the transaction has committed: 148 - * @sb: superblock of current filesystem for transaction 149 - * @jce: returned journal callback data 150 - * @rc: journal state at commit (0 = transaction committed properly) 151 - * @jce: journal callback data (internal and function private data struct) 152 - * 153 - * The registered function will be called in the context of the journal thread 154 - * after the transaction for which the handle was created has completed. 155 - * 156 - * No locks are held when the callback function is called, so it is safe to 157 - * call blocking functions from within the callback, but the callback should 158 - * not block or run for too long, or the filesystem will be blocked waiting for 159 - * the next transaction to commit. No journaling functions can be used, or 160 - * there is a risk of deadlock. 161 - * 162 - * There is no guaranteed calling order of multiple registered callbacks on 163 - * the same transaction. 164 - */ 165 - static inline void _ext4_journal_callback_add(handle_t *handle, 166 - struct ext4_journal_cb_entry *jce) 167 - { 168 - /* Add the jce to transaction's private list */ 169 - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); 170 - } 171 - 172 - static inline void ext4_journal_callback_add(handle_t *handle, 173 - void (*func)(struct super_block *sb, 174 - struct ext4_journal_cb_entry *jce, 175 - int rc), 176 - struct ext4_journal_cb_entry *jce) 177 - { 178 - struct ext4_sb_info *sbi = 179 - EXT4_SB(handle->h_transaction->t_journal->j_private); 180 - 181 - /* Add the jce to transaction's private list */ 182 - jce->jce_func = func; 183 - spin_lock(&sbi->s_md_lock); 184 - _ext4_journal_callback_add(handle, jce); 185 - spin_unlock(&sbi->s_md_lock); 186 - } 187 - 188 - 189 - /** 190 - * ext4_journal_callback_del: delete a registered callback 191 - * @handle: active journal transaction handle on which callback was registered 192 - * @jce: registered journal callback entry to unregister 193 - * Return true if object was successfully removed 194 - */ 195 - static inline bool ext4_journal_callback_try_del(handle_t *handle, 196 - struct ext4_journal_cb_entry *jce) 197 - { 198 - bool deleted; 199 - struct ext4_sb_info *sbi = 200 - EXT4_SB(handle->h_transaction->t_journal->j_private); 201 - 202 - spin_lock(&sbi->s_md_lock); 203 - deleted = !list_empty(&jce->jce_list); 204 - list_del_init(&jce->jce_list); 205 - spin_unlock(&sbi->s_md_lock); 206 - return deleted; 207 - } 208 - 209 125 int 210 126 ext4_mark_iloc_dirty(handle_t *handle, 211 127 struct inode *inode, ··· 427 511 if (!test_opt(inode->i_sb, DELALLOC)) 428 512 return 0; 429 513 return 1; 514 + } 515 + 516 + /* 517 + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some 518 + * cases 519 + */ 520 + static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) 521 + { 522 + int err = 0; 523 + 524 + /* 525 + * At this point only two things can be operating on the journal. 526 + * JBD2 thread performing transaction commit and s_sb_upd_work 527 + * issuing sb update through the journal. Once we set 528 + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not 529 + * queue s_sb_upd_work and ext4_force_commit() makes sure any 530 + * ext4_handle_error() calls from the running transaction commit are 531 + * finished. Hence no new s_sb_upd_work can be queued after we 532 + * flush it here. 533 + */ 534 + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); 535 + 536 + ext4_force_commit(sbi->s_sb); 537 + flush_work(&sbi->s_sb_upd_work); 538 + 539 + err = jbd2_journal_destroy(journal); 540 + sbi->s_journal = NULL; 541 + 542 + return err; 430 543 } 431 544 432 545 #endif /* _EXT4_JBD2_H */

+202 -335

fs/ext4/extents.c

··· 63 63 { 64 64 struct ext4_extent_tail *et; 65 65 66 - if (!ext4_has_metadata_csum(inode->i_sb)) 66 + if (!ext4_has_feature_metadata_csum(inode->i_sb)) 67 67 return 1; 68 68 69 69 et = find_ext4_extent_tail(eh); ··· 77 77 { 78 78 struct ext4_extent_tail *et; 79 79 80 - if (!ext4_has_metadata_csum(inode->i_sb)) 80 + if (!ext4_has_feature_metadata_csum(inode->i_sb)) 81 81 return; 82 82 83 83 et = find_ext4_extent_tail(eh); ··· 4568 4568 loff_t len, int mode) 4569 4569 { 4570 4570 struct inode *inode = file_inode(file); 4571 - struct address_space *mapping = file->f_mapping; 4572 4571 handle_t *handle = NULL; 4573 - unsigned int max_blocks; 4574 4572 loff_t new_size = 0; 4575 - int ret = 0; 4576 - int flags; 4577 - int credits; 4578 - int partial_begin, partial_end; 4579 - loff_t start, end; 4580 - ext4_lblk_t lblk; 4573 + loff_t end = offset + len; 4574 + ext4_lblk_t start_lblk, end_lblk; 4575 + unsigned int blocksize = i_blocksize(inode); 4581 4576 unsigned int blkbits = inode->i_blkbits; 4577 + int ret, flags, credits; 4582 4578 4583 4579 trace_ext4_zero_range(inode, offset, len, mode); 4580 + WARN_ON_ONCE(!inode_is_locked(inode)); 4584 4581 4585 - /* 4586 - * Round up offset. This is not fallocate, we need to zero out 4587 - * blocks, so convert interior block aligned part of the range to 4588 - * unwritten and possibly manually zero out unaligned parts of the 4589 - * range. Here, start and partial_begin are inclusive, end and 4590 - * partial_end are exclusive. 4591 - */ 4592 - start = round_up(offset, 1 << blkbits); 4593 - end = round_down((offset + len), 1 << blkbits); 4594 - 4595 - if (start < offset || end > offset + len) 4596 - return -EINVAL; 4597 - partial_begin = offset & ((1 << blkbits) - 1); 4598 - partial_end = (offset + len) & ((1 << blkbits) - 1); 4599 - 4600 - lblk = start >> blkbits; 4601 - max_blocks = (end >> blkbits); 4602 - if (max_blocks < lblk) 4603 - max_blocks = 0; 4604 - else 4605 - max_blocks -= lblk; 4606 - 4607 - inode_lock(inode); 4608 - 4609 - /* 4610 - * Indirect files do not support unwritten extents 4611 - */ 4612 - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4613 - ret = -EOPNOTSUPP; 4614 - goto out_mutex; 4615 - } 4582 + /* Indirect files do not support unwritten extents */ 4583 + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4584 + return -EOPNOTSUPP; 4616 4585 4617 4586 if (!(mode & FALLOC_FL_KEEP_SIZE) && 4618 - (offset + len > inode->i_size || 4619 - offset + len > EXT4_I(inode)->i_disksize)) { 4620 - new_size = offset + len; 4587 + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { 4588 + new_size = end; 4621 4589 ret = inode_newsize_ok(inode, new_size); 4622 4590 if (ret) 4623 - goto out_mutex; 4591 + return ret; 4624 4592 } 4625 4593 4626 4594 flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4627 - 4628 - /* Wait all existing dio workers, newcomers will block on i_rwsem */ 4629 - inode_dio_wait(inode); 4630 - 4631 - ret = file_modified(file); 4632 - if (ret) 4633 - goto out_mutex; 4634 - 4635 4595 /* Preallocate the range including the unaligned edges */ 4636 - if (partial_begin || partial_end) { 4637 - ret = ext4_alloc_file_blocks(file, 4638 - round_down(offset, 1 << blkbits) >> blkbits, 4639 - (round_up((offset + len), 1 << blkbits) - 4640 - round_down(offset, 1 << blkbits)) >> blkbits, 4641 - new_size, flags); 4642 - if (ret) 4643 - goto out_mutex; 4596 + if (!IS_ALIGNED(offset | end, blocksize)) { 4597 + ext4_lblk_t alloc_lblk = offset >> blkbits; 4598 + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); 4644 4599 4600 + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, 4601 + new_size, flags); 4602 + if (ret) 4603 + return ret; 4645 4604 } 4605 + 4606 + ret = ext4_update_disksize_before_punch(inode, offset, len); 4607 + if (ret) 4608 + return ret; 4609 + 4610 + /* Now release the pages and zero block aligned part of pages */ 4611 + ret = ext4_truncate_page_cache_block_range(inode, offset, end); 4612 + if (ret) 4613 + return ret; 4646 4614 4647 4615 /* Zero range excluding the unaligned edges */ 4648 - if (max_blocks > 0) { 4649 - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | 4650 - EXT4_EX_NOCACHE); 4616 + start_lblk = EXT4_B_TO_LBLK(inode, offset); 4617 + end_lblk = end >> blkbits; 4618 + if (end_lblk > start_lblk) { 4619 + ext4_lblk_t zero_blks = end_lblk - start_lblk; 4651 4620 4652 - /* 4653 - * Prevent page faults from reinstantiating pages we have 4654 - * released from page cache. 4655 - */ 4656 - filemap_invalidate_lock(mapping); 4657 - 4658 - ret = ext4_break_layouts(inode); 4659 - if (ret) { 4660 - filemap_invalidate_unlock(mapping); 4661 - goto out_mutex; 4662 - } 4663 - 4664 - ret = ext4_update_disksize_before_punch(inode, offset, len); 4665 - if (ret) { 4666 - filemap_invalidate_unlock(mapping); 4667 - goto out_mutex; 4668 - } 4669 - 4670 - /* 4671 - * For journalled data we need to write (and checkpoint) pages 4672 - * before discarding page cache to avoid inconsitent data on 4673 - * disk in case of crash before zeroing trans is committed. 4674 - */ 4675 - if (ext4_should_journal_data(inode)) { 4676 - ret = filemap_write_and_wait_range(mapping, start, 4677 - end - 1); 4678 - if (ret) { 4679 - filemap_invalidate_unlock(mapping); 4680 - goto out_mutex; 4681 - } 4682 - } 4683 - 4684 - /* Now release the pages and zero block aligned part of pages */ 4685 - truncate_pagecache_range(inode, start, end - 1); 4686 - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 4687 - 4688 - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, 4689 - flags); 4690 - filemap_invalidate_unlock(mapping); 4621 + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); 4622 + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, 4623 + new_size, flags); 4691 4624 if (ret) 4692 - goto out_mutex; 4625 + return ret; 4693 4626 } 4694 - if (!partial_begin && !partial_end) 4695 - goto out_mutex; 4627 + /* Finish zeroing out if it doesn't contain partial block */ 4628 + if (IS_ALIGNED(offset | end, blocksize)) 4629 + return ret; 4696 4630 4697 4631 /* 4698 4632 * In worst case we have to writeout two nonadjacent unwritten ··· 4639 4705 if (IS_ERR(handle)) { 4640 4706 ret = PTR_ERR(handle); 4641 4707 ext4_std_error(inode->i_sb, ret); 4642 - goto out_mutex; 4708 + return ret; 4643 4709 } 4644 4710 4645 - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 4711 + /* Zero out partial block at the edges of the range */ 4712 + ret = ext4_zero_partial_blocks(handle, inode, offset, len); 4713 + if (ret) 4714 + goto out_handle; 4715 + 4646 4716 if (new_size) 4647 4717 ext4_update_inode_size(inode, new_size); 4648 4718 ret = ext4_mark_inode_dirty(handle, inode); 4649 4719 if (unlikely(ret)) 4650 4720 goto out_handle; 4651 - /* Zero out partial block at the edges of the range */ 4652 - ret = ext4_zero_partial_blocks(handle, inode, offset, len); 4653 - if (ret >= 0) 4654 - ext4_update_inode_fsync_trans(handle, inode, 1); 4655 4721 4722 + ext4_update_inode_fsync_trans(handle, inode, 1); 4656 4723 if (file->f_flags & O_SYNC) 4657 4724 ext4_handle_sync(handle); 4658 4725 4659 4726 out_handle: 4660 4727 ext4_journal_stop(handle); 4661 - out_mutex: 4662 - inode_unlock(inode); 4728 + return ret; 4729 + } 4730 + 4731 + static long ext4_do_fallocate(struct file *file, loff_t offset, 4732 + loff_t len, int mode) 4733 + { 4734 + struct inode *inode = file_inode(file); 4735 + loff_t end = offset + len; 4736 + loff_t new_size = 0; 4737 + ext4_lblk_t start_lblk, len_lblk; 4738 + int ret; 4739 + 4740 + trace_ext4_fallocate_enter(inode, offset, len, mode); 4741 + WARN_ON_ONCE(!inode_is_locked(inode)); 4742 + 4743 + start_lblk = offset >> inode->i_blkbits; 4744 + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); 4745 + 4746 + /* We only support preallocation for extent-based files only. */ 4747 + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4748 + ret = -EOPNOTSUPP; 4749 + goto out; 4750 + } 4751 + 4752 + if (!(mode & FALLOC_FL_KEEP_SIZE) && 4753 + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { 4754 + new_size = end; 4755 + ret = inode_newsize_ok(inode, new_size); 4756 + if (ret) 4757 + goto out; 4758 + } 4759 + 4760 + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, 4761 + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); 4762 + if (ret) 4763 + goto out; 4764 + 4765 + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { 4766 + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, 4767 + EXT4_I(inode)->i_sync_tid); 4768 + } 4769 + out: 4770 + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); 4663 4771 return ret; 4664 4772 } 4665 4773 ··· 4715 4739 long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) 4716 4740 { 4717 4741 struct inode *inode = file_inode(file); 4718 - loff_t new_size = 0; 4719 - unsigned int max_blocks; 4720 - int ret = 0; 4721 - int flags; 4722 - ext4_lblk_t lblk; 4723 - unsigned int blkbits = inode->i_blkbits; 4742 + struct address_space *mapping = file->f_mapping; 4743 + int ret; 4724 4744 4725 4745 /* 4726 4746 * Encrypted inodes can't handle collapse range or insert ··· 4736 4764 4737 4765 inode_lock(inode); 4738 4766 ret = ext4_convert_inline_data(inode); 4739 - inode_unlock(inode); 4740 4767 if (ret) 4741 - goto exit; 4742 - 4743 - if (mode & FALLOC_FL_PUNCH_HOLE) { 4744 - ret = ext4_punch_hole(file, offset, len); 4745 - goto exit; 4746 - } 4747 - 4748 - if (mode & FALLOC_FL_COLLAPSE_RANGE) { 4749 - ret = ext4_collapse_range(file, offset, len); 4750 - goto exit; 4751 - } 4752 - 4753 - if (mode & FALLOC_FL_INSERT_RANGE) { 4754 - ret = ext4_insert_range(file, offset, len); 4755 - goto exit; 4756 - } 4757 - 4758 - if (mode & FALLOC_FL_ZERO_RANGE) { 4759 - ret = ext4_zero_range(file, offset, len, mode); 4760 - goto exit; 4761 - } 4762 - trace_ext4_fallocate_enter(inode, offset, len, mode); 4763 - lblk = offset >> blkbits; 4764 - 4765 - max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); 4766 - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; 4767 - 4768 - inode_lock(inode); 4769 - 4770 - /* 4771 - * We only support preallocation for extent-based files only 4772 - */ 4773 - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { 4774 - ret = -EOPNOTSUPP; 4775 - goto out; 4776 - } 4777 - 4778 - if (!(mode & FALLOC_FL_KEEP_SIZE) && 4779 - (offset + len > inode->i_size || 4780 - offset + len > EXT4_I(inode)->i_disksize)) { 4781 - new_size = offset + len; 4782 - ret = inode_newsize_ok(inode, new_size); 4783 - if (ret) 4784 - goto out; 4785 - } 4768 + goto out_inode_lock; 4786 4769 4787 4770 /* Wait all existing dio workers, newcomers will block on i_rwsem */ 4788 4771 inode_dio_wait(inode); 4789 4772 4790 4773 ret = file_modified(file); 4791 4774 if (ret) 4792 - goto out; 4775 + goto out_inode_lock; 4793 4776 4794 - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); 4795 - if (ret) 4796 - goto out; 4797 - 4798 - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { 4799 - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, 4800 - EXT4_I(inode)->i_sync_tid); 4777 + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { 4778 + ret = ext4_do_fallocate(file, offset, len, mode); 4779 + goto out_inode_lock; 4801 4780 } 4802 - out: 4781 + 4782 + /* 4783 + * Follow-up operations will drop page cache, hold invalidate lock 4784 + * to prevent page faults from reinstantiating pages we have 4785 + * released from page cache. 4786 + */ 4787 + filemap_invalidate_lock(mapping); 4788 + 4789 + ret = ext4_break_layouts(inode); 4790 + if (ret) 4791 + goto out_invalidate_lock; 4792 + 4793 + if (mode & FALLOC_FL_PUNCH_HOLE) 4794 + ret = ext4_punch_hole(file, offset, len); 4795 + else if (mode & FALLOC_FL_COLLAPSE_RANGE) 4796 + ret = ext4_collapse_range(file, offset, len); 4797 + else if (mode & FALLOC_FL_INSERT_RANGE) 4798 + ret = ext4_insert_range(file, offset, len); 4799 + else if (mode & FALLOC_FL_ZERO_RANGE) 4800 + ret = ext4_zero_range(file, offset, len, mode); 4801 + else 4802 + ret = -EOPNOTSUPP; 4803 + 4804 + out_invalidate_lock: 4805 + filemap_invalidate_unlock(mapping); 4806 + out_inode_lock: 4803 4807 inode_unlock(inode); 4804 - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); 4805 - exit: 4806 4808 return ret; 4807 4809 } 4808 4810 ··· 5278 5332 struct inode *inode = file_inode(file); 5279 5333 struct super_block *sb = inode->i_sb; 5280 5334 struct address_space *mapping = inode->i_mapping; 5281 - ext4_lblk_t punch_start, punch_stop; 5335 + loff_t end = offset + len; 5336 + ext4_lblk_t start_lblk, end_lblk; 5282 5337 handle_t *handle; 5283 5338 unsigned int credits; 5284 - loff_t new_size, ioffset; 5339 + loff_t start, new_size; 5285 5340 int ret; 5286 5341 5287 - /* 5288 - * We need to test this early because xfstests assumes that a 5289 - * collapse range of (0, 1) will return EOPNOTSUPP if the file 5290 - * system does not support collapse range. 5291 - */ 5342 + trace_ext4_collapse_range(inode, offset, len); 5343 + WARN_ON_ONCE(!inode_is_locked(inode)); 5344 + 5345 + /* Currently just for extent based files */ 5292 5346 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5293 5347 return -EOPNOTSUPP; 5294 - 5295 5348 /* Collapse range works only on fs cluster size aligned regions. */ 5296 5349 if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) 5297 5350 return -EINVAL; 5298 - 5299 - trace_ext4_collapse_range(inode, offset, len); 5300 - 5301 - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5302 - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); 5303 - 5304 - inode_lock(inode); 5305 5351 /* 5306 5352 * There is no need to overlap collapse range with EOF, in which case 5307 5353 * it is effectively a truncate operation 5308 5354 */ 5309 - if (offset + len >= inode->i_size) { 5310 - ret = -EINVAL; 5311 - goto out_mutex; 5312 - } 5313 - 5314 - /* Currently just for extent based files */ 5315 - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5316 - ret = -EOPNOTSUPP; 5317 - goto out_mutex; 5318 - } 5319 - 5320 - /* Wait for existing dio to complete */ 5321 - inode_dio_wait(inode); 5322 - 5323 - ret = file_modified(file); 5324 - if (ret) 5325 - goto out_mutex; 5355 + if (end >= inode->i_size) 5356 + return -EINVAL; 5326 5357 5327 5358 /* 5328 - * Prevent page faults from reinstantiating pages we have released from 5329 - * page cache. 5330 - */ 5331 - filemap_invalidate_lock(mapping); 5332 - 5333 - ret = ext4_break_layouts(inode); 5334 - if (ret) 5335 - goto out_mmap; 5336 - 5337 - /* 5359 + * Write tail of the last page before removed range and data that 5360 + * will be shifted since they will get removed from the page cache 5361 + * below. We are also protected from pages becoming dirty by 5362 + * i_rwsem and invalidate_lock. 5338 5363 * Need to round down offset to be aligned with page size boundary 5339 5364 * for page size > block size. 5340 5365 */ 5341 - ioffset = round_down(offset, PAGE_SIZE); 5342 - /* 5343 - * Write tail of the last page before removed range since it will get 5344 - * removed from the page cache below. 5345 - */ 5346 - ret = filemap_write_and_wait_range(mapping, ioffset, offset); 5366 + start = round_down(offset, PAGE_SIZE); 5367 + ret = filemap_write_and_wait_range(mapping, start, offset); 5368 + if (!ret) 5369 + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); 5347 5370 if (ret) 5348 - goto out_mmap; 5349 - /* 5350 - * Write data that will be shifted to preserve them when discarding 5351 - * page cache below. We are also protected from pages becoming dirty 5352 - * by i_rwsem and invalidate_lock. 5353 - */ 5354 - ret = filemap_write_and_wait_range(mapping, offset + len, 5355 - LLONG_MAX); 5356 - if (ret) 5357 - goto out_mmap; 5358 - truncate_pagecache(inode, ioffset); 5371 + return ret; 5372 + 5373 + truncate_pagecache(inode, start); 5359 5374 5360 5375 credits = ext4_writepage_trans_blocks(inode); 5361 5376 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5362 - if (IS_ERR(handle)) { 5363 - ret = PTR_ERR(handle); 5364 - goto out_mmap; 5365 - } 5377 + if (IS_ERR(handle)) 5378 + return PTR_ERR(handle); 5379 + 5366 5380 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); 5381 + 5382 + start_lblk = offset >> inode->i_blkbits; 5383 + end_lblk = (offset + len) >> inode->i_blkbits; 5367 5384 5368 5385 down_write(&EXT4_I(inode)->i_data_sem); 5369 5386 ext4_discard_preallocations(inode); 5370 - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); 5387 + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); 5371 5388 5372 - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); 5389 + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); 5373 5390 if (ret) { 5374 5391 up_write(&EXT4_I(inode)->i_data_sem); 5375 - goto out_stop; 5392 + goto out_handle; 5376 5393 } 5377 5394 ext4_discard_preallocations(inode); 5378 5395 5379 - ret = ext4_ext_shift_extents(inode, handle, punch_stop, 5380 - punch_stop - punch_start, SHIFT_LEFT); 5396 + ret = ext4_ext_shift_extents(inode, handle, end_lblk, 5397 + end_lblk - start_lblk, SHIFT_LEFT); 5381 5398 if (ret) { 5382 5399 up_write(&EXT4_I(inode)->i_data_sem); 5383 - goto out_stop; 5400 + goto out_handle; 5384 5401 } 5385 5402 5386 5403 new_size = inode->i_size - len; ··· 5351 5442 EXT4_I(inode)->i_disksize = new_size; 5352 5443 5353 5444 up_write(&EXT4_I(inode)->i_data_sem); 5445 + ret = ext4_mark_inode_dirty(handle, inode); 5446 + if (ret) 5447 + goto out_handle; 5448 + 5449 + ext4_update_inode_fsync_trans(handle, inode, 1); 5354 5450 if (IS_SYNC(inode)) 5355 5451 ext4_handle_sync(handle); 5356 - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 5357 - ret = ext4_mark_inode_dirty(handle, inode); 5358 - ext4_update_inode_fsync_trans(handle, inode, 1); 5359 5452 5360 - out_stop: 5453 + out_handle: 5361 5454 ext4_journal_stop(handle); 5362 - out_mmap: 5363 - filemap_invalidate_unlock(mapping); 5364 - out_mutex: 5365 - inode_unlock(inode); 5366 5455 return ret; 5367 5456 } 5368 5457 ··· 5380 5473 handle_t *handle; 5381 5474 struct ext4_ext_path *path; 5382 5475 struct ext4_extent *extent; 5383 - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; 5476 + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; 5384 5477 unsigned int credits, ee_len; 5385 - int ret = 0, depth, split_flag = 0; 5386 - loff_t ioffset; 5478 + int ret, depth, split_flag = 0; 5479 + loff_t start; 5387 5480 5388 - /* 5389 - * We need to test this early because xfstests assumes that an 5390 - * insert range of (0, 1) will return EOPNOTSUPP if the file 5391 - * system does not support insert range. 5392 - */ 5481 + trace_ext4_insert_range(inode, offset, len); 5482 + WARN_ON_ONCE(!inode_is_locked(inode)); 5483 + 5484 + /* Currently just for extent based files */ 5393 5485 if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 5394 5486 return -EOPNOTSUPP; 5395 - 5396 5487 /* Insert range works only on fs cluster size aligned regions. */ 5397 5488 if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) 5398 5489 return -EINVAL; 5399 - 5400 - trace_ext4_insert_range(inode, offset, len); 5401 - 5402 - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); 5403 - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); 5404 - 5405 - inode_lock(inode); 5406 - /* Currently just for extent based files */ 5407 - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { 5408 - ret = -EOPNOTSUPP; 5409 - goto out_mutex; 5410 - } 5411 - 5412 - /* Check whether the maximum file size would be exceeded */ 5413 - if (len > inode->i_sb->s_maxbytes - inode->i_size) { 5414 - ret = -EFBIG; 5415 - goto out_mutex; 5416 - } 5417 - 5418 5490 /* Offset must be less than i_size */ 5419 - if (offset >= inode->i_size) { 5420 - ret = -EINVAL; 5421 - goto out_mutex; 5422 - } 5423 - 5424 - /* Wait for existing dio to complete */ 5425 - inode_dio_wait(inode); 5426 - 5427 - ret = file_modified(file); 5428 - if (ret) 5429 - goto out_mutex; 5491 + if (offset >= inode->i_size) 5492 + return -EINVAL; 5493 + /* Check whether the maximum file size would be exceeded */ 5494 + if (len > inode->i_sb->s_maxbytes - inode->i_size) 5495 + return -EFBIG; 5430 5496 5431 5497 /* 5432 - * Prevent page faults from reinstantiating pages we have released from 5433 - * page cache. 5498 + * Write out all dirty pages. Need to round down to align start offset 5499 + * to page size boundary for page size > block size. 5434 5500 */ 5435 - filemap_invalidate_lock(mapping); 5436 - 5437 - ret = ext4_break_layouts(inode); 5501 + start = round_down(offset, PAGE_SIZE); 5502 + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); 5438 5503 if (ret) 5439 - goto out_mmap; 5504 + return ret; 5440 5505 5441 - /* 5442 - * Need to round down to align start offset to page size boundary 5443 - * for page size > block size. 5444 - */ 5445 - ioffset = round_down(offset, PAGE_SIZE); 5446 - /* Write out all dirty pages */ 5447 - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, 5448 - LLONG_MAX); 5449 - if (ret) 5450 - goto out_mmap; 5451 - truncate_pagecache(inode, ioffset); 5506 + truncate_pagecache(inode, start); 5452 5507 5453 5508 credits = ext4_writepage_trans_blocks(inode); 5454 5509 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5455 - if (IS_ERR(handle)) { 5456 - ret = PTR_ERR(handle); 5457 - goto out_mmap; 5458 - } 5510 + if (IS_ERR(handle)) 5511 + return PTR_ERR(handle); 5512 + 5459 5513 ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); 5460 5514 5461 5515 /* Expand file to avoid data loss if there is error while shifting */ 5462 5516 inode->i_size += len; 5463 5517 EXT4_I(inode)->i_disksize += len; 5464 - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 5465 5518 ret = ext4_mark_inode_dirty(handle, inode); 5466 5519 if (ret) 5467 - goto out_stop; 5520 + goto out_handle; 5521 + 5522 + start_lblk = offset >> inode->i_blkbits; 5523 + len_lblk = len >> inode->i_blkbits; 5468 5524 5469 5525 down_write(&EXT4_I(inode)->i_data_sem); 5470 5526 ext4_discard_preallocations(inode); 5471 5527 5472 - path = ext4_find_extent(inode, offset_lblk, NULL, 0); 5528 + path = ext4_find_extent(inode, start_lblk, NULL, 0); 5473 5529 if (IS_ERR(path)) { 5474 5530 up_write(&EXT4_I(inode)->i_data_sem); 5475 5531 ret = PTR_ERR(path); 5476 - goto out_stop; 5532 + goto out_handle; 5477 5533 } 5478 5534 5479 5535 depth = ext_depth(inode); ··· 5446 5576 ee_len = ext4_ext_get_actual_len(extent); 5447 5577 5448 5578 /* 5449 - * If offset_lblk is not the starting block of extent, split 5450 - * the extent @offset_lblk 5579 + * If start_lblk is not the starting block of extent, split 5580 + * the extent @start_lblk 5451 5581 */ 5452 - if ((offset_lblk > ee_start_lblk) && 5453 - (offset_lblk < (ee_start_lblk + ee_len))) { 5582 + if ((start_lblk > ee_start_lblk) && 5583 + (start_lblk < (ee_start_lblk + ee_len))) { 5454 5584 if (ext4_ext_is_unwritten(extent)) 5455 5585 split_flag = EXT4_EXT_MARK_UNWRIT1 | 5456 5586 EXT4_EXT_MARK_UNWRIT2; 5457 5587 path = ext4_split_extent_at(handle, inode, path, 5458 - offset_lblk, split_flag, 5588 + start_lblk, split_flag, 5459 5589 EXT4_EX_NOCACHE | 5460 5590 EXT4_GET_BLOCKS_PRE_IO | 5461 5591 EXT4_GET_BLOCKS_METADATA_NOFAIL); ··· 5464 5594 if (IS_ERR(path)) { 5465 5595 up_write(&EXT4_I(inode)->i_data_sem); 5466 5596 ret = PTR_ERR(path); 5467 - goto out_stop; 5597 + goto out_handle; 5468 5598 } 5469 5599 } 5470 5600 5471 5601 ext4_free_ext_path(path); 5472 - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); 5602 + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); 5473 5603 5474 5604 /* 5475 - * if offset_lblk lies in a hole which is at start of file, use 5605 + * if start_lblk lies in a hole which is at start of file, use 5476 5606 * ee_start_lblk to shift extents 5477 5607 */ 5478 5608 ret = ext4_ext_shift_extents(inode, handle, 5479 - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); 5480 - 5609 + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); 5481 5610 up_write(&EXT4_I(inode)->i_data_sem); 5611 + if (ret) 5612 + goto out_handle; 5613 + 5614 + ext4_update_inode_fsync_trans(handle, inode, 1); 5482 5615 if (IS_SYNC(inode)) 5483 5616 ext4_handle_sync(handle); 5484 - if (ret >= 0) 5485 - ext4_update_inode_fsync_trans(handle, inode, 1); 5486 5617 5487 - out_stop: 5618 + out_handle: 5488 5619 ext4_journal_stop(handle); 5489 - out_mmap: 5490 - filemap_invalidate_unlock(mapping); 5491 - out_mutex: 5492 - inode_unlock(inode); 5493 5620 return ret; 5494 5621 } 5495 5622

-1

fs/ext4/extents_status.c

··· 1551 1551 1552 1552 ext4_es_print_tree(inode); 1553 1553 ext4_da_release_space(inode, reserved); 1554 - return; 1555 1554 } 1556 1555 1557 1556 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,

+19 -8

fs/ext4/file.c

··· 688 688 static ssize_t 689 689 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 690 690 { 691 + int ret; 691 692 struct inode *inode = file_inode(iocb->ki_filp); 692 693 693 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 694 - return -EIO; 694 + ret = ext4_emergency_state(inode->i_sb); 695 + if (unlikely(ret)) 696 + return ret; 695 697 696 698 #ifdef CONFIG_FS_DAX 697 699 if (IS_DAX(inode)) ··· 702 700 703 701 if (iocb->ki_flags & IOCB_ATOMIC) { 704 702 size_t len = iov_iter_count(from); 705 - int ret; 706 703 707 704 if (len < EXT4_SB(inode->i_sb)->s_awu_min || 708 705 len > EXT4_SB(inode->i_sb)->s_awu_max) ··· 801 800 802 801 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) 803 802 { 803 + int ret; 804 804 struct inode *inode = file->f_mapping->host; 805 805 struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 806 806 807 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 808 - return -EIO; 807 + if (file->f_mode & FMODE_WRITE) 808 + ret = ext4_emergency_state(inode->i_sb); 809 + else 810 + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; 811 + if (unlikely(ret)) 812 + return ret; 809 813 810 814 /* 811 815 * We don't support synchronous mappings for non-DAX files and ··· 841 835 if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) 842 836 return 0; 843 837 844 - if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) 838 + if (ext4_emergency_state(sb) || sb_rdonly(sb) || 839 + !sb_start_intwrite_trylock(sb)) 845 840 return 0; 846 841 847 842 ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); ··· 885 878 { 886 879 int ret; 887 880 888 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 889 - return -EIO; 881 + if (filp->f_mode & FMODE_WRITE) 882 + ret = ext4_emergency_state(inode->i_sb); 883 + else 884 + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; 885 + if (unlikely(ret)) 886 + return ret; 890 887 891 888 ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); 892 889 if (ret)

+4 -8

fs/ext4/fsync.c

··· 132 132 bool needs_barrier = false; 133 133 struct inode *inode = file->f_mapping->host; 134 134 135 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 136 - return -EIO; 135 + ret = ext4_emergency_state(inode->i_sb); 136 + if (unlikely(ret)) 137 + return ret; 137 138 138 139 ASSERT(ext4_journal_current_handle() == NULL); 139 140 140 141 trace_ext4_sync_file_enter(file, datasync); 141 142 142 - if (sb_rdonly(inode->i_sb)) { 143 - /* Make sure that we read updated s_ext4_flags value */ 144 - smp_rmb(); 145 - if (ext4_forced_shutdown(inode->i_sb)) 146 - ret = -EROFS; 143 + if (sb_rdonly(inode->i_sb)) 147 144 goto out; 148 - } 149 145 150 146 if (!EXT4_SB(inode->i_sb)->s_journal) { 151 147 ret = ext4_fsync_nojournal(file, start, end, datasync,

+1 -1

fs/ext4/hash.c

··· 302 302 303 303 if (len && IS_CASEFOLDED(dir) && 304 304 (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { 305 - buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); 305 + buff = kzalloc(PATH_MAX, GFP_KERNEL); 306 306 if (!buff) 307 307 return -ENOMEM; 308 308

+5 -4

fs/ext4/ialloc.c

··· 951 951 sb = dir->i_sb; 952 952 sbi = EXT4_SB(sb); 953 953 954 - if (unlikely(ext4_forced_shutdown(sb))) 955 - return ERR_PTR(-EIO); 954 + ret2 = ext4_emergency_state(sb); 955 + if (unlikely(ret2)) 956 + return ERR_PTR(ret2); 956 957 957 958 ngroups = ext4_get_groups_count(sb); 958 959 trace_ext4_request_inode(dir, mode); ··· 1283 1282 inode->i_generation = get_random_u32(); 1284 1283 1285 1284 /* Precompute checksum seed for inode metadata */ 1286 - if (ext4_has_metadata_csum(sb)) { 1285 + if (ext4_has_feature_metadata_csum(sb)) { 1287 1286 __u32 csum; 1288 1287 __le32 inum = cpu_to_le32(inode->i_ino); 1289 1288 __le32 gen = cpu_to_le32(inode->i_generation); ··· 1299 1298 ei->i_extra_isize = sbi->s_want_extra_isize; 1300 1299 ei->i_inline_off = 0; 1301 1300 if (ext4_has_feature_inline_data(sb) && 1302 - (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) 1301 + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) 1303 1302 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1304 1303 ret = inode; 1305 1304 err = dquot_alloc_inode(inode);

+100 -165

fs/ext4/inline.c

··· 20 20 #define EXT4_INLINE_DOTDOT_OFFSET 2 21 21 #define EXT4_INLINE_DOTDOT_SIZE 4 22 22 23 + 24 + static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, 25 + struct inode *inode, 26 + void **fsdata); 27 + 23 28 static int ext4_get_inline_size(struct inode *inode) 24 29 { 25 30 if (EXT4_I(inode)->i_inline_off) ··· 233 228 struct ext4_inode *raw_inode; 234 229 int cp_len = 0; 235 230 236 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 231 + if (unlikely(ext4_emergency_state(inode->i_sb))) 237 232 return; 238 233 239 234 BUG_ON(!EXT4_I(inode)->i_inline_off); ··· 658 653 } 659 654 660 655 /* 656 + * Prepare the write for the inline data. 657 + * If the data can be written into the inode, we just read 658 + * the page and make it uptodate, and start the journal. 659 + * Otherwise read the page, makes it dirty so that it can be 660 + * handle in writepages(the i_disksize update is left to the 661 + * normal ext4_da_write_end). 662 + */ 663 + int ext4_generic_write_inline_data(struct address_space *mapping, 664 + struct inode *inode, 665 + loff_t pos, unsigned len, 666 + struct folio **foliop, 667 + void **fsdata, bool da) 668 + { 669 + int ret; 670 + handle_t *handle; 671 + struct folio *folio; 672 + struct ext4_iloc iloc; 673 + int retries = 0; 674 + 675 + ret = ext4_get_inode_loc(inode, &iloc); 676 + if (ret) 677 + return ret; 678 + 679 + retry_journal: 680 + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 681 + if (IS_ERR(handle)) { 682 + ret = PTR_ERR(handle); 683 + goto out_release_bh; 684 + } 685 + 686 + ret = ext4_prepare_inline_data(handle, inode, pos + len); 687 + if (ret && ret != -ENOSPC) 688 + goto out_stop_journal; 689 + 690 + if (ret == -ENOSPC) { 691 + ext4_journal_stop(handle); 692 + if (!da) { 693 + brelse(iloc.bh); 694 + /* Retry inside */ 695 + return ext4_convert_inline_data_to_extent(mapping, inode); 696 + } 697 + 698 + ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); 699 + if (ret == -ENOSPC && 700 + ext4_should_retry_alloc(inode->i_sb, &retries)) 701 + goto retry_journal; 702 + goto out_release_bh; 703 + } 704 + 705 + folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, 706 + mapping_gfp_mask(mapping)); 707 + if (IS_ERR(folio)) { 708 + ret = PTR_ERR(folio); 709 + goto out_stop_journal; 710 + } 711 + 712 + down_read(&EXT4_I(inode)->xattr_sem); 713 + /* Someone else had converted it to extent */ 714 + if (!ext4_has_inline_data(inode)) { 715 + ret = 0; 716 + goto out_release_folio; 717 + } 718 + 719 + if (!folio_test_uptodate(folio)) { 720 + ret = ext4_read_inline_folio(inode, folio); 721 + if (ret < 0) 722 + goto out_release_folio; 723 + } 724 + 725 + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); 726 + if (ret) 727 + goto out_release_folio; 728 + *foliop = folio; 729 + up_read(&EXT4_I(inode)->xattr_sem); 730 + brelse(iloc.bh); 731 + return 1; 732 + 733 + out_release_folio: 734 + up_read(&EXT4_I(inode)->xattr_sem); 735 + folio_unlock(folio); 736 + folio_put(folio); 737 + out_stop_journal: 738 + ext4_journal_stop(handle); 739 + out_release_bh: 740 + brelse(iloc.bh); 741 + return ret; 742 + } 743 + 744 + /* 661 745 * Try to write data in the inode. 662 746 * If the inode has inline data, check whether the new write can be 663 747 * in the inode also. If not, create the page the handle, move the data ··· 757 663 loff_t pos, unsigned len, 758 664 struct folio **foliop) 759 665 { 760 - int ret; 761 - handle_t *handle; 762 - struct folio *folio; 763 - struct ext4_iloc iloc; 764 - 765 666 if (pos + len > ext4_get_max_inline_size(inode)) 766 - goto convert; 767 - 768 - ret = ext4_get_inode_loc(inode, &iloc); 769 - if (ret) 770 - return ret; 771 - 772 - /* 773 - * The possible write could happen in the inode, 774 - * so try to reserve the space in inode first. 775 - */ 776 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 777 - if (IS_ERR(handle)) { 778 - ret = PTR_ERR(handle); 779 - handle = NULL; 780 - goto out; 781 - } 782 - 783 - ret = ext4_prepare_inline_data(handle, inode, pos + len); 784 - if (ret && ret != -ENOSPC) 785 - goto out; 786 - 787 - /* We don't have space in inline inode, so convert it to extent. */ 788 - if (ret == -ENOSPC) { 789 - ext4_journal_stop(handle); 790 - brelse(iloc.bh); 791 - goto convert; 792 - } 793 - 794 - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, 795 - EXT4_JTR_NONE); 796 - if (ret) 797 - goto out; 798 - 799 - folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, 800 - mapping_gfp_mask(mapping)); 801 - if (IS_ERR(folio)) { 802 - ret = PTR_ERR(folio); 803 - goto out; 804 - } 805 - 806 - *foliop = folio; 807 - down_read(&EXT4_I(inode)->xattr_sem); 808 - if (!ext4_has_inline_data(inode)) { 809 - ret = 0; 810 - folio_unlock(folio); 811 - folio_put(folio); 812 - goto out_up_read; 813 - } 814 - 815 - if (!folio_test_uptodate(folio)) { 816 - ret = ext4_read_inline_folio(inode, folio); 817 - if (ret < 0) { 818 - folio_unlock(folio); 819 - folio_put(folio); 820 - goto out_up_read; 821 - } 822 - } 823 - 824 - ret = 1; 825 - handle = NULL; 826 - out_up_read: 827 - up_read(&EXT4_I(inode)->xattr_sem); 828 - out: 829 - if (handle && (ret != 1)) 830 - ext4_journal_stop(handle); 831 - brelse(iloc.bh); 832 - return ret; 833 - convert: 834 - return ext4_convert_inline_data_to_extent(mapping, inode); 667 + return ext4_convert_inline_data_to_extent(mapping, inode); 668 + return ext4_generic_write_inline_data(mapping, inode, pos, len, 669 + foliop, NULL, false); 835 670 } 836 671 837 672 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ··· 904 881 return ret; 905 882 } 906 883 907 - /* 908 - * Prepare the write for the inline data. 909 - * If the data can be written into the inode, we just read 910 - * the page and make it uptodate, and start the journal. 911 - * Otherwise read the page, makes it dirty so that it can be 912 - * handle in writepages(the i_disksize update is left to the 913 - * normal ext4_da_write_end). 914 - */ 915 - int ext4_da_write_inline_data_begin(struct address_space *mapping, 916 - struct inode *inode, 917 - loff_t pos, unsigned len, 918 - struct folio **foliop, 919 - void **fsdata) 920 - { 921 - int ret; 922 - handle_t *handle; 923 - struct folio *folio; 924 - struct ext4_iloc iloc; 925 - int retries = 0; 926 - 927 - ret = ext4_get_inode_loc(inode, &iloc); 928 - if (ret) 929 - return ret; 930 - 931 - retry_journal: 932 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); 933 - if (IS_ERR(handle)) { 934 - ret = PTR_ERR(handle); 935 - goto out; 936 - } 937 - 938 - ret = ext4_prepare_inline_data(handle, inode, pos + len); 939 - if (ret && ret != -ENOSPC) 940 - goto out_journal; 941 - 942 - if (ret == -ENOSPC) { 943 - ext4_journal_stop(handle); 944 - ret = ext4_da_convert_inline_data_to_extent(mapping, 945 - inode, 946 - fsdata); 947 - if (ret == -ENOSPC && 948 - ext4_should_retry_alloc(inode->i_sb, &retries)) 949 - goto retry_journal; 950 - goto out; 951 - } 952 - 953 - /* 954 - * We cannot recurse into the filesystem as the transaction 955 - * is already started. 956 - */ 957 - folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, 958 - mapping_gfp_mask(mapping)); 959 - if (IS_ERR(folio)) { 960 - ret = PTR_ERR(folio); 961 - goto out_journal; 962 - } 963 - 964 - down_read(&EXT4_I(inode)->xattr_sem); 965 - if (!ext4_has_inline_data(inode)) { 966 - ret = 0; 967 - goto out_release_page; 968 - } 969 - 970 - if (!folio_test_uptodate(folio)) { 971 - ret = ext4_read_inline_folio(inode, folio); 972 - if (ret < 0) 973 - goto out_release_page; 974 - } 975 - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, 976 - EXT4_JTR_NONE); 977 - if (ret) 978 - goto out_release_page; 979 - 980 - up_read(&EXT4_I(inode)->xattr_sem); 981 - *foliop = folio; 982 - brelse(iloc.bh); 983 - return 1; 984 - out_release_page: 985 - up_read(&EXT4_I(inode)->xattr_sem); 986 - folio_unlock(folio); 987 - folio_put(folio); 988 - out_journal: 989 - ext4_journal_stop(handle); 990 - out: 991 - brelse(iloc.bh); 992 - return ret; 993 - } 994 - 995 884 #ifdef INLINE_DIR_DEBUG 996 885 void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, 997 886 void *inline_start, int inline_size) ··· 947 1012 int err; 948 1013 struct ext4_dir_entry_2 *de; 949 1014 950 - err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, 1015 + err = ext4_find_dest_de(dir, iloc->bh, inline_start, 951 1016 inline_size, fname, &de); 952 1017 if (err) 953 1018 return err; ··· 1081 1146 memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, 1082 1147 inline_size - EXT4_INLINE_DOTDOT_SIZE); 1083 1148 1084 - if (ext4_has_metadata_csum(inode->i_sb)) 1149 + if (ext4_has_feature_metadata_csum(inode->i_sb)) 1085 1150 csum_size = sizeof(struct ext4_dir_entry_tail); 1086 1151 1087 1152 inode->i_size = inode->i_sb->s_blocksize;

+164 -124

fs/ext4/inode.c

··· 31 31 #include <linux/writeback.h> 32 32 #include <linux/pagevec.h> 33 33 #include <linux/mpage.h> 34 + #include <linux/rmap.h> 34 35 #include <linux/namei.h> 35 36 #include <linux/uio.h> 36 37 #include <linux/bio.h> ··· 94 93 95 94 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 96 95 cpu_to_le32(EXT4_OS_LINUX) || 97 - !ext4_has_metadata_csum(inode->i_sb)) 96 + !ext4_has_feature_metadata_csum(inode->i_sb)) 98 97 return 1; 99 98 100 99 provided = le16_to_cpu(raw->i_checksum_lo); ··· 115 114 116 115 if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != 117 116 cpu_to_le32(EXT4_OS_LINUX) || 118 - !ext4_has_metadata_csum(inode->i_sb)) 117 + !ext4_has_feature_metadata_csum(inode->i_sb)) 119 118 return; 120 119 121 120 csum = ext4_inode_csum(inode, raw, ei); ··· 752 751 flags &= EXT4_MAP_FLAGS; 753 752 754 753 /* Dummy buffer_head? Set non-atomically. */ 755 - if (!bh->b_page) { 754 + if (!bh->b_folio) { 756 755 bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; 757 756 return; 758 757 } ··· 1150 1149 pgoff_t index; 1151 1150 unsigned from, to; 1152 1151 1153 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 1154 - return -EIO; 1152 + ret = ext4_emergency_state(inode->i_sb); 1153 + if (unlikely(ret)) 1154 + return ret; 1155 1155 1156 1156 trace_ext4_write_begin(inode, pos, len); 1157 1157 /* ··· 2227 2225 mpd->io_submit.io_end->handle = handle->h_rsv_handle; 2228 2226 handle->h_rsv_handle = NULL; 2229 2227 } 2230 - ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); 2228 + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); 2231 2229 } 2232 2230 2233 2231 BUG_ON(map->m_len == 0); ··· 2275 2273 if (err < 0) { 2276 2274 struct super_block *sb = inode->i_sb; 2277 2275 2278 - if (ext4_forced_shutdown(sb)) 2276 + if (ext4_emergency_state(sb)) 2279 2277 goto invalidate_dirty_pages; 2280 2278 /* 2281 2279 * Let the uper layers retry transient errors. ··· 2601 2599 * *never* be called, so if that ever happens, we would want 2602 2600 * the stack trace. 2603 2601 */ 2604 - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { 2605 - ret = -EROFS; 2602 + ret = ext4_emergency_state(mapping->host->i_sb); 2603 + if (unlikely(ret)) 2606 2604 goto out_writepages; 2607 - } 2608 2605 2609 2606 /* 2610 2607 * If we have inline data and arrive here, it means that ··· 2818 2817 int ret; 2819 2818 int alloc_ctx; 2820 2819 2821 - if (unlikely(ext4_forced_shutdown(sb))) 2822 - return -EIO; 2820 + ret = ext4_emergency_state(sb); 2821 + if (unlikely(ret)) 2822 + return ret; 2823 2823 2824 2824 alloc_ctx = ext4_writepages_down_read(sb); 2825 2825 ret = ext4_do_writepages(&mpd); ··· 2860 2858 struct inode *inode = mapping->host; 2861 2859 int alloc_ctx; 2862 2860 2863 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 2864 - return -EIO; 2861 + ret = ext4_emergency_state(inode->i_sb); 2862 + if (unlikely(ret)) 2863 + return ret; 2865 2864 2866 2865 alloc_ctx = ext4_writepages_down_read(inode->i_sb); 2867 2866 trace_ext4_writepages(inode, wbc); ··· 2918 2915 pgoff_t index; 2919 2916 struct inode *inode = mapping->host; 2920 2917 2921 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 2922 - return -EIO; 2918 + ret = ext4_emergency_state(inode->i_sb); 2919 + if (unlikely(ret)) 2920 + return ret; 2923 2921 2924 2922 index = pos >> PAGE_SHIFT; 2925 2923 ··· 2933 2929 trace_ext4_da_write_begin(inode, pos, len); 2934 2930 2935 2931 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 2936 - ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, 2937 - foliop, fsdata); 2932 + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, 2933 + foliop, fsdata, true); 2938 2934 if (ret < 0) 2939 2935 return ret; 2940 2936 if (ret == 1) ··· 3910 3906 return ret; 3911 3907 } 3912 3908 3909 + static inline void ext4_truncate_folio(struct inode *inode, 3910 + loff_t start, loff_t end) 3911 + { 3912 + unsigned long blocksize = i_blocksize(inode); 3913 + struct folio *folio; 3914 + 3915 + /* Nothing to be done if no complete block needs to be truncated. */ 3916 + if (round_up(start, blocksize) >= round_down(end, blocksize)) 3917 + return; 3918 + 3919 + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); 3920 + if (IS_ERR(folio)) 3921 + return; 3922 + 3923 + if (folio_mkclean(folio)) 3924 + folio_mark_dirty(folio); 3925 + folio_unlock(folio); 3926 + folio_put(folio); 3927 + } 3928 + 3929 + int ext4_truncate_page_cache_block_range(struct inode *inode, 3930 + loff_t start, loff_t end) 3931 + { 3932 + unsigned long blocksize = i_blocksize(inode); 3933 + int ret; 3934 + 3935 + /* 3936 + * For journalled data we need to write (and checkpoint) pages 3937 + * before discarding page cache to avoid inconsitent data on disk 3938 + * in case of crash before freeing or unwritten converting trans 3939 + * is committed. 3940 + */ 3941 + if (ext4_should_journal_data(inode)) { 3942 + ret = filemap_write_and_wait_range(inode->i_mapping, start, 3943 + end - 1); 3944 + if (ret) 3945 + return ret; 3946 + goto truncate_pagecache; 3947 + } 3948 + 3949 + /* 3950 + * If the block size is less than the page size, the file's mapped 3951 + * blocks within one page could be freed or converted to unwritten. 3952 + * So it's necessary to remove writable userspace mappings, and then 3953 + * ext4_page_mkwrite() can be called during subsequent write access 3954 + * to these partial folios. 3955 + */ 3956 + if (!IS_ALIGNED(start | end, PAGE_SIZE) && 3957 + blocksize < PAGE_SIZE && start < inode->i_size) { 3958 + loff_t page_boundary = round_up(start, PAGE_SIZE); 3959 + 3960 + ext4_truncate_folio(inode, start, min(page_boundary, end)); 3961 + if (end > page_boundary) 3962 + ext4_truncate_folio(inode, 3963 + round_down(end, PAGE_SIZE), end); 3964 + } 3965 + 3966 + truncate_pagecache: 3967 + truncate_pagecache_range(inode, start, end - 1); 3968 + return 0; 3969 + } 3970 + 3913 3971 static void ext4_wait_dax_page(struct inode *inode) 3914 3972 { 3915 3973 filemap_invalidate_unlock(inode->i_mapping); ··· 4016 3950 { 4017 3951 struct inode *inode = file_inode(file); 4018 3952 struct super_block *sb = inode->i_sb; 4019 - ext4_lblk_t first_block, stop_block; 4020 - struct address_space *mapping = inode->i_mapping; 4021 - loff_t first_block_offset, last_block_offset, max_length; 4022 - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 3953 + ext4_lblk_t start_lblk, end_lblk; 3954 + loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; 3955 + loff_t end = offset + length; 4023 3956 handle_t *handle; 4024 3957 unsigned int credits; 4025 - int ret = 0, ret2 = 0; 3958 + int ret; 4026 3959 4027 3960 trace_ext4_punch_hole(inode, offset, length, 0); 4028 - 4029 - /* 4030 - * Write out all dirty pages to avoid race conditions 4031 - * Then release them. 4032 - */ 4033 - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { 4034 - ret = filemap_write_and_wait_range(mapping, offset, 4035 - offset + length - 1); 4036 - if (ret) 4037 - return ret; 4038 - } 4039 - 4040 - inode_lock(inode); 3961 + WARN_ON_ONCE(!inode_is_locked(inode)); 4041 3962 4042 3963 /* No need to punch hole beyond i_size */ 4043 3964 if (offset >= inode->i_size) 4044 - goto out_mutex; 3965 + return 0; 4045 3966 4046 3967 /* 4047 - * If the hole extends beyond i_size, set the hole 4048 - * to end after the page that contains i_size 3968 + * If the hole extends beyond i_size, set the hole to end after 3969 + * the page that contains i_size, and also make sure that the hole 3970 + * within one block before last range. 4049 3971 */ 4050 - if (offset + length > inode->i_size) { 4051 - length = inode->i_size + 4052 - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - 4053 - offset; 4054 - } 3972 + if (end > inode->i_size) 3973 + end = round_up(inode->i_size, PAGE_SIZE); 3974 + if (end > max_end) 3975 + end = max_end; 3976 + length = end - offset; 4055 3977 4056 3978 /* 4057 - * For punch hole the length + offset needs to be within one block 4058 - * before last range. Adjust the length if it goes beyond that limit. 3979 + * Attach jinode to inode for jbd2 if we do any zeroing of partial 3980 + * block. 4059 3981 */ 4060 - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; 4061 - if (offset + length > max_length) 4062 - length = max_length - offset; 4063 - 4064 - if (offset & (sb->s_blocksize - 1) || 4065 - (offset + length) & (sb->s_blocksize - 1)) { 4066 - /* 4067 - * Attach jinode to inode for jbd2 if we do any zeroing of 4068 - * partial block 4069 - */ 3982 + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { 4070 3983 ret = ext4_inode_attach_jinode(inode); 4071 3984 if (ret < 0) 4072 - goto out_mutex; 4073 - 3985 + return ret; 4074 3986 } 4075 3987 4076 - /* Wait all existing dio workers, newcomers will block on i_rwsem */ 4077 - inode_dio_wait(inode); 4078 3988 4079 - ret = file_modified(file); 3989 + ret = ext4_update_disksize_before_punch(inode, offset, length); 4080 3990 if (ret) 4081 - goto out_mutex; 4082 - 4083 - /* 4084 - * Prevent page faults from reinstantiating pages we have released from 4085 - * page cache. 4086 - */ 4087 - filemap_invalidate_lock(mapping); 4088 - 4089 - ret = ext4_break_layouts(inode); 4090 - if (ret) 4091 - goto out_dio; 4092 - 4093 - first_block_offset = round_up(offset, sb->s_blocksize); 4094 - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; 3991 + return ret; 4095 3992 4096 3993 /* Now release the pages and zero block aligned part of pages*/ 4097 - if (last_block_offset > first_block_offset) { 4098 - ret = ext4_update_disksize_before_punch(inode, offset, length); 4099 - if (ret) 4100 - goto out_dio; 4101 - truncate_pagecache_range(inode, first_block_offset, 4102 - last_block_offset); 4103 - } 3994 + ret = ext4_truncate_page_cache_block_range(inode, offset, end); 3995 + if (ret) 3996 + return ret; 4104 3997 4105 3998 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4106 3999 credits = ext4_writepage_trans_blocks(inode); ··· 4069 4044 if (IS_ERR(handle)) { 4070 4045 ret = PTR_ERR(handle); 4071 4046 ext4_std_error(sb, ret); 4072 - goto out_dio; 4047 + return ret; 4073 4048 } 4074 4049 4075 - ret = ext4_zero_partial_blocks(handle, inode, offset, 4076 - length); 4050 + ret = ext4_zero_partial_blocks(handle, inode, offset, length); 4077 4051 if (ret) 4078 - goto out_stop; 4079 - 4080 - first_block = (offset + sb->s_blocksize - 1) >> 4081 - EXT4_BLOCK_SIZE_BITS(sb); 4082 - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); 4052 + goto out_handle; 4083 4053 4084 4054 /* If there are blocks to remove, do it */ 4085 - if (stop_block > first_block) { 4086 - ext4_lblk_t hole_len = stop_block - first_block; 4055 + start_lblk = EXT4_B_TO_LBLK(inode, offset); 4056 + end_lblk = end >> inode->i_blkbits; 4057 + 4058 + if (end_lblk > start_lblk) { 4059 + ext4_lblk_t hole_len = end_lblk - start_lblk; 4087 4060 4088 4061 down_write(&EXT4_I(inode)->i_data_sem); 4089 4062 ext4_discard_preallocations(inode); 4090 4063 4091 - ext4_es_remove_extent(inode, first_block, hole_len); 4064 + ext4_es_remove_extent(inode, start_lblk, hole_len); 4092 4065 4093 4066 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4094 - ret = ext4_ext_remove_space(inode, first_block, 4095 - stop_block - 1); 4067 + ret = ext4_ext_remove_space(inode, start_lblk, 4068 + end_lblk - 1); 4096 4069 else 4097 - ret = ext4_ind_remove_space(handle, inode, first_block, 4098 - stop_block); 4070 + ret = ext4_ind_remove_space(handle, inode, start_lblk, 4071 + end_lblk); 4072 + if (ret) { 4073 + up_write(&EXT4_I(inode)->i_data_sem); 4074 + goto out_handle; 4075 + } 4099 4076 4100 - ext4_es_insert_extent(inode, first_block, hole_len, ~0, 4077 + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, 4101 4078 EXTENT_STATUS_HOLE, 0); 4102 4079 up_write(&EXT4_I(inode)->i_data_sem); 4103 4080 } 4104 - ext4_fc_track_range(handle, inode, first_block, stop_block); 4081 + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); 4082 + 4083 + ret = ext4_mark_inode_dirty(handle, inode); 4084 + if (unlikely(ret)) 4085 + goto out_handle; 4086 + 4087 + ext4_update_inode_fsync_trans(handle, inode, 1); 4105 4088 if (IS_SYNC(inode)) 4106 4089 ext4_handle_sync(handle); 4107 - 4108 - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); 4109 - ret2 = ext4_mark_inode_dirty(handle, inode); 4110 - if (unlikely(ret2)) 4111 - ret = ret2; 4112 - if (ret >= 0) 4113 - ext4_update_inode_fsync_trans(handle, inode, 1); 4114 - out_stop: 4090 + out_handle: 4115 4091 ext4_journal_stop(handle); 4116 - out_dio: 4117 - filemap_invalidate_unlock(mapping); 4118 - out_mutex: 4119 - inode_unlock(inode); 4120 4092 return ret; 4121 4093 } 4122 4094 ··· 4700 4678 *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { 4701 4679 int err; 4702 4680 4681 + err = xattr_check_inode(inode, IHDR(inode, raw_inode), 4682 + ITAIL(inode, raw_inode)); 4683 + if (err) 4684 + return err; 4685 + 4703 4686 ext4_set_inode_state(inode, EXT4_STATE_XATTR); 4704 4687 err = ext4_find_inline_data_nolock(inode); 4705 4688 if (!err && ext4_has_inline_data(inode)) ··· 4831 4804 ei->i_extra_isize = 0; 4832 4805 4833 4806 /* Precompute checksum seed for inode metadata */ 4834 - if (ext4_has_metadata_csum(sb)) { 4807 + if (ext4_has_feature_metadata_csum(sb)) { 4835 4808 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 4836 4809 __u32 csum; 4837 4810 __le32 inum = cpu_to_le32(inode->i_ino); ··· 4918 4891 * we'd normally treat htree data as empty space. But with metadata 4919 4892 * checksumming that corrupts checksums so forbid that. 4920 4893 */ 4921 - if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && 4894 + if (!ext4_has_feature_dir_index(sb) && 4895 + ext4_has_feature_metadata_csum(sb) && 4922 4896 ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { 4923 4897 ext4_error_inode(inode, function, line, 0, 4924 4898 "iget: Dir with htree data on filesystem without dir_index feature."); ··· 5039 5011 inode->i_op = &ext4_encrypted_symlink_inode_operations; 5040 5012 } else if (ext4_inode_is_fast_symlink(inode)) { 5041 5013 inode->i_op = &ext4_fast_symlink_inode_operations; 5042 - nd_terminate_link(ei->i_data, inode->i_size, 5043 - sizeof(ei->i_data) - 1); 5014 + if (inode->i_size == 0 || 5015 + inode->i_size >= sizeof(ei->i_data) || 5016 + strnlen((char *)ei->i_data, inode->i_size + 1) != 5017 + inode->i_size) { 5018 + ext4_error_inode(inode, function, line, 0, 5019 + "invalid fast symlink length %llu", 5020 + (unsigned long long)inode->i_size); 5021 + ret = -EFSCORRUPTED; 5022 + goto bad_inode; 5023 + } 5044 5024 inode_set_cached_link(inode, (char *)ei->i_data, 5045 5025 inode->i_size); 5046 5026 } else { ··· 5268 5232 if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) 5269 5233 return 0; 5270 5234 5271 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 5272 - return -EIO; 5235 + err = ext4_emergency_state(inode->i_sb); 5236 + if (unlikely(err)) 5237 + return err; 5273 5238 5274 5239 if (EXT4_SB(inode->i_sb)->s_journal) { 5275 5240 if (ext4_journal_current_handle()) { ··· 5392 5355 const unsigned int ia_valid = attr->ia_valid; 5393 5356 bool inc_ivers = true; 5394 5357 5395 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 5396 - return -EIO; 5358 + error = ext4_emergency_state(inode->i_sb); 5359 + if (unlikely(error)) 5360 + return error; 5397 5361 5398 5362 if (unlikely(IS_IMMUTABLE(inode))) 5399 5363 return -EPERM; ··· 5506 5468 oldsize & (inode->i_sb->s_blocksize - 1)) { 5507 5469 error = ext4_inode_attach_jinode(inode); 5508 5470 if (error) 5509 - goto err_out; 5471 + goto out_mmap_sem; 5510 5472 } 5511 5473 5512 5474 handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); ··· 5838 5800 { 5839 5801 int err = 0; 5840 5802 5841 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) { 5803 + err = ext4_emergency_state(inode->i_sb); 5804 + if (unlikely(err)) { 5842 5805 put_bh(iloc->bh); 5843 - return -EIO; 5806 + return err; 5844 5807 } 5845 5808 ext4_fc_track_inode(handle, inode); 5846 5809 ··· 5865 5826 { 5866 5827 int err; 5867 5828 5868 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 5869 - return -EIO; 5829 + err = ext4_emergency_state(inode->i_sb); 5830 + if (unlikely(err)) 5831 + return err; 5870 5832 5871 5833 err = ext4_get_inode_loc(inode, iloc); 5872 5834 if (!err) {

+7 -6

fs/ext4/ioctl.c

··· 142 142 143 143 es = (struct ext4_super_block *) (bh->b_data + offset); 144 144 lock_buffer(bh); 145 - if (ext4_has_metadata_csum(sb) && 145 + if (ext4_has_feature_metadata_csum(sb) && 146 146 es->s_checksum != ext4_superblock_csum(sb, es)) { 147 147 ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " 148 148 "superblock %llu", sb_block); ··· 150 150 goto out_bh; 151 151 } 152 152 func(es, arg); 153 - if (ext4_has_metadata_csum(sb)) 153 + if (ext4_has_feature_metadata_csum(sb)) 154 154 es->s_checksum = ext4_superblock_csum(sb, es); 155 155 set_buffer_uptodate(bh); 156 156 unlock_buffer(bh); ··· 351 351 __le32 gen = cpu_to_le32(inode->i_generation); 352 352 __u32 csum; 353 353 354 - if (!ext4_has_metadata_csum(inode->i_sb)) 354 + if (!ext4_has_feature_metadata_csum(inode->i_sb)) 355 355 return; 356 356 357 357 csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ··· 1205 1205 * If any checksums (group descriptors or metadata) are being used 1206 1206 * then the checksum seed feature is required to change the UUID. 1207 1207 */ 1208 - if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) 1208 + if (((ext4_has_feature_gdt_csum(sb) || 1209 + ext4_has_feature_metadata_csum(sb)) 1209 1210 && !ext4_has_feature_csum_seed(sb)) 1210 1211 || ext4_has_feature_stable_inodes(sb)) 1211 1212 return -EOPNOTSUPP; ··· 1254 1253 if (!inode_owner_or_capable(idmap, inode)) 1255 1254 return -EPERM; 1256 1255 1257 - if (ext4_has_metadata_csum(inode->i_sb)) { 1256 + if (ext4_has_feature_metadata_csum(inode->i_sb)) { 1258 1257 ext4_warning(sb, "Setting inode version is not " 1259 1258 "supported with metadata_csum enabled."); 1260 1259 return -ENOTTY; ··· 1706 1705 { 1707 1706 struct ext4_sb_info *sbi = EXT4_SB(sb); 1708 1707 1709 - if (sb_rdonly(sb)) 1708 + if (ext4_emergency_state(sb) || sb_rdonly(sb)) 1710 1709 return 0; 1711 1710 if (!force && 1712 1711 (sbi->s_overhead == 0 ||

+2

fs/ext4/mballoc-test.c

··· 796 796 KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); 797 797 grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, 798 798 bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); 799 + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); 799 800 800 801 ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); 801 802 KUNIT_ASSERT_EQ(test, ret, 0); ··· 861 860 KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); 862 861 grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, 863 862 bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); 863 + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); 864 864 865 865 ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); 866 866 KUNIT_ASSERT_EQ(test, ret, 0);

+4 -4

fs/ext4/mballoc.c

··· 187 187 * /sys/fs/ext4/<partition>/mb_min_to_scan 188 188 * /sys/fs/ext4/<partition>/mb_max_to_scan 189 189 * /sys/fs/ext4/<partition>/mb_order2_req 190 - * /sys/fs/ext4/<partition>/mb_linear_limit 190 + * /sys/fs/ext4/<partition>/mb_max_linear_groups 191 191 * 192 192 * The regular allocator uses buddy scan only if the request len is power of 193 193 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The ··· 209 209 * get traversed linearly. That may result in subsequent allocations being not 210 210 * close to each other. And so, the underlying device may get filled up in a 211 211 * non-linear fashion. While that may not matter on non-rotational devices, for 212 - * rotational devices that may result in higher seek times. "mb_linear_limit" 212 + * rotational devices that may result in higher seek times. "mb_max_linear_groups" 213 213 * tells mballoc how many groups mballoc should search linearly before 214 214 * performing consulting above data structures for more efficient lookups. For 215 215 * non rotational devices, this value defaults to 0 and for rotational devices ··· 5653 5653 { 5654 5654 ext4_group_t i, ngroups; 5655 5655 5656 - if (ext4_forced_shutdown(sb)) 5656 + if (ext4_emergency_state(sb)) 5657 5657 return; 5658 5658 5659 5659 ngroups = ext4_get_groups_count(sb); ··· 5687 5687 { 5688 5688 struct super_block *sb = ac->ac_sb; 5689 5689 5690 - if (ext4_forced_shutdown(sb)) 5690 + if (ext4_emergency_state(sb)) 5691 5691 return; 5692 5692 5693 5693 mb_debug(sb, "Can't allocate:"

+3 -3

fs/ext4/mmp.c

··· 21 21 22 22 static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) 23 23 { 24 - if (!ext4_has_metadata_csum(sb)) 24 + if (!ext4_has_feature_metadata_csum(sb)) 25 25 return 1; 26 26 27 27 return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); ··· 29 29 30 30 static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) 31 31 { 32 - if (!ext4_has_metadata_csum(sb)) 32 + if (!ext4_has_feature_metadata_csum(sb)) 33 33 return; 34 34 35 35 mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); ··· 162 162 memcpy(mmp->mmp_nodename, init_utsname()->nodename, 163 163 sizeof(mmp->mmp_nodename)); 164 164 165 - while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { 165 + while (!kthread_should_stop() && !ext4_emergency_state(sb)) { 166 166 if (!ext4_has_feature_mmp(sb)) { 167 167 ext4_warning(sb, "kmmpd being stopped since MMP feature" 168 168 " has been disabled.");

+52 -65

fs/ext4/namei.c

··· 176 176 brelse(bh); 177 177 return ERR_PTR(-EFSCORRUPTED); 178 178 } 179 - if (!ext4_has_metadata_csum(inode->i_sb) || 179 + if (!ext4_has_feature_metadata_csum(inode->i_sb) || 180 180 buffer_verified(bh)) 181 181 return bh; 182 182 ··· 291 291 __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ 292 292 }; 293 293 294 - static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); 295 - static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); 296 - static inline unsigned dx_get_hash(struct dx_entry *entry); 297 - static void dx_set_hash(struct dx_entry *entry, unsigned value); 298 - static unsigned dx_get_count(struct dx_entry *entries); 299 - static unsigned dx_get_limit(struct dx_entry *entries); 300 - static void dx_set_count(struct dx_entry *entries, unsigned value); 301 - static void dx_set_limit(struct dx_entry *entries, unsigned value); 302 - static unsigned dx_root_limit(struct inode *dir, unsigned infosize); 303 - static unsigned dx_node_limit(struct inode *dir); 304 - static struct dx_frame *dx_probe(struct ext4_filename *fname, 305 - struct inode *dir, 306 - struct dx_hash_info *hinfo, 307 - struct dx_frame *frame); 308 - static void dx_release(struct dx_frame *frames); 309 - static int dx_make_map(struct inode *dir, struct buffer_head *bh, 310 - struct dx_hash_info *hinfo, 311 - struct dx_map_entry *map_tail); 312 - static void dx_sort_map(struct dx_map_entry *map, unsigned count); 313 - static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, 314 - char *to, struct dx_map_entry *offsets, 315 - int count, unsigned int blocksize); 316 - static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, 317 - unsigned int blocksize); 318 - static void dx_insert_block(struct dx_frame *frame, 319 - u32 hash, ext4_lblk_t block); 320 - static int ext4_htree_next_block(struct inode *dir, __u32 hash, 321 - struct dx_frame *frame, 322 - struct dx_frame *frames, 323 - __u32 *start_hash); 324 294 static struct buffer_head * ext4_dx_find_entry(struct inode *dir, 325 295 struct ext4_filename *fname, 326 296 struct ext4_dir_entry_2 **res_dir); ··· 368 398 { 369 399 struct ext4_dir_entry_tail *t; 370 400 371 - if (!ext4_has_metadata_csum(inode->i_sb)) 401 + if (!ext4_has_feature_metadata_csum(inode->i_sb)) 372 402 return 1; 373 403 374 404 t = get_dirent_tail(inode, bh); ··· 389 419 { 390 420 struct ext4_dir_entry_tail *t; 391 421 392 - if (!ext4_has_metadata_csum(inode->i_sb)) 422 + if (!ext4_has_feature_metadata_csum(inode->i_sb)) 393 423 return; 394 424 395 425 t = get_dirent_tail(inode, bh); ··· 464 494 struct dx_tail *t; 465 495 int count_offset, limit, count; 466 496 467 - if (!ext4_has_metadata_csum(inode->i_sb)) 497 + if (!ext4_has_feature_metadata_csum(inode->i_sb)) 468 498 return 1; 469 499 470 500 c = get_dx_countlimit(inode, dirent, &count_offset); ··· 493 523 struct dx_tail *t; 494 524 int count_offset, limit, count; 495 525 496 - if (!ext4_has_metadata_csum(inode->i_sb)) 526 + if (!ext4_has_feature_metadata_csum(inode->i_sb)) 497 527 return; 498 528 499 529 c = get_dx_countlimit(inode, dirent, &count_offset); ··· 582 612 ext4_dir_rec_len(1, NULL) - 583 613 ext4_dir_rec_len(2, NULL) - infosize; 584 614 585 - if (ext4_has_metadata_csum(dir->i_sb)) 615 + if (ext4_has_feature_metadata_csum(dir->i_sb)) 586 616 entry_space -= sizeof(struct dx_tail); 587 617 return entry_space / sizeof(struct dx_entry); 588 618 } ··· 592 622 unsigned int entry_space = dir->i_sb->s_blocksize - 593 623 ext4_dir_rec_len(0, dir); 594 624 595 - if (ext4_has_metadata_csum(dir->i_sb)) 625 + if (ext4_has_feature_metadata_csum(dir->i_sb)) 596 626 entry_space -= sizeof(struct dx_tail); 597 627 return entry_space / sizeof(struct dx_entry); 598 628 } ··· 1046 1076 struct ext4_dir_entry_2 *de, *top; 1047 1077 int err = 0, count = 0; 1048 1078 struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; 1049 - int csum = ext4_has_metadata_csum(dir->i_sb); 1079 + int csum = ext4_has_feature_metadata_csum(dir->i_sb); 1050 1080 1051 1081 dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", 1052 1082 (unsigned long)block)); ··· 1290 1320 struct dx_hash_info h = *hinfo; 1291 1321 int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); 1292 1322 1293 - if (ext4_has_metadata_csum(dir->i_sb)) 1323 + if (ext4_has_feature_metadata_csum(dir->i_sb)) 1294 1324 buflen -= sizeof(struct ext4_dir_entry_tail); 1295 1325 1296 1326 while ((char *) de < base + buflen) { ··· 1432 1462 * sure cf_name was properly initialized before 1433 1463 * considering the calculated hash. 1434 1464 */ 1435 - if (IS_ENCRYPTED(parent) && fname->cf_name.name && 1465 + if (sb_no_casefold_compat_fallback(parent->i_sb) && 1466 + IS_ENCRYPTED(parent) && fname->cf_name.name && 1436 1467 (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || 1437 1468 fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) 1438 1469 return false; ··· 1566 1595 * return. Otherwise, fall back to doing a search the 1567 1596 * old fashioned way. 1568 1597 */ 1569 - if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) 1598 + if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) 1599 + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1600 + "falling back\n")); 1601 + else if (!sb_no_casefold_compat_fallback(dir->i_sb) && 1602 + *res_dir == NULL && IS_CASEFOLDED(dir)) 1603 + dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " 1604 + "failed, falling back\n")); 1605 + else 1570 1606 goto cleanup_and_exit; 1571 - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " 1572 - "falling back\n")); 1573 1607 ret = NULL; 1574 1608 } 1575 1609 nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); ··· 1921 1945 int csum_size = 0; 1922 1946 int err = 0, i; 1923 1947 1924 - if (ext4_has_metadata_csum(dir->i_sb)) 1948 + if (ext4_has_feature_metadata_csum(dir->i_sb)) 1925 1949 csum_size = sizeof(struct ext4_dir_entry_tail); 1926 1950 1927 1951 bh2 = ext4_append(handle, dir, &newblock); ··· 2036 2060 return ERR_PTR(err); 2037 2061 } 2038 2062 2039 - int ext4_find_dest_de(struct inode *dir, struct inode *inode, 2040 - struct buffer_head *bh, 2063 + int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, 2041 2064 void *buf, int buf_size, 2042 2065 struct ext4_filename *fname, 2043 2066 struct ext4_dir_entry_2 **dest_de) ··· 2118 2143 int csum_size = 0; 2119 2144 int err, err2; 2120 2145 2121 - if (ext4_has_metadata_csum(inode->i_sb)) 2146 + if (ext4_has_feature_metadata_csum(inode->i_sb)) 2122 2147 csum_size = sizeof(struct ext4_dir_entry_tail); 2123 2148 2124 2149 if (!de) { 2125 - err = ext4_find_dest_de(dir, inode, bh, bh->b_data, 2150 + err = ext4_find_dest_de(dir, bh, bh->b_data, 2126 2151 blocksize - csum_size, fname, &de); 2127 2152 if (err) 2128 2153 return err; ··· 2227 2252 struct fake_dirent *fde; 2228 2253 int csum_size = 0; 2229 2254 2230 - if (ext4_has_metadata_csum(inode->i_sb)) 2255 + if (ext4_has_feature_metadata_csum(inode->i_sb)) 2231 2256 csum_size = sizeof(struct ext4_dir_entry_tail); 2232 2257 2233 2258 blocksize = dir->i_sb->s_blocksize; ··· 2371 2396 ext4_lblk_t block, blocks; 2372 2397 int csum_size = 0; 2373 2398 2374 - if (ext4_has_metadata_csum(inode->i_sb)) 2399 + if (ext4_has_feature_metadata_csum(inode->i_sb)) 2375 2400 csum_size = sizeof(struct ext4_dir_entry_tail); 2376 2401 2377 2402 sb = dir->i_sb; ··· 2402 2427 if (!retval || (retval != ERR_BAD_DX_DIR)) 2403 2428 goto out; 2404 2429 /* Can we just ignore htree data? */ 2405 - if (ext4_has_metadata_csum(sb)) { 2430 + if (ext4_has_feature_metadata_csum(sb)) { 2406 2431 EXT4_ERROR_INODE(dir, 2407 2432 "Directory has corrupted htree index."); 2408 2433 retval = -EFSCORRUPTED; ··· 2552 2577 BUFFER_TRACE(frame->bh, "get_write_access"); 2553 2578 err = ext4_journal_get_write_access(handle, sb, frame->bh, 2554 2579 EXT4_JTR_NONE); 2555 - if (err) 2580 + if (err) { 2581 + brelse(bh2); 2556 2582 goto journal_error; 2583 + } 2557 2584 if (!add_level) { 2558 2585 unsigned icount1 = icount/2, icount2 = icount - icount1; 2559 2586 unsigned hash2 = dx_get_hash(entries + icount1); ··· 2566 2589 err = ext4_journal_get_write_access(handle, sb, 2567 2590 (frame - 1)->bh, 2568 2591 EXT4_JTR_NONE); 2569 - if (err) 2592 + if (err) { 2593 + brelse(bh2); 2570 2594 goto journal_error; 2595 + } 2571 2596 2572 2597 memcpy((char *) entries2, (char *) (entries + icount1), 2573 2598 icount2 * sizeof(struct dx_entry)); ··· 2588 2609 dxtrace(dx_show_index("node", 2589 2610 ((struct dx_node *) bh2->b_data)->entries)); 2590 2611 err = ext4_handle_dirty_dx_node(handle, dir, bh2); 2591 - if (err) 2612 + if (err) { 2613 + brelse(bh2); 2592 2614 goto journal_error; 2615 + } 2593 2616 brelse (bh2); 2594 2617 err = ext4_handle_dirty_dx_node(handle, dir, 2595 2618 (frame - 1)->bh); ··· 2616 2635 "Creating %d level index...\n", 2617 2636 dxroot->info.indirect_levels)); 2618 2637 err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); 2619 - if (err) 2638 + if (err) { 2639 + brelse(bh2); 2620 2640 goto journal_error; 2641 + } 2621 2642 err = ext4_handle_dirty_dx_node(handle, dir, bh2); 2622 2643 brelse(bh2); 2623 2644 restart = 1; ··· 2716 2733 return err; 2717 2734 } 2718 2735 2719 - if (ext4_has_metadata_csum(dir->i_sb)) 2736 + if (ext4_has_feature_metadata_csum(dir->i_sb)) 2720 2737 csum_size = sizeof(struct ext4_dir_entry_tail); 2721 2738 2722 2739 BUFFER_TRACE(bh, "get_write_access"); ··· 2956 2973 int csum_size = 0; 2957 2974 int err; 2958 2975 2959 - if (ext4_has_metadata_csum(dir->i_sb)) 2976 + if (ext4_has_feature_metadata_csum(dir->i_sb)) 2960 2977 csum_size = sizeof(struct ext4_dir_entry_tail); 2961 2978 2962 2979 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ··· 3134 3151 struct ext4_dir_entry_2 *de; 3135 3152 handle_t *handle = NULL; 3136 3153 3137 - if (unlikely(ext4_forced_shutdown(dir->i_sb))) 3138 - return -EIO; 3154 + retval = ext4_emergency_state(dir->i_sb); 3155 + if (unlikely(retval)) 3156 + return retval; 3139 3157 3140 3158 /* Initialize quotas before so that eventual writes go in 3141 3159 * separate transaction */ ··· 3293 3309 { 3294 3310 int retval; 3295 3311 3296 - if (unlikely(ext4_forced_shutdown(dir->i_sb))) 3297 - return -EIO; 3312 + retval = ext4_emergency_state(dir->i_sb); 3313 + if (unlikely(retval)) 3314 + return retval; 3298 3315 3299 3316 trace_ext4_unlink_enter(dir, dentry); 3300 3317 /* ··· 3361 3376 struct fscrypt_str disk_link; 3362 3377 int retries = 0; 3363 3378 3364 - if (unlikely(ext4_forced_shutdown(dir->i_sb))) 3365 - return -EIO; 3379 + err = ext4_emergency_state(dir->i_sb); 3380 + if (unlikely(err)) 3381 + return err; 3366 3382 3367 3383 err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, 3368 3384 &disk_link); ··· 4185 4199 { 4186 4200 int err; 4187 4201 4188 - if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) 4189 - return -EIO; 4202 + err = ext4_emergency_state(old_dir->i_sb); 4203 + if (unlikely(err)) 4204 + return err; 4190 4205 4191 4206 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) 4192 4207 return -EINVAL;

+1 -1

fs/ext4/orphan.c

··· 537 537 struct ext4_orphan_block_tail *ot; 538 538 __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); 539 539 540 - if (!ext4_has_metadata_csum(sb)) 540 + if (!ext4_has_feature_metadata_csum(sb)) 541 541 return 1; 542 542 543 543 ot = ext4_orphan_block_tail(sb, bh);

+51 -24

fs/ext4/page-io.c

··· 164 164 } 165 165 166 166 /* 167 - * Check a range of space and convert unwritten extents to written. Note that 167 + * On successful IO, check a range of space and convert unwritten extents to 168 + * written. On IO failure, check if journal abort is needed. Note that 168 169 * we are protected from truncate touching same part of extent tree by the 169 170 * fact that truncate code waits for all DIO to finish (thus exclusion from 170 171 * direct IO is achieved) and also waits for PageWriteback bits. Thus we ··· 176 175 { 177 176 struct inode *inode = io_end->inode; 178 177 handle_t *handle = io_end->handle; 178 + struct super_block *sb = inode->i_sb; 179 179 int ret = 0; 180 180 181 181 ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," 182 182 "list->prev 0x%p\n", 183 183 io_end, inode->i_ino, io_end->list.next, io_end->list.prev); 184 184 185 - io_end->handle = NULL; /* Following call will use up the handle */ 186 - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); 187 - if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { 188 - ext4_msg(inode->i_sb, KERN_EMERG, 185 + /* 186 + * Do not convert the unwritten extents if data writeback fails, 187 + * or stale data may be exposed. 188 + */ 189 + io_end->handle = NULL; /* Following call will use up the handle */ 190 + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { 191 + ret = -EIO; 192 + if (handle) 193 + jbd2_journal_free_reserved(handle); 194 + 195 + if (test_opt(sb, DATA_ERR_ABORT)) 196 + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); 197 + } else { 198 + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); 199 + } 200 + if (ret < 0 && !ext4_emergency_state(sb) && 201 + io_end->flag & EXT4_IO_END_UNWRITTEN) { 202 + ext4_msg(sb, KERN_EMERG, 189 203 "failed to convert unwritten extents to written " 190 204 "extents -- potential data loss! " 191 205 "(inode %lu, error %d)", inode->i_ino, ret); 192 206 } 207 + 193 208 ext4_clear_io_unwritten_flag(io_end); 194 209 ext4_release_io_end(io_end); 195 210 return ret; ··· 234 217 #endif 235 218 } 236 219 220 + static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) 221 + { 222 + if (io_end->flag & EXT4_IO_END_UNWRITTEN) 223 + return true; 224 + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && 225 + io_end->flag & EXT4_IO_END_FAILED) 226 + return true; 227 + return false; 228 + } 229 + 237 230 /* Add the io_end to per-inode completed end_io list. */ 238 231 static void ext4_add_complete_io(ext4_io_end_t *io_end) 239 232 { ··· 252 225 struct workqueue_struct *wq; 253 226 unsigned long flags; 254 227 255 - /* Only reserved conversions from writeback should enter here */ 256 - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 257 - WARN_ON(!io_end->handle && sbi->s_journal); 228 + /* Only reserved conversions or pending IO errors will enter here. */ 229 + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); 230 + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && 231 + !io_end->handle && sbi->s_journal); 232 + 258 233 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 259 234 wq = sbi->rsv_conversion_wq; 260 235 if (list_empty(&ei->i_rsv_conversion_list)) ··· 281 252 282 253 while (!list_empty(&unwritten)) { 283 254 io_end = list_entry(unwritten.next, ext4_io_end_t, list); 284 - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 255 + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); 285 256 list_del_init(&io_end->list); 286 257 287 258 err = ext4_end_io_end(io_end); ··· 292 263 } 293 264 294 265 /* 295 - * work on completed IO, to convert unwritten extents to extents 266 + * Used to convert unwritten extents to written extents upon IO completion, 267 + * or used to abort the journal upon IO errors. 296 268 */ 297 269 void ext4_end_io_rsv_work(struct work_struct *work) 298 270 { ··· 318 288 void ext4_put_io_end_defer(ext4_io_end_t *io_end) 319 289 { 320 290 if (refcount_dec_and_test(&io_end->count)) { 321 - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || 322 - list_empty(&io_end->list_vec)) { 323 - ext4_release_io_end(io_end); 291 + if (io_end->flag & EXT4_IO_END_FAILED || 292 + (io_end->flag & EXT4_IO_END_UNWRITTEN && 293 + !list_empty(&io_end->list_vec))) { 294 + ext4_add_complete_io(io_end); 324 295 return; 325 296 } 326 - ext4_add_complete_io(io_end); 297 + ext4_release_io_end(io_end); 327 298 } 328 299 } 329 300 330 301 int ext4_put_io_end(ext4_io_end_t *io_end) 331 302 { 332 - int err = 0; 333 - 334 303 if (refcount_dec_and_test(&io_end->count)) { 335 - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 336 - err = ext4_convert_unwritten_io_end_vec(io_end->handle, 337 - io_end); 338 - io_end->handle = NULL; 339 - ext4_clear_io_unwritten_flag(io_end); 340 - } 304 + if (ext4_io_end_defer_completion(io_end)) 305 + return ext4_end_io_end(io_end); 306 + 341 307 ext4_release_io_end(io_end); 342 308 } 343 - return err; 309 + return 0; 344 310 } 345 311 346 312 ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) ··· 370 344 bio->bi_status, inode->i_ino, 371 345 (unsigned long long) 372 346 bi_sector >> (inode->i_blkbits - 9)); 347 + io_end->flag |= EXT4_IO_END_FAILED; 373 348 mapping_set_error(inode->i_mapping, 374 349 blk_status_to_errno(bio->bi_status)); 375 350 } 376 351 377 - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { 352 + if (ext4_io_end_defer_completion(io_end)) { 378 353 /* 379 354 * Link bio into list hanging from io_end. We have to do it 380 355 * atomically as bio completions can be racing against each

+2 -2

fs/ext4/resize.c

··· 1118 1118 struct ext4_super_block *es = (struct ext4_super_block *) data; 1119 1119 1120 1120 es->s_block_group_nr = cpu_to_le16(group); 1121 - if (ext4_has_metadata_csum(sb)) 1121 + if (ext4_has_feature_metadata_csum(sb)) 1122 1122 es->s_checksum = ext4_superblock_csum(sb, es); 1123 1123 } 1124 1124 ··· 1315 1315 { 1316 1316 struct buffer_head *bh; 1317 1317 1318 - if (!ext4_has_metadata_csum(sb)) 1318 + if (!ext4_has_feature_metadata_csum(sb)) 1319 1319 return 0; 1320 1320 1321 1321 bh = ext4_get_bitmap(sb, group_data->inode_bitmap);

+143 -124

fs/ext4/super.c

··· 79 79 static int ext4_freeze(struct super_block *sb); 80 80 static inline int ext2_feature_set_ok(struct super_block *sb); 81 81 static inline int ext3_feature_set_ok(struct super_block *sb); 82 - static void ext4_destroy_lazyinit_thread(void); 83 82 static void ext4_unregister_li_request(struct super_block *sb); 84 83 static void ext4_clear_request_list(void); 85 84 static struct inode *ext4_get_journal_inode(struct super_block *sb, ··· 301 302 static int ext4_superblock_csum_verify(struct super_block *sb, 302 303 struct ext4_super_block *es) 303 304 { 304 - if (!ext4_has_metadata_csum(sb)) 305 + if (!ext4_has_feature_metadata_csum(sb)) 305 306 return 1; 306 307 307 308 return es->s_checksum == ext4_superblock_csum(sb, es); ··· 311 312 { 312 313 struct ext4_super_block *es = EXT4_SB(sb)->s_es; 313 314 314 - if (!ext4_has_metadata_csum(sb)) 315 + if (!ext4_has_feature_metadata_csum(sb)) 315 316 return; 316 317 317 318 es->s_checksum = ext4_superblock_csum(sb, es); ··· 447 448 #define ext4_get_tstamp(es, tstamp) \ 448 449 __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) 449 450 450 - #define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ 451 - #define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ 452 - 453 451 /* 454 452 * The ext4_maybe_update_superblock() function checks and updates the 455 453 * superblock if needed. ··· 454 458 * This function is designed to update the on-disk superblock only under 455 459 * certain conditions to prevent excessive disk writes and unnecessary 456 460 * waking of the disk from sleep. The superblock will be updated if: 457 - * 1. More than an hour has passed since the last superblock update, and 458 - * 2. More than 16MB have been written since the last superblock update. 461 + * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last 462 + * superblock update 463 + * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the 464 + * last superblock update. 459 465 * 460 466 * @sb: The superblock 461 467 */ ··· 471 473 __u64 lifetime_write_kbytes; 472 474 __u64 diff_size; 473 475 474 - if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || 475 - !journal || (journal->j_flags & JBD2_UNMOUNT)) 476 + if (ext4_emergency_state(sb) || sb_rdonly(sb) || 477 + !(sb->s_flags & SB_ACTIVE) || !journal || 478 + journal->j_flags & JBD2_UNMOUNT) 476 479 return; 477 480 478 481 now = ktime_get_real_seconds(); 479 482 last_update = ext4_get_tstamp(es, s_wtime); 480 483 481 - if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) 484 + if (likely(now - last_update < sbi->s_sb_update_sec)) 482 485 return; 483 486 484 487 lifetime_write_kbytes = sbi->s_kbytes_written + ··· 494 495 */ 495 496 diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); 496 497 497 - if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) 498 + if (diff_size > sbi->s_sb_update_kb) 498 499 schedule_work(&EXT4_SB(sb)->s_sb_upd_work); 499 500 } 500 501 501 502 static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) 502 503 { 503 504 struct super_block *sb = journal->j_private; 504 - struct ext4_sb_info *sbi = EXT4_SB(sb); 505 - int error = is_journal_aborted(journal); 506 - struct ext4_journal_cb_entry *jce; 507 505 508 506 BUG_ON(txn->t_state == T_FINISHED); 509 507 510 508 ext4_process_freed_data(sb, txn->t_tid); 511 509 ext4_maybe_update_superblock(sb); 512 - 513 - spin_lock(&sbi->s_md_lock); 514 - while (!list_empty(&txn->t_private_list)) { 515 - jce = list_entry(txn->t_private_list.next, 516 - struct ext4_journal_cb_entry, jce_list); 517 - list_del_init(&jce->jce_list); 518 - spin_unlock(&sbi->s_md_lock); 519 - jce->jce_func(sb, jce, error); 520 - spin_lock(&sbi->s_md_lock); 521 - } 522 - spin_unlock(&sbi->s_md_lock); 523 510 } 524 511 525 512 /* ··· 692 707 if (test_opt(sb, WARN_ON_ERROR)) 693 708 WARN_ON_ONCE(1); 694 709 695 - if (!continue_fs && !sb_rdonly(sb)) { 696 - set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); 697 - if (journal) 698 - jbd2_journal_abort(journal, -EIO); 699 - } 710 + if (!continue_fs && !ext4_emergency_ro(sb) && journal) 711 + jbd2_journal_abort(journal, -EIO); 700 712 701 713 if (!bdev_read_only(sb->s_bdev)) { 702 714 save_error_info(sb, error, ino, block, func, line); ··· 701 719 * In case the fs should keep running, we need to writeout 702 720 * superblock through the journal. Due to lock ordering 703 721 * constraints, it may not be safe to do it right here so we 704 - * defer superblock flushing to a workqueue. 722 + * defer superblock flushing to a workqueue. We just need to be 723 + * careful when the journal is already shutting down. If we get 724 + * here in that case, just update the sb directly as the last 725 + * transaction won't commit anyway. 705 726 */ 706 - if (continue_fs && journal) 727 + if (continue_fs && journal && 728 + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) 707 729 schedule_work(&EXT4_SB(sb)->s_sb_upd_work); 708 730 else 709 731 ext4_commit_super(sb); ··· 723 737 sb->s_id); 724 738 } 725 739 726 - if (sb_rdonly(sb) || continue_fs) 740 + if (ext4_emergency_ro(sb) || continue_fs) 727 741 return; 728 742 729 743 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 730 744 /* 731 - * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem 732 - * modifications. We don't set SB_RDONLY because that requires 733 - * sb->s_umount semaphore and setting it without proper remount 734 - * procedure is confusing code such as freeze_super() leading to 735 - * deadlocks and other problems. 745 + * We don't set SB_RDONLY because that requires sb->s_umount 746 + * semaphore and setting it without proper remount procedure is 747 + * confusing code such as freeze_super() leading to deadlocks 748 + * and other problems. 736 749 */ 750 + set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); 737 751 } 738 752 739 753 static void update_super_work(struct work_struct *work) ··· 751 765 * We use directly jbd2 functions here to avoid recursing back into 752 766 * ext4 error handling code during handling of previous errors. 753 767 */ 754 - if (!sb_rdonly(sbi->s_sb) && journal) { 768 + if (!ext4_emergency_state(sbi->s_sb) && 769 + !sb_rdonly(sbi->s_sb) && journal) { 755 770 struct buffer_head *sbh = sbi->s_sbh; 756 771 bool call_notify_err = false; 757 772 ··· 806 819 struct va_format vaf; 807 820 va_list args; 808 821 809 - if (unlikely(ext4_forced_shutdown(sb))) 822 + if (unlikely(ext4_emergency_state(sb))) 810 823 return; 811 824 812 825 trace_ext4_error(sb, function, line); ··· 831 844 va_list args; 832 845 struct va_format vaf; 833 846 834 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 847 + if (unlikely(ext4_emergency_state(inode->i_sb))) 835 848 return; 836 849 837 850 trace_ext4_error(inode->i_sb, function, line); ··· 866 879 struct inode *inode = file_inode(file); 867 880 char pathname[80], *path; 868 881 869 - if (unlikely(ext4_forced_shutdown(inode->i_sb))) 882 + if (unlikely(ext4_emergency_state(inode->i_sb))) 870 883 return; 871 884 872 885 trace_ext4_error(inode->i_sb, function, line); ··· 946 959 char nbuf[16]; 947 960 const char *errstr; 948 961 949 - if (unlikely(ext4_forced_shutdown(sb))) 962 + if (unlikely(ext4_emergency_state(sb))) 950 963 return; 951 964 952 965 /* Special case: if the error is EROFS, and we're not already ··· 1040 1053 struct va_format vaf; 1041 1054 va_list args; 1042 1055 1043 - if (unlikely(ext4_forced_shutdown(sb))) 1056 + if (unlikely(ext4_emergency_state(sb))) 1044 1057 return; 1045 1058 1046 1059 trace_ext4_error(sb, function, line); ··· 1293 1306 ext4_unregister_li_request(sb); 1294 1307 ext4_quotas_off(sb, EXT4_MAXQUOTAS); 1295 1308 1296 - flush_work(&sbi->s_sb_upd_work); 1297 1309 destroy_workqueue(sbi->rsv_conversion_wq); 1298 1310 ext4_release_orphan_info(sb); 1299 1311 1300 1312 if (sbi->s_journal) { 1301 1313 aborted = is_journal_aborted(sbi->s_journal); 1302 - err = jbd2_journal_destroy(sbi->s_journal); 1303 - sbi->s_journal = NULL; 1314 + err = ext4_journal_destroy(sbi, sbi->s_journal); 1304 1315 if ((err < 0) && !aborted) { 1305 1316 ext4_abort(sb, -err, "Couldn't clean up the journal"); 1306 1317 } 1307 - } 1318 + } else 1319 + flush_work(&sbi->s_sb_upd_work); 1308 1320 1309 1321 ext4_es_unregister_shrinker(sbi); 1310 1322 timer_shutdown_sync(&sbi->s_err_report); ··· 1311 1325 ext4_mb_release(sb); 1312 1326 ext4_ext_release(sb); 1313 1327 1314 - if (!sb_rdonly(sb) && !aborted) { 1315 - ext4_clear_feature_journal_needs_recovery(sb); 1316 - ext4_clear_feature_orphan_present(sb); 1317 - es->s_state = cpu_to_le16(sbi->s_mount_state); 1318 - } 1319 - if (!sb_rdonly(sb)) 1328 + if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { 1329 + if (!aborted) { 1330 + ext4_clear_feature_journal_needs_recovery(sb); 1331 + ext4_clear_feature_orphan_present(sb); 1332 + es->s_state = cpu_to_le16(sbi->s_mount_state); 1333 + } 1320 1334 ext4_commit_super(sb); 1335 + } 1321 1336 1322 1337 ext4_group_desc_free(sbi); 1323 1338 ext4_flex_groups_free(sbi); ··· 1413 1426 spin_lock_init(&ei->i_completed_io_lock); 1414 1427 ei->i_sync_tid = 0; 1415 1428 ei->i_datasync_tid = 0; 1416 - atomic_set(&ei->i_unwritten, 0); 1417 1429 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); 1418 1430 ext4_fc_init_inode(&ei->vfs_inode); 1419 1431 mutex_init(&ei->i_fc_lock); ··· 2771 2785 } 2772 2786 2773 2787 if (is_remount) { 2788 + if (!sbi->s_journal && 2789 + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { 2790 + ext4_msg(NULL, KERN_WARNING, 2791 + "Remounting fs w/o journal so ignoring data_err option"); 2792 + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); 2793 + } 2794 + 2774 2795 if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && 2775 2796 (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { 2776 2797 ext4_msg(NULL, KERN_ERR, "can't mount with " ··· 3031 3038 if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) 3032 3039 SEQ_OPTS_PUTS("prefetch_block_bitmaps"); 3033 3040 3041 + if (ext4_emergency_ro(sb)) 3042 + SEQ_OPTS_PUTS("emergency_ro"); 3043 + 3044 + if (ext4_forced_shutdown(sb)) 3045 + SEQ_OPTS_PUTS("shutdown"); 3046 + 3034 3047 ext4_show_quota_options(seq, sb); 3035 3048 return 0; 3036 3049 } ··· 3204 3205 __le32 le_group = cpu_to_le32(block_group); 3205 3206 struct ext4_sb_info *sbi = EXT4_SB(sb); 3206 3207 3207 - if (ext4_has_metadata_csum(sbi->s_sb)) { 3208 + if (ext4_has_feature_metadata_csum(sbi->s_sb)) { 3208 3209 /* Use new metadata_csum algorithm */ 3209 3210 __u32 csum32; 3210 3211 __u16 dummy_csum = 0; ··· 3692 3693 if (group >= elr->lr_next_group) { 3693 3694 ret = 1; 3694 3695 if (elr->lr_first_not_zeroed != ngroups && 3695 - !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { 3696 + !ext4_emergency_state(sb) && !sb_rdonly(sb) && 3697 + test_opt(sb, INIT_INODE_TABLE)) { 3696 3698 elr->lr_next_group = elr->lr_first_not_zeroed; 3697 3699 elr->lr_mode = EXT4_LI_MODE_ITABLE; 3698 3700 ret = 0; ··· 3998 3998 goto out; 3999 3999 } 4000 4000 4001 - if (sb_rdonly(sb) || 4001 + if (ext4_emergency_state(sb) || sb_rdonly(sb) || 4002 4002 (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && 4003 4003 (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) 4004 4004 goto out; ··· 4061 4061 int compat, incompat; 4062 4062 struct ext4_sb_info *sbi = EXT4_SB(sb); 4063 4063 4064 - if (ext4_has_metadata_csum(sb)) { 4064 + if (ext4_has_feature_metadata_csum(sb)) { 4065 4065 /* journal checksum v3 */ 4066 4066 compat = 0; 4067 4067 incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; ··· 4349 4349 if (ext4_has_feature_fast_commit(sb)) 4350 4350 set_opt2(sb, JOURNAL_FAST_COMMIT); 4351 4351 /* don't forget to enable journal_csum when metadata_csum is enabled. */ 4352 - if (ext4_has_metadata_csum(sb)) 4352 + if (ext4_has_feature_metadata_csum(sb)) 4353 4353 set_opt(sb, JOURNAL_CHECKSUM); 4354 4354 4355 4355 if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) ··· 4642 4642 /* Precompute checksum seed for all metadata */ 4643 4643 if (ext4_has_feature_csum_seed(sb)) 4644 4644 sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); 4645 - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) 4645 + else if (ext4_has_feature_metadata_csum(sb) || 4646 + ext4_has_feature_ea_inode(sb)) 4646 4647 sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, 4647 4648 sizeof(es->s_uuid)); 4648 4649 return 0; ··· 4974 4973 return 0; 4975 4974 4976 4975 out: 4977 - /* flush s_sb_upd_work before destroying the journal. */ 4978 - flush_work(&sbi->s_sb_upd_work); 4979 - jbd2_journal_destroy(sbi->s_journal); 4980 - sbi->s_journal = NULL; 4976 + ext4_journal_destroy(sbi, sbi->s_journal); 4981 4977 return -EINVAL; 4982 4978 } 4983 4979 ··· 5009 5011 } 5010 5012 5011 5013 return 0; 5014 + } 5015 + 5016 + static const char *ext4_has_journal_option(struct super_block *sb) 5017 + { 5018 + struct ext4_sb_info *sbi = EXT4_SB(sb); 5019 + 5020 + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) 5021 + return "journal_async_commit"; 5022 + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) 5023 + return "journal_checksum"; 5024 + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) 5025 + return "commit="; 5026 + if (EXT4_MOUNT_DATA_FLAGS & 5027 + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) 5028 + return "data="; 5029 + if (test_opt(sb, DATA_ERR_ABORT)) 5030 + return "data_err=abort"; 5031 + return NULL; 5012 5032 } 5013 5033 5014 5034 static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, ··· 5279 5263 sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; 5280 5264 sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; 5281 5265 sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; 5266 + sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; 5267 + sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; 5282 5268 5283 5269 /* 5284 5270 * set default s_li_wait_mult for lazyinit, for the case there is ··· 5422 5404 "suppressed and not mounted read-only"); 5423 5405 goto failed_mount3a; 5424 5406 } else { 5407 + const char *journal_option; 5408 + 5425 5409 /* Nojournal mode, all journal mount options are illegal */ 5426 - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { 5427 - ext4_msg(sb, KERN_ERR, "can't mount with " 5428 - "journal_async_commit, fs mounted w/o journal"); 5410 + journal_option = ext4_has_journal_option(sb); 5411 + if (journal_option != NULL) { 5412 + ext4_msg(sb, KERN_ERR, 5413 + "can't mount with %s, fs mounted w/o journal", 5414 + journal_option); 5429 5415 goto failed_mount3a; 5430 5416 } 5431 5417 5432 - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { 5433 - ext4_msg(sb, KERN_ERR, "can't mount with " 5434 - "journal_checksum, fs mounted w/o journal"); 5435 - goto failed_mount3a; 5436 - } 5437 - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { 5438 - ext4_msg(sb, KERN_ERR, "can't mount with " 5439 - "commit=%lu, fs mounted w/o journal", 5440 - sbi->s_commit_interval / HZ); 5441 - goto failed_mount3a; 5442 - } 5443 - if (EXT4_MOUNT_DATA_FLAGS & 5444 - (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { 5445 - ext4_msg(sb, KERN_ERR, "can't mount with " 5446 - "data=, fs mounted w/o journal"); 5447 - goto failed_mount3a; 5448 - } 5449 5418 sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; 5450 5419 clear_opt(sb, JOURNAL_CHECKSUM); 5451 5420 clear_opt(sb, DATA_FLAGS); ··· 5621 5616 goto failed_mount9; 5622 5617 } 5623 5618 5624 - if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) 5619 + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { 5625 5620 ext4_msg(sb, KERN_WARNING, 5626 5621 "mounting with \"discard\" option, but the device does not support discard"); 5622 + clear_opt(sb, DISCARD); 5623 + } 5627 5624 5628 5625 if (es->s_error_count) 5629 5626 mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ ··· 5672 5665 sbi->s_ea_block_cache = NULL; 5673 5666 5674 5667 if (sbi->s_journal) { 5675 - /* flush s_sb_upd_work before journal destroy. */ 5676 - flush_work(&sbi->s_sb_upd_work); 5677 - jbd2_journal_destroy(sbi->s_journal); 5678 - sbi->s_journal = NULL; 5668 + ext4_journal_destroy(sbi, sbi->s_journal); 5679 5669 } 5680 5670 failed_mount3a: 5681 5671 ext4_es_unregister_shrinker(sbi); ··· 5777 5773 journal->j_flags |= JBD2_BARRIER; 5778 5774 else 5779 5775 journal->j_flags &= ~JBD2_BARRIER; 5780 - if (test_opt(sb, DATA_ERR_ABORT)) 5781 - journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; 5782 - else 5783 - journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; 5784 5776 /* 5785 5777 * Always enable journal cycle record option, letting the journal 5786 5778 * records log transactions continuously between each mount. ··· 5973 5973 return journal; 5974 5974 5975 5975 out_journal: 5976 - jbd2_journal_destroy(journal); 5976 + ext4_journal_destroy(EXT4_SB(sb), journal); 5977 5977 out_bdev: 5978 5978 bdev_fput(bdev_file); 5979 5979 return ERR_PTR(errno); ··· 6090 6090 EXT4_SB(sb)->s_journal = journal; 6091 6091 err = ext4_clear_journal_err(sb, es); 6092 6092 if (err) { 6093 - EXT4_SB(sb)->s_journal = NULL; 6094 - jbd2_journal_destroy(journal); 6093 + ext4_journal_destroy(EXT4_SB(sb), journal); 6095 6094 return err; 6096 6095 } 6097 6096 ··· 6108 6109 return 0; 6109 6110 6110 6111 err_out: 6111 - jbd2_journal_destroy(journal); 6112 + ext4_journal_destroy(EXT4_SB(sb), journal); 6112 6113 return err; 6113 6114 } 6114 6115 ··· 6335 6336 bool needs_barrier = false; 6336 6337 struct ext4_sb_info *sbi = EXT4_SB(sb); 6337 6338 6338 - if (unlikely(ext4_forced_shutdown(sb))) 6339 - return -EIO; 6339 + ret = ext4_emergency_state(sb); 6340 + if (unlikely(ret)) 6341 + return ret; 6340 6342 6341 6343 trace_ext4_sync_fs(sb, wait); 6342 6344 flush_workqueue(sbi->rsv_conversion_wq); ··· 6419 6419 */ 6420 6420 static int ext4_unfreeze(struct super_block *sb) 6421 6421 { 6422 - if (ext4_forced_shutdown(sb)) 6422 + if (ext4_emergency_state(sb)) 6423 6423 return 0; 6424 6424 6425 6425 if (EXT4_SB(sb)->s_journal) { ··· 6575 6575 flush_work(&sbi->s_sb_upd_work); 6576 6576 6577 6577 if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { 6578 - if (ext4_forced_shutdown(sb)) { 6578 + if (ext4_emergency_state(sb)) { 6579 6579 err = -EROFS; 6580 6580 goto restore_opts; 6581 6581 } ··· 6780 6780 { 6781 6781 struct super_block *sb = fc->root->d_sb; 6782 6782 int ret; 6783 + bool old_ro = sb_rdonly(sb); 6783 6784 6784 6785 fc->s_fs_info = EXT4_SB(sb); 6785 6786 ··· 6792 6791 if (ret < 0) 6793 6792 return ret; 6794 6793 6795 - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", 6796 - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", 6797 - ext4_quota_mode(sb)); 6794 + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", 6795 + &sb->s_uuid, 6796 + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); 6798 6797 6799 6798 return 0; 6800 6799 } ··· 6818 6817 dquot->dq_dqb.dqb_bhardlimit); 6819 6818 limit >>= sb->s_blocksize_bits; 6820 6819 6821 - if (limit && buf->f_blocks > limit) { 6820 + if (limit) { 6821 + uint64_t remaining = 0; 6822 + 6822 6823 curblock = (dquot->dq_dqb.dqb_curspace + 6823 6824 dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; 6824 - buf->f_blocks = limit; 6825 - buf->f_bfree = buf->f_bavail = 6826 - (buf->f_blocks > curblock) ? 6827 - (buf->f_blocks - curblock) : 0; 6825 + if (limit > curblock) 6826 + remaining = limit - curblock; 6827 + 6828 + buf->f_blocks = min(buf->f_blocks, limit); 6829 + buf->f_bfree = min(buf->f_bfree, remaining); 6830 + buf->f_bavail = min(buf->f_bavail, remaining); 6828 6831 } 6829 6832 6830 6833 limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, 6831 6834 dquot->dq_dqb.dqb_ihardlimit); 6832 - if (limit && buf->f_files > limit) { 6833 - buf->f_files = limit; 6834 - buf->f_ffree = 6835 - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? 6836 - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; 6835 + if (limit) { 6836 + uint64_t remaining = 0; 6837 + 6838 + if (limit > dquot->dq_dqb.dqb_curinodes) 6839 + remaining = limit - dquot->dq_dqb.dqb_curinodes; 6840 + 6841 + buf->f_files = min(buf->f_files, limit); 6842 + buf->f_ffree = min(buf->f_ffree, remaining); 6837 6843 } 6838 6844 6839 6845 spin_unlock(&dquot->dq_dqb_lock); ··· 6943 6935 { 6944 6936 int ret, err; 6945 6937 handle_t *handle; 6938 + bool freeze_protected = false; 6939 + 6940 + /* 6941 + * Trying to sb_start_intwrite() in a running transaction 6942 + * can result in a deadlock. Further, running transactions 6943 + * are already protected from freezing. 6944 + */ 6945 + if (!ext4_journal_current_handle()) { 6946 + sb_start_intwrite(dquot->dq_sb); 6947 + freeze_protected = true; 6948 + } 6946 6949 6947 6950 handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, 6948 6951 EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); 6949 6952 if (IS_ERR(handle)) { 6950 6953 /* Release dquot anyway to avoid endless cycle in dqput() */ 6951 6954 dquot_release(dquot); 6955 + if (freeze_protected) 6956 + sb_end_intwrite(dquot->dq_sb); 6952 6957 return PTR_ERR(handle); 6953 6958 } 6954 6959 ret = dquot_release(dquot); ··· 6972 6951 err = ext4_journal_stop(handle); 6973 6952 if (!ret) 6974 6953 ret = err; 6954 + 6955 + if (freeze_protected) 6956 + sb_end_intwrite(dquot->dq_sb); 6957 + 6975 6958 return ret; 6976 6959 } 6977 6960 ··· 7313 7288 } 7314 7289 lock_buffer(bh); 7315 7290 memcpy(bh->b_data+offset, data, len); 7316 - flush_dcache_page(bh->b_page); 7291 + flush_dcache_folio(bh->b_folio); 7317 7292 unlock_buffer(bh); 7318 7293 err = ext4_handle_dirty_metadata(handle, NULL, bh); 7319 7294 brelse(bh); ··· 7406 7381 }; 7407 7382 MODULE_ALIAS_FS("ext4"); 7408 7383 7409 - /* Shared across all ext4 file systems */ 7410 - wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; 7411 - 7412 7384 static int __init ext4_init_fs(void) 7413 7385 { 7414 - int i, err; 7386 + int err; 7415 7387 7416 7388 ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); 7417 7389 ext4_li_info = NULL; 7418 7390 7419 7391 /* Build-time check for flags consistency */ 7420 7392 ext4_check_flag_values(); 7421 - 7422 - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) 7423 - init_waitqueue_head(&ext4__ioend_wq[i]); 7424 7393 7425 7394 err = ext4_init_es(); 7426 7395 if (err)

+4

fs/ext4/sysfs.c

··· 254 254 EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); 255 255 EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); 256 256 EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); 257 + EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); 258 + EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); 257 259 258 260 static unsigned int old_bump_val = 128; 259 261 EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); ··· 307 305 ATTR_LIST(mb_prefetch), 308 306 ATTR_LIST(mb_prefetch_limit), 309 307 ATTR_LIST(last_trim_minblks), 308 + ATTR_LIST(sb_update_sec), 309 + ATTR_LIST(sb_update_kb), 310 310 NULL, 311 311 }; 312 312 ATTRIBUTE_GROUPS(ext4);

+16 -31

fs/ext4/xattr.c

··· 156 156 struct ext4_xattr_header *hdr = BHDR(bh); 157 157 int ret = 1; 158 158 159 - if (ext4_has_metadata_csum(inode->i_sb)) { 159 + if (ext4_has_feature_metadata_csum(inode->i_sb)) { 160 160 lock_buffer(bh); 161 161 ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, 162 162 bh->b_blocknr, hdr)); ··· 168 168 static void ext4_xattr_block_csum_set(struct inode *inode, 169 169 struct buffer_head *bh) 170 170 { 171 - if (ext4_has_metadata_csum(inode->i_sb)) 171 + if (ext4_has_feature_metadata_csum(inode->i_sb)) 172 172 BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, 173 173 bh->b_blocknr, BHDR(bh)); 174 174 } ··· 308 308 __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) 309 309 310 310 311 - static inline int 311 + int 312 312 __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, 313 313 void *end, const char *function, unsigned int line) 314 314 { 315 315 return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), 316 316 function, line); 317 317 } 318 - 319 - #define xattr_check_inode(inode, header, end) \ 320 - __xattr_check_inode((inode), (header), (end), __func__, __LINE__) 321 318 322 319 static int 323 320 xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, ··· 646 649 return error; 647 650 raw_inode = ext4_raw_inode(&iloc); 648 651 header = IHDR(inode, raw_inode); 649 - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 650 - error = xattr_check_inode(inode, header, end); 651 - if (error) 652 - goto cleanup; 652 + end = ITAIL(inode, raw_inode); 653 653 entry = IFIRST(header); 654 654 error = xattr_find_entry(inode, &entry, end, name_index, name, 0); 655 655 if (error) ··· 777 783 struct ext4_xattr_ibody_header *header; 778 784 struct ext4_inode *raw_inode; 779 785 struct ext4_iloc iloc; 780 - void *end; 781 786 int error; 782 787 783 788 if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) ··· 786 793 return error; 787 794 raw_inode = ext4_raw_inode(&iloc); 788 795 header = IHDR(inode, raw_inode); 789 - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 790 - error = xattr_check_inode(inode, header, end); 791 - if (error) 792 - goto cleanup; 793 796 error = ext4_xattr_list_entries(dentry, IFIRST(header), 794 797 buffer, buffer_size); 795 798 796 - cleanup: 797 799 brelse(iloc.bh); 798 800 return error; 799 801 } ··· 856 868 struct ext4_xattr_ibody_header *header; 857 869 struct ext4_xattr_entry *entry; 858 870 qsize_t ea_inode_refs = 0; 859 - void *end; 860 871 int ret; 861 872 862 873 lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); ··· 866 879 goto out; 867 880 raw_inode = ext4_raw_inode(&iloc); 868 881 header = IHDR(inode, raw_inode); 869 - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 870 - ret = xattr_check_inode(inode, header, end); 871 - if (ret) 872 - goto out; 873 882 874 883 for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); 875 884 entry = EXT4_XATTR_NEXT(entry)) ··· 1159 1176 { 1160 1177 struct inode *ea_inode; 1161 1178 struct ext4_xattr_entry *entry; 1179 + struct ext4_iloc iloc; 1162 1180 bool dirty = false; 1163 1181 unsigned int ea_ino; 1164 1182 int err; 1165 1183 int credits; 1184 + void *end; 1185 + 1186 + if (block_csum) 1187 + end = (void *)bh->b_data + bh->b_size; 1188 + else { 1189 + ext4_get_inode_loc(parent, &iloc); 1190 + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; 1191 + } 1166 1192 1167 1193 /* One credit for dec ref on ea_inode, one for orphan list addition, */ 1168 1194 credits = 2 + extra_credits; 1169 1195 1170 - for (entry = first; !IS_LAST_ENTRY(entry); 1196 + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); 1171 1197 entry = EXT4_XATTR_NEXT(entry)) { 1172 1198 if (!entry->e_value_inum) 1173 1199 continue; ··· 2227 2235 header = IHDR(inode, raw_inode); 2228 2236 is->s.base = is->s.first = IFIRST(header); 2229 2237 is->s.here = is->s.first; 2230 - is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 2238 + is->s.end = ITAIL(inode, raw_inode); 2231 2239 if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { 2232 - error = xattr_check_inode(inode, header, is->s.end); 2233 - if (error) 2234 - return error; 2235 2240 /* Find the named attribute. */ 2236 2241 error = xattr_find_entry(inode, &is->s.here, is->s.end, 2237 2242 i->name_index, i->name, 0); ··· 2775 2786 */ 2776 2787 2777 2788 base = IFIRST(header); 2778 - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; 2789 + end = ITAIL(inode, raw_inode); 2779 2790 min_offs = end - base; 2780 2791 total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); 2781 - 2782 - error = xattr_check_inode(inode, header, end); 2783 - if (error) 2784 - goto cleanup; 2785 2792 2786 2793 ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); 2787 2794 if (ifree >= isize_diff)

+10

fs/ext4/xattr.h

··· 67 67 ((void *)raw_inode + \ 68 68 EXT4_GOOD_OLD_INODE_SIZE + \ 69 69 EXT4_I(inode)->i_extra_isize)) 70 + #define ITAIL(inode, raw_inode) \ 71 + ((void *)(raw_inode) + \ 72 + EXT4_SB((inode)->i_sb)->s_inode_size) 70 73 #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) 71 74 72 75 /* ··· 208 205 209 206 extern struct mb_cache *ext4_xattr_create_cache(void); 210 207 extern void ext4_xattr_destroy_cache(struct mb_cache *); 208 + 209 + extern int 210 + __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, 211 + void *end, const char *function, unsigned int line); 212 + 213 + #define xattr_check_inode(inode, header, end) \ 214 + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) 211 215 212 216 #ifdef CONFIG_EXT4_FS_SECURITY 213 217 extern int ext4_init_security(handle_t *handle, struct inode *inode,

+4 -6

fs/jbd2/commit.c

··· 57 57 * So here, we have a buffer which has just come off the forget list. Look to 58 58 * see if we can strip all buffers from the backing page. 59 59 * 60 - * Called under lock_journal(), and possibly under journal_datalist_lock. The 61 - * caller provided us with a ref against the buffer, and we drop that here. 60 + * Called under j_list_lock. The caller provided us with a ref against the 61 + * buffer, and we drop that here. 62 62 */ 63 63 static void release_buffer_page(struct buffer_head *bh) 64 64 { ··· 738 738 err = journal_finish_inode_data_buffers(journal, commit_transaction); 739 739 if (err) { 740 740 printk(KERN_WARNING 741 - "JBD2: Detected IO errors while flushing file data " 742 - "on %s\n", journal->j_devname); 743 - if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) 744 - jbd2_journal_abort(journal, err); 741 + "JBD2: Detected IO errors %d while flushing file data on %s\n", 742 + err, journal->j_devname); 745 743 err = 0; 746 744 } 747 745

+20 -14

fs/jbd2/journal.c

··· 603 603 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) 604 604 { 605 605 int ret = 0; 606 - transaction_t *commit_trans; 606 + transaction_t *commit_trans, *running_trans; 607 607 608 608 if (!(journal->j_flags & JBD2_BARRIER)) 609 609 return 0; ··· 613 613 goto out; 614 614 commit_trans = journal->j_committing_transaction; 615 615 if (!commit_trans || commit_trans->t_tid != tid) { 616 + running_trans = journal->j_running_transaction; 617 + /* 618 + * The query transaction hasn't started committing, 619 + * it must still be running. 620 + */ 621 + if (WARN_ON_ONCE(!running_trans || 622 + running_trans->t_tid != tid)) 623 + goto out; 624 + 625 + running_trans->t_need_data_flush = 1; 616 626 ret = 1; 617 627 goto out; 618 628 } ··· 957 947 * descriptor blocks we do need to generate bona fide buffers. 958 948 * 959 949 * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying 960 - * the buffer's contents they really should run flush_dcache_page(bh->b_page). 950 + * the buffer's contents they really should run flush_dcache_folio(bh->b_folio). 961 951 * But we don't bother doing that, so there will be coherency problems with 962 952 * mmaps of blockdevs which hold live JBD-controlled filesystems. 963 953 */ ··· 1371 1361 return err; 1372 1362 } 1373 1363 1374 - if (jbd2_journal_has_csum_v2or3_feature(journal) && 1364 + if (jbd2_journal_has_csum_v2or3(journal) && 1375 1365 jbd2_has_feature_checksum(journal)) { 1376 1366 /* Can't have checksum v1 and v2 on at the same time! */ 1377 1367 printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " ··· 1379 1369 return err; 1380 1370 } 1381 1371 1382 - if (jbd2_journal_has_csum_v2or3_feature(journal)) { 1372 + if (jbd2_journal_has_csum_v2or3(journal)) { 1383 1373 if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { 1384 1374 printk(KERN_ERR "JBD2: Unknown checksum type\n"); 1385 1375 return err; ··· 1879 1869 1880 1870 /* Log is no longer empty */ 1881 1871 write_lock(&journal->j_state_lock); 1882 - WARN_ON(!sb->s_sequence); 1883 1872 journal->j_flags &= ~JBD2_FLUSHED; 1884 1873 write_unlock(&journal->j_state_lock); 1885 1874 ··· 1974 1965 return err; 1975 1966 } 1976 1967 1977 - if (block_start == ~0ULL) { 1978 - block_start = phys_block; 1979 - block_stop = block_start - 1; 1980 - } 1968 + if (block_start == ~0ULL) 1969 + block_stop = block_start = phys_block; 1981 1970 1982 1971 /* 1983 1972 * last block not contiguous with current block, 1984 1973 * process last contiguous region and return to this block on 1985 1974 * next loop 1986 1975 */ 1987 - if (phys_block != block_stop + 1) { 1976 + if (phys_block != block_stop) { 1988 1977 block--; 1989 1978 } else { 1990 1979 block_stop++; ··· 2001 1994 */ 2002 1995 byte_start = block_start * journal->j_blocksize; 2003 1996 byte_stop = block_stop * journal->j_blocksize; 2004 - byte_count = (block_stop - block_start + 1) * 2005 - journal->j_blocksize; 1997 + byte_count = (block_stop - block_start) * journal->j_blocksize; 2006 1998 2007 1999 truncate_inode_pages_range(journal->j_dev->bd_mapping, 2008 - byte_start, byte_stop); 2000 + byte_start, byte_stop - 1); 2009 2001 2010 2002 if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { 2011 2003 err = blkdev_issue_discard(journal->j_dev, ··· 2019 2013 } 2020 2014 2021 2015 if (unlikely(err != 0)) { 2022 - pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", 2016 + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)", 2023 2017 err, block_start, block_stop); 2024 2018 return err; 2025 2019 }

+57 -23

fs/jbd2/recovery.c

··· 39 39 40 40 static int do_one_pass(journal_t *journal, 41 41 struct recovery_info *info, enum passtype pass); 42 - static int scan_revoke_records(journal_t *, struct buffer_head *, 42 + static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, 43 43 tid_t, struct recovery_info *); 44 44 45 45 #ifdef __KERNEL__ ··· 65 65 */ 66 66 67 67 #define MAXBUF 8 68 - static int do_readahead(journal_t *journal, unsigned int start) 68 + static void do_readahead(journal_t *journal, unsigned int start) 69 69 { 70 - int err; 71 70 unsigned int max, nbufs, next; 72 71 unsigned long long blocknr; 73 72 struct buffer_head *bh; ··· 84 85 nbufs = 0; 85 86 86 87 for (next = start; next < max; next++) { 87 - err = jbd2_journal_bmap(journal, next, &blocknr); 88 + int err = jbd2_journal_bmap(journal, next, &blocknr); 88 89 89 90 if (err) { 90 91 printk(KERN_ERR "JBD2: bad block at offset %u\n", ··· 93 94 } 94 95 95 96 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 96 - if (!bh) { 97 - err = -ENOMEM; 97 + if (!bh) 98 98 goto failed; 99 - } 100 99 101 100 if (!buffer_uptodate(bh) && !buffer_locked(bh)) { 102 101 bufs[nbufs++] = bh; ··· 109 112 110 113 if (nbufs) 111 114 bh_readahead_batch(nbufs, bufs, 0); 112 - err = 0; 113 115 114 116 failed: 115 117 if (nbufs) 116 118 journal_brelse_array(bufs, nbufs); 117 - return err; 118 119 } 119 120 120 121 #endif /* __KERNEL__ */ ··· 282 287 int jbd2_journal_recover(journal_t *journal) 283 288 { 284 289 int err, err2; 285 - journal_superblock_t * sb; 286 - 287 290 struct recovery_info info; 288 291 289 292 memset(&info, 0, sizeof(info)); 290 - sb = journal->j_superblock; 291 293 292 294 /* 293 295 * The journal superblock's s_start field (the current log head) 294 296 * is always zero if, and only if, the journal was cleanly 295 - * unmounted. 297 + * unmounted. We use its in-memory version j_tail here because 298 + * jbd2_journal_wipe() could have updated it without updating journal 299 + * superblock. 296 300 */ 297 - if (!sb->s_start) { 301 + if (!journal->j_tail) { 302 + journal_superblock_t *sb = journal->j_superblock; 303 + 298 304 jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", 299 305 be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); 300 306 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; ··· 323 327 journal->j_transaction_sequence, journal->j_head); 324 328 325 329 jbd2_journal_clear_revoke(journal); 330 + /* Free revoke table allocated for replay */ 331 + if (journal->j_revoke != journal->j_revoke_table[0] && 332 + journal->j_revoke != journal->j_revoke_table[1]) { 333 + jbd2_journal_destroy_revoke_table(journal->j_revoke); 334 + journal->j_revoke = journal->j_revoke_table[1]; 335 + } 326 336 err2 = sync_blockdev(journal->j_fs_dev); 327 337 if (!err) 328 338 err = err2; ··· 614 612 first_commit_ID = next_commit_ID; 615 613 if (pass == PASS_SCAN) 616 614 info->start_transaction = first_commit_ID; 615 + else if (pass == PASS_REVOKE) { 616 + /* 617 + * Would the default revoke table have too long hash chains 618 + * during replay? 619 + */ 620 + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { 621 + unsigned int hash_size; 622 + 623 + /* 624 + * Aim for average chain length of 8, limit at 1M 625 + * entries to avoid problems with malicious 626 + * filesystems. 627 + */ 628 + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), 629 + 1U << 20); 630 + journal->j_revoke = 631 + jbd2_journal_init_revoke_table(hash_size); 632 + if (!journal->j_revoke) { 633 + printk(KERN_ERR 634 + "JBD2: failed to allocate revoke table for replay with %u entries. " 635 + "Journal replay may be slow.\n", hash_size); 636 + journal->j_revoke = journal->j_revoke_table[1]; 637 + } 638 + } 639 + } 617 640 618 641 jbd2_debug(1, "Starting recovery pass %d\n", pass); 619 642 ··· 879 852 880 853 case JBD2_REVOKE_BLOCK: 881 854 /* 855 + * If we aren't in the SCAN or REVOKE pass, then we can 856 + * just skip over this block. 857 + */ 858 + if (pass != PASS_REVOKE && pass != PASS_SCAN) 859 + continue; 860 + 861 + /* 882 862 * Check revoke block crc in pass_scan, if csum verify 883 863 * failed, check commit block time later. 884 864 */ ··· 897 863 need_check_commit_time = true; 898 864 } 899 865 900 - /* If we aren't in the REVOKE pass, then we can 901 - * just skip over this block. */ 902 - if (pass != PASS_REVOKE) 903 - continue; 904 - 905 - err = scan_revoke_records(journal, bh, 866 + err = scan_revoke_records(journal, pass, bh, 906 867 next_commit_ID, info); 907 868 if (err) 908 869 goto failed; ··· 951 922 952 923 /* Scan a revoke record, marking all blocks mentioned as revoked. */ 953 924 954 - static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 955 - tid_t sequence, struct recovery_info *info) 925 + static int scan_revoke_records(journal_t *journal, enum passtype pass, 926 + struct buffer_head *bh, tid_t sequence, 927 + struct recovery_info *info) 956 928 { 957 929 jbd2_journal_revoke_header_t *header; 958 930 int offset, max; ··· 974 944 if (jbd2_has_feature_64bit(journal)) 975 945 record_len = 8; 976 946 947 + if (pass == PASS_SCAN) { 948 + info->nr_revokes += (max - offset) / record_len; 949 + return 0; 950 + } 951 + 977 952 while (offset + record_len <= max) { 978 953 unsigned long long blocknr; 979 954 int err; ··· 991 956 err = jbd2_journal_set_revoke(journal, blocknr, sequence); 992 957 if (err) 993 958 return err; 994 - ++info->nr_revokes; 995 959 } 996 960 return 0; 997 961 }

+9 -12

fs/jbd2/revoke.c

··· 215 215 return 0; 216 216 } 217 217 218 - static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) 218 + struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) 219 219 { 220 220 int shift = 0; 221 221 int tmp = hash_size; ··· 231 231 table->hash_size = hash_size; 232 232 table->hash_shift = shift; 233 233 table->hash_table = 234 - kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); 234 + kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); 235 235 if (!table->hash_table) { 236 236 kmem_cache_free(jbd2_revoke_table_cache, table); 237 237 table = NULL; ··· 245 245 return table; 246 246 } 247 247 248 - static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) 248 + void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) 249 249 { 250 250 int i; 251 251 struct list_head *hash_list; ··· 255 255 J_ASSERT(list_empty(hash_list)); 256 256 } 257 257 258 - kfree(table->hash_table); 258 + kvfree(table->hash_table); 259 259 kmem_cache_free(jbd2_revoke_table_cache, table); 260 260 } 261 261 ··· 420 420 * do not trust the Revoked bit on buffers unless RevokeValid is also 421 421 * set. 422 422 */ 423 - int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 423 + void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) 424 424 { 425 425 struct jbd2_revoke_record_s *record; 426 426 journal_t *journal = handle->h_transaction->t_journal; 427 427 int need_cancel; 428 - int did_revoke = 0; /* akpm: debug */ 429 428 struct buffer_head *bh = jh2bh(jh); 430 429 431 430 jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); ··· 449 450 list_del(&record->hash); 450 451 spin_unlock(&journal->j_revoke_lock); 451 452 kmem_cache_free(jbd2_revoke_record_cache, record); 452 - did_revoke = 1; 453 453 } 454 454 } 455 455 ··· 471 473 __brelse(bh2); 472 474 } 473 475 } 474 - return did_revoke; 475 476 } 476 477 477 478 /* 478 - * journal_clear_revoked_flag clears revoked flag of buffers in 479 + * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in 479 480 * revoke table to reflect there is no revoked buffers in the next 480 481 * transaction which is going to be started. 481 482 */ ··· 503 506 } 504 507 } 505 508 506 - /* journal_switch_revoke table select j_revoke for next transaction 507 - * we do not want to suspend any processing until all revokes are 508 - * written -bzzz 509 + /* jbd2_journal_switch_revoke_table table select j_revoke for next 510 + * transaction we do not want to suspend any processing until all 511 + * revokes are written -bzzz 509 512 */ 510 513 void jbd2_journal_switch_revoke_table(journal_t *journal) 511 514 {

+1 -20

fs/jbd2/transaction.c

··· 92 92 atomic_set(&transaction->t_outstanding_revokes, 0); 93 93 atomic_set(&transaction->t_handle_count, 0); 94 94 INIT_LIST_HEAD(&transaction->t_inode_list); 95 - INIT_LIST_HEAD(&transaction->t_private_list); 96 95 97 96 /* Set up the commit timer for the new transaction. */ 98 97 journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); ··· 113 114 */ 114 115 115 116 /* 116 - * Update transaction's maximum wait time, if debugging is enabled. 117 - * 118 117 * t_max_wait is carefully updated here with use of atomic compare exchange. 119 118 * Note that there could be multiplre threads trying to do this simultaneously 120 119 * hence using cmpxchg to avoid any use of locks in this case. 121 - * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug. 122 120 */ 123 121 static inline void update_t_max_wait(transaction_t *transaction, 124 122 unsigned long ts) ··· 2075 2079 jh->b_transaction = NULL; 2076 2080 } 2077 2081 2078 - void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) 2079 - { 2080 - struct buffer_head *bh = jh2bh(jh); 2081 - 2082 - /* Get reference so that buffer cannot be freed before we unlock it */ 2083 - get_bh(bh); 2084 - spin_lock(&jh->b_state_lock); 2085 - spin_lock(&journal->j_list_lock); 2086 - __jbd2_journal_unfile_buffer(jh); 2087 - spin_unlock(&journal->j_list_lock); 2088 - spin_unlock(&jh->b_state_lock); 2089 - jbd2_journal_put_journal_head(jh); 2090 - __brelse(bh); 2091 - } 2092 - 2093 2082 /** 2094 2083 * jbd2_journal_try_to_free_buffers() - try to free page buffers. 2095 2084 * @journal: journal for operation ··· 2173 2192 /* 2174 2193 * We don't want to write the buffer anymore, clear the 2175 2194 * bit so that we don't confuse checks in 2176 - * __journal_file_buffer 2195 + * __jbd2_journal_file_buffer 2177 2196 */ 2178 2197 clear_buffer_dirty(bh); 2179 2198 __jbd2_journal_file_buffer(jh, transaction, BJ_Forget);

+9 -1

include/linux/fs.h

··· 1248 1248 #define SB_NOUSER BIT(31) 1249 1249 1250 1250 /* These flags relate to encoding and casefolding */ 1251 - #define SB_ENC_STRICT_MODE_FL (1 << 0) 1251 + #define SB_ENC_STRICT_MODE_FL (1 << 0) 1252 + #define SB_ENC_NO_COMPAT_FALLBACK_FL (1 << 1) 1252 1253 1253 1254 #define sb_has_strict_encoding(sb) \ 1254 1255 (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) 1256 + 1257 + #if IS_ENABLED(CONFIG_UNICODE) 1258 + #define sb_no_casefold_compat_fallback(sb) \ 1259 + (sb->s_encoding_flags & SB_ENC_NO_COMPAT_FALLBACK_FL) 1260 + #else 1261 + #define sb_no_casefold_compat_fallback(sb) (1) 1262 + #endif 1255 1263 1256 1264 /* 1257 1265 * Umount options

+5 -19

include/linux/jbd2.h

··· 459 459 * @h_ref: Reference count on this handle. 460 460 * @h_err: Field for caller's use to track errors through large fs operations. 461 461 * @h_sync: Flag for sync-on-close. 462 - * @h_jdata: Flag to force data journaling. 463 462 * @h_reserved: Flag for handle for reserved credits. 464 463 * @h_aborted: Flag indicating fatal error on handle. 465 464 * @h_type: For handle statistics. ··· 490 491 491 492 /* Flags [no locking] */ 492 493 unsigned int h_sync: 1; 493 - unsigned int h_jdata: 1; 494 494 unsigned int h_reserved: 1; 495 495 unsigned int h_aborted: 1; 496 496 unsigned int h_type: 8; ··· 698 700 699 701 /* Disk flush needs to be sent to fs partition [no locking] */ 700 702 int t_need_data_flush; 701 - 702 - /* 703 - * For use by the filesystem to store fs-specific data 704 - * structures associated with the transaction 705 - */ 706 - struct list_head t_private_list; 707 703 }; 708 704 709 705 struct transaction_run_stats_s { ··· 1380 1388 #define JBD2_FLUSHED 0x008 /* The journal superblock has been flushed */ 1381 1389 #define JBD2_LOADED 0x010 /* The journal superblock has been loaded */ 1382 1390 #define JBD2_BARRIER 0x020 /* Use IDE barriers */ 1383 - #define JBD2_ABORT_ON_SYNCDATA_ERR 0x040 /* Abort the journal on file 1384 - * data write error in ordered 1385 - * mode */ 1386 1391 #define JBD2_CYCLE_RECORD 0x080 /* Journal cycled record log on 1387 1392 * clean and empty filesystem 1388 1393 * logging area */ ··· 1396 1407 */ 1397 1408 1398 1409 /* Filing buffers */ 1399 - extern void jbd2_journal_unfile_buffer(journal_t *, struct journal_head *); 1400 1410 extern bool __jbd2_journal_refile_buffer(struct journal_head *); 1401 1411 extern void jbd2_journal_refile_buffer(journal_t *, struct journal_head *); 1402 1412 extern void __jbd2_journal_file_buffer(struct journal_head *, transaction_t *, int); ··· 1615 1627 extern void jbd2_journal_destroy_revoke_table_cache(void); 1616 1628 extern int __init jbd2_journal_init_revoke_record_cache(void); 1617 1629 extern int __init jbd2_journal_init_revoke_table_cache(void); 1630 + struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size); 1631 + void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table); 1618 1632 1619 1633 extern void jbd2_journal_destroy_revoke(journal_t *); 1620 1634 extern int jbd2_journal_revoke (handle_t *, unsigned long long, struct buffer_head *); 1621 - extern int jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); 1635 + extern void jbd2_journal_cancel_revoke(handle_t *, struct journal_head *); 1622 1636 extern void jbd2_journal_write_revoke_records(transaction_t *transaction, 1623 1637 struct list_head *log_bufs); 1624 1638 ··· 1726 1736 extern int jbd2_journal_blocks_per_page(struct inode *inode); 1727 1737 extern size_t journal_tag_bytes(journal_t *journal); 1728 1738 1729 - static inline bool jbd2_journal_has_csum_v2or3_feature(journal_t *j) 1730 - { 1731 - return jbd2_has_feature_csum2(j) || jbd2_has_feature_csum3(j); 1732 - } 1733 - 1734 1739 static inline int jbd2_journal_has_csum_v2or3(journal_t *journal) 1735 1740 { 1736 - return jbd2_journal_has_csum_v2or3_feature(journal); 1741 + return jbd2_has_feature_csum2(journal) || 1742 + jbd2_has_feature_csum3(journal); 1737 1743 } 1738 1744 1739 1745 static inline int jbd2_journal_get_num_fc_blks(journal_superblock_t *jsb)

Configure Feed

Configure Feed