Merge tag 'ext4_for_linus-6.19-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+2

Documentation/filesystems/ext4/inodes.rst

··· 297 297 - Inode has inline data (EXT4_INLINE_DATA_FL). 298 298 * - 0x20000000 299 299 - Create children with the same project ID (EXT4_PROJINHERIT_FL). 300 + * - 0x40000000 301 + - Use case-insensitive lookups for directory contents (EXT4_CASEFOLD_FL). 300 302 * - 0x80000000 301 303 - Reserved for ext4 library (EXT4_RESERVED_FL). 302 304 * -

+3 -1

Documentation/filesystems/ext4/super.rst

··· 671 671 * - 0x8000 672 672 - Data in inode (INCOMPAT_INLINE_DATA). 673 673 * - 0x10000 674 - - Encrypted inodes are present on the filesystem. (INCOMPAT_ENCRYPT). 674 + - Encrypted inodes can be present. (INCOMPAT_ENCRYPT). 675 + * - 0x20000 676 + - Directories can be marked case-insensitive. (INCOMPAT_CASEFOLD). 675 677 676 678 .. _super_rocompat: 677 679

+1 -1

fs/ext4/balloc.c

··· 752 752 *count = ar.len; 753 753 /* 754 754 * Account for the allocated meta blocks. We will never 755 - * fail EDQUOT for metdata, but we do account for it. 755 + * fail EDQUOT for metadata, but we do account for it. 756 756 */ 757 757 if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { 758 758 dquot_alloc_block_nofail(inode,

+4 -4

fs/ext4/dir.c

··· 192 192 continue; 193 193 } 194 194 if (err > 0) { 195 - pgoff_t index = map.m_pblk >> 196 - (PAGE_SHIFT - inode->i_blkbits); 195 + pgoff_t index = map.m_pblk << inode->i_blkbits >> 196 + PAGE_SHIFT; 197 197 if (!ra_has_index(&file->f_ra, index)) 198 198 page_cache_sync_readahead( 199 199 sb->s_bdev->bd_mapping, 200 - &file->f_ra, file, 201 - index, 1); 200 + &file->f_ra, file, index, 201 + 1 << EXT4_SB(sb)->s_min_folio_order); 202 202 file->f_ra.prev_pos = (loff_t)index << PAGE_SHIFT; 203 203 bh = ext4_bread(NULL, inode, map.m_lblk, 0); 204 204 if (IS_ERR(bh)) {

+30 -18

fs/ext4/ext4.h

··· 260 260 ext4_lblk_t m_lblk; 261 261 unsigned int m_len; 262 262 unsigned int m_flags; 263 + u64 m_seq; 263 264 }; 264 265 265 266 /* ··· 368 367 blkbits)) 369 368 #define EXT4_B_TO_LBLK(inode, offset) \ 370 369 (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) 370 + #define EXT4_LBLK_TO_B(inode, lblk) ((loff_t)(lblk) << (inode)->i_blkbits) 371 371 372 + /* Translate a block number to a page index */ 373 + #define EXT4_LBLK_TO_PG(inode, lblk) (EXT4_LBLK_TO_B((inode), (lblk)) >> \ 374 + PAGE_SHIFT) 375 + /* Translate a page index to a block number */ 376 + #define EXT4_PG_TO_LBLK(inode, pnum) (((loff_t)(pnum) << PAGE_SHIFT) >> \ 377 + (inode)->i_blkbits) 372 378 /* Translate a block number to a cluster number */ 373 379 #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) 374 380 /* Translate a cluster number to a block number */ ··· 702 694 /* Caller is from the delayed allocation writeout path 703 695 * finally doing the actual allocation of delayed blocks */ 704 696 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 705 - /* caller is from the direct IO path, request to creation of an 706 - unwritten extents if not allocated, split the unwritten 707 - extent if blocks has been preallocated already*/ 708 - #define EXT4_GET_BLOCKS_PRE_IO 0x0008 709 - #define EXT4_GET_BLOCKS_CONVERT 0x0010 710 - #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ 697 + /* 698 + * This means that we cannot merge newly allocated extents, and if we 699 + * found an unwritten extent, we need to split it. 700 + */ 701 + #define EXT4_GET_BLOCKS_SPLIT_NOMERGE 0x0008 702 + /* 703 + * Caller is from the dio or dioread_nolock buffered IO, reqest to 704 + * create an unwritten extent if it does not exist or split the 705 + * found unwritten extent. Also do not merge the newly created 706 + * unwritten extent, io end will convert unwritten to written, 707 + * and try to merge the written extent. 708 + */ 709 + #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_SPLIT_NOMERGE|\ 711 710 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) 711 + /* Convert unwritten extent to initialized. */ 712 + #define EXT4_GET_BLOCKS_CONVERT 0x0010 712 713 /* Eventual metadata allocation (due to growing extent tree) 713 714 * should not fail, so try to use reserved blocks for that.*/ 714 715 #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 ··· 1155 1138 ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for 1156 1139 extents to shrink. Protected by 1157 1140 i_es_lock */ 1141 + u64 i_es_seq; /* Change counter for extents. 1142 + Protected by i_es_lock */ 1158 1143 1159 1144 /* ialloc */ 1160 1145 ext4_group_t i_last_alloc_group; ··· 1703 1684 1704 1685 /* record the last minlen when FITRIM is called. */ 1705 1686 unsigned long s_last_trim_minblks; 1687 + 1688 + /* minimum folio order of a page cache allocation */ 1689 + u16 s_min_folio_order; 1690 + /* supported maximum folio order, 0 means not supported */ 1691 + u16 s_max_folio_order; 1706 1692 1707 1693 /* Precomputed FS UUID checksum for seeding other checksums */ 1708 1694 __u32 s_csum_seed; ··· 2496 2472 return (rec_len & ~EXT4_DIR_ROUND); 2497 2473 } 2498 2474 2499 - /* 2500 - * If we ever get support for fs block sizes > page_size, we'll need 2501 - * to remove the #if statements in the next two functions... 2502 - */ 2503 2475 static inline unsigned int 2504 2476 ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) 2505 2477 { 2506 2478 unsigned len = le16_to_cpu(dlen); 2507 2479 2508 - #if (PAGE_SIZE >= 65536) 2509 2480 if (len == EXT4_MAX_REC_LEN || len == 0) 2510 2481 return blocksize; 2511 2482 return (len & 65532) | ((len & 3) << 16); 2512 - #else 2513 - return len; 2514 - #endif 2515 2483 } 2516 2484 2517 2485 static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) 2518 2486 { 2519 2487 BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); 2520 - #if (PAGE_SIZE >= 65536) 2521 2488 if (len < 65536) 2522 2489 return cpu_to_le16(len); 2523 2490 if (len == blocksize) { ··· 2518 2503 return cpu_to_le16(0); 2519 2504 } 2520 2505 return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); 2521 - #else 2522 - return cpu_to_le16(len); 2523 - #endif 2524 2506 } 2525 2507 2526 2508 /*

+1 -2

fs/ext4/ext4_jbd2.c

··· 16 16 ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || 17 17 test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 18 18 (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && 19 - !test_opt(inode->i_sb, DELALLOC) && 20 - !mapping_large_folio_support(inode->i_mapping))) { 19 + !test_opt(inode->i_sb, DELALLOC))) { 21 20 /* We do not support data journalling for encrypted data */ 22 21 if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) 23 22 return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */

+14 -14

fs/ext4/extents.c

··· 333 333 int nofail) 334 334 { 335 335 int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); 336 - int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; 336 + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_SPLIT_NOMERGE; 337 337 338 338 if (nofail) 339 339 flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; ··· 2002 2002 } 2003 2003 2004 2004 /* try to insert block into found extent and return */ 2005 - if (ex && !(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) { 2005 + if (ex && !(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) { 2006 2006 2007 2007 /* 2008 2008 * Try to see whether we should rather test the extent on ··· 2181 2181 2182 2182 merge: 2183 2183 /* try to merge extents */ 2184 - if (!(gb_flags & EXT4_GET_BLOCKS_PRE_IO)) 2184 + if (!(gb_flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) 2185 2185 ext4_ext_try_to_merge(handle, inode, path, nearex); 2186 2186 2187 2187 /* time to correct all indexes above */ ··· 2213 2213 while (block <= end) { 2214 2214 next = 0; 2215 2215 flags = 0; 2216 - if (!ext4_es_lookup_extent(inode, block, &next, &es)) 2216 + if (!ext4_es_lookup_extent(inode, block, &next, &es, NULL)) 2217 2217 break; 2218 2218 if (ext4_es_is_unwritten(&es)) 2219 2219 flags |= FIEMAP_EXTENT_UNWRITTEN; ··· 3224 3224 else 3225 3225 ext4_ext_mark_initialized(ex); 3226 3226 3227 - if (!(flags & EXT4_GET_BLOCKS_PRE_IO)) 3227 + if (!(flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE)) 3228 3228 ext4_ext_try_to_merge(handle, inode, path, ex); 3229 3229 3230 3230 err = ext4_ext_dirty(handle, inode, path + path->p_depth); ··· 3368 3368 3369 3369 if (map->m_lblk + map->m_len < ee_block + ee_len) { 3370 3370 split_flag1 = split_flag & EXT4_EXT_MAY_ZEROOUT; 3371 - flags1 = flags | EXT4_GET_BLOCKS_PRE_IO; 3371 + flags1 = flags | EXT4_GET_BLOCKS_SPLIT_NOMERGE; 3372 3372 if (unwritten) 3373 3373 split_flag1 |= EXT4_EXT_MARK_UNWRIT1 | 3374 3374 EXT4_EXT_MARK_UNWRIT2; ··· 3721 3721 >> inode->i_sb->s_blocksize_bits; 3722 3722 if (eof_block < map->m_lblk + map->m_len) 3723 3723 eof_block = map->m_lblk + map->m_len; 3724 - /* 3725 - * It is safe to convert extent to initialized via explicit 3726 - * zeroout only if extent is fully inside i_size or new_size. 3727 - */ 3728 3724 depth = ext_depth(inode); 3729 3725 ex = path[depth].p_ext; 3730 3726 ee_block = le32_to_cpu(ex->ee_block); ··· 3731 3735 split_flag |= EXT4_EXT_DATA_VALID1; 3732 3736 /* Convert to initialized */ 3733 3737 } else if (flags & EXT4_GET_BLOCKS_CONVERT) { 3738 + /* 3739 + * It is safe to convert extent to initialized via explicit 3740 + * zeroout only if extent is fully inside i_size or new_size. 3741 + */ 3734 3742 split_flag |= ee_block + ee_len <= eof_block ? 3735 3743 EXT4_EXT_MAY_ZEROOUT : 0; 3736 3744 split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); 3737 3745 } 3738 - flags |= EXT4_GET_BLOCKS_PRE_IO; 3746 + flags |= EXT4_GET_BLOCKS_SPLIT_NOMERGE; 3739 3747 return ext4_split_extent(handle, inode, path, map, split_flag, flags, 3740 3748 allocated); 3741 3749 } ··· 3911 3911 *allocated, newblock); 3912 3912 3913 3913 /* get_block() before submitting IO, split the extent */ 3914 - if (flags & EXT4_GET_BLOCKS_PRE_IO) { 3914 + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE) { 3915 3915 path = ext4_split_convert_extents(handle, inode, map, path, 3916 3916 flags | EXT4_GET_BLOCKS_CONVERT, allocated); 3917 3917 if (IS_ERR(path)) ··· 4562 4562 * allow a full retry cycle for any remaining allocations 4563 4563 */ 4564 4564 retries = 0; 4565 - epos = (loff_t)(map.m_lblk + ret) << blkbits; 4565 + epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret); 4566 4566 inode_set_ctime_current(inode); 4567 4567 if (new_size) { 4568 4568 if (epos > new_size) ··· 5618 5618 path = ext4_split_extent_at(handle, inode, path, 5619 5619 start_lblk, split_flag, 5620 5620 EXT4_EX_NOCACHE | 5621 - EXT4_GET_BLOCKS_PRE_IO | 5621 + EXT4_GET_BLOCKS_SPLIT_NOMERGE | 5622 5622 EXT4_GET_BLOCKS_METADATA_NOFAIL); 5623 5623 } 5624 5624

+25 -6

fs/ext4/extents_status.c

··· 235 235 return es->es_lblk + es->es_len - 1; 236 236 } 237 237 238 + static inline void ext4_es_inc_seq(struct inode *inode) 239 + { 240 + struct ext4_inode_info *ei = EXT4_I(inode); 241 + 242 + WRITE_ONCE(ei->i_es_seq, ei->i_es_seq + 1); 243 + } 244 + 238 245 /* 239 246 * search through the tree for an delayed extent with a given offset. If 240 247 * it can't be found, try to find next extent. ··· 913 906 newes.es_lblk = lblk; 914 907 newes.es_len = len; 915 908 ext4_es_store_pblock_status(&newes, pblk, status); 916 - trace_ext4_es_insert_extent(inode, &newes); 917 909 918 910 ext4_es_insert_extent_check(inode, &newes); 919 911 ··· 961 955 } 962 956 pending = err3; 963 957 } 958 + /* 959 + * TODO: For cache on-disk extents, there is no need to increment 960 + * the sequence counter, this requires future optimization. 961 + */ 962 + ext4_es_inc_seq(inode); 964 963 error: 965 964 write_unlock(&EXT4_I(inode)->i_es_lock); 966 965 /* ··· 992 981 if (err1 || err2 || err3 < 0) 993 982 goto retry; 994 983 984 + trace_ext4_es_insert_extent(inode, &newes); 995 985 ext4_es_print_tree(inode); 996 986 return; 997 987 } ··· 1039 1027 * Return: 1 on found, 0 on not 1040 1028 */ 1041 1029 int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 1042 - ext4_lblk_t *next_lblk, 1043 - struct extent_status *es) 1030 + ext4_lblk_t *next_lblk, struct extent_status *es, 1031 + u64 *pseq) 1044 1032 { 1045 1033 struct ext4_es_tree *tree; 1046 1034 struct ext4_es_stats *stats; ··· 1099 1087 } else 1100 1088 *next_lblk = 0; 1101 1089 } 1090 + if (pseq) 1091 + *pseq = EXT4_I(inode)->i_es_seq; 1102 1092 } else { 1103 1093 percpu_counter_inc(&stats->es_stats_cache_misses); 1104 1094 } ··· 1564 1550 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 1565 1551 return; 1566 1552 1567 - trace_ext4_es_remove_extent(inode, lblk, len); 1568 1553 es_debug("remove [%u/%u) from extent status tree of inode %lu\n", 1569 1554 lblk, len, inode->i_ino); 1570 1555 ··· 1583 1570 */ 1584 1571 write_lock(&EXT4_I(inode)->i_es_lock); 1585 1572 err = __es_remove_extent(inode, lblk, end, &reserved, es); 1573 + if (err) 1574 + goto error; 1586 1575 /* Free preallocated extent if it didn't get used. */ 1587 1576 if (es) { 1588 1577 if (!es->es_len) 1589 1578 __es_free_extent(es); 1590 1579 es = NULL; 1591 1580 } 1581 + ext4_es_inc_seq(inode); 1582 + error: 1592 1583 write_unlock(&EXT4_I(inode)->i_es_lock); 1593 1584 if (err) 1594 1585 goto retry; 1595 1586 1587 + trace_ext4_es_remove_extent(inode, lblk, len); 1596 1588 ext4_es_print_tree(inode); 1597 1589 ext4_da_release_space(inode, reserved); 1598 1590 } ··· 2158 2140 newes.es_lblk = lblk; 2159 2141 newes.es_len = len; 2160 2142 ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); 2161 - trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, 2162 - end_allocated); 2163 2143 2164 2144 ext4_es_insert_extent_check(inode, &newes); 2165 2145 ··· 2212 2196 pr2 = NULL; 2213 2197 } 2214 2198 } 2199 + ext4_es_inc_seq(inode); 2215 2200 error: 2216 2201 write_unlock(&EXT4_I(inode)->i_es_lock); 2217 2202 if (err1 || err2 || err3 < 0) 2218 2203 goto retry; 2219 2204 2205 + trace_ext4_es_insert_delayed_extent(inode, &newes, lclu_allocated, 2206 + end_allocated); 2220 2207 ext4_es_print_tree(inode); 2221 2208 ext4_print_pending_tree(inode); 2222 2209 return;

+1 -1

fs/ext4/extents_status.h

··· 148 148 struct extent_status *es); 149 149 extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, 150 150 ext4_lblk_t *next_lblk, 151 - struct extent_status *es); 151 + struct extent_status *es, u64 *pseq); 152 152 extern bool ext4_es_scan_range(struct inode *inode, 153 153 int (*matching_fn)(struct extent_status *es), 154 154 ext4_lblk_t lblk, ext4_lblk_t end);

+1 -1

fs/ext4/hash.c

··· 268 268 combined_hash = fscrypt_fname_siphash(dir, &qname); 269 269 } else { 270 270 ext4_warning_inode(dir, "Siphash requires key"); 271 - return -1; 271 + return -EINVAL; 272 272 } 273 273 274 274 hash = (__u32)(combined_hash >> 32);

-1

fs/ext4/ialloc.c

··· 1293 1293 ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); 1294 1294 } 1295 1295 1296 - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 1297 1296 ext4_set_inode_state(inode, EXT4_STATE_NEW); 1298 1297 1299 1298 ei->i_extra_isize = sbi->s_want_extra_isize;

+12 -2

fs/ext4/inline.c

··· 418 418 return -ENOSPC; 419 419 420 420 ext4_write_lock_xattr(inode, &no_expand); 421 - 421 + /* 422 + * ei->i_inline_size may have changed since the initial check 423 + * if other xattrs were added. Recalculate to ensure 424 + * ext4_update_inline_data() validates against current capacity. 425 + */ 426 + (void) ext4_find_inline_data_nolock(inode); 422 427 if (ei->i_inline_off) 423 428 ret = ext4_update_inline_data(handle, inode, len); 424 429 else ··· 451 446 if (!ei->i_inline_off) 452 447 return 0; 453 448 449 + down_write(&ei->i_data_sem); 450 + 454 451 error = ext4_get_inode_loc(inode, &is.iloc); 455 - if (error) 452 + if (error) { 453 + up_write(&ei->i_data_sem); 456 454 return error; 455 + } 457 456 458 457 error = ext4_xattr_ibody_find(inode, &i, &is); 459 458 if (error) ··· 496 487 brelse(is.iloc.bh); 497 488 if (error == -ENODATA) 498 489 error = 0; 490 + up_write(&ei->i_data_sem); 499 491 return error; 500 492 } 501 493

+79 -90

fs/ext4/inode.c

··· 549 549 retval = ext4_ext_map_blocks(handle, inode, map, flags); 550 550 else 551 551 retval = ext4_ind_map_blocks(handle, inode, map, flags); 552 - 553 - if (retval <= 0) 552 + if (retval < 0) 554 553 return retval; 554 + 555 + /* A hole? */ 556 + if (retval == 0) 557 + goto out; 555 558 556 559 if (unlikely(retval != map->m_len)) { 557 560 ext4_warning(inode->i_sb, ··· 575 572 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 576 573 ext4_es_insert_extent(inode, map->m_lblk, map->m_len, 577 574 map->m_pblk, status, false); 578 - return retval; 575 + } else { 576 + retval = ext4_map_query_blocks_next_in_leaf(handle, inode, map, 577 + orig_mlen); 579 578 } 580 - 581 - return ext4_map_query_blocks_next_in_leaf(handle, inode, map, 582 - orig_mlen); 579 + out: 580 + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); 581 + return retval; 583 582 } 584 583 585 584 static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, ··· 652 647 * If the extent has been zeroed out, we don't need to update 653 648 * extent status tree. 654 649 */ 655 - if (flags & EXT4_GET_BLOCKS_PRE_IO && 656 - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 650 + if (flags & EXT4_GET_BLOCKS_SPLIT_NOMERGE && 651 + ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { 657 652 if (ext4_es_is_written(&es)) 658 653 return retval; 659 654 } ··· 662 657 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 663 658 ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, 664 659 status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE); 660 + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); 665 661 666 662 return retval; 667 663 } ··· 728 722 ext4_check_map_extents_env(inode); 729 723 730 724 /* Lookup extent status tree firstly */ 731 - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 725 + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, &map->m_seq)) { 732 726 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 733 727 map->m_pblk = ext4_es_pblock(&es) + 734 728 map->m_lblk - es.es_lblk; ··· 815 809 down_write(&EXT4_I(inode)->i_data_sem); 816 810 retval = ext4_map_create_blocks(handle, inode, map, flags); 817 811 up_write((&EXT4_I(inode)->i_data_sem)); 818 - if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { 812 + 813 + if (retval < 0) 814 + ext_debug(inode, "failed with err %d\n", retval); 815 + if (retval <= 0) 816 + return retval; 817 + 818 + if (map->m_flags & EXT4_MAP_MAPPED) { 819 819 ret = check_block_validity(inode, map); 820 820 if (ret != 0) 821 821 return ret; ··· 836 824 !(flags & EXT4_GET_BLOCKS_ZERO) && 837 825 !ext4_is_quota_file(inode) && 838 826 ext4_should_order_data(inode)) { 839 - loff_t start_byte = 840 - (loff_t)map->m_lblk << inode->i_blkbits; 841 - loff_t length = (loff_t)map->m_len << inode->i_blkbits; 827 + loff_t start_byte = EXT4_LBLK_TO_B(inode, map->m_lblk); 828 + loff_t length = EXT4_LBLK_TO_B(inode, map->m_len); 842 829 843 830 if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) 844 831 ret = ext4_jbd2_inode_add_wait(handle, inode, ··· 849 838 return ret; 850 839 } 851 840 } 852 - if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || 853 - map->m_flags & EXT4_MAP_MAPPED)) 854 - ext4_fc_track_range(handle, inode, map->m_lblk, 855 - map->m_lblk + map->m_len - 1); 856 - if (retval < 0) 857 - ext_debug(inode, "failed with err %d\n", retval); 841 + ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + 842 + map->m_len - 1); 858 843 return retval; 859 844 } 860 845 ··· 1169 1162 unsigned block_start, block_end; 1170 1163 sector_t block; 1171 1164 int err = 0; 1172 - unsigned blocksize = inode->i_sb->s_blocksize; 1173 - unsigned bbits; 1165 + unsigned int blocksize = i_blocksize(inode); 1174 1166 struct buffer_head *bh, *head, *wait[2]; 1175 1167 int nr_wait = 0; 1176 1168 int i; ··· 1178 1172 BUG_ON(!folio_test_locked(folio)); 1179 1173 BUG_ON(to > folio_size(folio)); 1180 1174 BUG_ON(from > to); 1175 + WARN_ON_ONCE(blocksize > folio_size(folio)); 1181 1176 1182 1177 head = folio_buffers(folio); 1183 1178 if (!head) 1184 1179 head = create_empty_buffers(folio, blocksize, 0); 1185 - bbits = ilog2(blocksize); 1186 - block = (sector_t)folio->index << (PAGE_SHIFT - bbits); 1180 + block = EXT4_PG_TO_LBLK(inode, folio->index); 1187 1181 1188 1182 for (bh = head, block_start = 0; bh != head || !block_start; 1189 1183 block++, block_start = block_end, bh = bh->b_this_page) { ··· 1913 1907 ext4_check_map_extents_env(inode); 1914 1908 1915 1909 /* Lookup extent status tree firstly */ 1916 - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 1910 + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { 1917 1911 map->m_len = min_t(unsigned int, map->m_len, 1918 1912 es.es_len - (map->m_lblk - es.es_lblk)); 1919 1913 ··· 1966 1960 * is held in write mode, before inserting a new da entry in 1967 1961 * the extent status tree. 1968 1962 */ 1969 - if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 1963 + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es, NULL)) { 1970 1964 map->m_len = min_t(unsigned int, map->m_len, 1971 1965 es.es_len - (map->m_lblk - es.es_lblk)); 1972 1966 ··· 1984 1978 1985 1979 map->m_flags |= EXT4_MAP_DELAYED; 1986 1980 retval = ext4_insert_delayed_blocks(inode, map->m_lblk, map->m_len); 1981 + if (!retval) 1982 + map->m_seq = READ_ONCE(EXT4_I(inode)->i_es_seq); 1987 1983 up_write(&EXT4_I(inode)->i_data_sem); 1988 1984 1989 1985 return retval; ··· 2232 2224 ext4_lblk_t lblk = *m_lblk; 2233 2225 ext4_fsblk_t pblock = *m_pblk; 2234 2226 int err = 0; 2235 - int blkbits = mpd->inode->i_blkbits; 2236 2227 ssize_t io_end_size = 0; 2237 2228 struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); 2238 2229 ··· 2257 2250 err = PTR_ERR(io_end_vec); 2258 2251 goto out; 2259 2252 } 2260 - io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; 2253 + io_end_vec->offset = EXT4_LBLK_TO_B(mpd->inode, 2254 + mpd->map.m_lblk); 2261 2255 } 2262 2256 *map_bh = true; 2263 2257 goto out; ··· 2268 2260 bh->b_blocknr = pblock++; 2269 2261 } 2270 2262 clear_buffer_unwritten(bh); 2271 - io_end_size += (1 << blkbits); 2263 + io_end_size += i_blocksize(mpd->inode); 2272 2264 } while (lblk++, (bh = bh->b_this_page) != head); 2273 2265 2274 2266 io_end_vec->size += io_end_size; ··· 2298 2290 struct folio_batch fbatch; 2299 2291 unsigned nr, i; 2300 2292 struct inode *inode = mpd->inode; 2301 - int bpp_bits = PAGE_SHIFT - inode->i_blkbits; 2302 2293 pgoff_t start, end; 2303 2294 ext4_lblk_t lblk; 2304 2295 ext4_fsblk_t pblock; 2305 2296 int err; 2306 2297 bool map_bh = false; 2307 2298 2308 - start = mpd->map.m_lblk >> bpp_bits; 2309 - end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; 2299 + start = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk); 2300 + end = EXT4_LBLK_TO_PG(inode, mpd->map.m_lblk + mpd->map.m_len - 1); 2310 2301 pblock = mpd->map.m_pblk; 2311 2302 2312 2303 folio_batch_init(&fbatch); ··· 2316 2309 for (i = 0; i < nr; i++) { 2317 2310 struct folio *folio = fbatch.folios[i]; 2318 2311 2319 - lblk = folio->index << bpp_bits; 2312 + lblk = EXT4_PG_TO_LBLK(inode, folio->index); 2320 2313 err = mpage_process_folio(mpd, folio, &lblk, &pblock, 2321 2314 &map_bh); 2322 2315 /* ··· 2469 2462 io_end_vec = ext4_alloc_io_end_vec(io_end); 2470 2463 if (IS_ERR(io_end_vec)) 2471 2464 return PTR_ERR(io_end_vec); 2472 - io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; 2465 + io_end_vec->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); 2473 2466 do { 2474 2467 err = mpage_map_one_extent(handle, mpd); 2475 2468 if (err < 0) { ··· 2619 2612 pgoff_t end = mpd->end_pos >> PAGE_SHIFT; 2620 2613 xa_mark_t tag; 2621 2614 int i, err = 0; 2622 - int blkbits = mpd->inode->i_blkbits; 2623 2615 ext4_lblk_t lblk; 2624 2616 struct buffer_head *head; 2625 2617 handle_t *handle = NULL; ··· 2654 2648 */ 2655 2649 if (mpd->wbc->sync_mode == WB_SYNC_NONE && 2656 2650 mpd->wbc->nr_to_write <= 2657 - mpd->map.m_len >> (PAGE_SHIFT - blkbits)) 2651 + EXT4_LBLK_TO_PG(mpd->inode, mpd->map.m_len)) 2658 2652 goto out; 2659 2653 2660 2654 /* If we can't merge this page, we are done. */ ··· 2732 2726 mpage_folio_done(mpd, folio); 2733 2727 } else { 2734 2728 /* Add all dirty buffers to mpd */ 2735 - lblk = ((ext4_lblk_t)folio->index) << 2736 - (PAGE_SHIFT - blkbits); 2729 + lblk = EXT4_PG_TO_LBLK(mpd->inode, folio->index); 2737 2730 head = folio_buffers(folio); 2738 2731 err = mpage_process_page_bufs(mpd, head, head, 2739 2732 lblk); ··· 3504 3499 iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; 3505 3500 else 3506 3501 iomap->bdev = inode->i_sb->s_bdev; 3507 - iomap->offset = (u64) map->m_lblk << blkbits; 3508 - iomap->length = (u64) map->m_len << blkbits; 3502 + iomap->offset = EXT4_LBLK_TO_B(inode, map->m_lblk); 3503 + iomap->length = EXT4_LBLK_TO_B(inode, map->m_len); 3509 3504 3510 3505 if ((map->m_flags & EXT4_MAP_MAPPED) && 3511 3506 !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ··· 3545 3540 ext4_lblk_t m_lblk = map->m_lblk; 3546 3541 unsigned int m_len = map->m_len; 3547 3542 unsigned int mapped_len = 0, m_flags = 0; 3548 - ext4_fsblk_t next_pblk; 3543 + ext4_fsblk_t next_pblk = 0; 3549 3544 bool check_next_pblk = false; 3550 3545 int ret = 0; 3551 3546 ··· 3679 3674 unsigned int flags) 3680 3675 { 3681 3676 handle_t *handle; 3682 - u8 blkbits = inode->i_blkbits; 3683 3677 int ret, dio_credits, m_flags = 0, retries = 0; 3684 3678 bool force_commit = false; 3685 3679 ··· 3737 3733 * i_disksize out to i_size. This could be beyond where direct I/O is 3738 3734 * happening and thus expose allocated blocks to direct I/O reads. 3739 3735 */ 3740 - else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) 3736 + else if (EXT4_LBLK_TO_B(inode, map->m_lblk) >= i_size_read(inode)) 3741 3737 m_flags = EXT4_GET_BLOCKS_CREATE; 3742 3738 else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3743 3739 m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; ··· 4072 4068 4073 4069 blocksize = inode->i_sb->s_blocksize; 4074 4070 4075 - iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); 4071 + iblock = EXT4_PG_TO_LBLK(inode, folio->index); 4076 4072 4077 4073 bh = folio_buffers(folio); 4078 4074 if (!bh) ··· 4157 4153 struct address_space *mapping, loff_t from, loff_t length) 4158 4154 { 4159 4155 struct inode *inode = mapping->host; 4160 - unsigned offset = from & (PAGE_SIZE-1); 4161 4156 unsigned blocksize = inode->i_sb->s_blocksize; 4162 - unsigned max = blocksize - (offset & (blocksize - 1)); 4157 + unsigned int max = blocksize - (from & (blocksize - 1)); 4163 4158 4164 4159 /* 4165 4160 * correct length if it does not fall between ··· 4183 4180 static int ext4_block_truncate_page(handle_t *handle, 4184 4181 struct address_space *mapping, loff_t from) 4185 4182 { 4186 - unsigned offset = from & (PAGE_SIZE-1); 4187 4183 unsigned length; 4188 4184 unsigned blocksize; 4189 4185 struct inode *inode = mapping->host; ··· 4191 4189 if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) 4192 4190 return 0; 4193 4191 4194 - blocksize = inode->i_sb->s_blocksize; 4195 - length = blocksize - (offset & (blocksize - 1)); 4192 + blocksize = i_blocksize(inode); 4193 + length = blocksize - (from & (blocksize - 1)); 4196 4194 4197 4195 return ext4_block_zero_page_range(handle, mapping, from, length); 4198 4196 } ··· 4398 4396 4399 4397 /* 4400 4398 * If the hole extends beyond i_size, set the hole to end after 4401 - * the page that contains i_size. 4399 + * the block that contains i_size to save pointless tail block zeroing. 4402 4400 */ 4403 - if (end > inode->i_size) 4404 - end = round_up(inode->i_size, PAGE_SIZE); 4401 + if (end >= inode->i_size) 4402 + end = round_up(inode->i_size, sb->s_blocksize); 4405 4403 if (end > max_end) 4406 4404 end = max_end; 4407 4405 length = end - offset; ··· 5144 5142 return -EFSCORRUPTED; 5145 5143 } 5146 5144 5147 - static bool ext4_should_enable_large_folio(struct inode *inode) 5148 - { 5149 - struct super_block *sb = inode->i_sb; 5150 - 5151 - if (!S_ISREG(inode->i_mode)) 5152 - return false; 5153 - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || 5154 - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 5155 - return false; 5156 - if (ext4_has_feature_verity(sb)) 5157 - return false; 5158 - if (ext4_has_feature_encrypt(sb)) 5159 - return false; 5160 - 5161 - return true; 5162 - } 5163 - 5164 - /* 5165 - * Limit the maximum folio order to 2048 blocks to prevent overestimation 5166 - * of reserve handle credits during the folio writeback in environments 5167 - * where the PAGE_SIZE exceeds 4KB. 5168 - */ 5169 - #define EXT4_MAX_PAGECACHE_ORDER(i) \ 5170 - umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) 5171 5145 void ext4_set_inode_mapping_order(struct inode *inode) 5172 5146 { 5173 - if (!ext4_should_enable_large_folio(inode)) 5147 + struct super_block *sb = inode->i_sb; 5148 + u16 min_order, max_order; 5149 + 5150 + max_order = EXT4_SB(sb)->s_max_folio_order; 5151 + if (!max_order) 5174 5152 return; 5175 5153 5176 - mapping_set_folio_order_range(inode->i_mapping, 0, 5177 - EXT4_MAX_PAGECACHE_ORDER(inode)); 5154 + min_order = EXT4_SB(sb)->s_min_folio_order; 5155 + if (!min_order && !S_ISREG(inode->i_mode)) 5156 + return; 5157 + 5158 + if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) 5159 + max_order = min_order; 5160 + 5161 + mapping_set_folio_order_range(inode->i_mapping, min_order, max_order); 5178 5162 } 5179 5163 5180 5164 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ··· 5272 5284 ei->i_projid = make_kprojid(&init_user_ns, i_projid); 5273 5285 set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); 5274 5286 5275 - ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 5276 5287 ei->i_inline_off = 0; 5277 5288 ei->i_dir_start_lookup = 0; 5278 5289 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); ··· 5504 5517 if (ret) 5505 5518 goto bad_inode; 5506 5519 brelse(iloc.bh); 5507 - 5520 + /* Initialize the "no ACL's" state for the simple cases */ 5521 + if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) && !ei->i_file_acl) 5522 + cache_no_acl(inode); 5508 5523 unlock_new_inode(inode); 5509 5524 return inode; 5510 5525 ··· 6537 6548 * dirty data which can be converted only after flushing the dirty 6538 6549 * data (and journalled aops don't know how to handle these cases). 6539 6550 */ 6540 - if (val) { 6541 - filemap_invalidate_lock(inode->i_mapping); 6542 - err = filemap_write_and_wait(inode->i_mapping); 6543 - if (err < 0) { 6544 - filemap_invalidate_unlock(inode->i_mapping); 6545 - return err; 6546 - } 6551 + filemap_invalidate_lock(inode->i_mapping); 6552 + err = filemap_write_and_wait(inode->i_mapping); 6553 + if (err < 0) { 6554 + filemap_invalidate_unlock(inode->i_mapping); 6555 + return err; 6547 6556 } 6557 + /* Before switch the inode journalling mode evict all the page cache. */ 6558 + truncate_pagecache(inode, 0); 6548 6559 6549 6560 alloc_ctx = ext4_writepages_down_write(inode->i_sb); 6550 6561 jbd2_journal_lock_updates(journal); ··· 6564 6575 if (err < 0) { 6565 6576 jbd2_journal_unlock_updates(journal); 6566 6577 ext4_writepages_up_write(inode->i_sb, alloc_ctx); 6578 + filemap_invalidate_unlock(inode->i_mapping); 6567 6579 return err; 6568 6580 } 6569 6581 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); 6570 6582 } 6571 6583 ext4_set_aops(inode); 6584 + ext4_set_inode_mapping_order(inode); 6572 6585 6573 6586 jbd2_journal_unlock_updates(journal); 6574 6587 ext4_writepages_up_write(inode->i_sb, alloc_ctx); 6575 - 6576 - if (val) 6577 - filemap_invalidate_unlock(inode->i_mapping); 6588 + filemap_invalidate_unlock(inode->i_mapping); 6578 6589 6579 6590 /* Finally we can mark the inode as dirty. */ 6580 6591

+4 -10

fs/ext4/ioctl.c

··· 1394 1394 if (copy_from_user(&params, in, sizeof(params))) 1395 1395 return -EFAULT; 1396 1396 1397 + if (strnlen(params.mount_opts, sizeof(params.mount_opts)) == 1398 + sizeof(params.mount_opts)) 1399 + return -E2BIG; 1400 + 1397 1401 if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) 1398 1402 return -EOPNOTSUPP; 1399 1403 ··· 1644 1640 1645 1641 if (!(fd_file(donor)->f_mode & FMODE_WRITE)) 1646 1642 return -EBADF; 1647 - 1648 - if (ext4_has_feature_bigalloc(sb)) { 1649 - ext4_msg(sb, KERN_ERR, 1650 - "Online defrag not supported with bigalloc"); 1651 - return -EOPNOTSUPP; 1652 - } else if (IS_DAX(inode)) { 1653 - ext4_msg(sb, KERN_ERR, 1654 - "Online defrag not supported with DAX"); 1655 - return -EOPNOTSUPP; 1656 - } 1657 1643 1658 1644 err = mnt_want_write_file(filp); 1659 1645 if (err)

+105 -83

fs/ext4/mballoc.c

··· 98 98 * block bitmap and buddy information. The information are stored in the 99 99 * inode as: 100 100 * 101 - * { page } 101 + * { folio } 102 102 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 103 103 * 104 104 * 105 105 * one block each for bitmap and buddy information. So for each group we 106 - * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / 107 - * blocksize) blocks. So it can have information regarding groups_per_page 108 - * which is blocks_per_page/2 106 + * take up 2 blocks. A folio can contain blocks_per_folio (folio_size / 107 + * blocksize) blocks. So it can have information regarding groups_per_folio 108 + * which is blocks_per_folio/2 109 109 * 110 110 * The buddy cache inode is not stored on disk. The inode is thrown 111 111 * away when the filesystem is unmounted. ··· 682 682 } \ 683 683 } while (0) 684 684 685 + /* 686 + * Perform buddy integrity check with the following steps: 687 + * 688 + * 1. Top-down validation (from highest order down to order 1, excluding order-0 bitmap): 689 + * For each pair of adjacent orders, if a higher-order bit is set (indicating a free block), 690 + * at most one of the two corresponding lower-order bits may be clear (free). 691 + * 692 + * 2. Order-0 (bitmap) validation, performed on bit pairs: 693 + * - If either bit in a pair is set (1, allocated), then all corresponding higher-order bits 694 + * must not be free (0). 695 + * - If both bits in a pair are clear (0, free), then exactly one of the corresponding 696 + * higher-order bits must be free (0). 697 + * 698 + * 3. Preallocation (pa) list validation: 699 + * For each preallocated block (pa) in the group: 700 + * - Verify that pa_pstart falls within the bounds of this block group. 701 + * - Ensure the corresponding bit(s) in the order-0 bitmap are marked as allocated (1). 702 + */ 685 703 static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, 686 704 const char *function, int line) 687 705 { ··· 741 723 continue; 742 724 } 743 725 744 - /* both bits in buddy2 must be 1 */ 745 - MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); 746 - MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); 747 - 748 - for (j = 0; j < (1 << order); j++) { 749 - k = (i * (1 << order)) + j; 750 - MB_CHECK_ASSERT( 751 - !mb_test_bit(k, e4b->bd_bitmap)); 752 - } 753 726 count++; 754 727 } 755 728 MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); ··· 756 747 fragments++; 757 748 fstart = i; 758 749 } 759 - continue; 750 + } else { 751 + fstart = -1; 760 752 } 761 - fstart = -1; 762 - /* check used bits only */ 763 - for (j = 0; j < e4b->bd_blkbits + 1; j++) { 764 - buddy2 = mb_find_buddy(e4b, j, &max2); 765 - k = i >> j; 766 - MB_CHECK_ASSERT(k < max2); 767 - MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); 753 + if (!(i & 1)) { 754 + int in_use, zero_bit_count = 0; 755 + 756 + in_use = mb_test_bit(i, buddy) || mb_test_bit(i + 1, buddy); 757 + for (j = 1; j < e4b->bd_blkbits + 2; j++) { 758 + buddy2 = mb_find_buddy(e4b, j, &max2); 759 + k = i >> j; 760 + MB_CHECK_ASSERT(k < max2); 761 + if (!mb_test_bit(k, buddy2)) 762 + zero_bit_count++; 763 + } 764 + MB_CHECK_ASSERT(zero_bit_count == !in_use); 768 765 } 769 766 } 770 767 MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); ··· 783 768 ext4_group_t groupnr; 784 769 struct ext4_prealloc_space *pa; 785 770 pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); 771 + if (!pa->pa_len) 772 + continue; 786 773 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); 787 774 MB_CHECK_ASSERT(groupnr == e4b->bd_group); 788 775 for (i = 0; i < pa->pa_len; i++) ··· 1346 1329 * block bitmap and buddy information. The information are 1347 1330 * stored in the inode as 1348 1331 * 1349 - * { page } 1332 + * { folio } 1350 1333 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... 1351 1334 * 1352 1335 * 1353 1336 * one block each for bitmap and buddy information. 1354 - * So for each group we take up 2 blocks. A page can 1355 - * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. 1356 - * So it can have information regarding groups_per_page which 1357 - * is blocks_per_page/2 1337 + * So for each group we take up 2 blocks. A folio can 1338 + * contain blocks_per_folio (folio_size / blocksize) blocks. 1339 + * So it can have information regarding groups_per_folio which 1340 + * is blocks_per_folio/2 1358 1341 * 1359 1342 * Locking note: This routine takes the block group lock of all groups 1360 - * for this page; do not hold this lock when calling this routine! 1343 + * for this folio; do not hold this lock when calling this routine! 1361 1344 */ 1362 - 1363 1345 static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) 1364 1346 { 1365 1347 ext4_group_t ngroups; 1366 1348 unsigned int blocksize; 1367 - int blocks_per_page; 1368 - int groups_per_page; 1349 + int blocks_per_folio; 1350 + int groups_per_folio; 1369 1351 int err = 0; 1370 1352 int i; 1371 1353 ext4_group_t first_group, group; ··· 1381 1365 sb = inode->i_sb; 1382 1366 ngroups = ext4_get_groups_count(sb); 1383 1367 blocksize = i_blocksize(inode); 1384 - blocks_per_page = PAGE_SIZE / blocksize; 1368 + blocks_per_folio = folio_size(folio) / blocksize; 1369 + WARN_ON_ONCE(!blocks_per_folio); 1370 + groups_per_folio = DIV_ROUND_UP(blocks_per_folio, 2); 1385 1371 1386 1372 mb_debug(sb, "init folio %lu\n", folio->index); 1387 1373 1388 - groups_per_page = blocks_per_page >> 1; 1389 - if (groups_per_page == 0) 1390 - groups_per_page = 1; 1391 - 1392 1374 /* allocate buffer_heads to read bitmaps */ 1393 - if (groups_per_page > 1) { 1394 - i = sizeof(struct buffer_head *) * groups_per_page; 1375 + if (groups_per_folio > 1) { 1376 + i = sizeof(struct buffer_head *) * groups_per_folio; 1395 1377 bh = kzalloc(i, gfp); 1396 1378 if (bh == NULL) 1397 1379 return -ENOMEM; 1398 1380 } else 1399 1381 bh = &bhs; 1400 1382 1401 - first_group = folio->index * blocks_per_page / 2; 1402 - 1403 1383 /* read all groups the folio covers into the cache */ 1404 - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1384 + first_group = EXT4_PG_TO_LBLK(inode, folio->index) / 2; 1385 + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { 1405 1386 if (group >= ngroups) 1406 1387 break; 1407 1388 ··· 1406 1393 if (!grinfo) 1407 1394 continue; 1408 1395 /* 1409 - * If page is uptodate then we came here after online resize 1396 + * If folio is uptodate then we came here after online resize 1410 1397 * which added some new uninitialized group info structs, so 1411 1398 * we must skip all initialized uptodate buddies on the folio, 1412 1399 * which may be currently in use by an allocating task. ··· 1426 1413 } 1427 1414 1428 1415 /* wait for I/O completion */ 1429 - for (i = 0, group = first_group; i < groups_per_page; i++, group++) { 1416 + for (i = 0, group = first_group; i < groups_per_folio; i++, group++) { 1430 1417 int err2; 1431 1418 1432 1419 if (!bh[i]) ··· 1436 1423 err = err2; 1437 1424 } 1438 1425 1439 - first_block = folio->index * blocks_per_page; 1440 - for (i = 0; i < blocks_per_page; i++) { 1426 + first_block = EXT4_PG_TO_LBLK(inode, folio->index); 1427 + for (i = 0; i < blocks_per_folio; i++) { 1441 1428 group = (first_block + i) >> 1; 1442 1429 if (group >= ngroups) 1443 1430 break; ··· 1514 1501 1515 1502 out: 1516 1503 if (bh) { 1517 - for (i = 0; i < groups_per_page; i++) 1504 + for (i = 0; i < groups_per_folio; i++) 1518 1505 brelse(bh[i]); 1519 1506 if (bh != &bhs) 1520 1507 kfree(bh); ··· 1523 1510 } 1524 1511 1525 1512 /* 1526 - * Lock the buddy and bitmap pages. This make sure other parallel init_group 1527 - * on the same buddy page doesn't happen whild holding the buddy page lock. 1528 - * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap 1529 - * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. 1513 + * Lock the buddy and bitmap folios. This makes sure other parallel init_group 1514 + * on the same buddy folio doesn't happen while holding the buddy folio lock. 1515 + * Return locked buddy and bitmap folios on e4b struct. If buddy and bitmap 1516 + * are on the same folio e4b->bd_buddy_folio is NULL and return value is 0. 1530 1517 */ 1531 - static int ext4_mb_get_buddy_page_lock(struct super_block *sb, 1518 + static int ext4_mb_get_buddy_folio_lock(struct super_block *sb, 1532 1519 ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) 1533 1520 { 1534 1521 struct inode *inode = EXT4_SB(sb)->s_buddy_cache; 1535 - int block, pnum, poff; 1536 - int blocks_per_page; 1522 + int block, pnum; 1537 1523 struct folio *folio; 1538 1524 1539 1525 e4b->bd_buddy_folio = NULL; 1540 1526 e4b->bd_bitmap_folio = NULL; 1541 1527 1542 - blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1543 1528 /* 1544 1529 * the buddy cache inode stores the block bitmap 1545 1530 * and buddy information in consecutive blocks. 1546 1531 * So for each group we need two blocks. 1547 1532 */ 1548 1533 block = group * 2; 1549 - pnum = block / blocks_per_page; 1550 - poff = block % blocks_per_page; 1534 + pnum = EXT4_LBLK_TO_PG(inode, block); 1551 1535 folio = __filemap_get_folio(inode->i_mapping, pnum, 1552 1536 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); 1553 1537 if (IS_ERR(folio)) 1554 1538 return PTR_ERR(folio); 1555 1539 BUG_ON(folio->mapping != inode->i_mapping); 1540 + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); 1556 1541 e4b->bd_bitmap_folio = folio; 1557 - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); 1542 + e4b->bd_bitmap = folio_address(folio) + 1543 + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); 1558 1544 1559 - if (blocks_per_page >= 2) { 1560 - /* buddy and bitmap are on the same page */ 1545 + block++; 1546 + pnum = EXT4_LBLK_TO_PG(inode, block); 1547 + if (folio_contains(folio, pnum)) { 1548 + /* buddy and bitmap are on the same folio */ 1561 1549 return 0; 1562 1550 } 1563 1551 1564 - /* blocks_per_page == 1, hence we need another page for the buddy */ 1565 - folio = __filemap_get_folio(inode->i_mapping, block + 1, 1552 + /* we need another folio for the buddy */ 1553 + folio = __filemap_get_folio(inode->i_mapping, pnum, 1566 1554 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); 1567 1555 if (IS_ERR(folio)) 1568 1556 return PTR_ERR(folio); 1569 1557 BUG_ON(folio->mapping != inode->i_mapping); 1558 + WARN_ON_ONCE(folio_size(folio) < sb->s_blocksize); 1570 1559 e4b->bd_buddy_folio = folio; 1571 1560 return 0; 1572 1561 } 1573 1562 1574 - static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) 1563 + static void ext4_mb_put_buddy_folio_lock(struct ext4_buddy *e4b) 1575 1564 { 1576 1565 if (e4b->bd_bitmap_folio) { 1577 1566 folio_unlock(e4b->bd_bitmap_folio); ··· 1587 1572 1588 1573 /* 1589 1574 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1590 - * block group lock of all groups for this page; do not hold the BG lock when 1575 + * block group lock of all groups for this folio; do not hold the BG lock when 1591 1576 * calling this routine! 1592 1577 */ 1593 1578 static noinline_for_stack ··· 1607 1592 1608 1593 /* 1609 1594 * This ensures that we don't reinit the buddy cache 1610 - * page which map to the group from which we are already 1595 + * folio which map to the group from which we are already 1611 1596 * allocating. If we are looking at the buddy cache we would 1612 1597 * have taken a reference using ext4_mb_load_buddy and that 1613 - * would have pinned buddy page to page cache. 1614 - * The call to ext4_mb_get_buddy_page_lock will mark the 1615 - * page accessed. 1598 + * would have pinned buddy folio to page cache. 1599 + * The call to ext4_mb_get_buddy_folio_lock will mark the 1600 + * folio accessed. 1616 1601 */ 1617 - ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); 1602 + ret = ext4_mb_get_buddy_folio_lock(sb, group, &e4b, gfp); 1618 1603 if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { 1619 1604 /* 1620 1605 * somebody initialized the group ··· 1635 1620 if (e4b.bd_buddy_folio == NULL) { 1636 1621 /* 1637 1622 * If both the bitmap and buddy are in 1638 - * the same page we don't need to force 1623 + * the same folio we don't need to force 1639 1624 * init the buddy 1640 1625 */ 1641 1626 ret = 0; ··· 1651 1636 goto err; 1652 1637 } 1653 1638 err: 1654 - ext4_mb_put_buddy_page_lock(&e4b); 1639 + ext4_mb_put_buddy_folio_lock(&e4b); 1655 1640 return ret; 1656 1641 } 1657 1642 1658 1643 /* 1659 1644 * Locking note: This routine calls ext4_mb_init_cache(), which takes the 1660 - * block group lock of all groups for this page; do not hold the BG lock when 1645 + * block group lock of all groups for this folio; do not hold the BG lock when 1661 1646 * calling this routine! 1662 1647 */ 1663 1648 static noinline_for_stack int 1664 1649 ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, 1665 1650 struct ext4_buddy *e4b, gfp_t gfp) 1666 1651 { 1667 - int blocks_per_page; 1668 1652 int block; 1669 1653 int pnum; 1670 - int poff; 1671 1654 struct folio *folio; 1672 1655 int ret; 1673 1656 struct ext4_group_info *grp; ··· 1675 1662 might_sleep(); 1676 1663 mb_debug(sb, "load group %u\n", group); 1677 1664 1678 - blocks_per_page = PAGE_SIZE / sb->s_blocksize; 1679 1665 grp = ext4_get_group_info(sb, group); 1680 1666 if (!grp) 1681 1667 return -EFSCORRUPTED; ··· 1702 1690 * So for each group we need two blocks. 1703 1691 */ 1704 1692 block = group * 2; 1705 - pnum = block / blocks_per_page; 1706 - poff = block % blocks_per_page; 1693 + pnum = EXT4_LBLK_TO_PG(inode, block); 1707 1694 1708 1695 /* Avoid locking the folio in the fast path ... */ 1709 1696 folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); ··· 1734 1723 goto err; 1735 1724 } 1736 1725 mb_cmp_bitmaps(e4b, folio_address(folio) + 1737 - (poff * sb->s_blocksize)); 1726 + offset_in_folio(folio, 1727 + EXT4_LBLK_TO_B(inode, block))); 1738 1728 } 1739 1729 folio_unlock(folio); 1740 1730 } ··· 1751 1739 1752 1740 /* Folios marked accessed already */ 1753 1741 e4b->bd_bitmap_folio = folio; 1754 - e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); 1742 + e4b->bd_bitmap = folio_address(folio) + 1743 + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); 1755 1744 1756 1745 block++; 1757 - pnum = block / blocks_per_page; 1758 - poff = block % blocks_per_page; 1746 + pnum = EXT4_LBLK_TO_PG(inode, block); 1747 + /* buddy and bitmap are on the same folio? */ 1748 + if (folio_contains(folio, pnum)) { 1749 + folio_get(folio); 1750 + goto update_buddy; 1751 + } 1759 1752 1753 + /* we need another folio for the buddy */ 1760 1754 folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); 1761 1755 if (IS_ERR(folio) || !folio_test_uptodate(folio)) { 1762 1756 if (!IS_ERR(folio)) ··· 1797 1779 goto err; 1798 1780 } 1799 1781 1782 + update_buddy: 1800 1783 /* Folios marked accessed already */ 1801 1784 e4b->bd_buddy_folio = folio; 1802 - e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); 1785 + e4b->bd_buddy = folio_address(folio) + 1786 + offset_in_folio(folio, EXT4_LBLK_TO_B(inode, block)); 1803 1787 1804 1788 return 0; 1805 1789 ··· 2244 2224 ac->ac_buddy = ret >> 16; 2245 2225 2246 2226 /* 2247 - * take the page reference. We want the page to be pinned 2227 + * take the folio reference. We want the folio to be pinned 2248 2228 * so that we don't get a ext4_mb_init_cache_call for this 2249 2229 * group until we update the bitmap. That would mean we 2250 2230 * double allocate blocks. The reference is dropped ··· 2950 2930 if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) 2951 2931 return 0; 2952 2932 2953 - /* This now checks without needing the buddy page */ 2933 + /* This now checks without needing the buddy folio */ 2954 2934 ret = ext4_mb_good_group_nolock(ac, group, cr); 2955 2935 if (ret <= 0) { 2956 2936 if (!ac->ac_first_err) ··· 3510 3490 * this will avoid confusion if it ever shows up during debugging. */ 3511 3491 sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; 3512 3492 EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; 3493 + ext4_set_inode_mapping_order(sbi->s_buddy_cache); 3494 + 3513 3495 for (i = 0; i < ngroups; i++) { 3514 3496 cond_resched(); 3515 3497 desc = ext4_get_group_desc(sb, i, NULL); ··· 4742 4720 "ext4: mb_load_buddy failed (%d)", err)) 4743 4721 /* 4744 4722 * This should never happen since we pin the 4745 - * pages in the ext4_allocation_context so 4723 + * folios in the ext4_allocation_context so 4746 4724 * ext4_mb_load_buddy() should never fail. 4747 4725 */ 4748 4726 return;

+381 -433

fs/ext4/move_extent.c

··· 13 13 #include "ext4.h" 14 14 #include "ext4_extents.h" 15 15 16 - /** 17 - * get_ext_path() - Find an extent path for designated logical block number. 18 - * @inode: inode to be searched 19 - * @lblock: logical block number to find an extent path 20 - * @path: pointer to an extent path 21 - * 22 - * ext4_find_extent wrapper. Return an extent path pointer on success, 23 - * or an error pointer on failure. 24 - */ 25 - static inline struct ext4_ext_path * 26 - get_ext_path(struct inode *inode, ext4_lblk_t lblock, 27 - struct ext4_ext_path *path) 28 - { 29 - path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); 30 - if (IS_ERR(path)) 31 - return path; 32 - if (path[ext_depth(inode)].p_ext == NULL) { 33 - ext4_free_ext_path(path); 34 - return ERR_PTR(-ENODATA); 35 - } 36 - return path; 37 - } 16 + #include <trace/events/ext4.h> 17 + 18 + struct mext_data { 19 + struct inode *orig_inode; /* Origin file inode */ 20 + struct inode *donor_inode; /* Donor file inode */ 21 + struct ext4_map_blocks orig_map;/* Origin file's move mapping */ 22 + ext4_lblk_t donor_lblk; /* Start block of the donor file */ 23 + }; 38 24 39 25 /** 40 26 * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem ··· 38 52 } else { 39 53 down_write(&EXT4_I(second)->i_data_sem); 40 54 down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); 41 - 42 55 } 43 56 } 44 57 ··· 56 71 up_write(&EXT4_I(donor_inode)->i_data_sem); 57 72 } 58 73 59 - /** 60 - * mext_check_coverage - Check that all extents in range has the same type 61 - * 62 - * @inode: inode in question 63 - * @from: block offset of inode 64 - * @count: block count to be checked 65 - * @unwritten: extents expected to be unwritten 66 - * @err: pointer to save error value 67 - * 68 - * Return 1 if all extents in range has expected type, and zero otherwise. 69 - */ 70 - static int 71 - mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, 72 - int unwritten, int *err) 73 - { 74 - struct ext4_ext_path *path = NULL; 75 - struct ext4_extent *ext; 76 - int ret = 0; 77 - ext4_lblk_t last = from + count; 78 - while (from < last) { 79 - path = get_ext_path(inode, from, path); 80 - if (IS_ERR(path)) { 81 - *err = PTR_ERR(path); 82 - return ret; 83 - } 84 - ext = path[ext_depth(inode)].p_ext; 85 - if (unwritten != ext4_ext_is_unwritten(ext)) 86 - goto out; 87 - from += ext4_ext_get_actual_len(ext); 88 - } 89 - ret = 1; 90 - out: 91 - ext4_free_ext_path(path); 92 - return ret; 93 - } 94 - 95 - /** 96 - * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 97 - * 98 - * @inode1: the inode structure 99 - * @inode2: the inode structure 100 - * @index1: folio index 101 - * @index2: folio index 102 - * @folio: result folio vector 103 - * 104 - * Grab two locked folio for inode's by inode order 105 - */ 106 - static int 107 - mext_folio_double_lock(struct inode *inode1, struct inode *inode2, 108 - pgoff_t index1, pgoff_t index2, struct folio *folio[2]) 74 + /* Grab and lock folio on both @inode1 and @inode2 by inode order. */ 75 + static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2, 76 + pgoff_t index1, pgoff_t index2, size_t len, 77 + struct folio *folio[2]) 109 78 { 110 79 struct address_space *mapping[2]; 111 80 unsigned int flags; 81 + fgf_t fgp_flags = FGP_WRITEBEGIN; 112 82 113 83 BUG_ON(!inode1 || !inode2); 114 84 if (inode1 < inode2) { ··· 76 136 } 77 137 78 138 flags = memalloc_nofs_save(); 79 - folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, 139 + fgp_flags |= fgf_set_order(len); 140 + folio[0] = __filemap_get_folio(mapping[0], index1, fgp_flags, 80 141 mapping_gfp_mask(mapping[0])); 81 142 if (IS_ERR(folio[0])) { 82 143 memalloc_nofs_restore(flags); 83 144 return PTR_ERR(folio[0]); 84 145 } 85 146 86 - folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, 147 + folio[1] = __filemap_get_folio(mapping[1], index2, fgp_flags, 87 148 mapping_gfp_mask(mapping[1])); 88 149 memalloc_nofs_restore(flags); 89 150 if (IS_ERR(folio[1])) { ··· 105 164 return 0; 106 165 } 107 166 167 + static void mext_folio_double_unlock(struct folio *folio[2]) 168 + { 169 + folio_unlock(folio[0]); 170 + folio_put(folio[0]); 171 + folio_unlock(folio[1]); 172 + folio_put(folio[1]); 173 + } 174 + 108 175 /* Force folio buffers uptodate w/o dropping folio's lock */ 109 - static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) 176 + static int mext_folio_mkuptodate(struct folio *folio, size_t from, size_t to) 110 177 { 111 178 struct inode *inode = folio->mapping->host; 112 179 sector_t block; ··· 187 238 return 0; 188 239 } 189 240 190 - /** 191 - * move_extent_per_page - Move extent data per page 192 - * 193 - * @o_filp: file structure of original file 194 - * @donor_inode: donor inode 195 - * @orig_page_offset: page index on original file 196 - * @donor_page_offset: page index on donor file 197 - * @data_offset_in_page: block index where data swapping starts 198 - * @block_len_in_page: the number of blocks to be swapped 199 - * @unwritten: orig extent is unwritten or not 200 - * @err: pointer to save return value 201 - * 202 - * Save the data in original inode blocks and replace original inode extents 203 - * with donor inode extents by calling ext4_swap_extents(). 204 - * Finally, write out the saved data in new original inode blocks. Return 205 - * replaced block count. 241 + enum mext_move_type {MEXT_SKIP_EXTENT, MEXT_MOVE_EXTENT, MEXT_COPY_DATA}; 242 + 243 + /* 244 + * Start to move extent between the origin inode and the donor inode, 245 + * hold one folio for each inode and check the candidate moving extent 246 + * mapping status again. 206 247 */ 207 - static int 208 - move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 209 - pgoff_t orig_page_offset, pgoff_t donor_page_offset, 210 - int data_offset_in_page, 211 - int block_len_in_page, int unwritten, int *err) 248 + static int mext_move_begin(struct mext_data *mext, struct folio *folio[2], 249 + enum mext_move_type *move_type) 212 250 { 213 - struct inode *orig_inode = file_inode(o_filp); 214 - struct folio *folio[2] = {NULL, NULL}; 215 - handle_t *handle; 216 - ext4_lblk_t orig_blk_offset, donor_blk_offset; 217 - unsigned long blocksize = orig_inode->i_sb->s_blocksize; 218 - unsigned int tmp_data_size, data_size, replaced_size; 219 - int i, err2, jblocks, retries = 0; 220 - int replaced_count = 0; 221 - int from; 222 - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; 223 - struct super_block *sb = orig_inode->i_sb; 224 - struct buffer_head *bh = NULL; 251 + struct inode *orig_inode = mext->orig_inode; 252 + struct inode *donor_inode = mext->donor_inode; 253 + unsigned int blkbits = orig_inode->i_blkbits; 254 + struct ext4_map_blocks donor_map = {0}; 255 + loff_t orig_pos, donor_pos; 256 + size_t move_len; 257 + int ret; 258 + 259 + orig_pos = ((loff_t)mext->orig_map.m_lblk) << blkbits; 260 + donor_pos = ((loff_t)mext->donor_lblk) << blkbits; 261 + ret = mext_folio_double_lock(orig_inode, donor_inode, 262 + orig_pos >> PAGE_SHIFT, donor_pos >> PAGE_SHIFT, 263 + ((size_t)mext->orig_map.m_len) << blkbits, folio); 264 + if (ret) 265 + return ret; 225 266 226 267 /* 227 - * It needs twice the amount of ordinary journal buffers because 228 - * inode and donor_inode may change each different metadata blocks. 268 + * Check the origin inode's mapping information again under the 269 + * folio lock, as we do not hold the i_data_sem at all times, and 270 + * it may change during the concurrent write-back operation. 229 271 */ 230 - again: 231 - *err = 0; 232 - jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, 233 - block_len_in_page) * 2; 234 - handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); 235 - if (IS_ERR(handle)) { 236 - *err = PTR_ERR(handle); 237 - return 0; 272 + if (mext->orig_map.m_seq != READ_ONCE(EXT4_I(orig_inode)->i_es_seq)) { 273 + ret = -ESTALE; 274 + goto error; 238 275 } 239 276 240 - orig_blk_offset = orig_page_offset * blocks_per_page + 241 - data_offset_in_page; 277 + /* Adjust the moving length according to the length of shorter folio. */ 278 + move_len = umin(folio_pos(folio[0]) + folio_size(folio[0]) - orig_pos, 279 + folio_pos(folio[1]) + folio_size(folio[1]) - donor_pos); 280 + move_len >>= blkbits; 281 + if (move_len < mext->orig_map.m_len) 282 + mext->orig_map.m_len = move_len; 242 283 243 - donor_blk_offset = donor_page_offset * blocks_per_page + 244 - data_offset_in_page; 284 + donor_map.m_lblk = mext->donor_lblk; 285 + donor_map.m_len = mext->orig_map.m_len; 286 + donor_map.m_flags = 0; 287 + ret = ext4_map_blocks(NULL, donor_inode, &donor_map, 0); 288 + if (ret < 0) 289 + goto error; 245 290 246 - /* Calculate data_size */ 247 - if ((orig_blk_offset + block_len_in_page - 1) == 248 - ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 249 - /* Replace the last block */ 250 - tmp_data_size = orig_inode->i_size & (blocksize - 1); 251 - /* 252 - * If data_size equal zero, it shows data_size is multiples of 253 - * blocksize. So we set appropriate value. 254 - */ 255 - if (tmp_data_size == 0) 256 - tmp_data_size = blocksize; 291 + /* Adjust the moving length according to the donor mapping length. */ 292 + mext->orig_map.m_len = donor_map.m_len; 257 293 258 - data_size = tmp_data_size + 259 - ((block_len_in_page - 1) << orig_inode->i_blkbits); 260 - } else 261 - data_size = block_len_in_page << orig_inode->i_blkbits; 294 + /* Skip moving if the donor range is a hole or a delalloc extent. */ 295 + if (!(donor_map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) 296 + *move_type = MEXT_SKIP_EXTENT; 297 + /* If both mapping ranges are unwritten, no need to copy data. */ 298 + else if ((mext->orig_map.m_flags & EXT4_MAP_UNWRITTEN) && 299 + (donor_map.m_flags & EXT4_MAP_UNWRITTEN)) 300 + *move_type = MEXT_MOVE_EXTENT; 301 + else 302 + *move_type = MEXT_COPY_DATA; 262 303 263 - replaced_size = data_size; 264 - 265 - *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, 266 - donor_page_offset, folio); 267 - if (unlikely(*err < 0)) 268 - goto stop_journal; 269 - /* 270 - * If orig extent was unwritten it can become initialized 271 - * at any time after i_data_sem was dropped, in order to 272 - * serialize with delalloc we have recheck extent while we 273 - * hold page's lock, if it is still the case data copy is not 274 - * necessary, just swap data blocks between orig and donor. 275 - */ 276 - if (unwritten) { 277 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 278 - /* If any of extents in range became initialized we have to 279 - * fallback to data copying */ 280 - unwritten = mext_check_coverage(orig_inode, orig_blk_offset, 281 - block_len_in_page, 1, err); 282 - if (*err) 283 - goto drop_data_sem; 284 - 285 - unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, 286 - block_len_in_page, 1, err); 287 - if (*err) 288 - goto drop_data_sem; 289 - 290 - if (!unwritten) { 291 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 292 - goto data_copy; 293 - } 294 - if (!filemap_release_folio(folio[0], 0) || 295 - !filemap_release_folio(folio[1], 0)) { 296 - *err = -EBUSY; 297 - goto drop_data_sem; 298 - } 299 - replaced_count = ext4_swap_extents(handle, orig_inode, 300 - donor_inode, orig_blk_offset, 301 - donor_blk_offset, 302 - block_len_in_page, 1, err); 303 - drop_data_sem: 304 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 305 - goto unlock_folios; 306 - } 307 - data_copy: 308 - from = offset_in_folio(folio[0], 309 - orig_blk_offset << orig_inode->i_blkbits); 310 - *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); 311 - if (*err) 312 - goto unlock_folios; 313 - 314 - /* At this point all buffers in range are uptodate, old mapping layout 315 - * is no longer required, try to drop it now. */ 316 - if (!filemap_release_folio(folio[0], 0) || 317 - !filemap_release_folio(folio[1], 0)) { 318 - *err = -EBUSY; 319 - goto unlock_folios; 320 - } 321 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 322 - replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, 323 - orig_blk_offset, donor_blk_offset, 324 - block_len_in_page, 1, err); 325 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 326 - if (*err) { 327 - if (replaced_count) { 328 - block_len_in_page = replaced_count; 329 - replaced_size = 330 - block_len_in_page << orig_inode->i_blkbits; 331 - } else 332 - goto unlock_folios; 333 - } 334 - /* Perform all necessary steps similar write_begin()/write_end() 335 - * but keeping in mind that i_size will not change */ 336 - bh = folio_buffers(folio[0]); 337 - if (!bh) 338 - bh = create_empty_buffers(folio[0], 339 - 1 << orig_inode->i_blkbits, 0); 340 - for (i = 0; i < from >> orig_inode->i_blkbits; i++) 341 - bh = bh->b_this_page; 342 - for (i = 0; i < block_len_in_page; i++) { 343 - *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); 344 - if (*err < 0) 345 - goto repair_branches; 346 - bh = bh->b_this_page; 347 - } 348 - 349 - block_commit_write(folio[0], from, from + replaced_size); 350 - 351 - /* Even in case of data=writeback it is reasonable to pin 352 - * inode to transaction, to prevent unexpected data loss */ 353 - *err = ext4_jbd2_inode_add_write(handle, orig_inode, 354 - (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); 355 - 356 - unlock_folios: 357 - folio_unlock(folio[0]); 358 - folio_put(folio[0]); 359 - folio_unlock(folio[1]); 360 - folio_put(folio[1]); 361 - stop_journal: 362 - ext4_journal_stop(handle); 363 - if (*err == -ENOSPC && 364 - ext4_should_retry_alloc(sb, &retries)) 365 - goto again; 366 - /* Buffer was busy because probably is pinned to journal transaction, 367 - * force transaction commit may help to free it. */ 368 - if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && 369 - jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) 370 - goto again; 371 - return replaced_count; 372 - 373 - repair_branches: 374 - /* 375 - * This should never ever happen! 376 - * Extents are swapped already, but we are not able to copy data. 377 - * Try to swap extents to it's original places 378 - */ 379 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 380 - replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, 381 - orig_blk_offset, donor_blk_offset, 382 - block_len_in_page, 0, &err2); 383 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 384 - if (replaced_count != block_len_in_page) { 385 - ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), 386 - EIO, "Unable to copy data block," 387 - " data will be lost."); 388 - *err = -EIO; 389 - } 390 - replaced_count = 0; 391 - goto unlock_folios; 304 + return 0; 305 + error: 306 + mext_folio_double_unlock(folio); 307 + return ret; 392 308 } 393 309 394 - /** 395 - * mext_check_arguments - Check whether move extent can be done 396 - * 397 - * @orig_inode: original inode 398 - * @donor_inode: donor inode 399 - * @orig_start: logical start offset in block for orig 400 - * @donor_start: logical start offset in block for donor 401 - * @len: the number of blocks to be moved 402 - * 403 - * Check the arguments of ext4_move_extents() whether the files can be 404 - * exchanged with each other. 405 - * Return 0 on success, or a negative error value on failure. 310 + /* 311 + * Re-create the new moved mapping buffers of the original inode and commit 312 + * the entire written range. 406 313 */ 407 - static int 408 - mext_check_arguments(struct inode *orig_inode, 409 - struct inode *donor_inode, __u64 orig_start, 410 - __u64 donor_start, __u64 *len) 314 + static int mext_folio_mkwrite(struct inode *inode, struct folio *folio, 315 + size_t from, size_t to) 411 316 { 412 - __u64 orig_eof, donor_eof; 317 + unsigned int blocksize = i_blocksize(inode); 318 + struct buffer_head *bh, *head; 319 + size_t block_start, block_end; 320 + sector_t block; 321 + int ret; 322 + 323 + head = folio_buffers(folio); 324 + if (!head) 325 + head = create_empty_buffers(folio, blocksize, 0); 326 + 327 + block = folio_pos(folio) >> inode->i_blkbits; 328 + block_end = 0; 329 + bh = head; 330 + do { 331 + block_start = block_end; 332 + block_end = block_start + blocksize; 333 + if (block_end <= from || block_start >= to) 334 + continue; 335 + 336 + ret = ext4_get_block(inode, block, bh, 0); 337 + if (ret) 338 + return ret; 339 + } while (block++, (bh = bh->b_this_page) != head); 340 + 341 + block_commit_write(folio, from, to); 342 + return 0; 343 + } 344 + 345 + /* 346 + * Save the data in original inode extent blocks and replace one folio size 347 + * aligned original inode extent with one or one partial donor inode extent, 348 + * and then write out the saved data in new original inode blocks. Pass out 349 + * the replaced block count through m_len. Return 0 on success, and an error 350 + * code otherwise. 351 + */ 352 + static int mext_move_extent(struct mext_data *mext, u64 *m_len) 353 + { 354 + struct inode *orig_inode = mext->orig_inode; 355 + struct inode *donor_inode = mext->donor_inode; 356 + struct ext4_map_blocks *orig_map = &mext->orig_map; 413 357 unsigned int blkbits = orig_inode->i_blkbits; 414 - unsigned int blocksize = 1 << blkbits; 358 + struct folio *folio[2] = {NULL, NULL}; 359 + loff_t from, length; 360 + enum mext_move_type move_type = 0; 361 + handle_t *handle; 362 + u64 r_len = 0; 363 + unsigned int credits; 364 + int ret, ret2; 415 365 416 - orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; 417 - donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; 366 + *m_len = 0; 367 + trace_ext4_move_extent_enter(orig_inode, orig_map, donor_inode, 368 + mext->donor_lblk); 369 + credits = ext4_chunk_trans_extent(orig_inode, 0) * 2; 370 + handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, credits); 371 + if (IS_ERR(handle)) { 372 + ret = PTR_ERR(handle); 373 + goto out; 374 + } 418 375 376 + ret = mext_move_begin(mext, folio, &move_type); 377 + if (ret) 378 + goto stop_handle; 419 379 420 - if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 421 - ext4_debug("ext4 move extent: suid or sgid is set" 422 - " to donor file [ino:orig %lu, donor %lu]\n", 380 + if (move_type == MEXT_SKIP_EXTENT) 381 + goto unlock; 382 + 383 + /* 384 + * Copy the data. First, read the original inode data into the page 385 + * cache. Then, release the existing mapping relationships and swap 386 + * the extent. Finally, re-establish the new mapping relationships 387 + * and dirty the page cache. 388 + */ 389 + if (move_type == MEXT_COPY_DATA) { 390 + from = offset_in_folio(folio[0], 391 + ((loff_t)orig_map->m_lblk) << blkbits); 392 + length = ((loff_t)orig_map->m_len) << blkbits; 393 + 394 + ret = mext_folio_mkuptodate(folio[0], from, from + length); 395 + if (ret) 396 + goto unlock; 397 + } 398 + 399 + if (!filemap_release_folio(folio[0], 0) || 400 + !filemap_release_folio(folio[1], 0)) { 401 + ret = -EBUSY; 402 + goto unlock; 403 + } 404 + 405 + /* Move extent */ 406 + ext4_double_down_write_data_sem(orig_inode, donor_inode); 407 + *m_len = ext4_swap_extents(handle, orig_inode, donor_inode, 408 + orig_map->m_lblk, mext->donor_lblk, 409 + orig_map->m_len, 1, &ret); 410 + ext4_double_up_write_data_sem(orig_inode, donor_inode); 411 + 412 + /* A short-length swap cannot occur after a successful swap extent. */ 413 + if (WARN_ON_ONCE(!ret && (*m_len != orig_map->m_len))) 414 + ret = -EIO; 415 + 416 + if (!(*m_len) || (move_type == MEXT_MOVE_EXTENT)) 417 + goto unlock; 418 + 419 + /* Copy data */ 420 + length = (*m_len) << blkbits; 421 + ret2 = mext_folio_mkwrite(orig_inode, folio[0], from, from + length); 422 + if (ret2) { 423 + if (!ret) 424 + ret = ret2; 425 + goto repair_branches; 426 + } 427 + /* 428 + * Even in case of data=writeback it is reasonable to pin 429 + * inode to transaction, to prevent unexpected data loss. 430 + */ 431 + ret2 = ext4_jbd2_inode_add_write(handle, orig_inode, 432 + ((loff_t)orig_map->m_lblk) << blkbits, length); 433 + if (!ret) 434 + ret = ret2; 435 + unlock: 436 + mext_folio_double_unlock(folio); 437 + stop_handle: 438 + ext4_journal_stop(handle); 439 + out: 440 + trace_ext4_move_extent_exit(orig_inode, orig_map->m_lblk, donor_inode, 441 + mext->donor_lblk, orig_map->m_len, *m_len, 442 + move_type, ret); 443 + return ret; 444 + 445 + repair_branches: 446 + ret2 = 0; 447 + r_len = ext4_swap_extents(handle, donor_inode, orig_inode, 448 + mext->donor_lblk, orig_map->m_lblk, 449 + *m_len, 0, &ret2); 450 + if (ret2 || r_len != *m_len) { 451 + ext4_error_inode_block(orig_inode, (sector_t)(orig_map->m_lblk), 452 + EIO, "Unable to copy data block, data will be lost!"); 453 + ret = -EIO; 454 + } 455 + *m_len = 0; 456 + goto unlock; 457 + } 458 + 459 + /* 460 + * Check the validity of the basic filesystem environment and the 461 + * inodes' support status. 462 + */ 463 + static int mext_check_validity(struct inode *orig_inode, 464 + struct inode *donor_inode) 465 + { 466 + struct super_block *sb = orig_inode->i_sb; 467 + 468 + /* origin and donor should be different inodes */ 469 + if (orig_inode == donor_inode) { 470 + ext4_debug("ext4 move extent: The argument files should not be same inode [ino:orig %lu, donor %lu]\n", 423 471 orig_inode->i_ino, donor_inode->i_ino); 424 472 return -EINVAL; 425 473 } 426 474 427 - if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) 428 - return -EPERM; 429 - 430 - /* Ext4 move extent does not support swap files */ 431 - if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { 432 - ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", 433 - orig_inode->i_ino, donor_inode->i_ino); 434 - return -ETXTBSY; 475 + /* origin and donor should belone to the same filesystem */ 476 + if (orig_inode->i_sb != donor_inode->i_sb) { 477 + ext4_debug("ext4 move extent: The argument files should be in same FS [ino:orig %lu, donor %lu]\n", 478 + orig_inode->i_ino, donor_inode->i_ino); 479 + return -EINVAL; 435 480 } 436 481 437 - if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { 438 - ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", 439 - orig_inode->i_ino, donor_inode->i_ino); 482 + /* Regular file check */ 483 + if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 484 + ext4_debug("ext4 move extent: The argument files should be regular file [ino:orig %lu, donor %lu]\n", 485 + orig_inode->i_ino, donor_inode->i_ino); 486 + return -EINVAL; 487 + } 488 + 489 + if (ext4_has_feature_bigalloc(sb)) { 490 + ext4_msg(sb, KERN_ERR, 491 + "Online defrag not supported with bigalloc"); 492 + return -EOPNOTSUPP; 493 + } 494 + 495 + if (IS_DAX(orig_inode)) { 496 + ext4_msg(sb, KERN_ERR, 497 + "Online defrag not supported with DAX"); 498 + return -EOPNOTSUPP; 499 + } 500 + 501 + /* 502 + * TODO: it's not obvious how to swap blocks for inodes with full 503 + * journaling enabled. 504 + */ 505 + if (ext4_should_journal_data(orig_inode) || 506 + ext4_should_journal_data(donor_inode)) { 507 + ext4_msg(sb, KERN_ERR, 508 + "Online defrag not supported with data journaling"); 509 + return -EOPNOTSUPP; 510 + } 511 + 512 + if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { 513 + ext4_msg(sb, KERN_ERR, 514 + "Online defrag not supported for encrypted files"); 440 515 return -EOPNOTSUPP; 441 516 } 442 517 443 518 /* Ext4 move extent supports only extent based file */ 444 - if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { 445 - ext4_debug("ext4 move extent: orig file is not extents " 446 - "based file [ino:orig %lu]\n", orig_inode->i_ino); 519 + if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS)) || 520 + !(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { 521 + ext4_msg(sb, KERN_ERR, 522 + "Online defrag not supported for non-extent files"); 447 523 return -EOPNOTSUPP; 448 - } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { 449 - ext4_debug("ext4 move extent: donor file is not extents " 450 - "based file [ino:donor %lu]\n", donor_inode->i_ino); 524 + } 525 + 526 + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { 527 + ext4_debug("ext4 move extent: suid or sgid is set to donor file [ino:orig %lu, donor %lu]\n", 528 + orig_inode->i_ino, donor_inode->i_ino); 529 + return -EINVAL; 530 + } 531 + 532 + if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) { 533 + ext4_debug("ext4 move extent: donor should not be immutable or append file [ino:orig %lu, donor %lu]\n", 534 + orig_inode->i_ino, donor_inode->i_ino); 535 + return -EPERM; 536 + } 537 + 538 + /* Ext4 move extent does not support swap files */ 539 + if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { 540 + ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", 541 + orig_inode->i_ino, donor_inode->i_ino); 542 + return -ETXTBSY; 543 + } 544 + 545 + if (ext4_is_quota_file(orig_inode) || ext4_is_quota_file(donor_inode)) { 546 + ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", 547 + orig_inode->i_ino, donor_inode->i_ino); 451 548 return -EOPNOTSUPP; 452 549 } 453 550 ··· 502 507 return -EINVAL; 503 508 } 504 509 510 + return 0; 511 + } 512 + 513 + /* 514 + * Check the moving range of ext4_move_extents() whether the files can be 515 + * exchanged with each other, and adjust the length to fit within the file 516 + * size. Return 0 on success, or a negative error value on failure. 517 + */ 518 + static int mext_check_adjust_range(struct inode *orig_inode, 519 + struct inode *donor_inode, __u64 orig_start, 520 + __u64 donor_start, __u64 *len) 521 + { 522 + __u64 orig_eof, donor_eof; 523 + 505 524 /* Start offset should be same */ 506 525 if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != 507 526 (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { 508 - ext4_debug("ext4 move extent: orig and donor's start " 509 - "offsets are not aligned [ino:orig %lu, donor %lu]\n", 510 - orig_inode->i_ino, donor_inode->i_ino); 527 + ext4_debug("ext4 move extent: orig and donor's start offsets are not aligned [ino:orig %lu, donor %lu]\n", 528 + orig_inode->i_ino, donor_inode->i_ino); 511 529 return -EINVAL; 512 530 } 513 531 ··· 529 521 (*len > EXT_MAX_BLOCKS) || 530 522 (donor_start + *len >= EXT_MAX_BLOCKS) || 531 523 (orig_start + *len >= EXT_MAX_BLOCKS)) { 532 - ext4_debug("ext4 move extent: Can't handle over [%u] blocks " 533 - "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, 534 - orig_inode->i_ino, donor_inode->i_ino); 524 + ext4_debug("ext4 move extent: Can't handle over [%u] blocks [ino:orig %lu, donor %lu]\n", 525 + EXT_MAX_BLOCKS, 526 + orig_inode->i_ino, donor_inode->i_ino); 535 527 return -EINVAL; 536 528 } 529 + 530 + orig_eof = EXT4_B_TO_LBLK(orig_inode, i_size_read(orig_inode)); 531 + donor_eof = EXT4_B_TO_LBLK(donor_inode, i_size_read(donor_inode)); 537 532 if (orig_eof <= orig_start) 538 533 *len = 0; 539 534 else if (orig_eof < orig_start + *len - 1) ··· 546 535 else if (donor_eof < donor_start + *len - 1) 547 536 *len = donor_eof - donor_start; 548 537 if (!*len) { 549 - ext4_debug("ext4 move extent: len should not be 0 " 550 - "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 551 - donor_inode->i_ino); 538 + ext4_debug("ext4 move extent: len should not be 0 [ino:orig %lu, donor %lu]\n", 539 + orig_inode->i_ino, donor_inode->i_ino); 552 540 return -EINVAL; 553 541 } 554 542 ··· 566 556 * 567 557 * This function returns 0 and moved block length is set in moved_len 568 558 * if succeed, otherwise returns error value. 569 - * 570 559 */ 571 - int 572 - ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, 573 - __u64 donor_blk, __u64 len, __u64 *moved_len) 560 + int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, 561 + __u64 donor_blk, __u64 len, __u64 *moved_len) 574 562 { 575 563 struct inode *orig_inode = file_inode(o_filp); 576 564 struct inode *donor_inode = file_inode(d_filp); 577 - struct ext4_ext_path *path = NULL; 578 - int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; 579 - ext4_lblk_t o_end, o_start = orig_blk; 580 - ext4_lblk_t d_start = donor_blk; 565 + struct mext_data mext; 566 + struct super_block *sb = orig_inode->i_sb; 567 + struct ext4_sb_info *sbi = EXT4_SB(sb); 568 + int retries = 0; 569 + u64 m_len; 581 570 int ret; 582 571 583 - if (orig_inode->i_sb != donor_inode->i_sb) { 584 - ext4_debug("ext4 move extent: The argument files " 585 - "should be in same FS [ino:orig %lu, donor %lu]\n", 586 - orig_inode->i_ino, donor_inode->i_ino); 587 - return -EINVAL; 588 - } 589 - 590 - /* orig and donor should be different inodes */ 591 - if (orig_inode == donor_inode) { 592 - ext4_debug("ext4 move extent: The argument files should not " 593 - "be same inode [ino:orig %lu, donor %lu]\n", 594 - orig_inode->i_ino, donor_inode->i_ino); 595 - return -EINVAL; 596 - } 597 - 598 - /* Regular file check */ 599 - if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { 600 - ext4_debug("ext4 move extent: The argument files should be " 601 - "regular file [ino:orig %lu, donor %lu]\n", 602 - orig_inode->i_ino, donor_inode->i_ino); 603 - return -EINVAL; 604 - } 605 - 606 - /* TODO: it's not obvious how to swap blocks for inodes with full 607 - journaling enabled */ 608 - if (ext4_should_journal_data(orig_inode) || 609 - ext4_should_journal_data(donor_inode)) { 610 - ext4_msg(orig_inode->i_sb, KERN_ERR, 611 - "Online defrag not supported with data journaling"); 612 - return -EOPNOTSUPP; 613 - } 614 - 615 - if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { 616 - ext4_msg(orig_inode->i_sb, KERN_ERR, 617 - "Online defrag not supported for encrypted files"); 618 - return -EOPNOTSUPP; 619 - } 572 + *moved_len = 0; 620 573 621 574 /* Protect orig and donor inodes against a truncate */ 622 575 lock_two_nondirectories(orig_inode, donor_inode); 576 + 577 + ret = mext_check_validity(orig_inode, donor_inode); 578 + if (ret) 579 + goto out; 623 580 624 581 /* Wait for all existing dio workers */ 625 582 inode_dio_wait(orig_inode); 626 583 inode_dio_wait(donor_inode); 627 584 628 - /* Protect extent tree against block allocations via delalloc */ 629 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 630 - /* Check the filesystem environment whether move_extent can be done */ 631 - ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, 632 - donor_blk, &len); 585 + /* Check and adjust the specified move_extent range. */ 586 + ret = mext_check_adjust_range(orig_inode, donor_inode, orig_blk, 587 + donor_blk, &len); 633 588 if (ret) 634 589 goto out; 635 - o_end = o_start + len; 636 590 637 - *moved_len = 0; 638 - while (o_start < o_end) { 639 - struct ext4_extent *ex; 640 - ext4_lblk_t cur_blk, next_blk; 641 - pgoff_t orig_page_index, donor_page_index; 642 - int offset_in_page; 643 - int unwritten, cur_len; 591 + mext.orig_inode = orig_inode; 592 + mext.donor_inode = donor_inode; 593 + while (len) { 594 + mext.orig_map.m_lblk = orig_blk; 595 + mext.orig_map.m_len = len; 596 + mext.orig_map.m_flags = 0; 597 + mext.donor_lblk = donor_blk; 644 598 645 - path = get_ext_path(orig_inode, o_start, path); 646 - if (IS_ERR(path)) { 647 - ret = PTR_ERR(path); 599 + ret = ext4_map_blocks(NULL, orig_inode, &mext.orig_map, 0); 600 + if (ret < 0) 601 + goto out; 602 + 603 + /* Skip moving if it is a hole or a delalloc extent. */ 604 + if (mext.orig_map.m_flags & 605 + (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN)) { 606 + ret = mext_move_extent(&mext, &m_len); 607 + *moved_len += m_len; 608 + if (!ret) 609 + goto next; 610 + 611 + /* Move failed or partially failed. */ 612 + if (m_len) { 613 + orig_blk += m_len; 614 + donor_blk += m_len; 615 + len -= m_len; 616 + } 617 + if (ret == -ESTALE) 618 + continue; 619 + if (ret == -ENOSPC && 620 + ext4_should_retry_alloc(sb, &retries)) 621 + continue; 622 + if (ret == -EBUSY && 623 + sbi->s_journal && retries++ < 4 && 624 + jbd2_journal_force_commit_nested(sbi->s_journal)) 625 + continue; 626 + 648 627 goto out; 649 628 } 650 - ex = path[path->p_depth].p_ext; 651 - cur_blk = le32_to_cpu(ex->ee_block); 652 - cur_len = ext4_ext_get_actual_len(ex); 653 - /* Check hole before the start pos */ 654 - if (cur_blk + cur_len - 1 < o_start) { 655 - next_blk = ext4_ext_next_allocated_block(path); 656 - if (next_blk == EXT_MAX_BLOCKS) { 657 - ret = -ENODATA; 658 - goto out; 659 - } 660 - d_start += next_blk - o_start; 661 - o_start = next_blk; 662 - continue; 663 - /* Check hole after the start pos */ 664 - } else if (cur_blk > o_start) { 665 - /* Skip hole */ 666 - d_start += cur_blk - o_start; 667 - o_start = cur_blk; 668 - /* Extent inside requested range ?*/ 669 - if (cur_blk >= o_end) 670 - goto out; 671 - } else { /* in_range(o_start, o_blk, o_len) */ 672 - cur_len += cur_blk - o_start; 673 - } 674 - unwritten = ext4_ext_is_unwritten(ex); 675 - if (o_end - o_start < cur_len) 676 - cur_len = o_end - o_start; 677 - 678 - orig_page_index = o_start >> (PAGE_SHIFT - 679 - orig_inode->i_blkbits); 680 - donor_page_index = d_start >> (PAGE_SHIFT - 681 - donor_inode->i_blkbits); 682 - offset_in_page = o_start % blocks_per_page; 683 - if (cur_len > blocks_per_page - offset_in_page) 684 - cur_len = blocks_per_page - offset_in_page; 685 - /* 686 - * Up semaphore to avoid following problems: 687 - * a. transaction deadlock among ext4_journal_start, 688 - * ->write_begin via pagefault, and jbd2_journal_commit 689 - * b. racing with ->read_folio, ->write_begin, and 690 - * ext4_get_block in move_extent_per_page 691 - */ 692 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 693 - /* Swap original branches with new branches */ 694 - *moved_len += move_extent_per_page(o_filp, donor_inode, 695 - orig_page_index, donor_page_index, 696 - offset_in_page, cur_len, 697 - unwritten, &ret); 698 - ext4_double_down_write_data_sem(orig_inode, donor_inode); 699 - if (ret < 0) 700 - break; 701 - o_start += cur_len; 702 - d_start += cur_len; 629 + next: 630 + orig_blk += mext.orig_map.m_len; 631 + donor_blk += mext.orig_map.m_len; 632 + len -= mext.orig_map.m_len; 633 + retries = 0; 703 634 } 704 635 705 636 out: ··· 649 698 ext4_discard_preallocations(donor_inode); 650 699 } 651 700 652 - ext4_free_ext_path(path); 653 - ext4_double_up_write_data_sem(orig_inode, donor_inode); 654 701 unlock_two_nondirectories(orig_inode, donor_inode); 655 - 656 702 return ret; 657 703 }

+8 -10

fs/ext4/namei.c

··· 1076 1076 for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { 1077 1077 if (ext4_check_dir_entry(dir, NULL, de, bh, 1078 1078 bh->b_data, bh->b_size, 1079 - (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 1079 + EXT4_LBLK_TO_B(dir, block) 1080 1080 + ((char *)de - bh->b_data))) { 1081 1081 /* silently ignore the rest of the block */ 1082 1082 break; ··· 1630 1630 } 1631 1631 set_buffer_verified(bh); 1632 1632 i = search_dirblock(bh, dir, fname, 1633 - block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); 1633 + EXT4_LBLK_TO_B(dir, block), res_dir); 1634 1634 if (i == 1) { 1635 1635 EXT4_I(dir)->i_dir_start_lookup = block; 1636 1636 ret = bh; ··· 1710 1710 struct ext4_filename *fname, 1711 1711 struct ext4_dir_entry_2 **res_dir) 1712 1712 { 1713 - struct super_block * sb = dir->i_sb; 1714 1713 struct dx_frame frames[EXT4_HTREE_LEVEL], *frame; 1715 1714 struct buffer_head *bh; 1716 1715 ext4_lblk_t block; ··· 1728 1729 goto errout; 1729 1730 1730 1731 retval = search_dirblock(bh, dir, fname, 1731 - block << EXT4_BLOCK_SIZE_BITS(sb), 1732 - res_dir); 1732 + EXT4_LBLK_TO_B(dir, block), res_dir); 1733 1733 if (retval == 1) 1734 1734 goto success; 1735 1735 brelse(bh); ··· 1760 1762 static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 1761 1763 { 1762 1764 struct inode *inode; 1763 - struct ext4_dir_entry_2 *de; 1765 + struct ext4_dir_entry_2 *de = NULL; 1764 1766 struct buffer_head *bh; 1765 1767 1766 1768 if (dentry->d_name.len > EXT4_NAME_LEN) ··· 1816 1818 struct dentry *ext4_get_parent(struct dentry *child) 1817 1819 { 1818 1820 __u32 ino; 1819 - struct ext4_dir_entry_2 * de; 1821 + struct ext4_dir_entry_2 * de = NULL; 1820 1822 struct buffer_head *bh; 1821 1823 1822 1824 bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); ··· 3131 3133 int retval; 3132 3134 struct inode *inode; 3133 3135 struct buffer_head *bh; 3134 - struct ext4_dir_entry_2 *de; 3136 + struct ext4_dir_entry_2 *de = NULL; 3135 3137 handle_t *handle = NULL; 3136 3138 3137 3139 retval = ext4_emergency_state(dir->i_sb); ··· 3222 3224 { 3223 3225 int retval = -ENOENT; 3224 3226 struct buffer_head *bh; 3225 - struct ext4_dir_entry_2 *de; 3227 + struct ext4_dir_entry_2 *de = NULL; 3226 3228 handle_t *handle; 3227 3229 int skip_remove_dentry = 0; 3228 3230 ··· 3686 3688 { 3687 3689 int retval = -ENOENT; 3688 3690 struct buffer_head *bh; 3689 - struct ext4_dir_entry_2 *de; 3691 + struct ext4_dir_entry_2 *de = NULL; 3690 3692 3691 3693 bh = ext4_find_entry(dir, d_name, &de, NULL); 3692 3694 if (IS_ERR(bh))

+3 -1

fs/ext4/orphan.c

··· 8 8 #include "ext4.h" 9 9 #include "ext4_jbd2.h" 10 10 11 + #define EXT4_MAX_ORPHAN_FILE_BLOCKS 512 12 + 11 13 static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) 12 14 { 13 15 int i, j, start; ··· 590 588 * consuming absurd amounts of memory when pinning blocks of orphan 591 589 * file in memory. 592 590 */ 593 - if (inode->i_size > 8 << 20) { 591 + if (inode->i_size > (EXT4_MAX_ORPHAN_FILE_BLOCKS << inode->i_blkbits)) { 594 592 ext4_msg(sb, KERN_ERR, "orphan file too big: %llu", 595 593 (unsigned long long)inode->i_size); 596 594 ret = -EFSCORRUPTED;

+2 -5

fs/ext4/readpage.c

··· 213 213 { 214 214 struct bio *bio = NULL; 215 215 sector_t last_block_in_bio = 0; 216 - 217 216 const unsigned blkbits = inode->i_blkbits; 218 - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; 219 217 const unsigned blocksize = 1 << blkbits; 220 218 sector_t next_block; 221 219 sector_t block_in_file; ··· 249 251 250 252 blocks_per_folio = folio_size(folio) >> blkbits; 251 253 first_hole = blocks_per_folio; 252 - block_in_file = next_block = 253 - (sector_t)folio->index << (PAGE_SHIFT - blkbits); 254 - last_block = block_in_file + nr_pages * blocks_per_page; 254 + block_in_file = next_block = EXT4_PG_TO_LBLK(inode, folio->index); 255 + last_block = EXT4_PG_TO_LBLK(inode, folio->index + nr_pages); 255 256 last_block_in_file = (ext4_readpage_limit(inode) + 256 257 blocksize - 1) >> blkbits; 257 258 if (last_block > last_block_in_file)

+57 -15

fs/ext4/super.c

··· 698 698 WARN_ON_ONCE(1); 699 699 700 700 if (!continue_fs && !ext4_emergency_ro(sb) && journal) 701 - jbd2_journal_abort(journal, -EIO); 701 + jbd2_journal_abort(journal, -error); 702 702 703 703 if (!bdev_read_only(sb->s_bdev)) { 704 704 save_error_info(sb, error, ino, block, func, line); ··· 1396 1396 1397 1397 inode_set_iversion(&ei->vfs_inode, 1); 1398 1398 ei->i_flags = 0; 1399 + ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ 1399 1400 spin_lock_init(&ei->i_raw_lock); 1400 1401 ei->i_prealloc_node = RB_ROOT; 1401 1402 atomic_set(&ei->i_prealloc_active, 0); ··· 1407 1406 ei->i_es_all_nr = 0; 1408 1407 ei->i_es_shk_nr = 0; 1409 1408 ei->i_es_shrink_lblk = 0; 1409 + ei->i_es_seq = 0; 1410 1410 ei->i_reserved_data_blocks = 0; 1411 1411 spin_lock_init(&(ei->i_block_reservation_lock)); 1412 1412 ext4_init_pending_tree(&ei->i_pending_tree); ··· 2477 2475 struct ext4_fs_context *m_ctx) 2478 2476 { 2479 2477 struct ext4_sb_info *sbi = EXT4_SB(sb); 2480 - char s_mount_opts[65]; 2478 + char s_mount_opts[64]; 2481 2479 struct ext4_fs_context *s_ctx = NULL; 2482 2480 struct fs_context *fc = NULL; 2483 2481 int ret = -ENOMEM; ··· 2485 2483 if (!sbi->s_es->s_mount_opts[0]) 2486 2484 return 0; 2487 2485 2488 - strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts); 2486 + if (strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts) < 0) 2487 + return -E2BIG; 2489 2488 2490 2489 fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); 2491 2490 if (!fc) ··· 4191 4188 unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum); 4192 4189 ext4_group_t i, ngroups = ext4_get_groups_count(sb); 4193 4190 ext4_fsblk_t overhead = 0; 4194 - char *buf = (char *) get_zeroed_page(GFP_NOFS); 4191 + char *buf = kvmalloc(sb->s_blocksize, GFP_NOFS | __GFP_ZERO); 4195 4192 4196 4193 if (!buf) 4197 4194 return -ENOMEM; ··· 4216 4213 blks = count_overhead(sb, i, buf); 4217 4214 overhead += blks; 4218 4215 if (blks) 4219 - memset(buf, 0, PAGE_SIZE); 4216 + memset(buf, 0, sb->s_blocksize); 4220 4217 cond_resched(); 4221 4218 } 4222 4219 ··· 4239 4236 } 4240 4237 sbi->s_overhead = overhead; 4241 4238 smp_wmb(); 4242 - free_page((unsigned long) buf); 4239 + kvfree(buf); 4243 4240 return 0; 4244 4241 } 4245 4242 ··· 4392 4389 ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) 4393 4390 set_opt(sb, DELALLOC); 4394 4391 4395 - if (sb->s_blocksize <= PAGE_SIZE) 4396 - set_opt(sb, DIOREAD_NOLOCK); 4392 + set_opt(sb, DIOREAD_NOLOCK); 4397 4393 } 4398 4394 4399 4395 static int ext4_handle_clustersize(struct super_block *sb) ··· 5042 5040 return NULL; 5043 5041 } 5044 5042 5043 + /* 5044 + * Limit the maximum folio order to 2048 blocks to prevent overestimation 5045 + * of reserve handle credits during the folio writeback in environments 5046 + * where the PAGE_SIZE exceeds 4KB. 5047 + */ 5048 + #define EXT4_MAX_PAGECACHE_ORDER(sb) \ 5049 + umin(MAX_PAGECACHE_ORDER, (11 + (sb)->s_blocksize_bits - PAGE_SHIFT)) 5050 + static void ext4_set_max_mapping_order(struct super_block *sb) 5051 + { 5052 + struct ext4_sb_info *sbi = EXT4_SB(sb); 5053 + 5054 + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) 5055 + sbi->s_max_folio_order = sbi->s_min_folio_order; 5056 + else 5057 + sbi->s_max_folio_order = EXT4_MAX_PAGECACHE_ORDER(sb); 5058 + } 5059 + 5060 + static int ext4_check_large_folio(struct super_block *sb) 5061 + { 5062 + const char *err_str = NULL; 5063 + 5064 + if (ext4_has_feature_encrypt(sb)) 5065 + err_str = "encrypt"; 5066 + 5067 + if (!err_str) { 5068 + ext4_set_max_mapping_order(sb); 5069 + } else if (sb->s_blocksize > PAGE_SIZE) { 5070 + ext4_msg(sb, KERN_ERR, "bs(%lu) > ps(%lu) unsupported for %s", 5071 + sb->s_blocksize, PAGE_SIZE, err_str); 5072 + return -EINVAL; 5073 + } 5074 + 5075 + return 0; 5076 + } 5077 + 5045 5078 static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, 5046 5079 int silent) 5047 5080 { ··· 5144 5107 * If the default block size is not the same as the real block size, 5145 5108 * we need to reload it. 5146 5109 */ 5147 - if (sb->s_blocksize == blocksize) { 5148 - *lsb = logical_sb_block; 5149 - sbi->s_sbh = bh; 5150 - return 0; 5151 - } 5110 + if (sb->s_blocksize == blocksize) 5111 + goto success; 5152 5112 5153 5113 /* 5154 5114 * bh must be released before kill_bdev(), otherwise ··· 5176 5142 ext4_msg(sb, KERN_ERR, "Magic mismatch, very weird!"); 5177 5143 goto out; 5178 5144 } 5145 + 5146 + success: 5147 + sbi->s_min_folio_order = get_order(blocksize); 5179 5148 *lsb = logical_sb_block; 5180 5149 sbi->s_sbh = bh; 5181 5150 return 0; ··· 5352 5315 goto failed_mount; 5353 5316 5354 5317 ext4_apply_options(fc, sb); 5318 + 5319 + err = ext4_check_large_folio(sb); 5320 + if (err < 0) 5321 + goto failed_mount; 5355 5322 5356 5323 err = ext4_encoding_init(sb, es); 5357 5324 if (err) ··· 5883 5842 ext4_msg(journal->j_inode->i_sb, KERN_CRIT, 5884 5843 "journal bmap failed: block %llu ret %d\n", 5885 5844 *block, ret); 5886 - jbd2_journal_abort(journal, ret ? ret : -EIO); 5845 + jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED); 5887 5846 return ret; 5888 5847 } 5889 5848 *block = map.m_pblk; ··· 7453 7412 .init_fs_context = ext4_init_fs_context, 7454 7413 .parameters = ext4_param_specs, 7455 7414 .kill_sb = ext4_kill_sb, 7456 - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, 7415 + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | 7416 + FS_LBS, 7457 7417 }; 7458 7418 MODULE_ALIAS_FS("ext4"); 7459 7419

+6

fs/ext4/sysfs.c

··· 332 332 #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) 333 333 EXT4_ATTR_FEATURE(encrypted_casefold); 334 334 #endif 335 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 336 + EXT4_ATTR_FEATURE(blocksize_gt_pagesize); 337 + #endif 335 338 336 339 static struct attribute *ext4_feat_attrs[] = { 337 340 ATTR_LIST(lazy_itable_init), ··· 354 351 ATTR_LIST(fast_commit), 355 352 #if IS_ENABLED(CONFIG_UNICODE) && defined(CONFIG_FS_ENCRYPTION) 356 353 ATTR_LIST(encrypted_casefold), 354 + #endif 355 + #ifdef CONFIG_TRANSPARENT_HUGEPAGE 356 + ATTR_LIST(blocksize_gt_pagesize), 357 357 #endif 358 358 NULL, 359 359 };

+1 -1

fs/ext4/verity.c

··· 302 302 303 303 end_lblk = le32_to_cpu(last_extent->ee_block) + 304 304 ext4_ext_get_actual_len(last_extent); 305 - desc_size_pos = (u64)end_lblk << inode->i_blkbits; 305 + desc_size_pos = EXT4_LBLK_TO_B(inode, end_lblk); 306 306 ext4_free_ext_path(path); 307 307 308 308 if (desc_size_pos < sizeof(desc_size_disk))

+5 -1

fs/ext4/xattr.c

··· 1174 1174 if (block_csum) 1175 1175 end = (void *)bh->b_data + bh->b_size; 1176 1176 else { 1177 - ext4_get_inode_loc(parent, &iloc); 1177 + err = ext4_get_inode_loc(parent, &iloc); 1178 + if (err) { 1179 + EXT4_ERROR_INODE(parent, "parent inode loc (error %d)", err); 1180 + return; 1181 + } 1178 1182 end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; 1179 1183 } 1180 1184

+1 -1

fs/jbd2/checkpoint.c

··· 113 113 "journal space in %s\n", __func__, 114 114 journal->j_devname); 115 115 WARN_ON(1); 116 - jbd2_journal_abort(journal, -EIO); 116 + jbd2_journal_abort(journal, -ENOSPC); 117 117 } 118 118 write_lock(&journal->j_state_lock); 119 119 } else {

+27 -8

fs/jbd2/journal.c

··· 937 937 printk(KERN_ALERT "%s: journal block not found " 938 938 "at offset %lu on %s\n", 939 939 __func__, blocknr, journal->j_devname); 940 + jbd2_journal_abort(journal, ret ? ret : -EFSCORRUPTED); 940 941 err = -EIO; 941 - jbd2_journal_abort(journal, err); 942 942 } else { 943 943 *retp = block; 944 944 } ··· 1521 1521 struct block_device *fs_dev, 1522 1522 unsigned long long start, int len, int blocksize) 1523 1523 { 1524 - static struct lock_class_key jbd2_trans_commit_key; 1525 1524 journal_t *journal; 1526 1525 int err; 1527 1526 int n; ··· 1529 1530 if (!journal) 1530 1531 return ERR_PTR(-ENOMEM); 1531 1532 1533 + lockdep_register_key(&journal->jbd2_trans_commit_key); 1532 1534 journal->j_blocksize = blocksize; 1533 1535 journal->j_dev = bdev; 1534 1536 journal->j_fs_dev = fs_dev; ··· 1560 1560 journal->j_max_batch_time = 15000; /* 15ms */ 1561 1561 atomic_set(&journal->j_reserved_credits, 0); 1562 1562 lockdep_init_map(&journal->j_trans_commit_map, "jbd2_handle", 1563 - &jbd2_trans_commit_key, 0); 1563 + &journal->jbd2_trans_commit_key, 0); 1564 1564 1565 1565 /* The journal is marked for error until we succeed with recovery! */ 1566 1566 journal->j_flags = JBD2_ABORT; ··· 1611 1611 kfree(journal->j_wbuf); 1612 1612 jbd2_journal_destroy_revoke(journal); 1613 1613 journal_fail_superblock(journal); 1614 + lockdep_unregister_key(&journal->jbd2_trans_commit_key); 1614 1615 kfree(journal); 1615 1616 return ERR_PTR(err); 1616 1617 } ··· 1859 1858 1860 1859 if (is_journal_aborted(journal)) 1861 1860 return -EIO; 1862 - if (jbd2_check_fs_dev_write_error(journal)) { 1863 - jbd2_journal_abort(journal, -EIO); 1861 + ret = jbd2_check_fs_dev_write_error(journal); 1862 + if (ret) { 1863 + jbd2_journal_abort(journal, ret); 1864 1864 return -EIO; 1865 1865 } 1866 1866 ··· 2158 2156 * failed to write back to the original location, otherwise the 2159 2157 * filesystem may become inconsistent. 2160 2158 */ 2161 - if (!is_journal_aborted(journal) && 2162 - jbd2_check_fs_dev_write_error(journal)) 2163 - jbd2_journal_abort(journal, -EIO); 2159 + if (!is_journal_aborted(journal)) { 2160 + int ret = jbd2_check_fs_dev_write_error(journal); 2161 + if (ret) 2162 + jbd2_journal_abort(journal, ret); 2163 + } 2164 2164 2165 2165 if (journal->j_sb_buffer) { 2166 2166 if (!is_journal_aborted(journal)) { ··· 2191 2187 jbd2_journal_destroy_revoke(journal); 2192 2188 kfree(journal->j_fc_wbuf); 2193 2189 kfree(journal->j_wbuf); 2190 + lockdep_unregister_key(&journal->jbd2_trans_commit_key); 2194 2191 kfree(journal); 2195 2192 2196 2193 return err; ··· 2354 2349 sb->s_feature_compat |= cpu_to_be32(compat); 2355 2350 sb->s_feature_ro_compat |= cpu_to_be32(ro); 2356 2351 sb->s_feature_incompat |= cpu_to_be32(incompat); 2352 + /* 2353 + * Update the checksum now so that it is valid even for read-only 2354 + * filesystems where jbd2_write_superblock() doesn't get called. 2355 + */ 2356 + if (jbd2_journal_has_csum_v2or3(journal)) 2357 + sb->s_checksum = jbd2_superblock_csum(sb); 2357 2358 unlock_buffer(journal->j_sb_buffer); 2358 2359 jbd2_journal_init_transaction_limits(journal); 2359 2360 ··· 2389 2378 2390 2379 sb = journal->j_superblock; 2391 2380 2381 + lock_buffer(journal->j_sb_buffer); 2392 2382 sb->s_feature_compat &= ~cpu_to_be32(compat); 2393 2383 sb->s_feature_ro_compat &= ~cpu_to_be32(ro); 2394 2384 sb->s_feature_incompat &= ~cpu_to_be32(incompat); 2385 + /* 2386 + * Update the checksum now so that it is valid even for read-only 2387 + * filesystems where jbd2_write_superblock() doesn't get called. 2388 + */ 2389 + if (jbd2_journal_has_csum_v2or3(journal)) 2390 + sb->s_checksum = jbd2_superblock_csum(sb); 2391 + unlock_buffer(journal->j_sb_buffer); 2395 2392 jbd2_journal_init_transaction_limits(journal); 2396 2393 } 2397 2394 EXPORT_SYMBOL(jbd2_journal_clear_features);

+18 -8

fs/jbd2/transaction.c

··· 441 441 read_unlock(&journal->j_state_lock); 442 442 current->journal_info = handle; 443 443 444 - rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); 444 + rwsem_acquire_read(&journal->j_trans_commit_map, 0, 1, _THIS_IP_); 445 445 jbd2_journal_free_transaction(new_transaction); 446 446 /* 447 447 * Ensure that no allocations done while the transaction is open are ··· 1219 1219 return -EROFS; 1220 1220 1221 1221 journal = handle->h_transaction->t_journal; 1222 - if (jbd2_check_fs_dev_write_error(journal)) { 1222 + rc = jbd2_check_fs_dev_write_error(journal); 1223 + if (rc) { 1223 1224 /* 1224 1225 * If the fs dev has writeback errors, it may have failed 1225 1226 * to async write out metadata buffers in the background. ··· 1228 1227 * it out again, which may lead to on-disk filesystem 1229 1228 * inconsistency. Aborting journal can avoid it happen. 1230 1229 */ 1231 - jbd2_journal_abort(journal, -EIO); 1230 + jbd2_journal_abort(journal, rc); 1232 1231 return -EIO; 1233 1232 } 1234 1233 ··· 1285 1284 * committing transaction's lists, but it HAS to be in Forget state in 1286 1285 * that case: the transaction must have deleted the buffer for it to be 1287 1286 * reused here. 1287 + * In the case of file system data inconsistency, for example, if the 1288 + * block bitmap of a referenced block is not set, it can lead to the 1289 + * situation where a block being committed is allocated and used again. 1290 + * As a result, the following condition will not be satisfied, so here 1291 + * we directly trigger a JBD abort instead of immediately invoking 1292 + * bugon. 1288 1293 */ 1289 1294 spin_lock(&jh->b_state_lock); 1290 - J_ASSERT_JH(jh, (jh->b_transaction == transaction || 1291 - jh->b_transaction == NULL || 1292 - (jh->b_transaction == journal->j_committing_transaction && 1293 - jh->b_jlist == BJ_Forget))); 1295 + if (!(jh->b_transaction == transaction || jh->b_transaction == NULL || 1296 + (jh->b_transaction == journal->j_committing_transaction && 1297 + jh->b_jlist == BJ_Forget)) || jh->b_next_transaction != NULL) { 1298 + err = -EROFS; 1299 + spin_unlock(&jh->b_state_lock); 1300 + jbd2_journal_abort(journal, err); 1301 + goto out; 1302 + } 1294 1303 1295 - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); 1296 1304 J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); 1297 1305 1298 1306 if (jh->b_transaction == NULL) {

+6

include/linux/jbd2.h

··· 1253 1253 */ 1254 1254 struct lockdep_map j_trans_commit_map; 1255 1255 #endif 1256 + /** 1257 + * @jbd2_trans_commit_key: 1258 + * 1259 + * "struct lock_class_key" for @j_trans_commit_map 1260 + */ 1261 + struct lock_class_key jbd2_trans_commit_key; 1256 1262 1257 1263 /** 1258 1264 * @j_fc_cleanup_callback:

+90 -9

include/trace/events/ext4.h

··· 39 39 { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ 40 40 { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ 41 41 { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ 42 - { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ 42 + { EXT4_GET_BLOCKS_SPLIT_NOMERGE, "SPLIT_NOMERGE" }, \ 43 43 { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ 44 44 { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ 45 45 { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ ··· 2210 2210 __field( ext4_lblk_t, lblk ) 2211 2211 __field( ext4_lblk_t, len ) 2212 2212 __field( ext4_fsblk_t, pblk ) 2213 - __field( char, status ) 2213 + __field( char, status ) 2214 + __field( u64, seq ) 2214 2215 ), 2215 2216 2216 2217 TP_fast_assign( ··· 2221 2220 __entry->len = es->es_len; 2222 2221 __entry->pblk = ext4_es_show_pblock(es); 2223 2222 __entry->status = ext4_es_status(es); 2223 + __entry->seq = EXT4_I(inode)->i_es_seq; 2224 2224 ), 2225 2225 2226 - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", 2226 + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %llu", 2227 2227 MAJOR(__entry->dev), MINOR(__entry->dev), 2228 2228 (unsigned long) __entry->ino, 2229 2229 __entry->lblk, __entry->len, 2230 - __entry->pblk, show_extent_status(__entry->status)) 2230 + __entry->pblk, show_extent_status(__entry->status), 2231 + __entry->seq) 2231 2232 ); 2232 2233 2233 2234 DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, ··· 2254 2251 __field( ino_t, ino ) 2255 2252 __field( loff_t, lblk ) 2256 2253 __field( loff_t, len ) 2254 + __field( u64, seq ) 2257 2255 ), 2258 2256 2259 2257 TP_fast_assign( ··· 2262 2258 __entry->ino = inode->i_ino; 2263 2259 __entry->lblk = lblk; 2264 2260 __entry->len = len; 2261 + __entry->seq = EXT4_I(inode)->i_es_seq; 2265 2262 ), 2266 2263 2267 - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", 2264 + TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %llu", 2268 2265 MAJOR(__entry->dev), MINOR(__entry->dev), 2269 2266 (unsigned long) __entry->ino, 2270 - __entry->lblk, __entry->len) 2267 + __entry->lblk, __entry->len, __entry->seq) 2271 2268 ); 2272 2269 2273 2270 TRACE_EVENT(ext4_es_find_extent_range_enter, ··· 2528 2523 __field( char, status ) 2529 2524 __field( bool, lclu_allocated ) 2530 2525 __field( bool, end_allocated ) 2526 + __field( u64, seq ) 2531 2527 ), 2532 2528 2533 2529 TP_fast_assign( ··· 2540 2534 __entry->status = ext4_es_status(es); 2541 2535 __entry->lclu_allocated = lclu_allocated; 2542 2536 __entry->end_allocated = end_allocated; 2537 + __entry->seq = EXT4_I(inode)->i_es_seq; 2543 2538 ), 2544 2539 2545 - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " 2546 - "allocated %d %d", 2540 + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %llu", 2547 2541 MAJOR(__entry->dev), MINOR(__entry->dev), 2548 2542 (unsigned long) __entry->ino, 2549 2543 __entry->lblk, __entry->len, 2550 2544 __entry->pblk, show_extent_status(__entry->status), 2551 - __entry->lclu_allocated, __entry->end_allocated) 2545 + __entry->lclu_allocated, __entry->end_allocated, 2546 + __entry->seq) 2552 2547 ); 2553 2548 2554 2549 /* fsmap traces */ ··· 3014 3007 TP_printk("dev %d,%d fsblk %llu flags %u", 3015 3008 MAJOR(__entry->dev), MINOR(__entry->dev), 3016 3009 __entry->fsblk, __entry->flags) 3010 + ); 3011 + 3012 + TRACE_EVENT(ext4_move_extent_enter, 3013 + TP_PROTO(struct inode *orig_inode, struct ext4_map_blocks *orig_map, 3014 + struct inode *donor_inode, ext4_lblk_t donor_lblk), 3015 + 3016 + TP_ARGS(orig_inode, orig_map, donor_inode, donor_lblk), 3017 + 3018 + TP_STRUCT__entry( 3019 + __field(dev_t, dev) 3020 + __field(ino_t, orig_ino) 3021 + __field(ext4_lblk_t, orig_lblk) 3022 + __field(unsigned int, orig_flags) 3023 + __field(ino_t, donor_ino) 3024 + __field(ext4_lblk_t, donor_lblk) 3025 + __field(unsigned int, len) 3026 + ), 3027 + 3028 + TP_fast_assign( 3029 + __entry->dev = orig_inode->i_sb->s_dev; 3030 + __entry->orig_ino = orig_inode->i_ino; 3031 + __entry->orig_lblk = orig_map->m_lblk; 3032 + __entry->orig_flags = orig_map->m_flags; 3033 + __entry->donor_ino = donor_inode->i_ino; 3034 + __entry->donor_lblk = donor_lblk; 3035 + __entry->len = orig_map->m_len; 3036 + ), 3037 + 3038 + TP_printk("dev %d,%d origin ino %lu lblk %u flags %s donor ino %lu lblk %u len %u", 3039 + MAJOR(__entry->dev), MINOR(__entry->dev), 3040 + (unsigned long) __entry->orig_ino, __entry->orig_lblk, 3041 + show_mflags(__entry->orig_flags), 3042 + (unsigned long) __entry->donor_ino, __entry->donor_lblk, 3043 + __entry->len) 3044 + ); 3045 + 3046 + TRACE_EVENT(ext4_move_extent_exit, 3047 + TP_PROTO(struct inode *orig_inode, ext4_lblk_t orig_lblk, 3048 + struct inode *donor_inode, ext4_lblk_t donor_lblk, 3049 + unsigned int m_len, u64 move_len, int move_type, int ret), 3050 + 3051 + TP_ARGS(orig_inode, orig_lblk, donor_inode, donor_lblk, m_len, 3052 + move_len, move_type, ret), 3053 + 3054 + TP_STRUCT__entry( 3055 + __field(dev_t, dev) 3056 + __field(ino_t, orig_ino) 3057 + __field(ext4_lblk_t, orig_lblk) 3058 + __field(ino_t, donor_ino) 3059 + __field(ext4_lblk_t, donor_lblk) 3060 + __field(unsigned int, m_len) 3061 + __field(u64, move_len) 3062 + __field(int, move_type) 3063 + __field(int, ret) 3064 + ), 3065 + 3066 + TP_fast_assign( 3067 + __entry->dev = orig_inode->i_sb->s_dev; 3068 + __entry->orig_ino = orig_inode->i_ino; 3069 + __entry->orig_lblk = orig_lblk; 3070 + __entry->donor_ino = donor_inode->i_ino; 3071 + __entry->donor_lblk = donor_lblk; 3072 + __entry->m_len = m_len; 3073 + __entry->move_len = move_len; 3074 + __entry->move_type = move_type; 3075 + __entry->ret = ret; 3076 + ), 3077 + 3078 + TP_printk("dev %d,%d origin ino %lu lblk %u donor ino %lu lblk %u m_len %u, move_len %llu type %d ret %d", 3079 + MAJOR(__entry->dev), MINOR(__entry->dev), 3080 + (unsigned long) __entry->orig_ino, __entry->orig_lblk, 3081 + (unsigned long) __entry->donor_ino, __entry->donor_lblk, 3082 + __entry->m_len, __entry->move_len, __entry->move_type, 3083 + __entry->ret) 3017 3084 ); 3018 3085 3019 3086 #endif /* _TRACE_EXT4_H */

Configure Feed

Configure Feed