Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ext4_for_linus_6.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 updates from Ted Ts'o:
"Major ext4 changes for 6.17:

- Better scalability for ext4 block allocation

- Fix insufficient credits when writing back large folios

Miscellaneous bug fixes, especially when handling exteded attriutes,
inline data, and fast commit"

* tag 'ext4_for_linus_6.17-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (39 commits)
ext4: do not BUG when INLINE_DATA_FL lacks system.data xattr
ext4: implement linear-like traversal across order xarrays
ext4: refactor choose group to scan group
ext4: convert free groups order lists to xarrays
ext4: factor out ext4_mb_scan_group()
ext4: factor out ext4_mb_might_prefetch()
ext4: factor out __ext4_mb_scan_group()
ext4: fix largest free orders lists corruption on mb_optimize_scan switch
ext4: fix zombie groups in average fragment size lists
ext4: merge freed extent with existing extents before insertion
ext4: convert sbi->s_mb_free_pending to atomic_t
ext4: fix typo in CR_GOAL_LEN_SLOW comment
ext4: get rid of some obsolete EXT4_MB_HINT flags
ext4: utilize multiple global goals to reduce contention
ext4: remove unnecessary s_md_lock on update s_mb_last_group
ext4: remove unnecessary s_mb_last_start
ext4: separate stream goal hits from s_bal_goals for better tracking
ext4: add ext4_try_lock_group() to skip busy groups
ext4: initialize superblock fields in the kballoc-test.c kunit tests
ext4: refactor the inline directory conversion and new directory codepaths
...

+902 -694
+1 -1
fs/ext4/balloc.c
··· 703 703 * possible we just missed a transaction commit that did so 704 704 */ 705 705 smp_mb(); 706 - if (sbi->s_mb_free_pending == 0) { 706 + if (atomic_read(&sbi->s_mb_free_pending) == 0) { 707 707 if (test_opt(sb, DISCARD)) { 708 708 atomic_inc(&sbi->s_retry_alloc_pending); 709 709 flush_work(&sbi->s_discard_work);
+30 -44
fs/ext4/ext4.h
··· 157 157 158 158 /* 159 159 * Reads each block group sequentially, performing disk IO if 160 - * necessary, to find find_suitable block group. Tries to 160 + * necessary, to find suitable block group. Tries to 161 161 * allocate goal length but might trim the request if nothing 162 162 * is found after enough tries. 163 163 */ ··· 185 185 186 186 /* prefer goal again. length */ 187 187 #define EXT4_MB_HINT_MERGE 0x0001 188 - /* blocks already reserved */ 189 - #define EXT4_MB_HINT_RESERVED 0x0002 190 - /* metadata is being allocated */ 191 - #define EXT4_MB_HINT_METADATA 0x0004 192 188 /* first blocks in the file */ 193 189 #define EXT4_MB_HINT_FIRST 0x0008 194 - /* search for the best chunk */ 195 - #define EXT4_MB_HINT_BEST 0x0010 196 190 /* data is being allocated */ 197 191 #define EXT4_MB_HINT_DATA 0x0020 198 192 /* don't preallocate (for tails) */ ··· 207 213 #define EXT4_MB_USE_RESERVED 0x2000 208 214 /* Do strict check for free blocks while retrying block allocation */ 209 215 #define EXT4_MB_STRICT_CHECK 0x4000 210 - /* Large fragment size list lookup succeeded at least once for 211 - * CR_POWER2_ALIGNED */ 212 - #define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000 213 - /* Avg fragment size rb tree lookup succeeded at least once for 214 - * CR_GOAL_LEN_FAST */ 215 - #define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000 216 - /* Avg fragment size rb tree lookup succeeded at least once for 217 - * CR_BEST_AVAIL_LEN */ 218 - #define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000 219 216 220 217 struct ext4_allocation_request { 221 218 /* target inode for block we're allocating */ ··· 1593 1608 unsigned short *s_mb_offsets; 1594 1609 unsigned int *s_mb_maxs; 1595 1610 unsigned int s_group_info_size; 1596 - unsigned int s_mb_free_pending; 1611 + atomic_t s_mb_free_pending; 1597 1612 struct list_head s_freed_data_list[2]; /* List of blocks to be freed 1598 1613 after commit completed */ 1599 1614 struct list_head s_discard_list; 1600 1615 struct work_struct s_discard_work; 1601 1616 atomic_t s_retry_alloc_pending; 1602 - struct list_head *s_mb_avg_fragment_size; 1603 - rwlock_t *s_mb_avg_fragment_size_locks; 1604 - struct list_head *s_mb_largest_free_orders; 1605 - rwlock_t *s_mb_largest_free_orders_locks; 1617 + struct xarray *s_mb_avg_fragment_size; 1618 + struct xarray *s_mb_largest_free_orders; 1606 1619 1607 1620 /* tunables */ 1608 1621 unsigned long s_stripe; ··· 1612 1629 unsigned int s_mb_order2_reqs; 1613 1630 unsigned int s_mb_group_prealloc; 1614 1631 unsigned int s_max_dir_size_kb; 1615 - /* where last allocation was done - for stream allocation */ 1616 - unsigned long s_mb_last_group; 1617 - unsigned long s_mb_last_start; 1618 1632 unsigned int s_mb_prefetch; 1619 1633 unsigned int s_mb_prefetch_limit; 1620 1634 unsigned int s_mb_best_avail_max_trim_order; 1621 1635 unsigned int s_sb_update_sec; 1622 1636 unsigned int s_sb_update_kb; 1637 + 1638 + /* where last allocation was done - for stream allocation */ 1639 + ext4_group_t *s_mb_last_groups; 1640 + unsigned int s_mb_nr_global_goals; 1623 1641 1624 1642 /* stats for buddy allocator */ 1625 1643 atomic_t s_bal_reqs; /* number of reqs with len > 1 */ ··· 1630 1646 atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ 1631 1647 atomic_t s_bal_groups_scanned; /* number of groups scanned */ 1632 1648 atomic_t s_bal_goals; /* goal hits */ 1649 + atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ 1633 1650 atomic_t s_bal_len_goals; /* len goal hits */ 1634 1651 atomic_t s_bal_breaks; /* too long searches */ 1635 1652 atomic_t s_bal_2orders; /* 2^order hits */ 1636 - atomic_t s_bal_p2_aligned_bad_suggestions; 1637 - atomic_t s_bal_goal_fast_bad_suggestions; 1638 - atomic_t s_bal_best_avail_bad_suggestions; 1639 1653 atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; 1640 1654 atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; 1641 1655 atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ ··· 3002 3020 struct buffer_head *bh)); 3003 3021 int do_journal_get_write_access(handle_t *handle, struct inode *inode, 3004 3022 struct buffer_head *bh); 3005 - bool ext4_should_enable_large_folio(struct inode *inode); 3023 + void ext4_set_inode_mapping_order(struct inode *inode); 3006 3024 #define FALL_BACK_TO_NONDELALLOC 1 3007 3025 #define CONVERT_INLINE_DATA 2 3008 3026 ··· 3046 3064 extern void ext4_set_inode_flags(struct inode *, bool init); 3047 3065 extern int ext4_alloc_da_blocks(struct inode *inode); 3048 3066 extern void ext4_set_aops(struct inode *inode); 3049 - extern int ext4_writepage_trans_blocks(struct inode *); 3050 3067 extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); 3051 3068 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 3069 + extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); 3052 3070 extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, 3053 3071 int pextents); 3054 3072 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, ··· 3471 3489 void *bb_bitmap; 3472 3490 #endif 3473 3491 struct rw_semaphore alloc_sem; 3474 - struct list_head bb_avg_fragment_size_node; 3475 - struct list_head bb_largest_free_order_node; 3476 3492 ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block 3477 3493 * regions, index is order. 3478 3494 * bb_counters[3] = 5 means ··· 3521 3541 return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); 3522 3542 } 3523 3543 3544 + static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) 3545 + { 3546 + if (!spin_trylock(ext4_group_lock_ptr(sb, group))) 3547 + return false; 3548 + /* 3549 + * We're able to grab the lock right away, so drop the lock 3550 + * contention counter. 3551 + */ 3552 + atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); 3553 + return true; 3554 + } 3555 + 3524 3556 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) 3525 3557 { 3526 - spinlock_t *lock = ext4_group_lock_ptr(sb, group); 3527 - if (spin_trylock(lock)) 3528 - /* 3529 - * We're able to grab the lock right away, so drop the 3530 - * lock contention counter. 3531 - */ 3532 - atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); 3533 - else { 3558 + if (!ext4_try_lock_group(sb, group)) { 3534 3559 /* 3535 3560 * The lock is busy, so bump the contention counter, 3536 3561 * and then wait on the spin lock. 3537 3562 */ 3538 3563 atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, 3539 3564 EXT4_MAX_CONTENTION); 3540 - spin_lock(lock); 3565 + spin_lock(ext4_group_lock_ptr(sb, group)); 3541 3566 } 3542 3567 } 3543 3568 ··· 3597 3612 extern int ext4_get_max_inline_size(struct inode *inode); 3598 3613 extern int ext4_find_inline_data_nolock(struct inode *inode); 3599 3614 extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); 3615 + extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); 3600 3616 3601 3617 int ext4_readpage_inline(struct inode *inode, struct folio *folio); 3602 3618 extern int ext4_try_to_write_inline_data(struct address_space *mapping, ··· 3657 3671 extern const struct inode_operations ext4_dir_inode_operations; 3658 3672 extern const struct inode_operations ext4_special_inode_operations; 3659 3673 extern struct dentry *ext4_get_parent(struct dentry *child); 3660 - extern struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 3661 - struct ext4_dir_entry_2 *de, 3662 - int blocksize, int csum_size, 3663 - unsigned int parent_ino, int dotdot_real_len); 3674 + extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, 3675 + struct buffer_head *dir_block, 3676 + unsigned int parent_ino, void *inline_buf, 3677 + int inline_size); 3664 3678 extern void ext4_initialize_dirent_tail(struct buffer_head *bh, 3665 3679 unsigned int blocksize); 3666 3680 extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode,
-7
fs/ext4/ext4_extents.h
··· 31 31 #define CHECK_BINSEARCH__ 32 32 33 33 /* 34 - * If EXT_STATS is defined then stats numbers are collected. 35 - * These number will be displayed at umount time. 36 - */ 37 - #define EXT_STATS_ 38 - 39 - 40 - /* 41 34 * ext4_inode has i_block array (60 bytes total). 42 35 * The first 12 bytes store ext4_extent_header; 43 36 * the remainder stores an array of ext4_extent.
+3 -3
fs/ext4/extents.c
··· 5215 5215 credits = depth + 2; 5216 5216 } 5217 5217 5218 - restart_credits = ext4_writepage_trans_blocks(inode); 5218 + restart_credits = ext4_chunk_trans_extent(inode, 0); 5219 5219 err = ext4_datasem_ensure_credits(handle, inode, credits, 5220 5220 restart_credits, 0); 5221 5221 if (err) { ··· 5475 5475 5476 5476 truncate_pagecache(inode, start); 5477 5477 5478 - credits = ext4_writepage_trans_blocks(inode); 5478 + credits = ext4_chunk_trans_extent(inode, 0); 5479 5479 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5480 5480 if (IS_ERR(handle)) 5481 5481 return PTR_ERR(handle); ··· 5571 5571 5572 5572 truncate_pagecache(inode, start); 5573 5573 5574 - credits = ext4_writepage_trans_blocks(inode); 5574 + credits = ext4_chunk_trans_extent(inode, 0); 5575 5575 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); 5576 5576 if (IS_ERR(handle)) 5577 5577 return PTR_ERR(handle);
+1 -2
fs/ext4/ialloc.c
··· 1335 1335 } 1336 1336 } 1337 1337 1338 - if (ext4_should_enable_large_folio(inode)) 1339 - mapping_set_large_folios(inode->i_mapping); 1338 + ext4_set_inode_mapping_order(inode); 1340 1339 1341 1340 ext4_update_inode_fsync_trans(handle, inode, 1); 1342 1341
+35 -56
fs/ext4/inline.c
··· 303 303 if (error) 304 304 goto out; 305 305 306 - BUG_ON(!is.s.not_found); 306 + if (!is.s.not_found) { 307 + EXT4_ERROR_INODE(inode, "unexpected inline data xattr"); 308 + error = -EFSCORRUPTED; 309 + goto out; 310 + } 307 311 308 312 error = ext4_xattr_ibody_set(handle, inode, &i, &is); 309 313 if (error) { ··· 358 354 if (error) 359 355 goto out; 360 356 361 - BUG_ON(is.s.not_found); 357 + if (is.s.not_found) { 358 + EXT4_ERROR_INODE(inode, "missing inline data xattr"); 359 + error = -EFSCORRUPTED; 360 + goto out; 361 + } 362 362 363 363 len -= EXT4_MIN_INLINE_DATA_SIZE; 364 364 value = kzalloc(len, GFP_NOFS); ··· 570 562 return 0; 571 563 } 572 564 573 - needed_blocks = ext4_writepage_trans_blocks(inode); 565 + needed_blocks = ext4_chunk_trans_extent(inode, 1); 574 566 575 567 ret = ext4_get_inode_loc(inode, &iloc); 576 568 if (ret) ··· 620 612 } else 621 613 ret = ext4_block_write_begin(handle, folio, from, to, 622 614 ext4_get_block); 615 + clear_buffer_new(folio_buffers(folio)); 623 616 624 617 if (!ret && ext4_should_journal_data(inode)) { 625 618 ret = ext4_walk_page_buffers(handle, inode, ··· 900 891 return ret; 901 892 } 902 893 894 + clear_buffer_new(folio_buffers(folio)); 903 895 folio_mark_dirty(folio); 904 896 folio_mark_uptodate(folio); 905 897 ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ··· 1005 995 } 1006 996 1007 997 /* Set the final de to cover the whole block. */ 1008 - static void ext4_update_final_de(void *de_buf, int old_size, int new_size) 998 + void ext4_update_final_de(void *de_buf, int old_size, int new_size) 1009 999 { 1010 1000 struct ext4_dir_entry_2 *de, *prev_de; 1011 1001 void *limit; ··· 1067 1057 } 1068 1058 ext4_write_inline_data(inode, iloc, buf, 0, inline_size); 1069 1059 ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 1070 - } 1071 - 1072 - static int ext4_finish_convert_inline_dir(handle_t *handle, 1073 - struct inode *inode, 1074 - struct buffer_head *dir_block, 1075 - void *buf, 1076 - int inline_size) 1077 - { 1078 - int err, csum_size = 0, header_size = 0; 1079 - struct ext4_dir_entry_2 *de; 1080 - void *target = dir_block->b_data; 1081 - 1082 - /* 1083 - * First create "." and ".." and then copy the dir information 1084 - * back to the block. 1085 - */ 1086 - de = target; 1087 - de = ext4_init_dot_dotdot(inode, de, 1088 - inode->i_sb->s_blocksize, csum_size, 1089 - le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1); 1090 - header_size = (void *)de - target; 1091 - 1092 - memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, 1093 - inline_size - EXT4_INLINE_DOTDOT_SIZE); 1094 - 1095 - if (ext4_has_feature_metadata_csum(inode->i_sb)) 1096 - csum_size = sizeof(struct ext4_dir_entry_tail); 1097 - 1098 - inode->i_size = inode->i_sb->s_blocksize; 1099 - i_size_write(inode, inode->i_sb->s_blocksize); 1100 - EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1101 - ext4_update_final_de(dir_block->b_data, 1102 - inline_size - EXT4_INLINE_DOTDOT_SIZE + header_size, 1103 - inode->i_sb->s_blocksize - csum_size); 1104 - 1105 - if (csum_size) 1106 - ext4_initialize_dirent_tail(dir_block, 1107 - inode->i_sb->s_blocksize); 1108 - set_buffer_uptodate(dir_block); 1109 - unlock_buffer(dir_block); 1110 - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); 1111 - if (err) 1112 - return err; 1113 - set_buffer_verified(dir_block); 1114 - return ext4_mark_inode_dirty(handle, inode); 1115 1060 } 1116 1061 1117 1062 static int ext4_convert_inline_data_nolock(handle_t *handle, ··· 1140 1175 error = ext4_handle_dirty_metadata(handle, 1141 1176 inode, data_bh); 1142 1177 } else { 1143 - error = ext4_finish_convert_inline_dir(handle, inode, data_bh, 1144 - buf, inline_size); 1178 + unlock_buffer(data_bh); 1179 + inode->i_size = inode->i_sb->s_blocksize; 1180 + i_size_write(inode, inode->i_sb->s_blocksize); 1181 + EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; 1182 + 1183 + error = ext4_init_dirblock(handle, inode, data_bh, 1184 + le32_to_cpu(((struct ext4_dir_entry_2 *)buf)->inode), 1185 + buf + EXT4_INLINE_DOTDOT_SIZE, 1186 + inline_size - EXT4_INLINE_DOTDOT_SIZE); 1187 + if (!error) 1188 + error = ext4_mark_inode_dirty(handle, inode); 1145 1189 } 1146 1190 1147 1191 out_restore: ··· 1289 1315 if (pos == 0) { 1290 1316 fake.inode = cpu_to_le32(inode->i_ino); 1291 1317 fake.name_len = 1; 1292 - strcpy(fake.name, "."); 1318 + memcpy(fake.name, ".", 2); 1293 1319 fake.rec_len = ext4_rec_len_to_disk( 1294 1320 ext4_dir_rec_len(fake.name_len, NULL), 1295 1321 inline_size); ··· 1299 1325 } else if (pos == EXT4_INLINE_DOTDOT_OFFSET) { 1300 1326 fake.inode = cpu_to_le32(parent_ino); 1301 1327 fake.name_len = 2; 1302 - strcpy(fake.name, ".."); 1328 + memcpy(fake.name, "..", 3); 1303 1329 fake.rec_len = ext4_rec_len_to_disk( 1304 1330 ext4_dir_rec_len(fake.name_len, NULL), 1305 1331 inline_size); ··· 1838 1864 }; 1839 1865 1840 1866 1841 - needed_blocks = ext4_writepage_trans_blocks(inode); 1867 + needed_blocks = ext4_chunk_trans_extent(inode, 1); 1842 1868 handle = ext4_journal_start(inode, EXT4_HT_INODE, needed_blocks); 1843 1869 if (IS_ERR(handle)) 1844 1870 return PTR_ERR(handle); ··· 1877 1903 if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) 1878 1904 goto out_error; 1879 1905 1880 - BUG_ON(is.s.not_found); 1906 + if (is.s.not_found) { 1907 + EXT4_ERROR_INODE(inode, 1908 + "missing inline data xattr"); 1909 + err = -EFSCORRUPTED; 1910 + goto out_error; 1911 + } 1881 1912 1882 1913 value_len = le32_to_cpu(is.s.here->e_value_size); 1883 1914 value = kmalloc(value_len, GFP_NOFS); ··· 1958 1979 return 0; 1959 1980 } 1960 1981 1961 - needed_blocks = ext4_writepage_trans_blocks(inode); 1982 + needed_blocks = ext4_chunk_trans_extent(inode, 1); 1962 1983 1963 1984 iloc.bh = NULL; 1964 1985 error = ext4_get_inode_loc(inode, &iloc);
+225 -133
fs/ext4/inode.c
··· 723 723 ext4_check_map_extents_env(inode); 724 724 725 725 /* Lookup extent status tree firstly */ 726 - if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && 727 - ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 726 + if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { 728 727 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 729 728 map->m_pblk = ext4_es_pblock(&es) + 730 729 map->m_lblk - es.es_lblk; ··· 756 757 orig_mlen == map->m_len) 757 758 goto found; 758 759 759 - if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) 760 - map->m_len = orig_mlen; 760 + map->m_len = orig_mlen; 761 761 } 762 762 /* 763 763 * In the query cache no-wait mode, nothing we can do more if we ··· 873 875 do { 874 876 new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; 875 877 } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); 878 + } 879 + 880 + /* 881 + * Make sure that the current journal transaction has enough credits to map 882 + * one extent. Return -EAGAIN if it cannot extend the current running 883 + * transaction. 884 + */ 885 + static inline int ext4_journal_ensure_extent_credits(handle_t *handle, 886 + struct inode *inode) 887 + { 888 + int credits; 889 + int ret; 890 + 891 + /* Called from ext4_da_write_begin() which has no handle started? */ 892 + if (!handle) 893 + return 0; 894 + 895 + credits = ext4_chunk_trans_blocks(inode, 1); 896 + ret = __ext4_journal_ensure_credits(handle, credits, credits, 0); 897 + return ret <= 0 ? ret : -EAGAIN; 876 898 } 877 899 878 900 static int _ext4_get_block(struct inode *inode, sector_t iblock, ··· 1189 1171 } 1190 1172 continue; 1191 1173 } 1192 - if (buffer_new(bh)) 1174 + if (WARN_ON_ONCE(buffer_new(bh))) 1193 1175 clear_buffer_new(bh); 1194 1176 if (!buffer_mapped(bh)) { 1195 1177 WARN_ON(bh->b_size != blocksize); 1196 - err = get_block(inode, block, bh, 1); 1178 + err = ext4_journal_ensure_extent_credits(handle, inode); 1179 + if (!err) 1180 + err = get_block(inode, block, bh, 1); 1197 1181 if (err) 1198 1182 break; 1199 1183 if (buffer_new(bh)) { ··· 1294 1274 * Reserve one block more for addition to orphan list in case 1295 1275 * we allocate blocks but write fails for some reason 1296 1276 */ 1297 - needed_blocks = ext4_writepage_trans_blocks(inode) + 1; 1277 + needed_blocks = ext4_chunk_trans_extent(inode, 1278 + ext4_journal_blocks_per_folio(inode)) + 1; 1298 1279 index = pos >> PAGE_SHIFT; 1299 1280 1300 1281 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ··· 1393 1372 ext4_orphan_del(NULL, inode); 1394 1373 } 1395 1374 1396 - if (ret == -ENOSPC && 1397 - ext4_should_retry_alloc(inode->i_sb, &retries)) 1375 + if (ret == -EAGAIN || 1376 + (ret == -ENOSPC && 1377 + ext4_should_retry_alloc(inode->i_sb, &retries))) 1398 1378 goto retry_journal; 1399 1379 folio_put(folio); 1400 1380 return ret; ··· 1415 1393 ret = ext4_dirty_journalled_data(handle, bh); 1416 1394 clear_buffer_meta(bh); 1417 1395 clear_buffer_prio(bh); 1396 + clear_buffer_new(bh); 1418 1397 return ret; 1419 1398 } 1420 1399 ··· 1688 1665 unsigned int can_map:1; /* Can writepages call map blocks? */ 1689 1666 1690 1667 /* These are internal state of ext4_do_writepages() */ 1691 - pgoff_t first_page; /* The first page to write */ 1692 - pgoff_t next_page; /* Current page to examine */ 1693 - pgoff_t last_page; /* Last page to examine */ 1668 + loff_t start_pos; /* The start pos to write */ 1669 + loff_t next_pos; /* Current pos to examine */ 1670 + loff_t end_pos; /* Last pos to examine */ 1671 + 1694 1672 /* 1695 - * Extent to map - this can be after first_page because that can be 1673 + * Extent to map - this can be after start_pos because that can be 1696 1674 * fully mapped. We somewhat abuse m_flags to store whether the extent 1697 1675 * is delalloc or unwritten. 1698 1676 */ ··· 1713 1689 struct inode *inode = mpd->inode; 1714 1690 struct address_space *mapping = inode->i_mapping; 1715 1691 1716 - /* This is necessary when next_page == 0. */ 1717 - if (mpd->first_page >= mpd->next_page) 1692 + /* This is necessary when next_pos == 0. */ 1693 + if (mpd->start_pos >= mpd->next_pos) 1718 1694 return; 1719 1695 1720 1696 mpd->scanned_until_end = 0; 1721 - index = mpd->first_page; 1722 - end = mpd->next_page - 1; 1723 1697 if (invalidate) { 1724 1698 ext4_lblk_t start, last; 1725 - start = index << (PAGE_SHIFT - inode->i_blkbits); 1726 - last = end << (PAGE_SHIFT - inode->i_blkbits); 1699 + start = EXT4_B_TO_LBLK(inode, mpd->start_pos); 1700 + last = mpd->next_pos >> inode->i_blkbits; 1727 1701 1728 1702 /* 1729 1703 * avoid racing with extent status tree scans made by 1730 1704 * ext4_insert_delayed_block() 1731 1705 */ 1732 1706 down_write(&EXT4_I(inode)->i_data_sem); 1733 - ext4_es_remove_extent(inode, start, last - start + 1); 1707 + ext4_es_remove_extent(inode, start, last - start); 1734 1708 up_write(&EXT4_I(inode)->i_data_sem); 1735 1709 } 1736 1710 1737 1711 folio_batch_init(&fbatch); 1738 - while (index <= end) { 1739 - nr = filemap_get_folios(mapping, &index, end, &fbatch); 1712 + index = mpd->start_pos >> PAGE_SHIFT; 1713 + end = mpd->next_pos >> PAGE_SHIFT; 1714 + while (index < end) { 1715 + nr = filemap_get_folios(mapping, &index, end - 1, &fbatch); 1740 1716 if (nr == 0) 1741 1717 break; 1742 1718 for (i = 0; i < nr; i++) { 1743 1719 struct folio *folio = fbatch.folios[i]; 1744 1720 1745 - if (folio->index < mpd->first_page) 1721 + if (folio_pos(folio) < mpd->start_pos) 1746 1722 continue; 1747 - if (folio_next_index(folio) - 1 > end) 1723 + if (folio_next_index(folio) > end) 1748 1724 continue; 1749 1725 BUG_ON(!folio_test_locked(folio)); 1750 1726 BUG_ON(folio_test_writeback(folio)); ··· 2046 2022 2047 2023 static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) 2048 2024 { 2049 - mpd->first_page += folio_nr_pages(folio); 2025 + mpd->start_pos += folio_size(folio); 2026 + mpd->wbc->nr_to_write -= folio_nr_pages(folio); 2050 2027 folio_unlock(folio); 2051 2028 } 2052 2029 ··· 2057 2032 loff_t size; 2058 2033 int err; 2059 2034 2060 - BUG_ON(folio->index != mpd->first_page); 2035 + WARN_ON_ONCE(folio_pos(folio) != mpd->start_pos); 2061 2036 folio_clear_dirty_for_io(folio); 2062 2037 /* 2063 2038 * We have to be very careful here! Nothing protects writeback path ··· 2078 2053 !ext4_verity_in_progress(mpd->inode)) 2079 2054 len = size & (len - 1); 2080 2055 err = ext4_bio_write_folio(&mpd->io_submit, folio, len); 2081 - if (!err) 2082 - mpd->wbc->nr_to_write -= folio_nr_pages(folio); 2083 2056 2084 2057 return err; 2085 2058 } ··· 2344 2321 int get_blocks_flags; 2345 2322 int err, dioread_nolock; 2346 2323 2324 + /* Make sure transaction has enough credits for this extent */ 2325 + err = ext4_journal_ensure_extent_credits(handle, inode); 2326 + if (err < 0) 2327 + return err; 2328 + 2347 2329 trace_ext4_da_write_pages_extent(inode, map); 2348 2330 /* 2349 2331 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or ··· 2385 2357 2386 2358 BUG_ON(map->m_len == 0); 2387 2359 return 0; 2360 + } 2361 + 2362 + /* 2363 + * This is used to submit mapped buffers in a single folio that is not fully 2364 + * mapped for various reasons, such as insufficient space or journal credits. 2365 + */ 2366 + static int mpage_submit_partial_folio(struct mpage_da_data *mpd) 2367 + { 2368 + struct inode *inode = mpd->inode; 2369 + struct folio *folio; 2370 + loff_t pos; 2371 + int ret; 2372 + 2373 + folio = filemap_get_folio(inode->i_mapping, 2374 + mpd->start_pos >> PAGE_SHIFT); 2375 + if (IS_ERR(folio)) 2376 + return PTR_ERR(folio); 2377 + /* 2378 + * The mapped position should be within the current processing folio 2379 + * but must not be the folio start position. 2380 + */ 2381 + pos = ((loff_t)mpd->map.m_lblk) << inode->i_blkbits; 2382 + if (WARN_ON_ONCE((folio_pos(folio) == pos) || 2383 + !folio_contains(folio, pos >> PAGE_SHIFT))) 2384 + return -EINVAL; 2385 + 2386 + ret = mpage_submit_folio(mpd, folio); 2387 + if (ret) 2388 + goto out; 2389 + /* 2390 + * Update start_pos to prevent this folio from being released in 2391 + * mpage_release_unused_pages(), it will be reset to the aligned folio 2392 + * pos when this folio is written again in the next round. Additionally, 2393 + * do not update wbc->nr_to_write here, as it will be updated once the 2394 + * entire folio has finished processing. 2395 + */ 2396 + mpd->start_pos = pos; 2397 + out: 2398 + folio_unlock(folio); 2399 + folio_put(folio); 2400 + return ret; 2388 2401 } 2389 2402 2390 2403 /* ··· 2476 2407 * In the case of ENOSPC, if ext4_count_free_blocks() 2477 2408 * is non-zero, a commit should free up blocks. 2478 2409 */ 2479 - if ((err == -ENOMEM) || 2410 + if ((err == -ENOMEM) || (err == -EAGAIN) || 2480 2411 (err == -ENOSPC && ext4_count_free_clusters(sb))) { 2481 - if (progress) 2412 + /* 2413 + * We may have already allocated extents for 2414 + * some bhs inside the folio, issue the 2415 + * corresponding data to prevent stale data. 2416 + */ 2417 + if (progress) { 2418 + if (mpage_submit_partial_folio(mpd)) 2419 + goto invalidate_dirty_pages; 2482 2420 goto update_disksize; 2421 + } 2483 2422 return err; 2484 2423 } 2485 2424 ext4_msg(sb, KERN_CRIT, ··· 2521 2444 * Update on-disk size after IO is submitted. Races with 2522 2445 * truncate are avoided by checking i_size under i_data_sem. 2523 2446 */ 2524 - disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; 2447 + disksize = mpd->start_pos; 2525 2448 if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { 2526 2449 int err2; 2527 2450 loff_t i_size; ··· 2543 2466 err = err2; 2544 2467 } 2545 2468 return err; 2546 - } 2547 - 2548 - /* 2549 - * Calculate the total number of credits to reserve for one writepages 2550 - * iteration. This is called from ext4_writepages(). We map an extent of 2551 - * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping 2552 - * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + 2553 - * bpp - 1 blocks in bpp different extents. 2554 - */ 2555 - static int ext4_da_writepages_trans_blocks(struct inode *inode) 2556 - { 2557 - int bpp = ext4_journal_blocks_per_folio(inode); 2558 - 2559 - return ext4_meta_trans_blocks(inode, 2560 - MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); 2561 2469 } 2562 2470 2563 2471 static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, ··· 2609 2547 struct address_space *mapping = mpd->inode->i_mapping; 2610 2548 struct folio_batch fbatch; 2611 2549 unsigned int nr_folios; 2612 - pgoff_t index = mpd->first_page; 2613 - pgoff_t end = mpd->last_page; 2550 + pgoff_t index = mpd->start_pos >> PAGE_SHIFT; 2551 + pgoff_t end = mpd->end_pos >> PAGE_SHIFT; 2614 2552 xa_mark_t tag; 2615 2553 int i, err = 0; 2616 2554 int blkbits = mpd->inode->i_blkbits; ··· 2625 2563 tag = PAGECACHE_TAG_DIRTY; 2626 2564 2627 2565 mpd->map.m_len = 0; 2628 - mpd->next_page = index; 2566 + mpd->next_pos = mpd->start_pos; 2629 2567 if (ext4_should_journal_data(mpd->inode)) { 2630 2568 handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, 2631 2569 bpp); ··· 2656 2594 goto out; 2657 2595 2658 2596 /* If we can't merge this page, we are done. */ 2659 - if (mpd->map.m_len > 0 && mpd->next_page != folio->index) 2597 + if (mpd->map.m_len > 0 && 2598 + mpd->next_pos != folio_pos(folio)) 2660 2599 goto out; 2661 2600 2662 2601 if (handle) { ··· 2703 2640 } 2704 2641 2705 2642 if (mpd->map.m_len == 0) 2706 - mpd->first_page = folio->index; 2707 - mpd->next_page = folio_next_index(folio); 2643 + mpd->start_pos = folio_pos(folio); 2644 + mpd->next_pos = folio_pos(folio) + folio_size(folio); 2708 2645 /* 2709 2646 * Writeout when we cannot modify metadata is simple. 2710 2647 * Just submit the page. For data=journal mode we ··· 2832 2769 mpd->journalled_more_data = 0; 2833 2770 2834 2771 if (ext4_should_dioread_nolock(inode)) { 2772 + int bpf = ext4_journal_blocks_per_folio(inode); 2835 2773 /* 2836 2774 * We may need to convert up to one extent per block in 2837 - * the page and we may dirty the inode. 2775 + * the folio and we may dirty the inode. 2838 2776 */ 2839 - rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, 2840 - PAGE_SIZE >> inode->i_blkbits); 2777 + rsv_blocks = 1 + ext4_ext_index_trans_blocks(inode, bpf); 2841 2778 } 2842 2779 2843 2780 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) ··· 2847 2784 writeback_index = mapping->writeback_index; 2848 2785 if (writeback_index) 2849 2786 cycled = 0; 2850 - mpd->first_page = writeback_index; 2851 - mpd->last_page = -1; 2787 + mpd->start_pos = writeback_index << PAGE_SHIFT; 2788 + mpd->end_pos = LLONG_MAX; 2852 2789 } else { 2853 - mpd->first_page = wbc->range_start >> PAGE_SHIFT; 2854 - mpd->last_page = wbc->range_end >> PAGE_SHIFT; 2790 + mpd->start_pos = wbc->range_start; 2791 + mpd->end_pos = wbc->range_end; 2855 2792 } 2856 2793 2857 2794 ext4_io_submit_init(&mpd->io_submit, wbc); 2858 2795 retry: 2859 2796 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2860 - tag_pages_for_writeback(mapping, mpd->first_page, 2861 - mpd->last_page); 2797 + tag_pages_for_writeback(mapping, mpd->start_pos >> PAGE_SHIFT, 2798 + mpd->end_pos >> PAGE_SHIFT); 2862 2799 blk_start_plug(&plug); 2863 2800 2864 2801 /* ··· 2901 2838 * not supported by delalloc. 2902 2839 */ 2903 2840 BUG_ON(ext4_should_journal_data(inode)); 2904 - needed_blocks = ext4_da_writepages_trans_blocks(inode); 2905 - 2841 + /* 2842 + * Calculate the number of credits needed to reserve for one 2843 + * extent of up to MAX_WRITEPAGES_EXTENT_LEN blocks. It will 2844 + * attempt to extend the transaction or start a new iteration 2845 + * if the reserved credits are insufficient. 2846 + */ 2847 + needed_blocks = ext4_chunk_trans_blocks(inode, 2848 + MAX_WRITEPAGES_EXTENT_LEN); 2906 2849 /* start a new transaction */ 2907 2850 handle = ext4_journal_start_with_reserve(inode, 2908 2851 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); ··· 2924 2855 } 2925 2856 mpd->do_map = 1; 2926 2857 2927 - trace_ext4_da_write_pages(inode, mpd->first_page, wbc); 2858 + trace_ext4_da_write_folios_start(inode, mpd->start_pos, 2859 + mpd->next_pos, wbc); 2928 2860 ret = mpage_prepare_extent_to_map(mpd); 2929 2861 if (!ret && mpd->map.m_len) 2930 2862 ret = mpage_map_and_submit_extent(handle, mpd, ··· 2963 2893 } else 2964 2894 ext4_put_io_end(mpd->io_submit.io_end); 2965 2895 mpd->io_submit.io_end = NULL; 2896 + trace_ext4_da_write_folios_end(inode, mpd->start_pos, 2897 + mpd->next_pos, wbc, ret); 2966 2898 2967 2899 if (ret == -ENOSPC && sbi->s_journal) { 2968 2900 /* ··· 2976 2904 ret = 0; 2977 2905 continue; 2978 2906 } 2907 + if (ret == -EAGAIN) 2908 + ret = 0; 2979 2909 /* Fatal error - ENOMEM, EIO... */ 2980 2910 if (ret) 2981 2911 break; ··· 2986 2912 blk_finish_plug(&plug); 2987 2913 if (!ret && !cycled && wbc->nr_to_write > 0) { 2988 2914 cycled = 1; 2989 - mpd->last_page = writeback_index - 1; 2990 - mpd->first_page = 0; 2915 + mpd->end_pos = (writeback_index << PAGE_SHIFT) - 1; 2916 + mpd->start_pos = 0; 2991 2917 goto retry; 2992 2918 } 2993 2919 ··· 2997 2923 * Set the writeback_index so that range_cyclic 2998 2924 * mode will write it back later 2999 2925 */ 3000 - mapping->writeback_index = mpd->first_page; 2926 + mapping->writeback_index = mpd->start_pos >> PAGE_SHIFT; 3001 2927 3002 2928 out_writepages: 3003 2929 trace_ext4_writepages_result(inode, wbc, ret, ··· 4458 4384 return ret; 4459 4385 4460 4386 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4461 - credits = ext4_writepage_trans_blocks(inode); 4387 + credits = ext4_chunk_trans_extent(inode, 2); 4462 4388 else 4463 4389 credits = ext4_blocks_for_truncate(inode); 4464 4390 handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); ··· 4607 4533 } 4608 4534 4609 4535 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 4610 - credits = ext4_writepage_trans_blocks(inode); 4536 + credits = ext4_chunk_trans_extent(inode, 1); 4611 4537 else 4612 4538 credits = ext4_blocks_for_truncate(inode); 4613 4539 ··· 5175 5101 return -EFSCORRUPTED; 5176 5102 } 5177 5103 5178 - bool ext4_should_enable_large_folio(struct inode *inode) 5104 + static bool ext4_should_enable_large_folio(struct inode *inode) 5179 5105 { 5180 5106 struct super_block *sb = inode->i_sb; 5181 5107 ··· 5190 5116 return false; 5191 5117 5192 5118 return true; 5119 + } 5120 + 5121 + /* 5122 + * Limit the maximum folio order to 2048 blocks to prevent overestimation 5123 + * of reserve handle credits during the folio writeback in environments 5124 + * where the PAGE_SIZE exceeds 4KB. 5125 + */ 5126 + #define EXT4_MAX_PAGECACHE_ORDER(i) \ 5127 + umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT)) 5128 + void ext4_set_inode_mapping_order(struct inode *inode) 5129 + { 5130 + if (!ext4_should_enable_large_folio(inode)) 5131 + return; 5132 + 5133 + mapping_set_folio_order_range(inode->i_mapping, 0, 5134 + EXT4_MAX_PAGECACHE_ORDER(inode)); 5193 5135 } 5194 5136 5195 5137 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ··· 5525 5435 ret = -EFSCORRUPTED; 5526 5436 goto bad_inode; 5527 5437 } 5528 - if (ext4_should_enable_large_folio(inode)) 5529 - mapping_set_large_folios(inode->i_mapping); 5438 + 5439 + ext4_set_inode_mapping_order(inode); 5530 5440 5531 5441 ret = check_igot_inode(inode, flags, function, line); 5532 5442 /* ··· 6224 6134 int ret; 6225 6135 6226 6136 /* 6227 - * How many index and lead blocks need to touch to map @lblocks 6137 + * How many index and leaf blocks need to touch to map @lblocks 6228 6138 * logical blocks to @pextents physical extents? 6229 6139 */ 6230 6140 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); ··· 6233 6143 * Now let's see how many group bitmaps and group descriptors need 6234 6144 * to account 6235 6145 */ 6236 - groups = idxblocks; 6146 + groups = idxblocks + pextents; 6237 6147 gdpblocks = groups; 6238 6148 if (groups > ngroups) 6239 6149 groups = ngroups; ··· 6250 6160 } 6251 6161 6252 6162 /* 6253 - * Calculate the total number of credits to reserve to fit 6254 - * the modification of a single pages into a single transaction, 6255 - * which may include multiple chunks of block allocations. 6256 - * 6257 - * This could be called via ext4_write_begin() 6258 - * 6259 - * We need to consider the worse case, when 6260 - * one new block per extent. 6163 + * Calculate the journal credits for modifying the number of blocks 6164 + * in a single extent within one transaction. 'nrblocks' is used only 6165 + * for non-extent inodes. For extent type inodes, 'nrblocks' can be 6166 + * zero if the exact number of blocks is unknown. 6261 6167 */ 6262 - int ext4_writepage_trans_blocks(struct inode *inode) 6168 + int ext4_chunk_trans_extent(struct inode *inode, int nrblocks) 6263 6169 { 6264 - int bpp = ext4_journal_blocks_per_folio(inode); 6265 6170 int ret; 6266 6171 6267 - ret = ext4_meta_trans_blocks(inode, bpp, bpp); 6268 - 6172 + ret = ext4_meta_trans_blocks(inode, nrblocks, 1); 6269 6173 /* Account for data blocks for journalled mode */ 6270 6174 if (ext4_should_journal_data(inode)) 6271 - ret += bpp; 6175 + ret += nrblocks; 6272 6176 return ret; 6273 6177 } 6274 6178 ··· 6634 6550 return !buffer_mapped(bh); 6635 6551 } 6636 6552 6553 + static int ext4_block_page_mkwrite(struct inode *inode, struct folio *folio, 6554 + get_block_t get_block) 6555 + { 6556 + handle_t *handle; 6557 + loff_t size; 6558 + unsigned long len; 6559 + int credits; 6560 + int ret; 6561 + 6562 + credits = ext4_chunk_trans_extent(inode, 6563 + ext4_journal_blocks_per_folio(inode)); 6564 + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, credits); 6565 + if (IS_ERR(handle)) 6566 + return PTR_ERR(handle); 6567 + 6568 + folio_lock(folio); 6569 + size = i_size_read(inode); 6570 + /* Page got truncated from under us? */ 6571 + if (folio->mapping != inode->i_mapping || folio_pos(folio) > size) { 6572 + ret = -EFAULT; 6573 + goto out_error; 6574 + } 6575 + 6576 + len = folio_size(folio); 6577 + if (folio_pos(folio) + len > size) 6578 + len = size - folio_pos(folio); 6579 + 6580 + ret = ext4_block_write_begin(handle, folio, 0, len, get_block); 6581 + if (ret) 6582 + goto out_error; 6583 + 6584 + if (!ext4_should_journal_data(inode)) { 6585 + block_commit_write(folio, 0, len); 6586 + folio_mark_dirty(folio); 6587 + } else { 6588 + ret = ext4_journal_folio_buffers(handle, folio, len); 6589 + if (ret) 6590 + goto out_error; 6591 + } 6592 + ext4_journal_stop(handle); 6593 + folio_wait_stable(folio); 6594 + return ret; 6595 + 6596 + out_error: 6597 + folio_unlock(folio); 6598 + ext4_journal_stop(handle); 6599 + return ret; 6600 + } 6601 + 6637 6602 vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) 6638 6603 { 6639 6604 struct vm_area_struct *vma = vmf->vma; ··· 6694 6561 struct file *file = vma->vm_file; 6695 6562 struct inode *inode = file_inode(file); 6696 6563 struct address_space *mapping = inode->i_mapping; 6697 - handle_t *handle; 6698 - get_block_t *get_block; 6564 + get_block_t *get_block = ext4_get_block; 6699 6565 int retries = 0; 6700 6566 6701 6567 if (unlikely(IS_IMMUTABLE(inode))) ··· 6762 6630 /* OK, we need to fill the hole... */ 6763 6631 if (ext4_should_dioread_nolock(inode)) 6764 6632 get_block = ext4_get_block_unwritten; 6765 - else 6766 - get_block = ext4_get_block; 6767 6633 retry_alloc: 6768 - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 6769 - ext4_writepage_trans_blocks(inode)); 6770 - if (IS_ERR(handle)) { 6771 - ret = VM_FAULT_SIGBUS; 6772 - goto out; 6773 - } 6774 - /* 6775 - * Data journalling can't use block_page_mkwrite() because it 6776 - * will set_buffer_dirty() before do_journal_get_write_access() 6777 - * thus might hit warning messages for dirty metadata buffers. 6778 - */ 6779 - if (!ext4_should_journal_data(inode)) { 6780 - err = block_page_mkwrite(vma, vmf, get_block); 6781 - } else { 6782 - folio_lock(folio); 6783 - size = i_size_read(inode); 6784 - /* Page got truncated from under us? */ 6785 - if (folio->mapping != mapping || folio_pos(folio) > size) { 6786 - ret = VM_FAULT_NOPAGE; 6787 - goto out_error; 6788 - } 6789 - 6790 - len = folio_size(folio); 6791 - if (folio_pos(folio) + len > size) 6792 - len = size - folio_pos(folio); 6793 - 6794 - err = ext4_block_write_begin(handle, folio, 0, len, 6795 - ext4_get_block); 6796 - if (!err) { 6797 - ret = VM_FAULT_SIGBUS; 6798 - if (ext4_journal_folio_buffers(handle, folio, len)) 6799 - goto out_error; 6800 - } else { 6801 - folio_unlock(folio); 6802 - } 6803 - } 6804 - ext4_journal_stop(handle); 6805 - if (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 6634 + /* Start journal and allocate blocks */ 6635 + err = ext4_block_page_mkwrite(inode, folio, get_block); 6636 + if (err == -EAGAIN || 6637 + (err == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))) 6806 6638 goto retry_alloc; 6807 6639 out_ret: 6808 6640 ret = vmf_fs_error(err); ··· 6774 6678 filemap_invalidate_unlock_shared(mapping); 6775 6679 sb_end_pagefault(inode->i_sb); 6776 6680 return ret; 6777 - out_error: 6778 - folio_unlock(folio); 6779 - ext4_journal_stop(handle); 6780 - goto out; 6781 6681 }
+5
fs/ext4/mballoc-test.c
··· 155 155 bgl_lock_init(sbi->s_blockgroup_lock); 156 156 157 157 sbi->s_es = &fsb->es; 158 + sbi->s_sb = sb; 158 159 sb->s_fs_info = sbi; 159 160 160 161 up_write(&sb->s_umount); ··· 803 802 KUNIT_ASSERT_EQ(test, ret, 0); 804 803 805 804 grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb); 805 + grp->bb_largest_free_order = -1; 806 + grp->bb_avg_fragment_size_order = -1; 806 807 mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT); 807 808 for (i = 0; i < TEST_RANGE_COUNT; i++) 808 809 test_mb_mark_used_range(test, &e4b, ranges[i].start, ··· 878 875 ext4_unlock_group(sb, TEST_GOAL_GROUP); 879 876 880 877 grp->bb_free = 0; 878 + grp->bb_largest_free_order = -1; 879 + grp->bb_avg_fragment_size_order = -1; 881 880 memset(bitmap, 0xff, sb->s_blocksize); 882 881 883 882 mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+504 -395
fs/ext4/mballoc.c
··· 132 132 * If "mb_optimize_scan" mount option is set, we maintain in memory group info 133 133 * structures in two data structures: 134 134 * 135 - * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) 135 + * 1) Array of largest free order xarrays (sbi->s_mb_largest_free_orders) 136 136 * 137 - * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) 137 + * Locking: Writers use xa_lock, readers use rcu_read_lock. 138 138 * 139 - * This is an array of lists where the index in the array represents the 139 + * This is an array of xarrays where the index in the array represents the 140 140 * largest free order in the buddy bitmap of the participating group infos of 141 - * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total 142 - * number of buddy bitmap orders possible) number of lists. Group-infos are 143 - * placed in appropriate lists. 141 + * that xarray. So, there are exactly MB_NUM_ORDERS(sb) (which means total 142 + * number of buddy bitmap orders possible) number of xarrays. Group-infos are 143 + * placed in appropriate xarrays. 144 144 * 145 - * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) 145 + * 2) Average fragment size xarrays (sbi->s_mb_avg_fragment_size) 146 146 * 147 - * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) 147 + * Locking: Writers use xa_lock, readers use rcu_read_lock. 148 148 * 149 - * This is an array of lists where in the i-th list there are groups with 149 + * This is an array of xarrays where in the i-th xarray there are groups with 150 150 * average fragment size >= 2^i and < 2^(i+1). The average fragment size 151 151 * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. 152 - * Note that we don't bother with a special list for completely empty groups 153 - * so we only have MB_NUM_ORDERS(sb) lists. 152 + * Note that we don't bother with a special xarray for completely empty 153 + * groups so we only have MB_NUM_ORDERS(sb) xarrays. Group-infos are placed 154 + * in appropriate xarrays. 155 + * 156 + * In xarray, the index is the block group number, the value is the block group 157 + * information, and a non-empty value indicates the block group is present in 158 + * the current xarray. 154 159 * 155 160 * When "mb_optimize_scan" mount option is set, mballoc consults the above data 156 161 * structures to decide the order in which groups are to be traversed for ··· 425 420 ext4_group_t group); 426 421 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 427 422 428 - static bool ext4_mb_good_group(struct ext4_allocation_context *ac, 429 - ext4_group_t group, enum criteria cr); 423 + static int ext4_mb_scan_group(struct ext4_allocation_context *ac, 424 + ext4_group_t group); 430 425 431 426 static int ext4_try_to_trim_range(struct super_block *sb, 432 427 struct ext4_buddy *e4b, ext4_grpblk_t start, ··· 846 841 mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) 847 842 { 848 843 struct ext4_sb_info *sbi = EXT4_SB(sb); 849 - int new_order; 844 + int new, old; 850 845 851 - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) 846 + if (!test_opt2(sb, MB_OPTIMIZE_SCAN)) 852 847 return; 853 848 854 - new_order = mb_avg_fragment_size_order(sb, 855 - grp->bb_free / grp->bb_fragments); 856 - if (new_order == grp->bb_avg_fragment_size_order) 849 + old = grp->bb_avg_fragment_size_order; 850 + new = grp->bb_fragments == 0 ? -1 : 851 + mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); 852 + if (new == old) 857 853 return; 858 854 859 - if (grp->bb_avg_fragment_size_order != -1) { 860 - write_lock(&sbi->s_mb_avg_fragment_size_locks[ 861 - grp->bb_avg_fragment_size_order]); 862 - list_del(&grp->bb_avg_fragment_size_node); 863 - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 864 - grp->bb_avg_fragment_size_order]); 855 + if (old >= 0) 856 + xa_erase(&sbi->s_mb_avg_fragment_size[old], grp->bb_group); 857 + 858 + grp->bb_avg_fragment_size_order = new; 859 + if (new >= 0) { 860 + /* 861 + * Cannot use __GFP_NOFAIL because we hold the group lock. 862 + * Although allocation for insertion may fails, it's not fatal 863 + * as we have linear traversal to fall back on. 864 + */ 865 + int err = xa_insert(&sbi->s_mb_avg_fragment_size[new], 866 + grp->bb_group, grp, GFP_ATOMIC); 867 + if (err) 868 + mb_debug(sb, "insert group: %u to s_mb_avg_fragment_size[%d] failed, err %d", 869 + grp->bb_group, new, err); 865 870 } 866 - grp->bb_avg_fragment_size_order = new_order; 867 - write_lock(&sbi->s_mb_avg_fragment_size_locks[ 868 - grp->bb_avg_fragment_size_order]); 869 - list_add_tail(&grp->bb_avg_fragment_size_node, 870 - &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); 871 - write_unlock(&sbi->s_mb_avg_fragment_size_locks[ 872 - grp->bb_avg_fragment_size_order]); 871 + } 872 + 873 + static int ext4_mb_scan_groups_xa_range(struct ext4_allocation_context *ac, 874 + struct xarray *xa, 875 + ext4_group_t start, ext4_group_t end) 876 + { 877 + struct super_block *sb = ac->ac_sb; 878 + struct ext4_sb_info *sbi = EXT4_SB(sb); 879 + enum criteria cr = ac->ac_criteria; 880 + ext4_group_t ngroups = ext4_get_groups_count(sb); 881 + unsigned long group = start; 882 + struct ext4_group_info *grp; 883 + 884 + if (WARN_ON_ONCE(end > ngroups || start >= end)) 885 + return 0; 886 + 887 + xa_for_each_range(xa, group, grp, start, end - 1) { 888 + int err; 889 + 890 + if (sbi->s_mb_stats) 891 + atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); 892 + 893 + err = ext4_mb_scan_group(ac, grp->bb_group); 894 + if (err || ac->ac_status != AC_STATUS_CONTINUE) 895 + return err; 896 + 897 + cond_resched(); 898 + } 899 + 900 + return 0; 901 + } 902 + 903 + /* 904 + * Find a suitable group of given order from the largest free orders xarray. 905 + */ 906 + static inline int 907 + ext4_mb_scan_groups_largest_free_order_range(struct ext4_allocation_context *ac, 908 + int order, ext4_group_t start, 909 + ext4_group_t end) 910 + { 911 + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_largest_free_orders[order]; 912 + 913 + if (xa_empty(xa)) 914 + return 0; 915 + 916 + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); 873 917 } 874 918 875 919 /* 876 920 * Choose next group by traversing largest_free_order lists. Updates *new_cr if 877 921 * cr level needs an update. 878 922 */ 879 - static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, 880 - enum criteria *new_cr, ext4_group_t *group) 923 + static int ext4_mb_scan_groups_p2_aligned(struct ext4_allocation_context *ac, 924 + ext4_group_t group) 881 925 { 882 926 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 883 - struct ext4_group_info *iter; 884 927 int i; 928 + int ret = 0; 929 + ext4_group_t start, end; 885 930 886 - if (ac->ac_status == AC_STATUS_FOUND) 887 - return; 888 - 889 - if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) 890 - atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); 891 - 931 + start = group; 932 + end = ext4_get_groups_count(ac->ac_sb); 933 + wrap_around: 892 934 for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 893 - if (list_empty(&sbi->s_mb_largest_free_orders[i])) 894 - continue; 895 - read_lock(&sbi->s_mb_largest_free_orders_locks[i]); 896 - if (list_empty(&sbi->s_mb_largest_free_orders[i])) { 897 - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 898 - continue; 899 - } 900 - list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], 901 - bb_largest_free_order_node) { 902 - if (sbi->s_mb_stats) 903 - atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); 904 - if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { 905 - *group = iter->bb_group; 906 - ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; 907 - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 908 - return; 909 - } 910 - } 911 - read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); 935 + ret = ext4_mb_scan_groups_largest_free_order_range(ac, i, 936 + start, end); 937 + if (ret || ac->ac_status != AC_STATUS_CONTINUE) 938 + return ret; 939 + } 940 + if (start) { 941 + end = start; 942 + start = 0; 943 + goto wrap_around; 912 944 } 913 945 946 + if (sbi->s_mb_stats) 947 + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); 948 + 914 949 /* Increment cr and search again if no group is found */ 915 - *new_cr = CR_GOAL_LEN_FAST; 950 + ac->ac_criteria = CR_GOAL_LEN_FAST; 951 + return ret; 916 952 } 917 953 918 954 /* 919 - * Find a suitable group of given order from the average fragments list. 955 + * Find a suitable group of given order from the average fragments xarray. 920 956 */ 921 - static struct ext4_group_info * 922 - ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) 957 + static int 958 + ext4_mb_scan_groups_avg_frag_order_range(struct ext4_allocation_context *ac, 959 + int order, ext4_group_t start, 960 + ext4_group_t end) 923 961 { 924 - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 925 - struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; 926 - rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; 927 - struct ext4_group_info *grp = NULL, *iter; 928 - enum criteria cr = ac->ac_criteria; 962 + struct xarray *xa = &EXT4_SB(ac->ac_sb)->s_mb_avg_fragment_size[order]; 929 963 930 - if (list_empty(frag_list)) 931 - return NULL; 932 - read_lock(frag_list_lock); 933 - if (list_empty(frag_list)) { 934 - read_unlock(frag_list_lock); 935 - return NULL; 936 - } 937 - list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { 938 - if (sbi->s_mb_stats) 939 - atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); 940 - if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { 941 - grp = iter; 942 - break; 943 - } 944 - } 945 - read_unlock(frag_list_lock); 946 - return grp; 964 + if (xa_empty(xa)) 965 + return 0; 966 + 967 + return ext4_mb_scan_groups_xa_range(ac, xa, start, end); 947 968 } 948 969 949 970 /* 950 971 * Choose next group by traversing average fragment size list of suitable 951 972 * order. Updates *new_cr if cr level needs an update. 952 973 */ 953 - static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, 954 - enum criteria *new_cr, ext4_group_t *group) 974 + static int ext4_mb_scan_groups_goal_fast(struct ext4_allocation_context *ac, 975 + ext4_group_t group) 955 976 { 956 977 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 957 - struct ext4_group_info *grp = NULL; 958 - int i; 978 + int i, ret = 0; 979 + ext4_group_t start, end; 959 980 960 - if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { 961 - if (sbi->s_mb_stats) 962 - atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); 981 + start = group; 982 + end = ext4_get_groups_count(ac->ac_sb); 983 + wrap_around: 984 + i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); 985 + for (; i < MB_NUM_ORDERS(ac->ac_sb); i++) { 986 + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, i, 987 + start, end); 988 + if (ret || ac->ac_status != AC_STATUS_CONTINUE) 989 + return ret; 990 + } 991 + if (start) { 992 + end = start; 993 + start = 0; 994 + goto wrap_around; 963 995 } 964 996 965 - for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); 966 - i < MB_NUM_ORDERS(ac->ac_sb); i++) { 967 - grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); 968 - if (grp) { 969 - *group = grp->bb_group; 970 - ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; 971 - return; 972 - } 973 - } 974 - 997 + if (sbi->s_mb_stats) 998 + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); 975 999 /* 976 1000 * CR_BEST_AVAIL_LEN works based on the concept that we have 977 1001 * a larger normalized goal len request which can be trimmed to ··· 1010 976 * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). 1011 977 */ 1012 978 if (ac->ac_flags & EXT4_MB_HINT_DATA) 1013 - *new_cr = CR_BEST_AVAIL_LEN; 979 + ac->ac_criteria = CR_BEST_AVAIL_LEN; 1014 980 else 1015 - *new_cr = CR_GOAL_LEN_SLOW; 981 + ac->ac_criteria = CR_GOAL_LEN_SLOW; 982 + 983 + return ret; 1016 984 } 1017 985 1018 986 /* ··· 1026 990 * preallocations. However, we make sure that we don't trim the request too 1027 991 * much and fall to CR_GOAL_LEN_SLOW in that case. 1028 992 */ 1029 - static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, 1030 - enum criteria *new_cr, ext4_group_t *group) 993 + static int ext4_mb_scan_groups_best_avail(struct ext4_allocation_context *ac, 994 + ext4_group_t group) 1031 995 { 996 + int ret = 0; 1032 997 struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1033 - struct ext4_group_info *grp = NULL; 1034 998 int i, order, min_order; 1035 999 unsigned long num_stripe_clusters = 0; 1036 - 1037 - if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { 1038 - if (sbi->s_mb_stats) 1039 - atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); 1040 - } 1000 + ext4_group_t start, end; 1041 1001 1042 1002 /* 1043 1003 * mb_avg_fragment_size_order() returns order in a way that makes ··· 1065 1033 if (1 << min_order < ac->ac_o_ex.fe_len) 1066 1034 min_order = fls(ac->ac_o_ex.fe_len); 1067 1035 1036 + start = group; 1037 + end = ext4_get_groups_count(ac->ac_sb); 1038 + wrap_around: 1068 1039 for (i = order; i >= min_order; i--) { 1069 1040 int frag_order; 1070 1041 /* ··· 1090 1055 frag_order = mb_avg_fragment_size_order(ac->ac_sb, 1091 1056 ac->ac_g_ex.fe_len); 1092 1057 1093 - grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); 1094 - if (grp) { 1095 - *group = grp->bb_group; 1096 - ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; 1097 - return; 1098 - } 1058 + ret = ext4_mb_scan_groups_avg_frag_order_range(ac, frag_order, 1059 + start, end); 1060 + if (ret || ac->ac_status != AC_STATUS_CONTINUE) 1061 + return ret; 1062 + } 1063 + if (start) { 1064 + end = start; 1065 + start = 0; 1066 + goto wrap_around; 1099 1067 } 1100 1068 1101 1069 /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ 1102 1070 ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; 1103 - *new_cr = CR_GOAL_LEN_SLOW; 1071 + if (sbi->s_mb_stats) 1072 + atomic64_inc(&sbi->s_bal_cX_failed[ac->ac_criteria]); 1073 + ac->ac_criteria = CR_GOAL_LEN_SLOW; 1074 + 1075 + return ret; 1104 1076 } 1105 1077 1106 1078 static inline int should_optimize_scan(struct ext4_allocation_context *ac) ··· 1122 1080 } 1123 1081 1124 1082 /* 1125 - * Return next linear group for allocation. 1083 + * next linear group for allocation. 1126 1084 */ 1127 - static ext4_group_t 1128 - next_linear_group(ext4_group_t group, ext4_group_t ngroups) 1085 + static void next_linear_group(ext4_group_t *group, ext4_group_t ngroups) 1129 1086 { 1130 1087 /* 1131 1088 * Artificially restricted ngroups for non-extent 1132 1089 * files makes group > ngroups possible on first loop. 1133 1090 */ 1134 - return group + 1 >= ngroups ? 0 : group + 1; 1091 + *group = *group + 1 >= ngroups ? 0 : *group + 1; 1135 1092 } 1136 1093 1137 - /* 1138 - * ext4_mb_choose_next_group: choose next group for allocation. 1139 - * 1140 - * @ac Allocation Context 1141 - * @new_cr This is an output parameter. If the there is no good group 1142 - * available at current CR level, this field is updated to indicate 1143 - * the new cr level that should be used. 1144 - * @group This is an input / output parameter. As an input it indicates the 1145 - * next group that the allocator intends to use for allocation. As 1146 - * output, this field indicates the next group that should be used as 1147 - * determined by the optimization functions. 1148 - * @ngroups Total number of groups 1149 - */ 1150 - static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, 1151 - enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) 1094 + static int ext4_mb_scan_groups_linear(struct ext4_allocation_context *ac, 1095 + ext4_group_t ngroups, ext4_group_t *start, ext4_group_t count) 1152 1096 { 1153 - *new_cr = ac->ac_criteria; 1097 + int ret, i; 1098 + enum criteria cr = ac->ac_criteria; 1099 + struct super_block *sb = ac->ac_sb; 1100 + struct ext4_sb_info *sbi = EXT4_SB(sb); 1101 + ext4_group_t group = *start; 1154 1102 1155 - if (!should_optimize_scan(ac)) { 1156 - *group = next_linear_group(*group, ngroups); 1157 - return; 1103 + for (i = 0; i < count; i++, next_linear_group(&group, ngroups)) { 1104 + ret = ext4_mb_scan_group(ac, group); 1105 + if (ret || ac->ac_status != AC_STATUS_CONTINUE) 1106 + return ret; 1107 + cond_resched(); 1158 1108 } 1109 + 1110 + *start = group; 1111 + if (count == ngroups) 1112 + ac->ac_criteria++; 1113 + 1114 + /* Processed all groups and haven't found blocks */ 1115 + if (sbi->s_mb_stats && i == ngroups) 1116 + atomic64_inc(&sbi->s_bal_cX_failed[cr]); 1117 + 1118 + return 0; 1119 + } 1120 + 1121 + static int ext4_mb_scan_groups(struct ext4_allocation_context *ac) 1122 + { 1123 + int ret = 0; 1124 + ext4_group_t start; 1125 + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); 1126 + ext4_group_t ngroups = ext4_get_groups_count(ac->ac_sb); 1127 + 1128 + /* non-extent files are limited to low blocks/groups */ 1129 + if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 1130 + ngroups = sbi->s_blockfile_groups; 1131 + 1132 + /* searching for the right group start from the goal value specified */ 1133 + start = ac->ac_g_ex.fe_group; 1134 + ac->ac_prefetch_grp = start; 1135 + ac->ac_prefetch_nr = 0; 1136 + 1137 + if (!should_optimize_scan(ac)) 1138 + return ext4_mb_scan_groups_linear(ac, ngroups, &start, ngroups); 1159 1139 1160 1140 /* 1161 1141 * Optimized scanning can return non adjacent groups which can cause 1162 1142 * seek overhead for rotational disks. So try few linear groups before 1163 1143 * trying optimized scan. 1164 1144 */ 1165 - if (ac->ac_groups_linear_remaining) { 1166 - *group = next_linear_group(*group, ngroups); 1167 - ac->ac_groups_linear_remaining--; 1168 - return; 1169 - } 1145 + if (sbi->s_mb_max_linear_groups) 1146 + ret = ext4_mb_scan_groups_linear(ac, ngroups, &start, 1147 + sbi->s_mb_max_linear_groups); 1148 + if (ret || ac->ac_status != AC_STATUS_CONTINUE) 1149 + return ret; 1170 1150 1171 - if (*new_cr == CR_POWER2_ALIGNED) { 1172 - ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); 1173 - } else if (*new_cr == CR_GOAL_LEN_FAST) { 1174 - ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); 1175 - } else if (*new_cr == CR_BEST_AVAIL_LEN) { 1176 - ext4_mb_choose_next_group_best_avail(ac, new_cr, group); 1177 - } else { 1151 + switch (ac->ac_criteria) { 1152 + case CR_POWER2_ALIGNED: 1153 + return ext4_mb_scan_groups_p2_aligned(ac, start); 1154 + case CR_GOAL_LEN_FAST: 1155 + return ext4_mb_scan_groups_goal_fast(ac, start); 1156 + case CR_BEST_AVAIL_LEN: 1157 + return ext4_mb_scan_groups_best_avail(ac, start); 1158 + default: 1178 1159 /* 1179 1160 * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an 1180 1161 * rb tree sorted by bb_free. But until that happens, we should ··· 1205 1140 */ 1206 1141 WARN_ON(1); 1207 1142 } 1143 + 1144 + return 0; 1208 1145 } 1209 1146 1210 1147 /* ··· 1217 1150 mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) 1218 1151 { 1219 1152 struct ext4_sb_info *sbi = EXT4_SB(sb); 1220 - int i; 1153 + int new, old = grp->bb_largest_free_order; 1221 1154 1222 - for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) 1223 - if (grp->bb_counters[i] > 0) 1155 + for (new = MB_NUM_ORDERS(sb) - 1; new >= 0; new--) 1156 + if (grp->bb_counters[new] > 0) 1224 1157 break; 1158 + 1225 1159 /* No need to move between order lists? */ 1226 - if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || 1227 - i == grp->bb_largest_free_order) { 1228 - grp->bb_largest_free_order = i; 1160 + if (new == old) 1229 1161 return; 1162 + 1163 + if (old >= 0) { 1164 + struct xarray *xa = &sbi->s_mb_largest_free_orders[old]; 1165 + 1166 + if (!xa_empty(xa) && xa_load(xa, grp->bb_group)) 1167 + xa_erase(xa, grp->bb_group); 1230 1168 } 1231 1169 1232 - if (grp->bb_largest_free_order >= 0) { 1233 - write_lock(&sbi->s_mb_largest_free_orders_locks[ 1234 - grp->bb_largest_free_order]); 1235 - list_del_init(&grp->bb_largest_free_order_node); 1236 - write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1237 - grp->bb_largest_free_order]); 1238 - } 1239 - grp->bb_largest_free_order = i; 1240 - if (grp->bb_largest_free_order >= 0 && grp->bb_free) { 1241 - write_lock(&sbi->s_mb_largest_free_orders_locks[ 1242 - grp->bb_largest_free_order]); 1243 - list_add_tail(&grp->bb_largest_free_order_node, 1244 - &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); 1245 - write_unlock(&sbi->s_mb_largest_free_orders_locks[ 1246 - grp->bb_largest_free_order]); 1170 + grp->bb_largest_free_order = new; 1171 + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && new >= 0 && grp->bb_free) { 1172 + /* 1173 + * Cannot use __GFP_NOFAIL because we hold the group lock. 1174 + * Although allocation for insertion may fails, it's not fatal 1175 + * as we have linear traversal to fall back on. 1176 + */ 1177 + int err = xa_insert(&sbi->s_mb_largest_free_orders[new], 1178 + grp->bb_group, grp, GFP_ATOMIC); 1179 + if (err) 1180 + mb_debug(sb, "insert group: %u to s_mb_largest_free_orders[%d] failed, err %d", 1181 + grp->bb_group, new, err); 1247 1182 } 1248 1183 } 1249 1184 ··· 2236 2167 folio_get(ac->ac_buddy_folio); 2237 2168 /* store last allocated for subsequent stream allocation */ 2238 2169 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 2239 - spin_lock(&sbi->s_md_lock); 2240 - sbi->s_mb_last_group = ac->ac_f_ex.fe_group; 2241 - sbi->s_mb_last_start = ac->ac_f_ex.fe_start; 2242 - spin_unlock(&sbi->s_md_lock); 2170 + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; 2171 + 2172 + WRITE_ONCE(sbi->s_mb_last_groups[hash], ac->ac_f_ex.fe_group); 2243 2173 } 2174 + 2244 2175 /* 2245 2176 * As we've just preallocated more space than 2246 2177 * user requested originally, we store allocated ··· 2640 2571 } 2641 2572 } 2642 2573 2574 + static void __ext4_mb_scan_group(struct ext4_allocation_context *ac) 2575 + { 2576 + bool is_stripe_aligned; 2577 + struct ext4_sb_info *sbi; 2578 + enum criteria cr = ac->ac_criteria; 2579 + 2580 + ac->ac_groups_scanned++; 2581 + if (cr == CR_POWER2_ALIGNED) 2582 + return ext4_mb_simple_scan_group(ac, ac->ac_e4b); 2583 + 2584 + sbi = EXT4_SB(ac->ac_sb); 2585 + is_stripe_aligned = false; 2586 + if ((sbi->s_stripe >= sbi->s_cluster_ratio) && 2587 + !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe))) 2588 + is_stripe_aligned = true; 2589 + 2590 + if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && 2591 + is_stripe_aligned) 2592 + ext4_mb_scan_aligned(ac, ac->ac_e4b); 2593 + 2594 + if (ac->ac_status == AC_STATUS_CONTINUE) 2595 + ext4_mb_complex_scan_group(ac, ac->ac_e4b); 2596 + } 2597 + 2643 2598 /* 2644 2599 * This is also called BEFORE we load the buddy bitmap. 2645 2600 * Returns either 1 or 0 indicating that the group is either suitable ··· 2854 2761 } 2855 2762 2856 2763 /* 2764 + * Batch reads of the block allocation bitmaps to get 2765 + * multiple READs in flight; limit prefetching at inexpensive 2766 + * CR, otherwise mballoc can spend a lot of time loading 2767 + * imperfect groups 2768 + */ 2769 + static void ext4_mb_might_prefetch(struct ext4_allocation_context *ac, 2770 + ext4_group_t group) 2771 + { 2772 + struct ext4_sb_info *sbi; 2773 + 2774 + if (ac->ac_prefetch_grp != group) 2775 + return; 2776 + 2777 + sbi = EXT4_SB(ac->ac_sb); 2778 + if (ext4_mb_cr_expensive(ac->ac_criteria) || 2779 + ac->ac_prefetch_ios < sbi->s_mb_prefetch_limit) { 2780 + unsigned int nr = sbi->s_mb_prefetch; 2781 + 2782 + if (ext4_has_feature_flex_bg(ac->ac_sb)) { 2783 + nr = 1 << sbi->s_log_groups_per_flex; 2784 + nr -= group & (nr - 1); 2785 + nr = umin(nr, sbi->s_mb_prefetch); 2786 + } 2787 + 2788 + ac->ac_prefetch_nr = nr; 2789 + ac->ac_prefetch_grp = ext4_mb_prefetch(ac->ac_sb, group, nr, 2790 + &ac->ac_prefetch_ios); 2791 + } 2792 + } 2793 + 2794 + /* 2857 2795 * Prefetching reads the block bitmap into the buffer cache; but we 2858 2796 * need to make sure that the buddy bitmap in the page cache has been 2859 2797 * initialized. Note that ext4_mb_init_group() will block if the I/O ··· 2917 2793 } 2918 2794 } 2919 2795 2796 + static int ext4_mb_scan_group(struct ext4_allocation_context *ac, 2797 + ext4_group_t group) 2798 + { 2799 + int ret; 2800 + struct super_block *sb = ac->ac_sb; 2801 + enum criteria cr = ac->ac_criteria; 2802 + 2803 + ext4_mb_might_prefetch(ac, group); 2804 + 2805 + /* prevent unnecessary buddy loading. */ 2806 + if (cr < CR_ANY_FREE && spin_is_locked(ext4_group_lock_ptr(sb, group))) 2807 + return 0; 2808 + 2809 + /* This now checks without needing the buddy page */ 2810 + ret = ext4_mb_good_group_nolock(ac, group, cr); 2811 + if (ret <= 0) { 2812 + if (!ac->ac_first_err) 2813 + ac->ac_first_err = ret; 2814 + return 0; 2815 + } 2816 + 2817 + ret = ext4_mb_load_buddy(sb, group, ac->ac_e4b); 2818 + if (ret) 2819 + return ret; 2820 + 2821 + /* skip busy group */ 2822 + if (cr >= CR_ANY_FREE) 2823 + ext4_lock_group(sb, group); 2824 + else if (!ext4_try_lock_group(sb, group)) 2825 + goto out_unload; 2826 + 2827 + /* We need to check again after locking the block group. */ 2828 + if (unlikely(!ext4_mb_good_group(ac, group, cr))) 2829 + goto out_unlock; 2830 + 2831 + __ext4_mb_scan_group(ac); 2832 + 2833 + out_unlock: 2834 + ext4_unlock_group(sb, group); 2835 + out_unload: 2836 + ext4_mb_unload_buddy(ac->ac_e4b); 2837 + return ret; 2838 + } 2839 + 2920 2840 static noinline_for_stack int 2921 2841 ext4_mb_regular_allocator(struct ext4_allocation_context *ac) 2922 2842 { 2923 - ext4_group_t prefetch_grp = 0, ngroups, group, i; 2924 - enum criteria new_cr, cr = CR_GOAL_LEN_FAST; 2925 - int err = 0, first_err = 0; 2926 - unsigned int nr = 0, prefetch_ios = 0; 2927 - struct ext4_sb_info *sbi; 2928 - struct super_block *sb; 2843 + ext4_group_t i; 2844 + int err = 0; 2845 + struct super_block *sb = ac->ac_sb; 2846 + struct ext4_sb_info *sbi = EXT4_SB(sb); 2929 2847 struct ext4_buddy e4b; 2930 - int lost; 2931 - 2932 - sb = ac->ac_sb; 2933 - sbi = EXT4_SB(sb); 2934 - ngroups = ext4_get_groups_count(sb); 2935 - /* non-extent files are limited to low blocks/groups */ 2936 - if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) 2937 - ngroups = sbi->s_blockfile_groups; 2938 2848 2939 2849 BUG_ON(ac->ac_status == AC_STATUS_FOUND); 2940 2850 ··· 3002 2844 3003 2845 /* if stream allocation is enabled, use global goal */ 3004 2846 if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { 3005 - /* TBD: may be hot point */ 3006 - spin_lock(&sbi->s_md_lock); 3007 - ac->ac_g_ex.fe_group = sbi->s_mb_last_group; 3008 - ac->ac_g_ex.fe_start = sbi->s_mb_last_start; 3009 - spin_unlock(&sbi->s_md_lock); 2847 + int hash = ac->ac_inode->i_ino % sbi->s_mb_nr_global_goals; 2848 + 2849 + ac->ac_g_ex.fe_group = READ_ONCE(sbi->s_mb_last_groups[hash]); 2850 + ac->ac_g_ex.fe_start = -1; 2851 + ac->ac_flags &= ~EXT4_MB_HINT_TRY_GOAL; 3010 2852 } 3011 2853 3012 2854 /* ··· 3014 2856 * start with CR_GOAL_LEN_FAST, unless it is power of 2 3015 2857 * aligned, in which case let's do that faster approach first. 3016 2858 */ 2859 + ac->ac_criteria = CR_GOAL_LEN_FAST; 3017 2860 if (ac->ac_2order) 3018 - cr = CR_POWER2_ALIGNED; 2861 + ac->ac_criteria = CR_POWER2_ALIGNED; 2862 + 2863 + ac->ac_e4b = &e4b; 2864 + ac->ac_prefetch_ios = 0; 2865 + ac->ac_first_err = 0; 3019 2866 repeat: 3020 - for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { 3021 - ac->ac_criteria = cr; 3022 - /* 3023 - * searching for the right group start 3024 - * from the goal value specified 3025 - */ 3026 - group = ac->ac_g_ex.fe_group; 3027 - ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; 3028 - prefetch_grp = group; 3029 - nr = 0; 2867 + while (ac->ac_criteria < EXT4_MB_NUM_CRS) { 2868 + err = ext4_mb_scan_groups(ac); 2869 + if (err) 2870 + goto out; 3030 2871 3031 - for (i = 0, new_cr = cr; i < ngroups; i++, 3032 - ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { 3033 - int ret = 0; 3034 - 3035 - cond_resched(); 3036 - if (new_cr != cr) { 3037 - cr = new_cr; 3038 - goto repeat; 3039 - } 3040 - 3041 - /* 3042 - * Batch reads of the block allocation bitmaps 3043 - * to get multiple READs in flight; limit 3044 - * prefetching at inexpensive CR, otherwise mballoc 3045 - * can spend a lot of time loading imperfect groups 3046 - */ 3047 - if ((prefetch_grp == group) && 3048 - (ext4_mb_cr_expensive(cr) || 3049 - prefetch_ios < sbi->s_mb_prefetch_limit)) { 3050 - nr = sbi->s_mb_prefetch; 3051 - if (ext4_has_feature_flex_bg(sb)) { 3052 - nr = 1 << sbi->s_log_groups_per_flex; 3053 - nr -= group & (nr - 1); 3054 - nr = min(nr, sbi->s_mb_prefetch); 3055 - } 3056 - prefetch_grp = ext4_mb_prefetch(sb, group, 3057 - nr, &prefetch_ios); 3058 - } 3059 - 3060 - /* This now checks without needing the buddy page */ 3061 - ret = ext4_mb_good_group_nolock(ac, group, cr); 3062 - if (ret <= 0) { 3063 - if (!first_err) 3064 - first_err = ret; 3065 - continue; 3066 - } 3067 - 3068 - err = ext4_mb_load_buddy(sb, group, &e4b); 3069 - if (err) 3070 - goto out; 3071 - 3072 - ext4_lock_group(sb, group); 3073 - 3074 - /* 3075 - * We need to check again after locking the 3076 - * block group 3077 - */ 3078 - ret = ext4_mb_good_group(ac, group, cr); 3079 - if (ret == 0) { 3080 - ext4_unlock_group(sb, group); 3081 - ext4_mb_unload_buddy(&e4b); 3082 - continue; 3083 - } 3084 - 3085 - ac->ac_groups_scanned++; 3086 - if (cr == CR_POWER2_ALIGNED) 3087 - ext4_mb_simple_scan_group(ac, &e4b); 3088 - else { 3089 - bool is_stripe_aligned = 3090 - (sbi->s_stripe >= 3091 - sbi->s_cluster_ratio) && 3092 - !(ac->ac_g_ex.fe_len % 3093 - EXT4_NUM_B2C(sbi, sbi->s_stripe)); 3094 - 3095 - if ((cr == CR_GOAL_LEN_FAST || 3096 - cr == CR_BEST_AVAIL_LEN) && 3097 - is_stripe_aligned) 3098 - ext4_mb_scan_aligned(ac, &e4b); 3099 - 3100 - if (ac->ac_status == AC_STATUS_CONTINUE) 3101 - ext4_mb_complex_scan_group(ac, &e4b); 3102 - } 3103 - 3104 - ext4_unlock_group(sb, group); 3105 - ext4_mb_unload_buddy(&e4b); 3106 - 3107 - if (ac->ac_status != AC_STATUS_CONTINUE) 3108 - break; 3109 - } 3110 - /* Processed all groups and haven't found blocks */ 3111 - if (sbi->s_mb_stats && i == ngroups) 3112 - atomic64_inc(&sbi->s_bal_cX_failed[cr]); 3113 - 3114 - if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) 3115 - /* Reset goal length to original goal length before 3116 - * falling into CR_GOAL_LEN_SLOW */ 3117 - ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; 2872 + if (ac->ac_status != AC_STATUS_CONTINUE) 2873 + break; 3118 2874 } 3119 2875 3120 2876 if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && ··· 3039 2967 */ 3040 2968 ext4_mb_try_best_found(ac, &e4b); 3041 2969 if (ac->ac_status != AC_STATUS_FOUND) { 2970 + int lost; 2971 + 3042 2972 /* 3043 2973 * Someone more lucky has already allocated it. 3044 2974 * The only thing we can do is just take first ··· 3056 2982 ac->ac_b_ex.fe_len = 0; 3057 2983 ac->ac_status = AC_STATUS_CONTINUE; 3058 2984 ac->ac_flags |= EXT4_MB_HINT_FIRST; 3059 - cr = CR_ANY_FREE; 2985 + ac->ac_criteria = CR_ANY_FREE; 3060 2986 goto repeat; 3061 2987 } 3062 2988 } 3063 2989 3064 - if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) 2990 + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) { 3065 2991 atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); 2992 + if (ac->ac_flags & EXT4_MB_STREAM_ALLOC && 2993 + ac->ac_b_ex.fe_group == ac->ac_g_ex.fe_group) 2994 + atomic_inc(&sbi->s_bal_stream_goals); 2995 + } 3066 2996 out: 3067 - if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) 3068 - err = first_err; 2997 + if (!err && ac->ac_status != AC_STATUS_FOUND && ac->ac_first_err) 2998 + err = ac->ac_first_err; 3069 2999 3070 3000 mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", 3071 3001 ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, 3072 - ac->ac_flags, cr, err); 3002 + ac->ac_flags, ac->ac_criteria, err); 3073 3003 3074 - if (nr) 3075 - ext4_mb_prefetch_fini(sb, prefetch_grp, nr); 3004 + if (ac->ac_prefetch_nr) 3005 + ext4_mb_prefetch_fini(sb, ac->ac_prefetch_grp, ac->ac_prefetch_nr); 3076 3006 3077 3007 return err; 3078 3008 } ··· 3199 3121 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); 3200 3122 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3201 3123 atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); 3202 - seq_printf(seq, "\t\tbad_suggestions: %u\n", 3203 - atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); 3204 3124 3205 3125 /* CR_GOAL_LEN_FAST stats */ 3206 3126 seq_puts(seq, "\tcr_goal_fast_stats:\n"); ··· 3211 3135 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); 3212 3136 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3213 3137 atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); 3214 - seq_printf(seq, "\t\tbad_suggestions: %u\n", 3215 - atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); 3216 3138 3217 3139 /* CR_BEST_AVAIL_LEN stats */ 3218 3140 seq_puts(seq, "\tcr_best_avail_stats:\n"); ··· 3224 3150 atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); 3225 3151 seq_printf(seq, "\t\tuseless_loops: %llu\n", 3226 3152 atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); 3227 - seq_printf(seq, "\t\tbad_suggestions: %u\n", 3228 - atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); 3229 3153 3230 3154 /* CR_GOAL_LEN_SLOW stats */ 3231 3155 seq_puts(seq, "\tcr_goal_slow_stats:\n"); ··· 3253 3181 seq_printf(seq, "\textents_scanned: %u\n", 3254 3182 atomic_read(&sbi->s_bal_ex_scanned)); 3255 3183 seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); 3184 + seq_printf(seq, "\t\tstream_goal_hits: %u\n", 3185 + atomic_read(&sbi->s_bal_stream_goals)); 3256 3186 seq_printf(seq, "\t\tlen_goal_hits: %u\n", 3257 3187 atomic_read(&sbi->s_bal_len_goals)); 3258 3188 seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); ··· 3301 3227 unsigned long position = ((unsigned long) v); 3302 3228 struct ext4_group_info *grp; 3303 3229 unsigned int count; 3230 + unsigned long idx; 3304 3231 3305 3232 position--; 3306 3233 if (position >= MB_NUM_ORDERS(sb)) { ··· 3310 3235 seq_puts(seq, "avg_fragment_size_lists:\n"); 3311 3236 3312 3237 count = 0; 3313 - read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); 3314 - list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], 3315 - bb_avg_fragment_size_node) 3238 + xa_for_each(&sbi->s_mb_avg_fragment_size[position], idx, grp) 3316 3239 count++; 3317 - read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); 3318 3240 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3319 3241 (unsigned int)position, count); 3320 3242 return 0; ··· 3323 3251 seq_puts(seq, "max_free_order_lists:\n"); 3324 3252 } 3325 3253 count = 0; 3326 - read_lock(&sbi->s_mb_largest_free_orders_locks[position]); 3327 - list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], 3328 - bb_largest_free_order_node) 3254 + xa_for_each(&sbi->s_mb_largest_free_orders[position], idx, grp) 3329 3255 count++; 3330 - read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); 3331 3256 seq_printf(seq, "\tlist_order_%u_groups: %u\n", 3332 3257 (unsigned int)position, count); 3333 3258 ··· 3444 3375 INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); 3445 3376 init_rwsem(&meta_group_info[i]->alloc_sem); 3446 3377 meta_group_info[i]->bb_free_root = RB_ROOT; 3447 - INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); 3448 - INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); 3449 3378 meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ 3450 3379 meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ 3451 3380 meta_group_info[i]->bb_group = group; ··· 3653 3586 ext4_mb_unload_buddy(&e4b); 3654 3587 } 3655 3588 3589 + static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) 3590 + { 3591 + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) 3592 + xa_destroy(&sbi->s_mb_avg_fragment_size[i]); 3593 + kfree(sbi->s_mb_avg_fragment_size); 3594 + } 3595 + 3596 + static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) 3597 + { 3598 + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) 3599 + xa_destroy(&sbi->s_mb_largest_free_orders[i]); 3600 + kfree(sbi->s_mb_largest_free_orders); 3601 + } 3602 + 3656 3603 int ext4_mb_init(struct super_block *sb) 3657 3604 { 3658 3605 struct ext4_sb_info *sbi = EXT4_SB(sb); ··· 3712 3631 } while (i < MB_NUM_ORDERS(sb)); 3713 3632 3714 3633 sbi->s_mb_avg_fragment_size = 3715 - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3634 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), 3716 3635 GFP_KERNEL); 3717 3636 if (!sbi->s_mb_avg_fragment_size) { 3718 3637 ret = -ENOMEM; 3719 3638 goto out; 3720 3639 } 3721 - sbi->s_mb_avg_fragment_size_locks = 3722 - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3723 - GFP_KERNEL); 3724 - if (!sbi->s_mb_avg_fragment_size_locks) { 3725 - ret = -ENOMEM; 3726 - goto out; 3727 - } 3728 - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3729 - INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); 3730 - rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); 3731 - } 3640 + for (i = 0; i < MB_NUM_ORDERS(sb); i++) 3641 + xa_init(&sbi->s_mb_avg_fragment_size[i]); 3642 + 3732 3643 sbi->s_mb_largest_free_orders = 3733 - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), 3644 + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct xarray), 3734 3645 GFP_KERNEL); 3735 3646 if (!sbi->s_mb_largest_free_orders) { 3736 3647 ret = -ENOMEM; 3737 3648 goto out; 3738 3649 } 3739 - sbi->s_mb_largest_free_orders_locks = 3740 - kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), 3741 - GFP_KERNEL); 3742 - if (!sbi->s_mb_largest_free_orders_locks) { 3743 - ret = -ENOMEM; 3744 - goto out; 3745 - } 3746 - for (i = 0; i < MB_NUM_ORDERS(sb); i++) { 3747 - INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); 3748 - rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); 3749 - } 3650 + for (i = 0; i < MB_NUM_ORDERS(sb); i++) 3651 + xa_init(&sbi->s_mb_largest_free_orders[i]); 3750 3652 3751 3653 spin_lock_init(&sbi->s_md_lock); 3752 - sbi->s_mb_free_pending = 0; 3654 + atomic_set(&sbi->s_mb_free_pending, 0); 3753 3655 INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); 3754 3656 INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); 3755 3657 INIT_LIST_HEAD(&sbi->s_discard_list); ··· 3773 3709 sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); 3774 3710 } 3775 3711 3712 + sbi->s_mb_nr_global_goals = umin(num_possible_cpus(), 3713 + DIV_ROUND_UP(sbi->s_groups_count, 4)); 3714 + sbi->s_mb_last_groups = kcalloc(sbi->s_mb_nr_global_goals, 3715 + sizeof(ext4_group_t), GFP_KERNEL); 3716 + if (sbi->s_mb_last_groups == NULL) { 3717 + ret = -ENOMEM; 3718 + goto out; 3719 + } 3720 + 3776 3721 sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); 3777 3722 if (sbi->s_locality_groups == NULL) { 3778 3723 ret = -ENOMEM; 3779 - goto out; 3724 + goto out_free_last_groups; 3780 3725 } 3781 3726 for_each_possible_cpu(i) { 3782 3727 struct ext4_locality_group *lg; ··· 3810 3737 out_free_locality_groups: 3811 3738 free_percpu(sbi->s_locality_groups); 3812 3739 sbi->s_locality_groups = NULL; 3740 + out_free_last_groups: 3741 + kfree(sbi->s_mb_last_groups); 3742 + sbi->s_mb_last_groups = NULL; 3813 3743 out: 3814 - kfree(sbi->s_mb_avg_fragment_size); 3815 - kfree(sbi->s_mb_avg_fragment_size_locks); 3816 - kfree(sbi->s_mb_largest_free_orders); 3817 - kfree(sbi->s_mb_largest_free_orders_locks); 3744 + ext4_mb_avg_fragment_size_destroy(sbi); 3745 + ext4_mb_largest_free_orders_destroy(sbi); 3818 3746 kfree(sbi->s_mb_offsets); 3819 3747 sbi->s_mb_offsets = NULL; 3820 3748 kfree(sbi->s_mb_maxs); ··· 3882 3808 kvfree(group_info); 3883 3809 rcu_read_unlock(); 3884 3810 } 3885 - kfree(sbi->s_mb_avg_fragment_size); 3886 - kfree(sbi->s_mb_avg_fragment_size_locks); 3887 - kfree(sbi->s_mb_largest_free_orders); 3888 - kfree(sbi->s_mb_largest_free_orders_locks); 3811 + ext4_mb_avg_fragment_size_destroy(sbi); 3812 + ext4_mb_largest_free_orders_destroy(sbi); 3889 3813 kfree(sbi->s_mb_offsets); 3890 3814 kfree(sbi->s_mb_maxs); 3891 3815 iput(sbi->s_buddy_cache); ··· 3913 3841 } 3914 3842 3915 3843 free_percpu(sbi->s_locality_groups); 3844 + kfree(sbi->s_mb_last_groups); 3916 3845 } 3917 3846 3918 3847 static inline int ext4_issue_discard(struct super_block *sb, ··· 3944 3871 /* we expect to find existing buddy because it's pinned */ 3945 3872 BUG_ON(err != 0); 3946 3873 3947 - spin_lock(&EXT4_SB(sb)->s_md_lock); 3948 - EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; 3949 - spin_unlock(&EXT4_SB(sb)->s_md_lock); 3950 - 3874 + atomic_sub(entry->efd_count, &EXT4_SB(sb)->s_mb_free_pending); 3951 3875 db = e4b.bd_info; 3952 3876 /* there are blocks to put in buddy to make them really free */ 3953 3877 count += entry->efd_count; ··· 6348 6278 * are contiguous, AND the extents were freed by the same transaction, 6349 6279 * AND the blocks are associated with the same group. 6350 6280 */ 6351 - static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, 6352 - struct ext4_free_data *entry, 6353 - struct ext4_free_data *new_entry, 6354 - struct rb_root *entry_rb_root) 6281 + static inline bool 6282 + ext4_freed_extents_can_be_merged(struct ext4_free_data *entry1, 6283 + struct ext4_free_data *entry2) 6355 6284 { 6356 - if ((entry->efd_tid != new_entry->efd_tid) || 6357 - (entry->efd_group != new_entry->efd_group)) 6358 - return; 6359 - if (entry->efd_start_cluster + entry->efd_count == 6360 - new_entry->efd_start_cluster) { 6361 - new_entry->efd_start_cluster = entry->efd_start_cluster; 6362 - new_entry->efd_count += entry->efd_count; 6363 - } else if (new_entry->efd_start_cluster + new_entry->efd_count == 6364 - entry->efd_start_cluster) { 6365 - new_entry->efd_count += entry->efd_count; 6366 - } else 6367 - return; 6285 + if (entry1->efd_tid != entry2->efd_tid) 6286 + return false; 6287 + if (entry1->efd_start_cluster + entry1->efd_count != 6288 + entry2->efd_start_cluster) 6289 + return false; 6290 + if (WARN_ON_ONCE(entry1->efd_group != entry2->efd_group)) 6291 + return false; 6292 + return true; 6293 + } 6294 + 6295 + static inline void 6296 + ext4_merge_freed_extents(struct ext4_sb_info *sbi, struct rb_root *root, 6297 + struct ext4_free_data *entry1, 6298 + struct ext4_free_data *entry2) 6299 + { 6300 + entry1->efd_count += entry2->efd_count; 6368 6301 spin_lock(&sbi->s_md_lock); 6369 - list_del(&entry->efd_list); 6302 + list_del(&entry2->efd_list); 6370 6303 spin_unlock(&sbi->s_md_lock); 6371 - rb_erase(&entry->efd_node, entry_rb_root); 6372 - kmem_cache_free(ext4_free_data_cachep, entry); 6304 + rb_erase(&entry2->efd_node, root); 6305 + kmem_cache_free(ext4_free_data_cachep, entry2); 6306 + } 6307 + 6308 + static inline void 6309 + ext4_try_merge_freed_extent_prev(struct ext4_sb_info *sbi, struct rb_root *root, 6310 + struct ext4_free_data *entry) 6311 + { 6312 + struct ext4_free_data *prev; 6313 + struct rb_node *node; 6314 + 6315 + node = rb_prev(&entry->efd_node); 6316 + if (!node) 6317 + return; 6318 + 6319 + prev = rb_entry(node, struct ext4_free_data, efd_node); 6320 + if (ext4_freed_extents_can_be_merged(prev, entry)) 6321 + ext4_merge_freed_extents(sbi, root, prev, entry); 6322 + } 6323 + 6324 + static inline void 6325 + ext4_try_merge_freed_extent_next(struct ext4_sb_info *sbi, struct rb_root *root, 6326 + struct ext4_free_data *entry) 6327 + { 6328 + struct ext4_free_data *next; 6329 + struct rb_node *node; 6330 + 6331 + node = rb_next(&entry->efd_node); 6332 + if (!node) 6333 + return; 6334 + 6335 + next = rb_entry(node, struct ext4_free_data, efd_node); 6336 + if (ext4_freed_extents_can_be_merged(entry, next)) 6337 + ext4_merge_freed_extents(sbi, root, entry, next); 6373 6338 } 6374 6339 6375 6340 static noinline_for_stack void ··· 6414 6309 ext4_group_t group = e4b->bd_group; 6415 6310 ext4_grpblk_t cluster; 6416 6311 ext4_grpblk_t clusters = new_entry->efd_count; 6417 - struct ext4_free_data *entry; 6312 + struct ext4_free_data *entry = NULL; 6418 6313 struct ext4_group_info *db = e4b->bd_info; 6419 6314 struct super_block *sb = e4b->bd_sb; 6420 6315 struct ext4_sb_info *sbi = EXT4_SB(sb); 6421 - struct rb_node **n = &db->bb_free_root.rb_node, *node; 6316 + struct rb_root *root = &db->bb_free_root; 6317 + struct rb_node **n = &root->rb_node; 6422 6318 struct rb_node *parent = NULL, *new_node; 6423 6319 6424 6320 BUG_ON(!ext4_handle_valid(handle)); ··· 6455 6349 } 6456 6350 } 6457 6351 6352 + atomic_add(clusters, &sbi->s_mb_free_pending); 6353 + if (!entry) 6354 + goto insert; 6355 + 6356 + /* Now try to see the extent can be merged to prev and next */ 6357 + if (ext4_freed_extents_can_be_merged(new_entry, entry)) { 6358 + entry->efd_start_cluster = cluster; 6359 + entry->efd_count += new_entry->efd_count; 6360 + kmem_cache_free(ext4_free_data_cachep, new_entry); 6361 + ext4_try_merge_freed_extent_prev(sbi, root, entry); 6362 + return; 6363 + } 6364 + if (ext4_freed_extents_can_be_merged(entry, new_entry)) { 6365 + entry->efd_count += new_entry->efd_count; 6366 + kmem_cache_free(ext4_free_data_cachep, new_entry); 6367 + ext4_try_merge_freed_extent_next(sbi, root, entry); 6368 + return; 6369 + } 6370 + insert: 6458 6371 rb_link_node(new_node, parent, n); 6459 - rb_insert_color(new_node, &db->bb_free_root); 6460 - 6461 - /* Now try to see the extent can be merged to left and right */ 6462 - node = rb_prev(new_node); 6463 - if (node) { 6464 - entry = rb_entry(node, struct ext4_free_data, efd_node); 6465 - ext4_try_merge_freed_extent(sbi, entry, new_entry, 6466 - &(db->bb_free_root)); 6467 - } 6468 - 6469 - node = rb_next(new_node); 6470 - if (node) { 6471 - entry = rb_entry(node, struct ext4_free_data, efd_node); 6472 - ext4_try_merge_freed_extent(sbi, entry, new_entry, 6473 - &(db->bb_free_root)); 6474 - } 6372 + rb_insert_color(new_node, root); 6475 6373 6476 6374 spin_lock(&sbi->s_md_lock); 6477 6375 list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); 6478 - sbi->s_mb_free_pending += clusters; 6479 6376 spin_unlock(&sbi->s_md_lock); 6480 6377 } 6481 6378
+8 -1
fs/ext4/mballoc.h
··· 192 192 */ 193 193 ext4_grpblk_t ac_orig_goal_len; 194 194 195 + ext4_group_t ac_prefetch_grp; 196 + unsigned int ac_prefetch_ios; 197 + unsigned int ac_prefetch_nr; 198 + 199 + int ac_first_err; 200 + 195 201 __u32 ac_flags; /* allocation hints */ 196 - __u32 ac_groups_linear_remaining; 197 202 __u16 ac_groups_scanned; 198 203 __u16 ac_found; 199 204 __u16 ac_cX_found[EXT4_MB_NUM_CRS]; ··· 209 204 __u8 ac_2order; /* if request is to allocate 2^N blocks and 210 205 * N > 0, the field stores N, otherwise 0 */ 211 206 __u8 ac_op; /* operation, for history only */ 207 + 208 + struct ext4_buddy *ac_e4b; 212 209 struct folio *ac_bitmap_folio; 213 210 struct folio *ac_buddy_folio; 214 211 struct ext4_prealloc_space *ac_pa;
+2 -1
fs/ext4/move_extent.c
··· 280 280 */ 281 281 again: 282 282 *err = 0; 283 - jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 283 + jblocks = ext4_meta_trans_blocks(orig_inode, block_len_in_page, 284 + block_len_in_page) * 2; 284 285 handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); 285 286 if (IS_ERR(handle)) { 286 287 *err = PTR_ERR(handle);
+39 -30
fs/ext4/namei.c
··· 2915 2915 return err; 2916 2916 } 2917 2917 2918 - struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 2919 - struct ext4_dir_entry_2 *de, 2920 - int blocksize, int csum_size, 2921 - unsigned int parent_ino, int dotdot_real_len) 2918 + int ext4_init_dirblock(handle_t *handle, struct inode *inode, 2919 + struct buffer_head *bh, unsigned int parent_ino, 2920 + void *inline_buf, int inline_size) 2922 2921 { 2922 + struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) bh->b_data; 2923 + size_t blocksize = bh->b_size; 2924 + int csum_size = 0, header_size; 2925 + 2926 + if (ext4_has_feature_metadata_csum(inode->i_sb)) 2927 + csum_size = sizeof(struct ext4_dir_entry_tail); 2928 + 2923 2929 de->inode = cpu_to_le32(inode->i_ino); 2924 2930 de->name_len = 1; 2925 2931 de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), 2926 2932 blocksize); 2927 - strcpy(de->name, "."); 2933 + memcpy(de->name, ".", 2); 2928 2934 ext4_set_de_type(inode->i_sb, de, S_IFDIR); 2929 2935 2930 2936 de = ext4_next_entry(de, blocksize); 2931 2937 de->inode = cpu_to_le32(parent_ino); 2932 2938 de->name_len = 2; 2933 - if (!dotdot_real_len) 2934 - de->rec_len = ext4_rec_len_to_disk(blocksize - 2935 - (csum_size + ext4_dir_rec_len(1, NULL)), 2936 - blocksize); 2937 - else 2939 + memcpy(de->name, "..", 3); 2940 + ext4_set_de_type(inode->i_sb, de, S_IFDIR); 2941 + if (inline_buf) { 2938 2942 de->rec_len = ext4_rec_len_to_disk( 2939 2943 ext4_dir_rec_len(de->name_len, NULL), 2940 2944 blocksize); 2941 - strcpy(de->name, ".."); 2942 - ext4_set_de_type(inode->i_sb, de, S_IFDIR); 2945 + de = ext4_next_entry(de, blocksize); 2946 + header_size = (char *)de - bh->b_data; 2947 + memcpy((void *)de, inline_buf, inline_size); 2948 + ext4_update_final_de(bh->b_data, inline_size + header_size, 2949 + blocksize - csum_size); 2950 + } else { 2951 + de->rec_len = ext4_rec_len_to_disk(blocksize - 2952 + (csum_size + ext4_dir_rec_len(1, NULL)), 2953 + blocksize); 2954 + } 2943 2955 2944 - return ext4_next_entry(de, blocksize); 2956 + if (csum_size) 2957 + ext4_initialize_dirent_tail(bh, blocksize); 2958 + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 2959 + set_buffer_uptodate(bh); 2960 + set_buffer_verified(bh); 2961 + return ext4_handle_dirty_dirblock(handle, inode, bh); 2945 2962 } 2946 2963 2947 2964 int ext4_init_new_dir(handle_t *handle, struct inode *dir, ··· 2967 2950 struct buffer_head *dir_block = NULL; 2968 2951 struct ext4_dir_entry_2 *de; 2969 2952 ext4_lblk_t block = 0; 2970 - unsigned int blocksize = dir->i_sb->s_blocksize; 2971 - int csum_size = 0; 2972 2953 int err; 2973 - 2974 - if (ext4_has_feature_metadata_csum(dir->i_sb)) 2975 - csum_size = sizeof(struct ext4_dir_entry_tail); 2976 2954 2977 2955 if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { 2978 2956 err = ext4_try_create_inline_dir(handle, dir, inode); ··· 2977 2965 goto out; 2978 2966 } 2979 2967 2968 + set_nlink(inode, 2); 2980 2969 inode->i_size = 0; 2981 2970 dir_block = ext4_append(handle, inode, &block); 2982 2971 if (IS_ERR(dir_block)) 2983 2972 return PTR_ERR(dir_block); 2984 2973 de = (struct ext4_dir_entry_2 *)dir_block->b_data; 2985 - ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); 2986 - set_nlink(inode, 2); 2987 - if (csum_size) 2988 - ext4_initialize_dirent_tail(dir_block, blocksize); 2989 - 2990 - BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); 2991 - err = ext4_handle_dirty_dirblock(handle, inode, dir_block); 2974 + err = ext4_init_dirblock(handle, inode, dir_block, dir->i_ino, NULL, 0); 2992 2975 if (err) 2993 2976 goto out; 2994 - set_buffer_verified(dir_block); 2995 2977 out: 2996 2978 brelse(dir_block); 2997 2979 return err; ··· 3088 3082 de = (struct ext4_dir_entry_2 *) bh->b_data; 3089 3083 if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 3090 3084 0) || 3091 - le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) { 3085 + le32_to_cpu(de->inode) != inode->i_ino || de->name_len != 1 || 3086 + de->name[0] != '.') { 3092 3087 ext4_warning_inode(inode, "directory missing '.'"); 3093 3088 brelse(bh); 3094 3089 return false; ··· 3098 3091 de = ext4_next_entry(de, sb->s_blocksize); 3099 3092 if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size, 3100 3093 offset) || 3101 - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { 3094 + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || 3095 + de->name[0] != '.' || de->name[1] != '.') { 3102 3096 ext4_warning_inode(inode, "directory missing '..'"); 3103 3097 brelse(bh); 3104 3098 return false; ··· 3540 3532 if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, 3541 3533 bh->b_size, 0) || 3542 3534 le32_to_cpu(de->inode) != inode->i_ino || 3543 - strcmp(".", de->name)) { 3535 + de->name_len != 1 || de->name[0] != '.') { 3544 3536 EXT4_ERROR_INODE(inode, "directory missing '.'"); 3545 3537 brelse(bh); 3546 3538 *retval = -EFSCORRUPTED; ··· 3551 3543 de = ext4_next_entry(de, inode->i_sb->s_blocksize); 3552 3544 if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, 3553 3545 bh->b_size, offset) || 3554 - le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { 3546 + le32_to_cpu(de->inode) == 0 || de->name_len != 2 || 3547 + de->name[0] != '.' || de->name[1] != '.') { 3555 3548 EXT4_ERROR_INODE(inode, "directory missing '..'"); 3556 3549 brelse(bh); 3557 3550 *retval = -EFSCORRUPTED;
+8 -8
fs/ext4/page-io.c
··· 236 236 237 237 static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) 238 238 { 239 - if (io_end->flag & EXT4_IO_END_UNWRITTEN) 239 + if (io_end->flag & EXT4_IO_END_UNWRITTEN && 240 + !list_empty(&io_end->list_vec)) 240 241 return true; 241 242 if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && 242 - io_end->flag & EXT4_IO_END_FAILED) 243 + io_end->flag & EXT4_IO_END_FAILED && 244 + !ext4_emergency_state(io_end->inode->i_sb)) 243 245 return true; 244 246 return false; 245 247 } ··· 258 256 WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); 259 257 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && 260 258 !io_end->handle && sbi->s_journal); 259 + WARN_ON(!io_end->bio); 261 260 262 261 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 263 262 wq = sbi->rsv_conversion_wq; ··· 321 318 void ext4_put_io_end_defer(ext4_io_end_t *io_end) 322 319 { 323 320 if (refcount_dec_and_test(&io_end->count)) { 324 - if (io_end->flag & EXT4_IO_END_FAILED || 325 - (io_end->flag & EXT4_IO_END_UNWRITTEN && 326 - !list_empty(&io_end->list_vec))) { 327 - ext4_add_complete_io(io_end); 328 - return; 329 - } 321 + if (ext4_io_end_defer_completion(io_end)) 322 + return ext4_add_complete_io(io_end); 323 + 330 324 ext4_release_io_end(io_end); 331 325 } 332 326 }
+2 -2
fs/ext4/xattr.c
··· 338 338 cmp = name_len - entry->e_name_len; 339 339 if (!cmp) 340 340 cmp = memcmp(name, entry->e_name, name_len); 341 - if (cmp <= 0 && (sorted || cmp == 0)) 341 + if (!cmp || (cmp < 0 && sorted)) 342 342 break; 343 343 } 344 344 *pentry = entry; ··· 962 962 * so we need to reserve credits for this eventuality 963 963 */ 964 964 if (inode && ext4_has_inline_data(inode)) 965 - credits += ext4_writepage_trans_blocks(inode) + 1; 965 + credits += ext4_chunk_trans_extent(inode, 1) + 1; 966 966 967 967 /* We are done if ea_inode feature is not enabled. */ 968 968 if (!ext4_has_feature_ea_inode(sb))
+39 -11
include/trace/events/ext4.h
··· 23 23 24 24 #define show_mballoc_flags(flags) __print_flags(flags, "|", \ 25 25 { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \ 26 - { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \ 27 - { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \ 28 26 { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \ 29 - { EXT4_MB_HINT_BEST, "HINT_BEST" }, \ 30 27 { EXT4_MB_HINT_DATA, "HINT_DATA" }, \ 31 28 { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \ 32 29 { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \ ··· 480 483 (unsigned long) __entry->writeback_index) 481 484 ); 482 485 483 - TRACE_EVENT(ext4_da_write_pages, 484 - TP_PROTO(struct inode *inode, pgoff_t first_page, 486 + TRACE_EVENT(ext4_da_write_folios_start, 487 + TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos, 485 488 struct writeback_control *wbc), 486 489 487 - TP_ARGS(inode, first_page, wbc), 490 + TP_ARGS(inode, start_pos, next_pos, wbc), 488 491 489 492 TP_STRUCT__entry( 490 493 __field( dev_t, dev ) 491 494 __field( ino_t, ino ) 492 - __field( pgoff_t, first_page ) 495 + __field( loff_t, start_pos ) 496 + __field( loff_t, next_pos ) 493 497 __field( long, nr_to_write ) 494 498 __field( int, sync_mode ) 495 499 ), ··· 498 500 TP_fast_assign( 499 501 __entry->dev = inode->i_sb->s_dev; 500 502 __entry->ino = inode->i_ino; 501 - __entry->first_page = first_page; 503 + __entry->start_pos = start_pos; 504 + __entry->next_pos = next_pos; 502 505 __entry->nr_to_write = wbc->nr_to_write; 503 506 __entry->sync_mode = wbc->sync_mode; 504 507 ), 505 508 506 - TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " 507 - "sync_mode %d", 509 + TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld sync_mode %d", 508 510 MAJOR(__entry->dev), MINOR(__entry->dev), 509 - (unsigned long) __entry->ino, __entry->first_page, 511 + (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos, 510 512 __entry->nr_to_write, __entry->sync_mode) 513 + ); 514 + 515 + TRACE_EVENT(ext4_da_write_folios_end, 516 + TP_PROTO(struct inode *inode, loff_t start_pos, loff_t next_pos, 517 + struct writeback_control *wbc, int ret), 518 + 519 + TP_ARGS(inode, start_pos, next_pos, wbc, ret), 520 + 521 + TP_STRUCT__entry( 522 + __field( dev_t, dev ) 523 + __field( ino_t, ino ) 524 + __field( loff_t, start_pos ) 525 + __field( loff_t, next_pos ) 526 + __field( long, nr_to_write ) 527 + __field( int, ret ) 528 + ), 529 + 530 + TP_fast_assign( 531 + __entry->dev = inode->i_sb->s_dev; 532 + __entry->ino = inode->i_ino; 533 + __entry->start_pos = start_pos; 534 + __entry->next_pos = next_pos; 535 + __entry->nr_to_write = wbc->nr_to_write; 536 + __entry->ret = ret; 537 + ), 538 + 539 + TP_printk("dev %d,%d ino %lu start_pos 0x%llx next_pos 0x%llx nr_to_write %ld ret %d", 540 + MAJOR(__entry->dev), MINOR(__entry->dev), 541 + (unsigned long) __entry->ino, __entry->start_pos, __entry->next_pos, 542 + __entry->nr_to_write, __entry->ret) 511 543 ); 512 544 513 545 TRACE_EVENT(ext4_da_write_pages_extent,