Merge tag 'ext4_for_linus-6.7-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

+5

fs/ext4/acl.h

··· 68 68 static inline int 69 69 ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) 70 70 { 71 + /* usually, the umask is applied by posix_acl_create(), but if 72 + ext4 ACL support is disabled at compile time, we need to do 73 + it here, because posix_acl_create() will never be called */ 74 + inode->i_mode &= ~current_umask(); 75 + 71 76 return 0; 72 77 } 73 78 #endif /* CONFIG_EXT4_FS_POSIX_ACL */

+12 -4

fs/ext4/balloc.c

··· 22 22 #include "mballoc.h" 23 23 24 24 #include <trace/events/ext4.h> 25 + #include <kunit/static_stub.h> 25 26 26 27 static unsigned ext4_num_base_meta_clusters(struct super_block *sb, 27 28 ext4_group_t block_group); ··· 112 111 itbl_blk_start = ext4_inode_table(sb, gdp); 113 112 itbl_blk_end = itbl_blk_start + sbi->s_itb_per_group - 1; 114 113 if (itbl_blk_start <= end && itbl_blk_end >= start) { 115 - itbl_blk_start = itbl_blk_start >= start ? 116 - itbl_blk_start : start; 117 - itbl_blk_end = itbl_blk_end <= end ? 118 - itbl_blk_end : end; 114 + itbl_blk_start = max(itbl_blk_start, start); 115 + itbl_blk_end = min(itbl_blk_end, end); 119 116 120 117 itbl_cluster_start = EXT4_B2C(sbi, itbl_blk_start - start); 121 118 itbl_cluster_end = EXT4_B2C(sbi, itbl_blk_end - start); ··· 272 273 struct ext4_group_desc *desc; 273 274 struct ext4_sb_info *sbi = EXT4_SB(sb); 274 275 struct buffer_head *bh_p; 276 + 277 + KUNIT_STATIC_STUB_REDIRECT(ext4_get_group_desc, 278 + sb, block_group, bh); 275 279 276 280 if (block_group >= ngroups) { 277 281 ext4_error(sb, "block_group >= groups_count - block_group = %u," ··· 470 468 ext4_fsblk_t bitmap_blk; 471 469 int err; 472 470 471 + KUNIT_STATIC_STUB_REDIRECT(ext4_read_block_bitmap_nowait, 472 + sb, block_group, ignore_locked); 473 + 473 474 desc = ext4_get_group_desc(sb, block_group, NULL); 474 475 if (!desc) 475 476 return ERR_PTR(-EFSCORRUPTED); ··· 567 562 struct buffer_head *bh) 568 563 { 569 564 struct ext4_group_desc *desc; 565 + 566 + KUNIT_STATIC_STUB_REDIRECT(ext4_wait_block_bitmap, 567 + sb, block_group, bh); 570 568 571 569 if (!buffer_new(bh)) 572 570 return 0;

+5 -3

fs/ext4/ext4.h

··· 1504 1504 loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ 1505 1505 struct buffer_head * s_sbh; /* Buffer containing the super block */ 1506 1506 struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ 1507 + /* Array of bh's for the block group descriptors */ 1507 1508 struct buffer_head * __rcu *s_group_desc; 1508 1509 unsigned int s_mount_opt; 1509 1510 unsigned int s_mount_opt2; ··· 1575 1574 unsigned int *s_mb_maxs; 1576 1575 unsigned int s_group_info_size; 1577 1576 unsigned int s_mb_free_pending; 1578 - struct list_head s_freed_data_list; /* List of blocks to be freed 1577 + struct list_head s_freed_data_list[2]; /* List of blocks to be freed 1579 1578 after commit completed */ 1580 1579 struct list_head s_discard_list; 1581 1580 struct work_struct s_discard_work; ··· 1687 1686 1688 1687 /* 1689 1688 * Barrier between writepages ops and changing any inode's JOURNAL_DATA 1690 - * or EXTENTS flag. 1689 + * or EXTENTS flag or between writepages ops and changing DELALLOC or 1690 + * DIOREAD_NOLOCK mount options on remount. 1691 1691 */ 1692 1692 struct percpu_rw_semaphore s_writepages_rwsem; 1693 1693 struct dax_device *s_daxdev; ··· 2936 2934 extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); 2937 2935 extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); 2938 2936 extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, 2939 - int len, int state); 2937 + int len, bool state); 2940 2938 static inline bool ext4_mb_cr_expensive(enum criteria cr) 2941 2939 { 2942 2940 return cr >= CR_GOAL_LEN_SLOW;

+7 -7

fs/ext4/extents.c

··· 1010 1010 ix = curp->p_idx; 1011 1011 } 1012 1012 1013 + if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { 1014 + EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); 1015 + return -EFSCORRUPTED; 1016 + } 1017 + 1013 1018 len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; 1014 1019 BUG_ON(len < 0); 1015 1020 if (len > 0) { ··· 1022 1017 "move %d indices from 0x%p to 0x%p\n", 1023 1018 logical, len, ix, ix + 1); 1024 1019 memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); 1025 - } 1026 - 1027 - if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) { 1028 - EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!"); 1029 - return -EFSCORRUPTED; 1030 1020 } 1031 1021 1032 1022 ix->ei_block = cpu_to_le32(logical); ··· 6081 6081 for (j = 0; j < path->p_depth; j++) { 6082 6082 6083 6083 ext4_mb_mark_bb(inode->i_sb, 6084 - path[j].p_block, 1, 0); 6084 + path[j].p_block, 1, false); 6085 6085 ext4_fc_record_regions(inode->i_sb, inode->i_ino, 6086 6086 0, path[j].p_block, 1, 1); 6087 6087 } 6088 6088 ext4_free_ext_path(path); 6089 6089 } 6090 - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 6090 + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 6091 6091 ext4_fc_record_regions(inode->i_sb, inode->i_ino, 6092 6092 map.m_lblk, map.m_pblk, map.m_len, 1); 6093 6093 }

+91 -36

fs/ext4/extents_status.c

··· 152 152 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); 153 153 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, 154 154 struct ext4_inode_info *locked_ei); 155 - static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, 156 - ext4_lblk_t len); 155 + static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, 156 + ext4_lblk_t len, 157 + struct pending_reservation **prealloc); 157 158 158 159 int __init ext4_init_es(void) 159 160 { ··· 447 446 WARN_ON_ONCE(sbi->s_es_nr_inode < 0); 448 447 } 449 448 spin_unlock(&sbi->s_es_lock); 449 + } 450 + 451 + static inline struct pending_reservation *__alloc_pending(bool nofail) 452 + { 453 + if (!nofail) 454 + return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); 455 + 456 + return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL); 457 + } 458 + 459 + static inline void __free_pending(struct pending_reservation *pr) 460 + { 461 + kmem_cache_free(ext4_pending_cachep, pr); 450 462 } 451 463 452 464 /* ··· 850 836 { 851 837 struct extent_status newes; 852 838 ext4_lblk_t end = lblk + len - 1; 853 - int err1 = 0; 854 - int err2 = 0; 839 + int err1 = 0, err2 = 0, err3 = 0; 855 840 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 856 841 struct extent_status *es1 = NULL; 857 842 struct extent_status *es2 = NULL; 843 + struct pending_reservation *pr = NULL; 844 + bool revise_pending = false; 858 845 859 846 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 860 847 return; ··· 883 868 884 869 ext4_es_insert_extent_check(inode, &newes); 885 870 871 + revise_pending = sbi->s_cluster_ratio > 1 && 872 + test_opt(inode->i_sb, DELALLOC) && 873 + (status & (EXTENT_STATUS_WRITTEN | 874 + EXTENT_STATUS_UNWRITTEN)); 886 875 retry: 887 876 if (err1 && !es1) 888 877 es1 = __es_alloc_extent(true); 889 878 if ((err1 || err2) && !es2) 890 879 es2 = __es_alloc_extent(true); 880 + if ((err1 || err2 || err3) && revise_pending && !pr) 881 + pr = __alloc_pending(true); 891 882 write_lock(&EXT4_I(inode)->i_es_lock); 892 883 893 884 err1 = __es_remove_extent(inode, lblk, end, NULL, es1); ··· 918 897 es2 = NULL; 919 898 } 920 899 921 - if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) && 922 - (status & EXTENT_STATUS_WRITTEN || 923 - status & EXTENT_STATUS_UNWRITTEN)) 924 - __revise_pending(inode, lblk, len); 900 + if (revise_pending) { 901 + err3 = __revise_pending(inode, lblk, len, &pr); 902 + if (err3 != 0) 903 + goto error; 904 + if (pr) { 905 + __free_pending(pr); 906 + pr = NULL; 907 + } 908 + } 925 909 error: 926 910 write_unlock(&EXT4_I(inode)->i_es_lock); 927 - if (err1 || err2) 911 + if (err1 || err2 || err3) 928 912 goto retry; 929 913 930 914 ext4_es_print_tree(inode); ··· 1337 1311 rc->ndelonly--; 1338 1312 node = rb_next(&pr->rb_node); 1339 1313 rb_erase(&pr->rb_node, &tree->root); 1340 - kmem_cache_free(ext4_pending_cachep, pr); 1314 + __free_pending(pr); 1341 1315 if (!node) 1342 1316 break; 1343 1317 pr = rb_entry(node, struct pending_reservation, ··· 1431 1405 } 1432 1406 } 1433 1407 if (count_reserved) 1434 - count_rsvd(inode, lblk, orig_es.es_len - len1 - len2, 1435 - &orig_es, &rc); 1408 + count_rsvd(inode, orig_es.es_lblk + len1, 1409 + orig_es.es_len - len1 - len2, &orig_es, &rc); 1436 1410 goto out_get_reserved; 1437 1411 } 1438 1412 ··· 1933 1907 * 1934 1908 * @inode - file containing the cluster 1935 1909 * @lblk - logical block in the cluster to be added 1910 + * @prealloc - preallocated pending entry 1936 1911 * 1937 1912 * Returns 0 on successful insertion and -ENOMEM on failure. If the 1938 1913 * pending reservation is already in the set, returns successfully. 1939 1914 */ 1940 - static int __insert_pending(struct inode *inode, ext4_lblk_t lblk) 1915 + static int __insert_pending(struct inode *inode, ext4_lblk_t lblk, 1916 + struct pending_reservation **prealloc) 1941 1917 { 1942 1918 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1943 1919 struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree; ··· 1965 1937 } 1966 1938 } 1967 1939 1968 - pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC); 1969 - if (pr == NULL) { 1970 - ret = -ENOMEM; 1971 - goto out; 1940 + if (likely(*prealloc == NULL)) { 1941 + pr = __alloc_pending(false); 1942 + if (!pr) { 1943 + ret = -ENOMEM; 1944 + goto out; 1945 + } 1946 + } else { 1947 + pr = *prealloc; 1948 + *prealloc = NULL; 1972 1949 } 1973 1950 pr->lclu = lclu; 1974 1951 ··· 2003 1970 if (pr != NULL) { 2004 1971 tree = &EXT4_I(inode)->i_pending_tree; 2005 1972 rb_erase(&pr->rb_node, &tree->root); 2006 - kmem_cache_free(ext4_pending_cachep, pr); 1973 + __free_pending(pr); 2007 1974 } 2008 1975 } 2009 1976 ··· 2062 2029 bool allocated) 2063 2030 { 2064 2031 struct extent_status newes; 2065 - int err1 = 0; 2066 - int err2 = 0; 2032 + int err1 = 0, err2 = 0, err3 = 0; 2067 2033 struct extent_status *es1 = NULL; 2068 2034 struct extent_status *es2 = NULL; 2035 + struct pending_reservation *pr = NULL; 2069 2036 2070 2037 if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) 2071 2038 return; ··· 2085 2052 es1 = __es_alloc_extent(true); 2086 2053 if ((err1 || err2) && !es2) 2087 2054 es2 = __es_alloc_extent(true); 2055 + if ((err1 || err2 || err3) && allocated && !pr) 2056 + pr = __alloc_pending(true); 2088 2057 write_lock(&EXT4_I(inode)->i_es_lock); 2089 2058 2090 2059 err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1); ··· 2109 2074 es2 = NULL; 2110 2075 } 2111 2076 2112 - if (allocated) 2113 - __insert_pending(inode, lblk); 2077 + if (allocated) { 2078 + err3 = __insert_pending(inode, lblk, &pr); 2079 + if (err3 != 0) 2080 + goto error; 2081 + if (pr) { 2082 + __free_pending(pr); 2083 + pr = NULL; 2084 + } 2085 + } 2114 2086 error: 2115 2087 write_unlock(&EXT4_I(inode)->i_es_lock); 2116 - if (err1 || err2) 2088 + if (err1 || err2 || err3) 2117 2089 goto retry; 2118 2090 2119 2091 ext4_es_print_tree(inode); ··· 2226 2184 * @inode - file containing the range 2227 2185 * @lblk - logical block defining the start of range 2228 2186 * @len - length of range in blocks 2187 + * @prealloc - preallocated pending entry 2229 2188 * 2230 2189 * Used after a newly allocated extent is added to the extents status tree. 2231 2190 * Requires that the extents in the range have either written or unwritten 2232 2191 * status. Must be called while holding i_es_lock. 2233 2192 */ 2234 - static void __revise_pending(struct inode *inode, ext4_lblk_t lblk, 2235 - ext4_lblk_t len) 2193 + static int __revise_pending(struct inode *inode, ext4_lblk_t lblk, 2194 + ext4_lblk_t len, 2195 + struct pending_reservation **prealloc) 2236 2196 { 2237 2197 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2238 2198 ext4_lblk_t end = lblk + len - 1; 2239 2199 ext4_lblk_t first, last; 2240 2200 bool f_del = false, l_del = false; 2201 + int ret = 0; 2241 2202 2242 2203 if (len == 0) 2243 - return; 2204 + return 0; 2244 2205 2245 2206 /* 2246 2207 * Two cases - block range within single cluster and block range ··· 2264 2219 f_del = __es_scan_range(inode, &ext4_es_is_delonly, 2265 2220 first, lblk - 1); 2266 2221 if (f_del) { 2267 - __insert_pending(inode, first); 2222 + ret = __insert_pending(inode, first, prealloc); 2223 + if (ret < 0) 2224 + goto out; 2268 2225 } else { 2269 2226 last = EXT4_LBLK_CMASK(sbi, end) + 2270 2227 sbi->s_cluster_ratio - 1; ··· 2274 2227 l_del = __es_scan_range(inode, 2275 2228 &ext4_es_is_delonly, 2276 2229 end + 1, last); 2277 - if (l_del) 2278 - __insert_pending(inode, last); 2279 - else 2230 + if (l_del) { 2231 + ret = __insert_pending(inode, last, prealloc); 2232 + if (ret < 0) 2233 + goto out; 2234 + } else 2280 2235 __remove_pending(inode, last); 2281 2236 } 2282 2237 } else { ··· 2286 2237 if (first != lblk) 2287 2238 f_del = __es_scan_range(inode, &ext4_es_is_delonly, 2288 2239 first, lblk - 1); 2289 - if (f_del) 2290 - __insert_pending(inode, first); 2291 - else 2240 + if (f_del) { 2241 + ret = __insert_pending(inode, first, prealloc); 2242 + if (ret < 0) 2243 + goto out; 2244 + } else 2292 2245 __remove_pending(inode, first); 2293 2246 2294 2247 last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1; 2295 2248 if (last != end) 2296 2249 l_del = __es_scan_range(inode, &ext4_es_is_delonly, 2297 2250 end + 1, last); 2298 - if (l_del) 2299 - __insert_pending(inode, last); 2300 - else 2251 + if (l_del) { 2252 + ret = __insert_pending(inode, last, prealloc); 2253 + if (ret < 0) 2254 + goto out; 2255 + } else 2301 2256 __remove_pending(inode, last); 2302 2257 } 2258 + out: 2259 + return ret; 2303 2260 }

+4 -4

fs/ext4/fast_commit.c

··· 1806 1806 * at the end of the FC replay using our array of 1807 1807 * modified inodes. 1808 1808 */ 1809 - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1809 + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 1810 1810 goto next; 1811 1811 } 1812 1812 ··· 1875 1875 if (ret > 0) { 1876 1876 remaining -= ret; 1877 1877 cur += ret; 1878 - ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, 0); 1878 + ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); 1879 1879 } else { 1880 1880 remaining -= map.m_len; 1881 1881 cur += map.m_len; ··· 1934 1934 if (!IS_ERR(path)) { 1935 1935 for (j = 0; j < path->p_depth; j++) 1936 1936 ext4_mb_mark_bb(inode->i_sb, 1937 - path[j].p_block, 1, 1); 1937 + path[j].p_block, 1, true); 1938 1938 ext4_free_ext_path(path); 1939 1939 } 1940 1940 cur += ret; 1941 1941 ext4_mb_mark_bb(inode->i_sb, map.m_pblk, 1942 - map.m_len, 1); 1942 + map.m_len, true); 1943 1943 } else { 1944 1944 cur = cur + (map.m_len ? map.m_len : 1); 1945 1945 }

+74 -95

fs/ext4/file.c

··· 306 306 } 307 307 308 308 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, 309 - ssize_t written, size_t count) 309 + ssize_t count) 310 310 { 311 311 handle_t *handle; 312 - bool truncate = false; 313 - u8 blkbits = inode->i_blkbits; 314 - ext4_lblk_t written_blk, end_blk; 315 - int ret; 316 312 317 - /* 318 - * Note that EXT4_I(inode)->i_disksize can get extended up to 319 - * inode->i_size while the I/O was running due to writeback of delalloc 320 - * blocks. But, the code in ext4_iomap_alloc() is careful to use 321 - * zeroed/unwritten extents if this is possible; thus we won't leave 322 - * uninitialized blocks in a file even if we didn't succeed in writing 323 - * as much as we intended. 324 - */ 325 - WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); 326 - if (offset + count <= EXT4_I(inode)->i_disksize) { 327 - /* 328 - * We need to ensure that the inode is removed from the orphan 329 - * list if it has been added prematurely, due to writeback of 330 - * delalloc blocks. 331 - */ 332 - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { 333 - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 334 - 335 - if (IS_ERR(handle)) { 336 - ext4_orphan_del(NULL, inode); 337 - return PTR_ERR(handle); 338 - } 339 - 340 - ext4_orphan_del(handle, inode); 341 - ext4_journal_stop(handle); 342 - } 343 - 344 - return written; 345 - } 346 - 347 - if (written < 0) 348 - goto truncate; 349 - 313 + lockdep_assert_held_write(&inode->i_rwsem); 350 314 handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 351 - if (IS_ERR(handle)) { 352 - written = PTR_ERR(handle); 353 - goto truncate; 354 - } 315 + if (IS_ERR(handle)) 316 + return PTR_ERR(handle); 355 317 356 - if (ext4_update_inode_size(inode, offset + written)) { 357 - ret = ext4_mark_inode_dirty(handle, inode); 318 + if (ext4_update_inode_size(inode, offset + count)) { 319 + int ret = ext4_mark_inode_dirty(handle, inode); 358 320 if (unlikely(ret)) { 359 - written = ret; 360 321 ext4_journal_stop(handle); 361 - goto truncate; 322 + return ret; 362 323 } 363 324 } 364 325 365 - /* 366 - * We may need to truncate allocated but not written blocks beyond EOF. 367 - */ 368 - written_blk = ALIGN(offset + written, 1 << blkbits); 369 - end_blk = ALIGN(offset + count, 1 << blkbits); 370 - if (written_blk < end_blk && ext4_can_truncate(inode)) 371 - truncate = true; 372 - 373 - /* 374 - * Remove the inode from the orphan list if it has been extended and 375 - * everything went OK. 376 - */ 377 - if (!truncate && inode->i_nlink) 326 + if (inode->i_nlink) 378 327 ext4_orphan_del(handle, inode); 379 328 ext4_journal_stop(handle); 380 329 381 - if (truncate) { 382 - truncate: 330 + return count; 331 + } 332 + 333 + /* 334 + * Clean up the inode after DIO or DAX extending write has completed and the 335 + * inode size has been updated using ext4_handle_inode_extension(). 336 + */ 337 + static void ext4_inode_extension_cleanup(struct inode *inode, ssize_t count) 338 + { 339 + lockdep_assert_held_write(&inode->i_rwsem); 340 + if (count < 0) { 383 341 ext4_truncate_failed_write(inode); 384 342 /* 385 343 * If the truncate operation failed early, then the inode may ··· 346 388 */ 347 389 if (inode->i_nlink) 348 390 ext4_orphan_del(NULL, inode); 391 + return; 349 392 } 393 + /* 394 + * If i_disksize got extended due to writeback of delalloc blocks while 395 + * the DIO was running we could fail to cleanup the orphan list in 396 + * ext4_handle_inode_extension(). Do it now. 397 + */ 398 + if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { 399 + handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); 350 400 351 - return written; 401 + if (IS_ERR(handle)) { 402 + /* 403 + * The write has successfully completed. Not much to 404 + * do with the error here so just cleanup the orphan 405 + * list and hope for the best. 406 + */ 407 + ext4_orphan_del(NULL, inode); 408 + return; 409 + } 410 + ext4_orphan_del(handle, inode); 411 + ext4_journal_stop(handle); 412 + } 352 413 } 353 414 354 415 static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, ··· 376 399 loff_t pos = iocb->ki_pos; 377 400 struct inode *inode = file_inode(iocb->ki_filp); 378 401 402 + if (!error && size && flags & IOMAP_DIO_UNWRITTEN) 403 + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); 379 404 if (error) 380 405 return error; 381 - 382 - if (size && flags & IOMAP_DIO_UNWRITTEN) { 383 - error = ext4_convert_unwritten_extents(NULL, inode, pos, size); 384 - if (error < 0) 385 - return error; 386 - } 387 406 /* 388 - * If we are extending the file, we have to update i_size here before 389 - * page cache gets invalidated in iomap_dio_rw(). Otherwise racing 390 - * buffered reads could zero out too much from page cache pages. Update 391 - * of on-disk size will happen later in ext4_dio_write_iter() where 392 - * we have enough information to also perform orphan list handling etc. 393 - * Note that we perform all extending writes synchronously under 394 - * i_rwsem held exclusively so i_size update is safe here in that case. 395 - * If the write was not extending, we cannot see pos > i_size here 396 - * because operations reducing i_size like truncate wait for all 397 - * outstanding DIO before updating i_size. 407 + * Note that EXT4_I(inode)->i_disksize can get extended up to 408 + * inode->i_size while the I/O was running due to writeback of delalloc 409 + * blocks. But the code in ext4_iomap_alloc() is careful to use 410 + * zeroed/unwritten extents if this is possible; thus we won't leave 411 + * uninitialized blocks in a file even if we didn't succeed in writing 412 + * as much as we intended. 398 413 */ 399 - pos += size; 400 - if (pos > i_size_read(inode)) 401 - i_size_write(inode, pos); 402 - 403 - return 0; 414 + WARN_ON_ONCE(i_size_read(inode) < READ_ONCE(EXT4_I(inode)->i_disksize)); 415 + if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize)) 416 + return size; 417 + return ext4_handle_inode_extension(inode, pos, size); 404 418 } 405 419 406 420 static const struct iomap_dio_ops ext4_dio_write_ops = { ··· 537 569 return ext4_buffered_write_iter(iocb, from); 538 570 } 539 571 572 + /* 573 + * Prevent inline data from being created since we are going to allocate 574 + * blocks for DIO. We know the inode does not currently have inline data 575 + * because ext4_should_use_dio() checked for it, but we have to clear 576 + * the state flag before the write checks because a lock cycle could 577 + * introduce races with other writers. 578 + */ 579 + ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 580 + 540 581 ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, 541 582 &unwritten, &dio_flags); 542 583 if (ret <= 0) 543 584 return ret; 544 - 545 - /* 546 - * Make sure inline data cannot be created anymore since we are going 547 - * to allocate blocks for DIO. We know the inode does not have any 548 - * inline data now because ext4_dio_supported() checked for that. 549 - */ 550 - ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); 551 585 552 586 offset = iocb->ki_pos; 553 587 count = ret; ··· 576 606 dio_flags, NULL, 0); 577 607 if (ret == -ENOTBLK) 578 608 ret = 0; 579 - 580 - if (extend) 581 - ret = ext4_handle_inode_extension(inode, offset, ret, count); 609 + if (extend) { 610 + /* 611 + * We always perform extending DIO write synchronously so by 612 + * now the IO is completed and ext4_handle_inode_extension() 613 + * was called. Cleanup the inode in case of error or race with 614 + * writeback of delalloc blocks. 615 + */ 616 + WARN_ON_ONCE(ret == -EIOCBQUEUED); 617 + ext4_inode_extension_cleanup(inode, ret); 618 + } 582 619 583 620 out: 584 621 if (ilock_shared) ··· 666 689 667 690 ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); 668 691 669 - if (extend) 670 - ret = ext4_handle_inode_extension(inode, offset, ret, count); 692 + if (extend) { 693 + ret = ext4_handle_inode_extension(inode, offset, ret); 694 + ext4_inode_extension_cleanup(inode, ret); 695 + } 671 696 out: 672 697 inode_unlock(inode); 673 698 if (ret > 0)

+13 -1

fs/ext4/inode.c

··· 789 789 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, 790 790 struct buffer_head *bh_result, int create) 791 791 { 792 + int ret = 0; 793 + 792 794 ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", 793 795 inode->i_ino, create); 794 - return _ext4_get_block(inode, iblock, bh_result, 796 + ret = _ext4_get_block(inode, iblock, bh_result, 795 797 EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); 798 + 799 + /* 800 + * If the buffer is marked unwritten, mark it as new to make sure it is 801 + * zeroed out correctly in case of partial writes. Otherwise, there is 802 + * a chance of stale data getting exposed. 803 + */ 804 + if (ret == 0 && buffer_unwritten(bh_result)) 805 + set_buffer_new(bh_result); 806 + 807 + return ret; 796 808 } 797 809 798 810 /* Maximum number of blocks we map for direct IO at once. */

+349

fs/ext4/mballoc-test.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * KUnit test of ext4 multiblocks allocation. 4 + */ 5 + 6 + #include <kunit/test.h> 7 + #include <kunit/static_stub.h> 8 + 9 + #include "ext4.h" 10 + 11 + struct mbt_grp_ctx { 12 + struct buffer_head bitmap_bh; 13 + /* desc and gd_bh are just the place holders for now */ 14 + struct ext4_group_desc desc; 15 + struct buffer_head gd_bh; 16 + }; 17 + 18 + struct mbt_ctx { 19 + struct mbt_grp_ctx *grp_ctx; 20 + }; 21 + 22 + struct mbt_ext4_super_block { 23 + struct super_block sb; 24 + struct mbt_ctx mbt_ctx; 25 + }; 26 + 27 + #define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx)) 28 + #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group]) 29 + 30 + static struct super_block *mbt_ext4_alloc_super_block(void) 31 + { 32 + struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL); 33 + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 34 + struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL); 35 + 36 + if (fsb == NULL || sbi == NULL || es == NULL) 37 + goto out; 38 + 39 + sbi->s_es = es; 40 + fsb->sb.s_fs_info = sbi; 41 + return &fsb->sb; 42 + 43 + out: 44 + kfree(fsb); 45 + kfree(sbi); 46 + kfree(es); 47 + return NULL; 48 + } 49 + 50 + static void mbt_ext4_free_super_block(struct super_block *sb) 51 + { 52 + struct mbt_ext4_super_block *fsb = 53 + container_of(sb, struct mbt_ext4_super_block, sb); 54 + struct ext4_sb_info *sbi = EXT4_SB(sb); 55 + 56 + kfree(sbi->s_es); 57 + kfree(sbi); 58 + kfree(fsb); 59 + } 60 + 61 + struct mbt_ext4_block_layout { 62 + unsigned char blocksize_bits; 63 + unsigned int cluster_bits; 64 + uint32_t blocks_per_group; 65 + ext4_group_t group_count; 66 + uint16_t desc_size; 67 + }; 68 + 69 + static void mbt_init_sb_layout(struct super_block *sb, 70 + struct mbt_ext4_block_layout *layout) 71 + { 72 + struct ext4_sb_info *sbi = EXT4_SB(sb); 73 + struct ext4_super_block *es = sbi->s_es; 74 + 75 + sb->s_blocksize = 1UL << layout->blocksize_bits; 76 + sb->s_blocksize_bits = layout->blocksize_bits; 77 + 78 + sbi->s_groups_count = layout->group_count; 79 + sbi->s_blocks_per_group = layout->blocks_per_group; 80 + sbi->s_cluster_bits = layout->cluster_bits; 81 + sbi->s_cluster_ratio = 1U << layout->cluster_bits; 82 + sbi->s_clusters_per_group = layout->blocks_per_group >> 83 + layout->cluster_bits; 84 + sbi->s_desc_size = layout->desc_size; 85 + 86 + es->s_first_data_block = cpu_to_le32(0); 87 + es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group * 88 + layout->group_count); 89 + } 90 + 91 + static int mbt_grp_ctx_init(struct super_block *sb, 92 + struct mbt_grp_ctx *grp_ctx) 93 + { 94 + grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL); 95 + if (grp_ctx->bitmap_bh.b_data == NULL) 96 + return -ENOMEM; 97 + 98 + return 0; 99 + } 100 + 101 + static void mbt_grp_ctx_release(struct mbt_grp_ctx *grp_ctx) 102 + { 103 + kfree(grp_ctx->bitmap_bh.b_data); 104 + grp_ctx->bitmap_bh.b_data = NULL; 105 + } 106 + 107 + static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group, 108 + unsigned int start, unsigned int len) 109 + { 110 + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); 111 + 112 + mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len); 113 + } 114 + 115 + /* called after mbt_init_sb_layout */ 116 + static int mbt_ctx_init(struct super_block *sb) 117 + { 118 + struct mbt_ctx *ctx = MBT_CTX(sb); 119 + ext4_group_t i, ngroups = ext4_get_groups_count(sb); 120 + 121 + ctx->grp_ctx = kcalloc(ngroups, sizeof(struct mbt_grp_ctx), 122 + GFP_KERNEL); 123 + if (ctx->grp_ctx == NULL) 124 + return -ENOMEM; 125 + 126 + for (i = 0; i < ngroups; i++) 127 + if (mbt_grp_ctx_init(sb, &ctx->grp_ctx[i])) 128 + goto out; 129 + 130 + /* 131 + * first data block(first cluster in first group) is used by 132 + * metadata, mark it used to avoid to alloc data block at first 133 + * block which will fail ext4_sb_block_valid check. 134 + */ 135 + mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1); 136 + 137 + return 0; 138 + out: 139 + while (i-- > 0) 140 + mbt_grp_ctx_release(&ctx->grp_ctx[i]); 141 + kfree(ctx->grp_ctx); 142 + return -ENOMEM; 143 + } 144 + 145 + static void mbt_ctx_release(struct super_block *sb) 146 + { 147 + struct mbt_ctx *ctx = MBT_CTX(sb); 148 + ext4_group_t i, ngroups = ext4_get_groups_count(sb); 149 + 150 + for (i = 0; i < ngroups; i++) 151 + mbt_grp_ctx_release(&ctx->grp_ctx[i]); 152 + kfree(ctx->grp_ctx); 153 + } 154 + 155 + static struct buffer_head * 156 + ext4_read_block_bitmap_nowait_stub(struct super_block *sb, ext4_group_t block_group, 157 + bool ignore_locked) 158 + { 159 + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); 160 + 161 + /* paired with brelse from caller of ext4_read_block_bitmap_nowait */ 162 + get_bh(&grp_ctx->bitmap_bh); 163 + return &grp_ctx->bitmap_bh; 164 + } 165 + 166 + static int ext4_wait_block_bitmap_stub(struct super_block *sb, 167 + ext4_group_t block_group, 168 + struct buffer_head *bh) 169 + { 170 + return 0; 171 + } 172 + 173 + static struct ext4_group_desc * 174 + ext4_get_group_desc_stub(struct super_block *sb, ext4_group_t block_group, 175 + struct buffer_head **bh) 176 + { 177 + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, block_group); 178 + 179 + if (bh != NULL) 180 + *bh = &grp_ctx->gd_bh; 181 + 182 + return &grp_ctx->desc; 183 + } 184 + 185 + static int 186 + ext4_mb_mark_context_stub(handle_t *handle, struct super_block *sb, bool state, 187 + ext4_group_t group, ext4_grpblk_t blkoff, 188 + ext4_grpblk_t len, int flags, 189 + ext4_grpblk_t *ret_changed) 190 + { 191 + struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group); 192 + struct buffer_head *bitmap_bh = &grp_ctx->bitmap_bh; 193 + 194 + if (state) 195 + mb_set_bits(bitmap_bh->b_data, blkoff, len); 196 + else 197 + mb_clear_bits(bitmap_bh->b_data, blkoff, len); 198 + 199 + return 0; 200 + } 201 + 202 + #define TEST_GOAL_GROUP 1 203 + static int mbt_kunit_init(struct kunit *test) 204 + { 205 + struct mbt_ext4_block_layout *layout = 206 + (struct mbt_ext4_block_layout *)(test->param_value); 207 + struct super_block *sb; 208 + int ret; 209 + 210 + sb = mbt_ext4_alloc_super_block(); 211 + if (sb == NULL) 212 + return -ENOMEM; 213 + 214 + mbt_init_sb_layout(sb, layout); 215 + 216 + ret = mbt_ctx_init(sb); 217 + if (ret != 0) { 218 + mbt_ext4_free_super_block(sb); 219 + return ret; 220 + } 221 + 222 + test->priv = sb; 223 + kunit_activate_static_stub(test, 224 + ext4_read_block_bitmap_nowait, 225 + ext4_read_block_bitmap_nowait_stub); 226 + kunit_activate_static_stub(test, 227 + ext4_wait_block_bitmap, 228 + ext4_wait_block_bitmap_stub); 229 + kunit_activate_static_stub(test, 230 + ext4_get_group_desc, 231 + ext4_get_group_desc_stub); 232 + kunit_activate_static_stub(test, 233 + ext4_mb_mark_context, 234 + ext4_mb_mark_context_stub); 235 + return 0; 236 + } 237 + 238 + static void mbt_kunit_exit(struct kunit *test) 239 + { 240 + struct super_block *sb = (struct super_block *)test->priv; 241 + 242 + mbt_ctx_release(sb); 243 + mbt_ext4_free_super_block(sb); 244 + } 245 + 246 + static void test_new_blocks_simple(struct kunit *test) 247 + { 248 + struct super_block *sb = (struct super_block *)test->priv; 249 + struct inode inode = { .i_sb = sb, }; 250 + struct ext4_allocation_request ar; 251 + ext4_group_t i, goal_group = TEST_GOAL_GROUP; 252 + int err = 0; 253 + ext4_fsblk_t found; 254 + struct ext4_sb_info *sbi = EXT4_SB(sb); 255 + 256 + ar.inode = &inode; 257 + 258 + /* get block at goal */ 259 + ar.goal = ext4_group_first_block_no(sb, goal_group); 260 + found = ext4_mb_new_blocks_simple(&ar, &err); 261 + KUNIT_ASSERT_EQ_MSG(test, ar.goal, found, 262 + "failed to alloc block at goal, expected %llu found %llu", 263 + ar.goal, found); 264 + 265 + /* get block after goal in goal group */ 266 + ar.goal = ext4_group_first_block_no(sb, goal_group); 267 + found = ext4_mb_new_blocks_simple(&ar, &err); 268 + KUNIT_ASSERT_EQ_MSG(test, ar.goal + EXT4_C2B(sbi, 1), found, 269 + "failed to alloc block after goal in goal group, expected %llu found %llu", 270 + ar.goal + 1, found); 271 + 272 + /* get block after goal group */ 273 + mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb)); 274 + ar.goal = ext4_group_first_block_no(sb, goal_group); 275 + found = ext4_mb_new_blocks_simple(&ar, &err); 276 + KUNIT_ASSERT_EQ_MSG(test, 277 + ext4_group_first_block_no(sb, goal_group + 1), found, 278 + "failed to alloc block after goal group, expected %llu found %llu", 279 + ext4_group_first_block_no(sb, goal_group + 1), found); 280 + 281 + /* get block before goal group */ 282 + for (i = goal_group; i < ext4_get_groups_count(sb); i++) 283 + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); 284 + ar.goal = ext4_group_first_block_no(sb, goal_group); 285 + found = ext4_mb_new_blocks_simple(&ar, &err); 286 + KUNIT_ASSERT_EQ_MSG(test, 287 + ext4_group_first_block_no(sb, 0) + EXT4_C2B(sbi, 1), found, 288 + "failed to alloc block before goal group, expected %llu found %llu", 289 + ext4_group_first_block_no(sb, 0 + EXT4_C2B(sbi, 1)), found); 290 + 291 + /* no block available, fail to allocate block */ 292 + for (i = 0; i < ext4_get_groups_count(sb); i++) 293 + mbt_ctx_mark_used(sb, i, 0, EXT4_CLUSTERS_PER_GROUP(sb)); 294 + ar.goal = ext4_group_first_block_no(sb, goal_group); 295 + found = ext4_mb_new_blocks_simple(&ar, &err); 296 + KUNIT_ASSERT_NE_MSG(test, err, 0, 297 + "unexpectedly get block when no block is available"); 298 + } 299 + 300 + static const struct mbt_ext4_block_layout mbt_test_layouts[] = { 301 + { 302 + .blocksize_bits = 10, 303 + .cluster_bits = 3, 304 + .blocks_per_group = 8192, 305 + .group_count = 4, 306 + .desc_size = 64, 307 + }, 308 + { 309 + .blocksize_bits = 12, 310 + .cluster_bits = 3, 311 + .blocks_per_group = 8192, 312 + .group_count = 4, 313 + .desc_size = 64, 314 + }, 315 + { 316 + .blocksize_bits = 16, 317 + .cluster_bits = 3, 318 + .blocks_per_group = 8192, 319 + .group_count = 4, 320 + .desc_size = 64, 321 + }, 322 + }; 323 + 324 + static void mbt_show_layout(const struct mbt_ext4_block_layout *layout, 325 + char *desc) 326 + { 327 + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "block_bits=%d cluster_bits=%d " 328 + "blocks_per_group=%d group_count=%d desc_size=%d\n", 329 + layout->blocksize_bits, layout->cluster_bits, 330 + layout->blocks_per_group, layout->group_count, 331 + layout->desc_size); 332 + } 333 + KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout); 334 + 335 + static struct kunit_case mbt_test_cases[] = { 336 + KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params), 337 + {} 338 + }; 339 + 340 + static struct kunit_suite mbt_test_suite = { 341 + .name = "ext4_mballoc_test", 342 + .init = mbt_kunit_init, 343 + .exit = mbt_kunit_exit, 344 + .test_cases = mbt_test_cases, 345 + }; 346 + 347 + kunit_test_suites(&mbt_test_suite); 348 + 349 + MODULE_LICENSE("GPL");

+203 -378

fs/ext4/mballoc.c

··· 18 18 #include <linux/backing-dev.h> 19 19 #include <linux/freezer.h> 20 20 #include <trace/events/ext4.h> 21 + #include <kunit/static_stub.h> 21 22 22 23 /* 23 24 * MUSTDO: ··· 418 417 419 418 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, 420 419 ext4_group_t group); 421 - static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 422 - ext4_group_t group); 423 420 static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); 424 421 425 422 static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ··· 1360 1361 * We place the buddy block and bitmap block 1361 1362 * close together 1362 1363 */ 1364 + grinfo = ext4_get_group_info(sb, group); 1365 + if (!grinfo) { 1366 + err = -EFSCORRUPTED; 1367 + goto out; 1368 + } 1363 1369 if ((first_block + i) & 1) { 1364 1370 /* this is block of buddy */ 1365 1371 BUG_ON(incore == NULL); 1366 1372 mb_debug(sb, "put buddy for group %u in page %lu/%x\n", 1367 1373 group, page->index, i * blocksize); 1368 1374 trace_ext4_mb_buddy_bitmap_load(sb, group); 1369 - grinfo = ext4_get_group_info(sb, group); 1370 - if (!grinfo) { 1371 - err = -EFSCORRUPTED; 1372 - goto out; 1373 - } 1374 1375 grinfo->bb_fragments = 0; 1375 1376 memset(grinfo->bb_counters, 0, 1376 1377 sizeof(*grinfo->bb_counters) * ··· 1397 1398 1398 1399 /* mark all preallocated blks used in in-core bitmap */ 1399 1400 ext4_mb_generate_from_pa(sb, data, group); 1400 - ext4_mb_generate_from_freelist(sb, data, group); 1401 + WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); 1401 1402 ext4_unlock_group(sb, group); 1402 1403 1403 1404 /* set incore so that the buddy information can be ··· 3630 3631 3631 3632 spin_lock_init(&sbi->s_md_lock); 3632 3633 sbi->s_mb_free_pending = 0; 3633 - INIT_LIST_HEAD(&sbi->s_freed_data_list); 3634 + INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); 3635 + INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); 3634 3636 INIT_LIST_HEAD(&sbi->s_discard_list); 3635 3637 INIT_WORK(&sbi->s_discard_work, ext4_discard_work); 3636 3638 atomic_set(&sbi->s_retry_alloc_pending, 0); ··· 3883 3883 struct ext4_sb_info *sbi = EXT4_SB(sb); 3884 3884 struct ext4_free_data *entry, *tmp; 3885 3885 LIST_HEAD(freed_data_list); 3886 - struct list_head *cut_pos = NULL; 3886 + struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; 3887 3887 bool wake; 3888 3888 3889 - spin_lock(&sbi->s_md_lock); 3890 - list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) { 3891 - if (entry->efd_tid != commit_tid) 3892 - break; 3893 - cut_pos = &entry->efd_list; 3894 - } 3895 - if (cut_pos) 3896 - list_cut_position(&freed_data_list, &sbi->s_freed_data_list, 3897 - cut_pos); 3898 - spin_unlock(&sbi->s_md_lock); 3889 + list_replace_init(s_freed_head, &freed_data_list); 3899 3890 3900 3891 list_for_each_entry(entry, &freed_data_list, efd_list) 3901 3892 ext4_free_data_in_buddy(sb, entry); ··· 3944 3953 ext4_groupinfo_destroy_slabs(); 3945 3954 } 3946 3955 3956 + #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 3957 + #define EXT4_MB_SYNC_UPDATE 0x0002 3958 + static int 3959 + ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, 3960 + ext4_group_t group, ext4_grpblk_t blkoff, 3961 + ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) 3962 + { 3963 + struct ext4_sb_info *sbi = EXT4_SB(sb); 3964 + struct buffer_head *bitmap_bh = NULL; 3965 + struct ext4_group_desc *gdp; 3966 + struct buffer_head *gdp_bh; 3967 + int err; 3968 + unsigned int i, already, changed = len; 3969 + 3970 + KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, 3971 + handle, sb, state, group, blkoff, len, 3972 + flags, ret_changed); 3973 + 3974 + if (ret_changed) 3975 + *ret_changed = 0; 3976 + bitmap_bh = ext4_read_block_bitmap(sb, group); 3977 + if (IS_ERR(bitmap_bh)) 3978 + return PTR_ERR(bitmap_bh); 3979 + 3980 + if (handle) { 3981 + BUFFER_TRACE(bitmap_bh, "getting write access"); 3982 + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 3983 + EXT4_JTR_NONE); 3984 + if (err) 3985 + goto out_err; 3986 + } 3987 + 3988 + err = -EIO; 3989 + gdp = ext4_get_group_desc(sb, group, &gdp_bh); 3990 + if (!gdp) 3991 + goto out_err; 3992 + 3993 + if (handle) { 3994 + BUFFER_TRACE(gdp_bh, "get_write_access"); 3995 + err = ext4_journal_get_write_access(handle, sb, gdp_bh, 3996 + EXT4_JTR_NONE); 3997 + if (err) 3998 + goto out_err; 3999 + } 4000 + 4001 + ext4_lock_group(sb, group); 4002 + if (ext4_has_group_desc_csum(sb) && 4003 + (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 4004 + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 4005 + ext4_free_group_clusters_set(sb, gdp, 4006 + ext4_free_clusters_after_init(sb, group, gdp)); 4007 + } 4008 + 4009 + if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { 4010 + already = 0; 4011 + for (i = 0; i < len; i++) 4012 + if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == 4013 + state) 4014 + already++; 4015 + changed = len - already; 4016 + } 4017 + 4018 + if (state) { 4019 + mb_set_bits(bitmap_bh->b_data, blkoff, len); 4020 + ext4_free_group_clusters_set(sb, gdp, 4021 + ext4_free_group_clusters(sb, gdp) - changed); 4022 + } else { 4023 + mb_clear_bits(bitmap_bh->b_data, blkoff, len); 4024 + ext4_free_group_clusters_set(sb, gdp, 4025 + ext4_free_group_clusters(sb, gdp) + changed); 4026 + } 4027 + 4028 + ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 4029 + ext4_group_desc_csum_set(sb, group, gdp); 4030 + ext4_unlock_group(sb, group); 4031 + if (ret_changed) 4032 + *ret_changed = changed; 4033 + 4034 + if (sbi->s_log_groups_per_flex) { 4035 + ext4_group_t flex_group = ext4_flex_group(sbi, group); 4036 + struct flex_groups *fg = sbi_array_rcu_deref(sbi, 4037 + s_flex_groups, flex_group); 4038 + 4039 + if (state) 4040 + atomic64_sub(changed, &fg->free_clusters); 4041 + else 4042 + atomic64_add(changed, &fg->free_clusters); 4043 + } 4044 + 4045 + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4046 + if (err) 4047 + goto out_err; 4048 + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 4049 + if (err) 4050 + goto out_err; 4051 + 4052 + if (flags & EXT4_MB_SYNC_UPDATE) { 4053 + sync_dirty_buffer(bitmap_bh); 4054 + sync_dirty_buffer(gdp_bh); 4055 + } 4056 + 4057 + out_err: 4058 + brelse(bitmap_bh); 4059 + return err; 4060 + } 3947 4061 3948 4062 /* 3949 4063 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps ··· 4058 3962 ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, 4059 3963 handle_t *handle, unsigned int reserv_clstrs) 4060 3964 { 4061 - struct buffer_head *bitmap_bh = NULL; 4062 3965 struct ext4_group_desc *gdp; 4063 - struct buffer_head *gdp_bh; 4064 3966 struct ext4_sb_info *sbi; 4065 3967 struct super_block *sb; 4066 3968 ext4_fsblk_t block; 4067 3969 int err, len; 3970 + int flags = 0; 3971 + ext4_grpblk_t changed; 4068 3972 4069 3973 BUG_ON(ac->ac_status != AC_STATUS_FOUND); 4070 3974 BUG_ON(ac->ac_b_ex.fe_len <= 0); ··· 4072 3976 sb = ac->ac_sb; 4073 3977 sbi = EXT4_SB(sb); 4074 3978 4075 - bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); 4076 - if (IS_ERR(bitmap_bh)) { 4077 - return PTR_ERR(bitmap_bh); 4078 - } 4079 - 4080 - BUFFER_TRACE(bitmap_bh, "getting write access"); 4081 - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 4082 - EXT4_JTR_NONE); 4083 - if (err) 4084 - goto out_err; 4085 - 4086 - err = -EIO; 4087 - gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh); 3979 + gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); 4088 3980 if (!gdp) 4089 - goto out_err; 4090 - 3981 + return -EIO; 4091 3982 ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, 4092 3983 ext4_free_group_clusters(sb, gdp)); 4093 3984 4094 - BUFFER_TRACE(gdp_bh, "get_write_access"); 4095 - err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); 4096 - if (err) 4097 - goto out_err; 4098 - 4099 3985 block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); 4100 - 4101 3986 len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); 4102 3987 if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { 4103 3988 ext4_error(sb, "Allocating blocks %llu-%llu which overlap " ··· 4087 4010 * Fix the bitmap and return EFSCORRUPTED 4088 4011 * We leak some of the blocks here. 4089 4012 */ 4090 - ext4_lock_group(sb, ac->ac_b_ex.fe_group); 4091 - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 4092 - ac->ac_b_ex.fe_len); 4093 - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 4094 - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4013 + err = ext4_mb_mark_context(handle, sb, true, 4014 + ac->ac_b_ex.fe_group, 4015 + ac->ac_b_ex.fe_start, 4016 + ac->ac_b_ex.fe_len, 4017 + 0, NULL); 4095 4018 if (!err) 4096 4019 err = -EFSCORRUPTED; 4097 - goto out_err; 4020 + return err; 4098 4021 } 4099 4022 4100 - ext4_lock_group(sb, ac->ac_b_ex.fe_group); 4101 4023 #ifdef AGGRESSIVE_CHECK 4102 - { 4103 - int i; 4104 - for (i = 0; i < ac->ac_b_ex.fe_len; i++) { 4105 - BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i, 4106 - bitmap_bh->b_data)); 4107 - } 4108 - } 4024 + flags |= EXT4_MB_BITMAP_MARKED_CHECK; 4109 4025 #endif 4110 - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, 4111 - ac->ac_b_ex.fe_len); 4112 - if (ext4_has_group_desc_csum(sb) && 4113 - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 4114 - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 4115 - ext4_free_group_clusters_set(sb, gdp, 4116 - ext4_free_clusters_after_init(sb, 4117 - ac->ac_b_ex.fe_group, gdp)); 4118 - } 4119 - len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len; 4120 - ext4_free_group_clusters_set(sb, gdp, len); 4121 - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 4122 - ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp); 4026 + err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, 4027 + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, 4028 + flags, &changed); 4123 4029 4124 - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); 4030 + if (err && changed == 0) 4031 + return err; 4032 + 4033 + #ifdef AGGRESSIVE_CHECK 4034 + BUG_ON(changed != ac->ac_b_ex.fe_len); 4035 + #endif 4125 4036 percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); 4126 4037 /* 4127 4038 * Now reduce the dirty block count also. Should not go negative ··· 4119 4054 percpu_counter_sub(&sbi->s_dirtyclusters_counter, 4120 4055 reserv_clstrs); 4121 4056 4122 - if (sbi->s_log_groups_per_flex) { 4123 - ext4_group_t flex_group = ext4_flex_group(sbi, 4124 - ac->ac_b_ex.fe_group); 4125 - atomic64_sub(ac->ac_b_ex.fe_len, 4126 - &sbi_array_rcu_deref(sbi, s_flex_groups, 4127 - flex_group)->free_clusters); 4128 - } 4129 - 4130 - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 4131 - if (err) 4132 - goto out_err; 4133 - err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); 4134 - 4135 - out_err: 4136 - brelse(bitmap_bh); 4137 4057 return err; 4138 4058 } 4139 4059 ··· 4127 4077 * blocks in bitmaps and update counters. 4128 4078 */ 4129 4079 void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, 4130 - int len, int state) 4080 + int len, bool state) 4131 4081 { 4132 - struct buffer_head *bitmap_bh = NULL; 4133 - struct ext4_group_desc *gdp; 4134 - struct buffer_head *gdp_bh; 4135 4082 struct ext4_sb_info *sbi = EXT4_SB(sb); 4136 4083 ext4_group_t group; 4137 4084 ext4_grpblk_t blkoff; 4138 - int i, err = 0; 4139 - int already; 4140 - unsigned int clen, clen_changed, thisgrp_len; 4085 + int err = 0; 4086 + unsigned int clen, thisgrp_len; 4141 4087 4142 4088 while (len > 0) { 4143 4089 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); ··· 4154 4108 ext4_error(sb, "Marking blocks in system zone - " 4155 4109 "Block = %llu, len = %u", 4156 4110 block, thisgrp_len); 4157 - bitmap_bh = NULL; 4158 4111 break; 4159 4112 } 4160 4113 4161 - bitmap_bh = ext4_read_block_bitmap(sb, group); 4162 - if (IS_ERR(bitmap_bh)) { 4163 - err = PTR_ERR(bitmap_bh); 4164 - bitmap_bh = NULL; 4165 - break; 4166 - } 4167 - 4168 - err = -EIO; 4169 - gdp = ext4_get_group_desc(sb, group, &gdp_bh); 4170 - if (!gdp) 4171 - break; 4172 - 4173 - ext4_lock_group(sb, group); 4174 - already = 0; 4175 - for (i = 0; i < clen; i++) 4176 - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == 4177 - !state) 4178 - already++; 4179 - 4180 - clen_changed = clen - already; 4181 - if (state) 4182 - mb_set_bits(bitmap_bh->b_data, blkoff, clen); 4183 - else 4184 - mb_clear_bits(bitmap_bh->b_data, blkoff, clen); 4185 - if (ext4_has_group_desc_csum(sb) && 4186 - (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { 4187 - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); 4188 - ext4_free_group_clusters_set(sb, gdp, 4189 - ext4_free_clusters_after_init(sb, group, gdp)); 4190 - } 4191 - if (state) 4192 - clen = ext4_free_group_clusters(sb, gdp) - clen_changed; 4193 - else 4194 - clen = ext4_free_group_clusters(sb, gdp) + clen_changed; 4195 - 4196 - ext4_free_group_clusters_set(sb, gdp, clen); 4197 - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 4198 - ext4_group_desc_csum_set(sb, group, gdp); 4199 - 4200 - ext4_unlock_group(sb, group); 4201 - 4202 - if (sbi->s_log_groups_per_flex) { 4203 - ext4_group_t flex_group = ext4_flex_group(sbi, group); 4204 - struct flex_groups *fg = sbi_array_rcu_deref(sbi, 4205 - s_flex_groups, flex_group); 4206 - 4207 - if (state) 4208 - atomic64_sub(clen_changed, &fg->free_clusters); 4209 - else 4210 - atomic64_add(clen_changed, &fg->free_clusters); 4211 - 4212 - } 4213 - 4214 - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 4215 - if (err) 4216 - break; 4217 - sync_dirty_buffer(bitmap_bh); 4218 - err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 4219 - sync_dirty_buffer(gdp_bh); 4114 + err = ext4_mb_mark_context(NULL, sb, state, 4115 + group, blkoff, clen, 4116 + EXT4_MB_BITMAP_MARKED_CHECK | 4117 + EXT4_MB_SYNC_UPDATE, 4118 + NULL); 4220 4119 if (err) 4221 4120 break; 4222 4121 4223 4122 block += thisgrp_len; 4224 4123 len -= thisgrp_len; 4225 - brelse(bitmap_bh); 4226 4124 BUG_ON(len < 0); 4227 4125 } 4228 - 4229 - if (err) 4230 - brelse(bitmap_bh); 4231 4126 } 4232 4127 4233 4128 /* ··· 4943 4956 return true; 4944 4957 } 4945 4958 return false; 4946 - } 4947 - 4948 - /* 4949 - * the function goes through all block freed in the group 4950 - * but not yet committed and marks them used in in-core bitmap. 4951 - * buddy must be generated from this bitmap 4952 - * Need to be called with the ext4 group lock held 4953 - */ 4954 - static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, 4955 - ext4_group_t group) 4956 - { 4957 - struct rb_node *n; 4958 - struct ext4_group_info *grp; 4959 - struct ext4_free_data *entry; 4960 - 4961 - grp = ext4_get_group_info(sb, group); 4962 - if (!grp) 4963 - return; 4964 - n = rb_first(&(grp->bb_free_root)); 4965 - 4966 - while (n) { 4967 - entry = rb_entry(n, struct ext4_free_data, efd_node); 4968 - mb_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count); 4969 - n = rb_next(n); 4970 - } 4971 4959 } 4972 4960 4973 4961 /* ··· 6092 6130 } 6093 6131 6094 6132 block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); 6095 - ext4_mb_mark_bb(sb, block, 1, 1); 6133 + ext4_mb_mark_bb(sb, block, 1, true); 6096 6134 ar->len = 1; 6097 6135 6098 6136 return block; ··· 6340 6378 } 6341 6379 6342 6380 spin_lock(&sbi->s_md_lock); 6343 - list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list); 6381 + list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); 6344 6382 sbi->s_mb_free_pending += clusters; 6345 6383 spin_unlock(&sbi->s_md_lock); 6346 6384 } ··· 6348 6386 static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, 6349 6387 unsigned long count) 6350 6388 { 6351 - struct buffer_head *bitmap_bh; 6352 6389 struct super_block *sb = inode->i_sb; 6353 - struct ext4_group_desc *gdp; 6354 - struct buffer_head *gdp_bh; 6355 6390 ext4_group_t group; 6356 6391 ext4_grpblk_t blkoff; 6357 - int already_freed = 0, err, i; 6358 6392 6359 6393 ext4_get_group_no_and_offset(sb, block, &group, &blkoff); 6360 - bitmap_bh = ext4_read_block_bitmap(sb, group); 6361 - if (IS_ERR(bitmap_bh)) { 6362 - pr_warn("Failed to read block bitmap\n"); 6363 - return; 6364 - } 6365 - gdp = ext4_get_group_desc(sb, group, &gdp_bh); 6366 - if (!gdp) 6367 - goto err_out; 6368 - 6369 - for (i = 0; i < count; i++) { 6370 - if (!mb_test_bit(blkoff + i, bitmap_bh->b_data)) 6371 - already_freed++; 6372 - } 6373 - mb_clear_bits(bitmap_bh->b_data, blkoff, count); 6374 - err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh); 6375 - if (err) 6376 - goto err_out; 6377 - ext4_free_group_clusters_set( 6378 - sb, gdp, ext4_free_group_clusters(sb, gdp) + 6379 - count - already_freed); 6380 - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6381 - ext4_group_desc_csum_set(sb, group, gdp); 6382 - ext4_handle_dirty_metadata(NULL, NULL, gdp_bh); 6383 - sync_dirty_buffer(bitmap_bh); 6384 - sync_dirty_buffer(gdp_bh); 6385 - 6386 - err_out: 6387 - brelse(bitmap_bh); 6394 + ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, 6395 + EXT4_MB_BITMAP_MARKED_CHECK | 6396 + EXT4_MB_SYNC_UPDATE, 6397 + NULL); 6388 6398 } 6389 6399 6390 6400 /** ··· 6372 6438 ext4_fsblk_t block, unsigned long count, 6373 6439 int flags) 6374 6440 { 6375 - struct buffer_head *bitmap_bh = NULL; 6376 6441 struct super_block *sb = inode->i_sb; 6377 - struct ext4_group_desc *gdp; 6378 6442 struct ext4_group_info *grp; 6379 6443 unsigned int overflow; 6380 6444 ext4_grpblk_t bit; 6381 - struct buffer_head *gd_bh; 6382 6445 ext4_group_t block_group; 6383 6446 struct ext4_sb_info *sbi; 6384 6447 struct ext4_buddy e4b; 6385 6448 unsigned int count_clusters; 6386 6449 int err = 0; 6387 - int ret; 6450 + int mark_flags = 0; 6451 + ext4_grpblk_t changed; 6388 6452 6389 6453 sbi = EXT4_SB(sb); 6390 6454 ··· 6391 6459 ext4_error(sb, "Freeing blocks in system zone - " 6392 6460 "Block = %llu, count = %lu", block, count); 6393 6461 /* err = 0. ext4_std_error should be a no op */ 6394 - goto error_return; 6462 + goto error_out; 6395 6463 } 6396 6464 flags |= EXT4_FREE_BLOCKS_VALIDATED; 6397 6465 ··· 6415 6483 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6416 6484 } 6417 6485 count_clusters = EXT4_NUM_B2C(sbi, count); 6418 - bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6419 - if (IS_ERR(bitmap_bh)) { 6420 - err = PTR_ERR(bitmap_bh); 6421 - bitmap_bh = NULL; 6422 - goto error_return; 6423 - } 6424 - gdp = ext4_get_group_desc(sb, block_group, &gd_bh); 6425 - if (!gdp) { 6426 - err = -EIO; 6427 - goto error_return; 6428 - } 6429 - 6430 - if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6431 - !ext4_inode_block_valid(inode, block, count)) { 6432 - ext4_error(sb, "Freeing blocks in system zone - " 6433 - "Block = %llu, count = %lu", block, count); 6434 - /* err = 0. ext4_std_error should be a no op */ 6435 - goto error_return; 6436 - } 6437 - 6438 - BUFFER_TRACE(bitmap_bh, "getting write access"); 6439 - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6440 - EXT4_JTR_NONE); 6441 - if (err) 6442 - goto error_return; 6443 - 6444 - /* 6445 - * We are about to modify some metadata. Call the journal APIs 6446 - * to unshare ->b_data if a currently-committing transaction is 6447 - * using it 6448 - */ 6449 - BUFFER_TRACE(gd_bh, "get_write_access"); 6450 - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6451 - if (err) 6452 - goto error_return; 6453 - #ifdef AGGRESSIVE_CHECK 6454 - { 6455 - int i; 6456 - for (i = 0; i < count_clusters; i++) 6457 - BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); 6458 - } 6459 - #endif 6460 6486 trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); 6461 6487 6462 6488 /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ 6463 6489 err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, 6464 6490 GFP_NOFS|__GFP_NOFAIL); 6465 6491 if (err) 6466 - goto error_return; 6492 + goto error_out; 6493 + 6494 + if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && 6495 + !ext4_inode_block_valid(inode, block, count)) { 6496 + ext4_error(sb, "Freeing blocks in system zone - " 6497 + "Block = %llu, count = %lu", block, count); 6498 + /* err = 0. ext4_std_error should be a no op */ 6499 + goto error_clean; 6500 + } 6501 + 6502 + #ifdef AGGRESSIVE_CHECK 6503 + mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; 6504 + #endif 6505 + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, 6506 + count_clusters, mark_flags, &changed); 6507 + 6508 + 6509 + if (err && changed == 0) 6510 + goto error_clean; 6511 + 6512 + #ifdef AGGRESSIVE_CHECK 6513 + BUG_ON(changed != count_clusters); 6514 + #endif 6467 6515 6468 6516 /* 6469 6517 * We need to make sure we don't reuse the freed block until after the ··· 6467 6555 new_entry->efd_tid = handle->h_transaction->t_tid; 6468 6556 6469 6557 ext4_lock_group(sb, block_group); 6470 - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6471 6558 ext4_mb_free_metadata(handle, &e4b, new_entry); 6472 6559 } else { 6473 - /* need to update group_info->bb_free and bitmap 6474 - * with group lock held. generate_buddy look at 6475 - * them with group lock_held 6476 - */ 6477 6560 if (test_opt(sb, DISCARD)) { 6478 6561 err = ext4_issue_discard(sb, block_group, bit, 6479 6562 count_clusters, NULL); ··· 6481 6574 EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); 6482 6575 6483 6576 ext4_lock_group(sb, block_group); 6484 - mb_clear_bits(bitmap_bh->b_data, bit, count_clusters); 6485 6577 mb_free_blocks(inode, &e4b, bit, count_clusters); 6486 6578 } 6487 6579 6488 - ret = ext4_free_group_clusters(sb, gdp) + count_clusters; 6489 - ext4_free_group_clusters_set(sb, gdp, ret); 6490 - ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); 6491 - ext4_group_desc_csum_set(sb, block_group, gdp); 6492 6580 ext4_unlock_group(sb, block_group); 6493 - 6494 - if (sbi->s_log_groups_per_flex) { 6495 - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6496 - atomic64_add(count_clusters, 6497 - &sbi_array_rcu_deref(sbi, s_flex_groups, 6498 - flex_group)->free_clusters); 6499 - } 6500 6581 6501 6582 /* 6502 6583 * on a bigalloc file system, defer the s_freeclusters_counter ··· 6498 6603 count_clusters); 6499 6604 } 6500 6605 6501 - ext4_mb_unload_buddy(&e4b); 6502 - 6503 - /* We dirtied the bitmap block */ 6504 - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6505 - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6506 - 6507 - /* And the group descriptor block */ 6508 - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6509 - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6510 - if (!err) 6511 - err = ret; 6512 - 6513 6606 if (overflow && !err) { 6514 6607 block += count; 6515 6608 count = overflow; 6516 - put_bh(bitmap_bh); 6609 + ext4_mb_unload_buddy(&e4b); 6517 6610 /* The range changed so it's no longer validated */ 6518 6611 flags &= ~EXT4_FREE_BLOCKS_VALIDATED; 6519 6612 goto do_more; 6520 6613 } 6521 - error_return: 6522 - brelse(bitmap_bh); 6614 + 6615 + error_clean: 6616 + ext4_mb_unload_buddy(&e4b); 6617 + error_out: 6523 6618 ext4_std_error(sb, err); 6524 6619 } 6525 6620 ··· 6627 6742 int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, 6628 6743 ext4_fsblk_t block, unsigned long count) 6629 6744 { 6630 - struct buffer_head *bitmap_bh = NULL; 6631 - struct buffer_head *gd_bh; 6632 6745 ext4_group_t block_group; 6633 6746 ext4_grpblk_t bit; 6634 - unsigned int i; 6635 - struct ext4_group_desc *desc; 6636 6747 struct ext4_sb_info *sbi = EXT4_SB(sb); 6637 6748 struct ext4_buddy e4b; 6638 - int err = 0, ret, free_clusters_count; 6639 - ext4_grpblk_t clusters_freed; 6749 + int err = 0; 6640 6750 ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); 6641 6751 ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); 6642 6752 unsigned long cluster_count = last_cluster - first_cluster + 1; 6753 + ext4_grpblk_t changed; 6643 6754 6644 6755 ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); 6645 6756 6646 - if (count == 0) 6757 + if (cluster_count == 0) 6647 6758 return 0; 6648 6759 6649 6760 ext4_get_group_no_and_offset(sb, block, &block_group, &bit); ··· 6651 6770 ext4_warning(sb, "too many blocks added to group %u", 6652 6771 block_group); 6653 6772 err = -EINVAL; 6654 - goto error_return; 6773 + goto error_out; 6655 6774 } 6656 6775 6657 - bitmap_bh = ext4_read_block_bitmap(sb, block_group); 6658 - if (IS_ERR(bitmap_bh)) { 6659 - err = PTR_ERR(bitmap_bh); 6660 - bitmap_bh = NULL; 6661 - goto error_return; 6662 - } 6663 - 6664 - desc = ext4_get_group_desc(sb, block_group, &gd_bh); 6665 - if (!desc) { 6666 - err = -EIO; 6667 - goto error_return; 6668 - } 6776 + err = ext4_mb_load_buddy(sb, block_group, &e4b); 6777 + if (err) 6778 + goto error_out; 6669 6779 6670 6780 if (!ext4_sb_block_valid(sb, NULL, block, count)) { 6671 6781 ext4_error(sb, "Adding blocks in system zones - " 6672 6782 "Block = %llu, count = %lu", 6673 6783 block, count); 6674 6784 err = -EINVAL; 6675 - goto error_return; 6785 + goto error_clean; 6676 6786 } 6677 6787 6678 - BUFFER_TRACE(bitmap_bh, "getting write access"); 6679 - err = ext4_journal_get_write_access(handle, sb, bitmap_bh, 6680 - EXT4_JTR_NONE); 6681 - if (err) 6682 - goto error_return; 6788 + err = ext4_mb_mark_context(handle, sb, false, block_group, bit, 6789 + cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, 6790 + &changed); 6791 + if (err && changed == 0) 6792 + goto error_clean; 6683 6793 6684 - /* 6685 - * We are about to modify some metadata. Call the journal APIs 6686 - * to unshare ->b_data if a currently-committing transaction is 6687 - * using it 6688 - */ 6689 - BUFFER_TRACE(gd_bh, "get_write_access"); 6690 - err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); 6691 - if (err) 6692 - goto error_return; 6794 + if (changed != cluster_count) 6795 + ext4_error(sb, "bit already cleared in group %u", block_group); 6693 6796 6694 - for (i = 0, clusters_freed = 0; i < cluster_count; i++) { 6695 - BUFFER_TRACE(bitmap_bh, "clear bit"); 6696 - if (!mb_test_bit(bit + i, bitmap_bh->b_data)) { 6697 - ext4_error(sb, "bit already cleared for block %llu", 6698 - (ext4_fsblk_t)(block + i)); 6699 - BUFFER_TRACE(bitmap_bh, "bit already cleared"); 6700 - } else { 6701 - clusters_freed++; 6702 - } 6703 - } 6704 - 6705 - err = ext4_mb_load_buddy(sb, block_group, &e4b); 6706 - if (err) 6707 - goto error_return; 6708 - 6709 - /* 6710 - * need to update group_info->bb_free and bitmap 6711 - * with group lock held. generate_buddy look at 6712 - * them with group lock_held 6713 - */ 6714 6797 ext4_lock_group(sb, block_group); 6715 - mb_clear_bits(bitmap_bh->b_data, bit, cluster_count); 6716 6798 mb_free_blocks(NULL, &e4b, bit, cluster_count); 6717 - free_clusters_count = clusters_freed + 6718 - ext4_free_group_clusters(sb, desc); 6719 - ext4_free_group_clusters_set(sb, desc, free_clusters_count); 6720 - ext4_block_bitmap_csum_set(sb, desc, bitmap_bh); 6721 - ext4_group_desc_csum_set(sb, block_group, desc); 6722 6799 ext4_unlock_group(sb, block_group); 6723 6800 percpu_counter_add(&sbi->s_freeclusters_counter, 6724 - clusters_freed); 6801 + changed); 6725 6802 6726 - if (sbi->s_log_groups_per_flex) { 6727 - ext4_group_t flex_group = ext4_flex_group(sbi, block_group); 6728 - atomic64_add(clusters_freed, 6729 - &sbi_array_rcu_deref(sbi, s_flex_groups, 6730 - flex_group)->free_clusters); 6731 - } 6732 - 6803 + error_clean: 6733 6804 ext4_mb_unload_buddy(&e4b); 6734 - 6735 - /* We dirtied the bitmap block */ 6736 - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 6737 - err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); 6738 - 6739 - /* And the group descriptor block */ 6740 - BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); 6741 - ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); 6742 - if (!err) 6743 - err = ret; 6744 - 6745 - error_return: 6746 - brelse(bitmap_bh); 6805 + error_out: 6747 6806 ext4_std_error(sb, err); 6748 6807 return err; 6749 6808 } ··· 6991 7170 6992 7171 return error; 6993 7172 } 7173 + 7174 + #ifdef CONFIG_EXT4_KUNIT_TESTS 7175 + #include "mballoc-test.c" 7176 + #endif

+1 -2

fs/ext4/namei.c

··· 2280 2280 top = data2 + len; 2281 2281 while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) { 2282 2282 if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len, 2283 - (data2 + (blocksize - csum_size) - 2284 - (char *) de))) { 2283 + (char *)de - data2)) { 2285 2284 brelse(bh2); 2286 2285 brelse(bh); 2287 2286 return -EFSCORRUPTED;

+37 -57

fs/ext4/resize.c

··· 10 10 */ 11 11 12 12 13 - #define EXT4FS_DEBUG 14 - 15 13 #include <linux/errno.h> 16 14 #include <linux/slab.h> 17 15 #include <linux/jiffies.h> ··· 55 57 * If the reserved GDT blocks is non-zero, the resize_inode feature 56 58 * should always be set. 57 59 */ 58 - if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks && 60 + if (sbi->s_es->s_reserved_gdt_blocks && 59 61 !ext4_has_feature_resize_inode(sb)) { 60 62 ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); 61 63 return -EFSCORRUPTED; ··· 67 69 * bad time to do it anyways. 68 70 */ 69 71 if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) != 70 - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { 72 + le32_to_cpu(sbi->s_es->s_first_data_block)) { 71 73 ext4_warning(sb, "won't resize using backup superblock at %llu", 72 - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); 74 + (unsigned long long)sbi->s_sbh->b_blocknr); 73 75 return -EPERM; 74 76 } 75 77 ··· 77 79 * We are not allowed to do online-resizing on a filesystem mounted 78 80 * with error, because it can destroy the filesystem easily. 79 81 */ 80 - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { 82 + if (sbi->s_mount_state & EXT4_ERROR_FS) { 81 83 ext4_warning(sb, "There are errors in the filesystem, " 82 84 "so online resizing is not allowed"); 83 85 return -EPERM; ··· 89 91 } 90 92 91 93 if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, 92 - &EXT4_SB(sb)->s_ext4_flags)) 94 + &sbi->s_ext4_flags)) 93 95 ret = -EBUSY; 94 96 95 97 return ret; ··· 102 104 if (update_backups) 103 105 return ext4_update_overhead(sb, true); 104 106 return 0; 105 - } 106 - 107 - static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb, 108 - ext4_group_t group) { 109 - return (group >> EXT4_DESC_PER_BLOCK_BITS(sb)) << 110 - EXT4_DESC_PER_BLOCK_BITS(sb); 111 - } 112 - 113 - static ext4_fsblk_t ext4_meta_bg_first_block_no(struct super_block *sb, 114 - ext4_group_t group) { 115 - group = ext4_meta_bg_first_group(sb, group); 116 - return ext4_group_first_block_no(sb, group); 117 107 } 118 108 119 109 static ext4_grpblk_t ext4_group_overhead_blocks(struct super_block *sb, ··· 140 154 141 155 overhead = ext4_group_overhead_blocks(sb, group); 142 156 metaend = start + overhead; 143 - input->free_clusters_count = free_blocks_count = 144 - input->blocks_count - 2 - overhead - sbi->s_itb_per_group; 157 + free_blocks_count = input->blocks_count - 2 - overhead - 158 + sbi->s_itb_per_group; 159 + input->free_clusters_count = EXT4_B2C(sbi, free_blocks_count); 145 160 146 161 if (test_opt(sb, DEBUG)) 147 162 printk(KERN_DEBUG "EXT4-fs: adding %s group %u: %u blocks " ··· 447 460 448 461 ext4_debug("mark clusters [%llu-%llu] used\n", first_cluster, 449 462 last_cluster); 450 - for (count2 = count; count > 0; 451 - count -= count2, first_cluster += count2) { 463 + for (; count > 0; count -= count2, first_cluster += count2) { 452 464 ext4_fsblk_t start; 453 465 struct buffer_head *bh; 454 466 ext4_group_t group; ··· 546 560 if (meta_bg == 0 && !ext4_bg_has_super(sb, group)) 547 561 goto handle_itb; 548 562 549 - if (meta_bg == 1) { 550 - ext4_group_t first_group; 551 - first_group = ext4_meta_bg_first_group(sb, group); 552 - if (first_group != group + 1 && 553 - first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1) 554 - goto handle_itb; 555 - } 563 + if (meta_bg == 1) 564 + goto handle_itb; 556 565 557 566 block = start + ext4_bg_has_super(sb, group); 558 567 /* Copy all of the GDT blocks into the backup in this group */ ··· 595 614 } 596 615 597 616 handle_itb: 598 - /* Initialize group tables of the grop @group */ 617 + /* Initialize group tables of the group @group */ 599 618 if (!(bg_flags[i] & EXT4_BG_INODE_ZEROED)) 600 619 goto handle_bb; 601 620 ··· 685 704 block = start; 686 705 } 687 706 688 - if (count) { 689 - err = set_flexbg_block_bitmap(sb, handle, 690 - flex_gd, 691 - EXT4_B2C(sbi, start), 692 - EXT4_B2C(sbi, 693 - start + count 694 - - 1)); 695 - if (err) 696 - goto out; 697 - } 707 + err = set_flexbg_block_bitmap(sb, handle, 708 + flex_gd, 709 + EXT4_B2C(sbi, start), 710 + EXT4_B2C(sbi, 711 + start + count 712 + - 1)); 713 + if (err) 714 + goto out; 698 715 } 699 716 700 717 out: ··· 931 952 } 932 953 933 954 /* 934 - * add_new_gdb_meta_bg is the sister of add_new_gdb. 955 + * If there is no available space in the existing block group descriptors for 956 + * the new block group and there are no reserved block group descriptors, then 957 + * the meta_bg feature will get enabled, and es->s_first_meta_bg will get set 958 + * to the first block group that is managed using meta_bg and s_first_meta_bg 959 + * must be a multiple of EXT4_DESC_PER_BLOCK(sb). 960 + * This function will be called when first group of meta_bg is added to bring 961 + * new group descriptors block of new added meta_bg. 935 962 */ 936 963 static int add_new_gdb_meta_bg(struct super_block *sb, 937 964 handle_t *handle, ext4_group_t group) { ··· 947 962 unsigned long gdb_num = group / EXT4_DESC_PER_BLOCK(sb); 948 963 int err; 949 964 950 - gdblock = ext4_meta_bg_first_block_no(sb, group) + 951 - ext4_bg_has_super(sb, group); 965 + gdblock = ext4_group_first_block_no(sb, group) + 966 + ext4_bg_has_super(sb, group); 952 967 gdb_bh = ext4_sb_bread(sb, gdblock, 0); 953 968 if (IS_ERR(gdb_bh)) 954 969 return PTR_ERR(gdb_bh); ··· 1072 1087 for (i = 0; i < reserved_gdb; i++) { 1073 1088 int err2; 1074 1089 data = (__le32 *)primary[i]->b_data; 1075 - /* printk("reserving backup %lu[%u] = %lu\n", 1076 - primary[i]->b_blocknr, gdbackups, 1077 - blk + primary[i]->b_blocknr); */ 1078 1090 data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); 1079 1091 err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); 1080 1092 if (!err) ··· 1173 1191 ext4_group_first_block_no(sb, group)); 1174 1192 BUFFER_TRACE(bh, "get_write_access"); 1175 1193 if ((err = ext4_journal_get_write_access(handle, sb, bh, 1176 - EXT4_JTR_NONE))) 1194 + EXT4_JTR_NONE))) { 1195 + brelse(bh); 1177 1196 break; 1197 + } 1178 1198 lock_buffer(bh); 1179 1199 memcpy(bh->b_data, data, size); 1180 1200 if (rest) ··· 1585 1601 int gdb_num_end = ((group + flex_gd->count - 1) / 1586 1602 EXT4_DESC_PER_BLOCK(sb)); 1587 1603 int meta_bg = ext4_has_feature_meta_bg(sb); 1588 - sector_t old_gdb = 0; 1604 + sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr - 1605 + ext4_group_first_block_no(sb, 0); 1589 1606 1590 1607 update_backups(sb, ext4_group_first_block_no(sb, 0), 1591 1608 (char *)es, sizeof(struct ext4_super_block), 0); ··· 1595 1610 1596 1611 gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, 1597 1612 gdb_num); 1598 - if (old_gdb == gdb_bh->b_blocknr) 1599 - continue; 1600 - update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data, 1601 - gdb_bh->b_size, meta_bg); 1602 - old_gdb = gdb_bh->b_blocknr; 1613 + update_backups(sb, gdb_bh->b_blocknr - padding_blocks, 1614 + gdb_bh->b_data, gdb_bh->b_size, meta_bg); 1603 1615 } 1604 1616 } 1605 1617 exit: ··· 1962 1980 1963 1981 errout: 1964 1982 ret = ext4_journal_stop(handle); 1965 - if (!err) 1966 - err = ret; 1967 - return ret; 1983 + return err ? err : ret; 1968 1984 1969 1985 invalid_resize_inode: 1970 1986 ext4_error(sb, "corrupted/inconsistent resize inode");

+16 -1

fs/ext4/super.c

··· 768 768 */ 769 769 if (!sb_rdonly(sbi->s_sb) && journal) { 770 770 struct buffer_head *sbh = sbi->s_sbh; 771 - bool call_notify_err; 771 + bool call_notify_err = false; 772 + 772 773 handle = jbd2_journal_start(journal, 1); 773 774 if (IS_ERR(handle)) 774 775 goto write_directly; ··· 6445 6444 struct ext4_mount_options old_opts; 6446 6445 ext4_group_t g; 6447 6446 int err = 0; 6447 + int alloc_ctx; 6448 6448 #ifdef CONFIG_QUOTA 6449 6449 int enable_quota = 0; 6450 6450 int i, j; ··· 6486 6484 6487 6485 } 6488 6486 6487 + /* 6488 + * Changing the DIOREAD_NOLOCK or DELALLOC mount options may cause 6489 + * two calls to ext4_should_dioread_nolock() to return inconsistent 6490 + * values, triggering WARN_ON in ext4_add_complete_io(). we grab 6491 + * here s_writepages_rwsem to avoid race between writepages ops and 6492 + * remount. 6493 + */ 6494 + alloc_ctx = ext4_writepages_down_write(sb); 6489 6495 ext4_apply_options(fc, sb); 6496 + ext4_writepages_up_write(sb, alloc_ctx); 6490 6497 6491 6498 if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ 6492 6499 test_opt(sb, JOURNAL_CHECKSUM)) { ··· 6713 6702 if (sb_rdonly(sb) && !(old_sb_flags & SB_RDONLY) && 6714 6703 sb_any_quota_suspended(sb)) 6715 6704 dquot_resume(sb, -1); 6705 + 6706 + alloc_ctx = ext4_writepages_down_write(sb); 6716 6707 sb->s_flags = old_sb_flags; 6717 6708 sbi->s_mount_opt = old_opts.s_mount_opt; 6718 6709 sbi->s_mount_opt2 = old_opts.s_mount_opt2; ··· 6723 6710 sbi->s_commit_interval = old_opts.s_commit_interval; 6724 6711 sbi->s_min_batch_time = old_opts.s_min_batch_time; 6725 6712 sbi->s_max_batch_time = old_opts.s_max_batch_time; 6713 + ext4_writepages_up_write(sb, alloc_ctx); 6714 + 6726 6715 if (!test_opt(sb, BLOCK_VALIDITY) && sbi->s_system_blks) 6727 6716 ext4_release_system_zone(sb); 6728 6717 #ifdef CONFIG_QUOTA

+11 -2

fs/jbd2/recovery.c

··· 289 289 journal_superblock_t * sb; 290 290 291 291 struct recovery_info info; 292 + errseq_t wb_err; 293 + struct address_space *mapping; 292 294 293 295 memset(&info, 0, sizeof(info)); 294 296 sb = journal->j_superblock; ··· 308 306 return 0; 309 307 } 310 308 309 + wb_err = 0; 310 + mapping = journal->j_fs_dev->bd_inode->i_mapping; 311 + errseq_check_and_advance(&mapping->wb_err, &wb_err); 311 312 err = do_one_pass(journal, &info, PASS_SCAN); 312 313 if (!err) 313 314 err = do_one_pass(journal, &info, PASS_REVOKE); ··· 332 327 333 328 jbd2_journal_clear_revoke(journal); 334 329 err2 = sync_blockdev(journal->j_fs_dev); 330 + if (!err) 331 + err = err2; 332 + err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err); 335 333 if (!err) 336 334 err = err2; 337 335 /* Make sure all replayed data is on permanent storage */ ··· 640 632 success = err; 641 633 printk(KERN_ERR 642 634 "JBD2: IO error %d recovering " 643 - "block %ld in log\n", 635 + "block %lu in log\n", 644 636 err, io_block); 645 637 } else { 646 638 unsigned long long blocknr; ··· 669 661 printk(KERN_ERR "JBD2: Invalid " 670 662 "checksum recovering " 671 663 "data block %llu in " 672 - "log\n", blocknr); 664 + "journal block %lu\n", 665 + blocknr, io_block); 673 666 block_error = 1; 674 667 goto skip_write; 675 668 }

Configure Feed

Configure Feed