Merge tag 'for-7.1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

-2

fs/btrfs/Kconfig

··· 112 112 113 113 - large folio and block size (> page size) support 114 114 115 - - shutdown ioctl and auto-degradation support 116 - 117 115 - asynchronous checksum generation for data writes 118 116 119 117 - remap-tree - logical address remapping tree

+4

fs/btrfs/Makefile

··· 45 45 tests/free-space-tree-tests.o tests/extent-map-tests.o \ 46 46 tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o \ 47 47 tests/chunk-allocation-tests.o 48 + 49 + ifeq ($(CONFIG_BLK_DEV_ZONED),y) 50 + btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/zoned-tests.o 51 + endif

-10

fs/btrfs/backref.c

··· 858 858 free_pref(ref); 859 859 return PTR_ERR(eb); 860 860 } 861 - if (unlikely(!extent_buffer_uptodate(eb))) { 862 - free_pref(ref); 863 - free_extent_buffer(eb); 864 - return -EIO; 865 - } 866 861 867 862 if (lock) 868 863 btrfs_tree_read_lock(eb); ··· 1613 1618 &check); 1614 1619 if (IS_ERR(eb)) { 1615 1620 ret = PTR_ERR(eb); 1616 - goto out; 1617 - } 1618 - if (unlikely(!extent_buffer_uptodate(eb))) { 1619 - free_extent_buffer(eb); 1620 - ret = -EIO; 1621 1621 goto out; 1622 1622 } 1623 1623

+10 -2

fs/btrfs/bio.c

··· 4 4 * Copyright (C) 2022 Christoph Hellwig. 5 5 */ 6 6 7 + #include <linux/blk_types.h> 7 8 #include <linux/bio.h> 8 9 #include "bio.h" 9 10 #include "ctree.h" ··· 351 350 352 351 static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev) 353 352 { 353 + blk_status_t sts = bio->bi_status; 354 + 354 355 if (!dev || !dev->bdev) 355 356 return; 356 - if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) 357 + if (unlikely(sts == BLK_STS_OK)) 357 358 return; 358 - 359 + if (unlikely(sts != BLK_STS_IOERR && sts != BLK_STS_TARGET && 360 + sts != BLK_STS_MEDIUM && sts != BLK_STS_PROTECTION)) { 361 + btrfs_warn_rl(dev->fs_info, "bdev %s unexpected block io error: %d", 362 + btrfs_dev_name(dev), sts); 363 + return; 364 + } 359 365 if (btrfs_op(bio) == BTRFS_MAP_WRITE) 360 366 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); 361 367 else if (!(bio->bi_opf & REQ_RAHEAD))

+188 -147

fs/btrfs/block-group.c

··· 728 728 struct extent_buffer *leaf; 729 729 struct btrfs_key key; 730 730 u64 total_found = 0; 731 - u64 last = 0; 731 + u64 last = block_group->start; 732 732 u32 nritems; 733 733 int ret; 734 734 bool wakeup = true; ··· 737 737 if (!path) 738 738 return -ENOMEM; 739 739 740 - last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); 741 740 extent_root = btrfs_extent_root(fs_info, last); 742 741 if (unlikely(!extent_root)) { 743 742 btrfs_err(fs_info, ··· 1612 1613 1613 1614 spin_lock(&space_info->lock); 1614 1615 spin_lock(&block_group->lock); 1616 + 1617 + if (btrfs_is_zoned(fs_info) && btrfs_is_block_group_used(block_group) && 1618 + block_group->zone_unusable >= div_u64(block_group->length, 2)) { 1619 + /* 1620 + * If the block group has data left, but at least half 1621 + * of the block group is zone_unusable, mark it as 1622 + * reclaimable before continuing with the next block group. 1623 + */ 1624 + 1625 + spin_unlock(&block_group->lock); 1626 + spin_unlock(&space_info->lock); 1627 + up_write(&space_info->groups_sem); 1628 + 1629 + btrfs_mark_bg_to_reclaim(block_group); 1630 + 1631 + goto next; 1632 + } 1633 + 1615 1634 if (btrfs_is_block_group_used(block_group) || 1616 1635 (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) || 1617 1636 list_is_singular(&block_group->list) || ··· 1696 1679 spin_unlock(&space_info->lock); 1697 1680 1698 1681 /* We don't want to force the issue, only flip if it's ok. */ 1699 - ret = inc_block_group_ro(block_group, 0); 1682 + ret = inc_block_group_ro(block_group, false); 1700 1683 up_write(&space_info->groups_sem); 1701 1684 if (ret < 0) { 1702 1685 ret = 0; ··· 1909 1892 return true; 1910 1893 } 1911 1894 1912 - void btrfs_reclaim_bgs_work(struct work_struct *work) 1895 + static int btrfs_reclaim_block_group(struct btrfs_block_group *bg, int *reclaimed) 1913 1896 { 1914 - struct btrfs_fs_info *fs_info = 1915 - container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 1897 + struct btrfs_fs_info *fs_info = bg->fs_info; 1898 + struct btrfs_space_info *space_info = bg->space_info; 1899 + u64 used; 1900 + u64 reserved; 1901 + u64 old_total; 1902 + int ret = 0; 1903 + 1904 + /* Don't race with allocators so take the groups_sem */ 1905 + down_write(&space_info->groups_sem); 1906 + 1907 + spin_lock(&space_info->lock); 1908 + spin_lock(&bg->lock); 1909 + if (bg->reserved || bg->pinned || bg->ro) { 1910 + /* 1911 + * We want to bail if we made new allocations or have 1912 + * outstanding allocations in this block group. We do 1913 + * the ro check in case balance is currently acting on 1914 + * this block group. 1915 + */ 1916 + spin_unlock(&bg->lock); 1917 + spin_unlock(&space_info->lock); 1918 + up_write(&space_info->groups_sem); 1919 + return 0; 1920 + } 1921 + 1922 + if (bg->used == 0) { 1923 + /* 1924 + * It is possible that we trigger relocation on a block 1925 + * group as its extents are deleted and it first goes 1926 + * below the threshold, then shortly after goes empty. 1927 + * 1928 + * In this case, relocating it does delete it, but has 1929 + * some overhead in relocation specific metadata, looking 1930 + * for the non-existent extents and running some extra 1931 + * transactions, which we can avoid by using one of the 1932 + * other mechanisms for dealing with empty block groups. 1933 + */ 1934 + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1935 + btrfs_mark_bg_unused(bg); 1936 + spin_unlock(&bg->lock); 1937 + spin_unlock(&space_info->lock); 1938 + up_write(&space_info->groups_sem); 1939 + return 0; 1940 + } 1941 + 1942 + /* 1943 + * The block group might no longer meet the reclaim condition by 1944 + * the time we get around to reclaiming it, so to avoid 1945 + * reclaiming overly full block_groups, skip reclaiming them. 1946 + * 1947 + * Since the decision making process also depends on the amount 1948 + * being freed, pass in a fake giant value to skip that extra 1949 + * check, which is more meaningful when adding to the list in 1950 + * the first place. 1951 + */ 1952 + if (!should_reclaim_block_group(bg, bg->length)) { 1953 + spin_unlock(&bg->lock); 1954 + spin_unlock(&space_info->lock); 1955 + up_write(&space_info->groups_sem); 1956 + return 0; 1957 + } 1958 + 1959 + spin_unlock(&bg->lock); 1960 + old_total = space_info->total_bytes; 1961 + spin_unlock(&space_info->lock); 1962 + 1963 + /* 1964 + * Get out fast, in case we're read-only or unmounting the 1965 + * filesystem. It is OK to drop block groups from the list even 1966 + * for the read-only case. As we did take the super write lock, 1967 + * "mount -o remount,ro" won't happen and read-only filesystem 1968 + * means it is forced read-only due to a fatal error. So, it 1969 + * never gets back to read-write to let us reclaim again. 1970 + */ 1971 + if (btrfs_need_cleaner_sleep(fs_info)) { 1972 + up_write(&space_info->groups_sem); 1973 + return 0; 1974 + } 1975 + 1976 + ret = inc_block_group_ro(bg, false); 1977 + up_write(&space_info->groups_sem); 1978 + if (ret < 0) 1979 + return ret; 1980 + 1981 + /* 1982 + * The amount of bytes reclaimed corresponds to the sum of the 1983 + * "used" and "reserved" counters. We have set the block group 1984 + * to RO above, which prevents reservations from happening but 1985 + * we may have existing reservations for which allocation has 1986 + * not yet been done - btrfs_update_block_group() was not yet 1987 + * called, which is where we will transfer a reserved extent's 1988 + * size from the "reserved" counter to the "used" counter - this 1989 + * happens when running delayed references. When we relocate the 1990 + * chunk below, relocation first flushes delalloc, waits for 1991 + * ordered extent completion (which is where we create delayed 1992 + * references for data extents) and commits the current 1993 + * transaction (which runs delayed references), and only after 1994 + * it does the actual work to move extents out of the block 1995 + * group. So the reported amount of reclaimed bytes is 1996 + * effectively the sum of the 'used' and 'reserved' counters. 1997 + */ 1998 + spin_lock(&bg->lock); 1999 + used = bg->used; 2000 + reserved = bg->reserved; 2001 + spin_unlock(&bg->lock); 2002 + 2003 + trace_btrfs_reclaim_block_group(bg); 2004 + ret = btrfs_relocate_chunk(fs_info, bg->start, false); 2005 + if (ret) { 2006 + btrfs_dec_block_group_ro(bg); 2007 + btrfs_err(fs_info, "error relocating chunk %llu", 2008 + bg->start); 2009 + used = 0; 2010 + reserved = 0; 2011 + spin_lock(&space_info->lock); 2012 + space_info->reclaim_errors++; 2013 + spin_unlock(&space_info->lock); 2014 + } 2015 + spin_lock(&space_info->lock); 2016 + space_info->reclaim_count++; 2017 + space_info->reclaim_bytes += used; 2018 + space_info->reclaim_bytes += reserved; 2019 + if (space_info->total_bytes < old_total) 2020 + btrfs_set_periodic_reclaim_ready(space_info, true); 2021 + spin_unlock(&space_info->lock); 2022 + if (!ret) 2023 + (*reclaimed)++; 2024 + 2025 + return ret; 2026 + } 2027 + 2028 + void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit) 2029 + { 1916 2030 struct btrfs_block_group *bg; 1917 2031 struct btrfs_space_info *space_info; 1918 2032 LIST_HEAD(retry_list); 2033 + int reclaimed = 0; 1919 2034 1920 2035 if (!btrfs_should_reclaim(fs_info)) 1921 2036 return; ··· 2074 1925 */ 2075 1926 list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); 2076 1927 while (!list_empty(&fs_info->reclaim_bgs)) { 2077 - u64 used; 2078 - u64 reserved; 2079 - u64 old_total; 2080 - int ret = 0; 1928 + int ret; 2081 1929 2082 1930 bg = list_first_entry(&fs_info->reclaim_bgs, 2083 1931 struct btrfs_block_group, ··· 2083 1937 2084 1938 space_info = bg->space_info; 2085 1939 spin_unlock(&fs_info->unused_bgs_lock); 1940 + ret = btrfs_reclaim_block_group(bg, &reclaimed); 2086 1941 2087 - /* Don't race with allocators so take the groups_sem */ 2088 - down_write(&space_info->groups_sem); 2089 - 2090 - spin_lock(&space_info->lock); 2091 - spin_lock(&bg->lock); 2092 - if (bg->reserved || bg->pinned || bg->ro) { 2093 - /* 2094 - * We want to bail if we made new allocations or have 2095 - * outstanding allocations in this block group. We do 2096 - * the ro check in case balance is currently acting on 2097 - * this block group. 2098 - */ 2099 - spin_unlock(&bg->lock); 2100 - spin_unlock(&space_info->lock); 2101 - up_write(&space_info->groups_sem); 2102 - goto next; 2103 - } 2104 - if (bg->used == 0) { 2105 - /* 2106 - * It is possible that we trigger relocation on a block 2107 - * group as its extents are deleted and it first goes 2108 - * below the threshold, then shortly after goes empty. 2109 - * 2110 - * In this case, relocating it does delete it, but has 2111 - * some overhead in relocation specific metadata, looking 2112 - * for the non-existent extents and running some extra 2113 - * transactions, which we can avoid by using one of the 2114 - * other mechanisms for dealing with empty block groups. 2115 - */ 2116 - if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) 2117 - btrfs_mark_bg_unused(bg); 2118 - spin_unlock(&bg->lock); 2119 - spin_unlock(&space_info->lock); 2120 - up_write(&space_info->groups_sem); 2121 - goto next; 2122 - 2123 - } 2124 - /* 2125 - * The block group might no longer meet the reclaim condition by 2126 - * the time we get around to reclaiming it, so to avoid 2127 - * reclaiming overly full block_groups, skip reclaiming them. 2128 - * 2129 - * Since the decision making process also depends on the amount 2130 - * being freed, pass in a fake giant value to skip that extra 2131 - * check, which is more meaningful when adding to the list in 2132 - * the first place. 2133 - */ 2134 - if (!should_reclaim_block_group(bg, bg->length)) { 2135 - spin_unlock(&bg->lock); 2136 - spin_unlock(&space_info->lock); 2137 - up_write(&space_info->groups_sem); 2138 - goto next; 2139 - } 2140 - 2141 - spin_unlock(&bg->lock); 2142 - old_total = space_info->total_bytes; 2143 - spin_unlock(&space_info->lock); 2144 - 2145 - /* 2146 - * Get out fast, in case we're read-only or unmounting the 2147 - * filesystem. It is OK to drop block groups from the list even 2148 - * for the read-only case. As we did take the super write lock, 2149 - * "mount -o remount,ro" won't happen and read-only filesystem 2150 - * means it is forced read-only due to a fatal error. So, it 2151 - * never gets back to read-write to let us reclaim again. 2152 - */ 2153 - if (btrfs_need_cleaner_sleep(fs_info)) { 2154 - up_write(&space_info->groups_sem); 2155 - goto next; 2156 - } 2157 - 2158 - ret = inc_block_group_ro(bg, 0); 2159 - up_write(&space_info->groups_sem); 2160 - if (ret < 0) 2161 - goto next; 2162 - 2163 - /* 2164 - * The amount of bytes reclaimed corresponds to the sum of the 2165 - * "used" and "reserved" counters. We have set the block group 2166 - * to RO above, which prevents reservations from happening but 2167 - * we may have existing reservations for which allocation has 2168 - * not yet been done - btrfs_update_block_group() was not yet 2169 - * called, which is where we will transfer a reserved extent's 2170 - * size from the "reserved" counter to the "used" counter - this 2171 - * happens when running delayed references. When we relocate the 2172 - * chunk below, relocation first flushes delalloc, waits for 2173 - * ordered extent completion (which is where we create delayed 2174 - * references for data extents) and commits the current 2175 - * transaction (which runs delayed references), and only after 2176 - * it does the actual work to move extents out of the block 2177 - * group. So the reported amount of reclaimed bytes is 2178 - * effectively the sum of the 'used' and 'reserved' counters. 2179 - */ 2180 - spin_lock(&bg->lock); 2181 - used = bg->used; 2182 - reserved = bg->reserved; 2183 - spin_unlock(&bg->lock); 2184 - 2185 - trace_btrfs_reclaim_block_group(bg); 2186 - ret = btrfs_relocate_chunk(fs_info, bg->start, false); 2187 - if (ret) { 2188 - btrfs_dec_block_group_ro(bg); 2189 - btrfs_err(fs_info, "error relocating chunk %llu", 2190 - bg->start); 2191 - used = 0; 2192 - reserved = 0; 2193 - spin_lock(&space_info->lock); 2194 - space_info->reclaim_errors++; 2195 - spin_unlock(&space_info->lock); 2196 - } 2197 - spin_lock(&space_info->lock); 2198 - space_info->reclaim_count++; 2199 - space_info->reclaim_bytes += used; 2200 - space_info->reclaim_bytes += reserved; 2201 - if (space_info->total_bytes < old_total) 2202 - btrfs_set_periodic_reclaim_ready(space_info, true); 2203 - spin_unlock(&space_info->lock); 2204 - 2205 - next: 2206 1942 if (ret && !READ_ONCE(space_info->periodic_reclaim)) 2207 1943 btrfs_link_bg_list(bg, &retry_list); 2208 1944 btrfs_put_block_group(bg); ··· 2102 2074 if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) 2103 2075 goto end; 2104 2076 spin_lock(&fs_info->unused_bgs_lock); 2077 + if (reclaimed >= limit) 2078 + break; 2105 2079 } 2106 2080 spin_unlock(&fs_info->unused_bgs_lock); 2107 2081 mutex_unlock(&fs_info->reclaim_bgs_lock); ··· 2112 2082 list_splice_tail(&retry_list, &fs_info->reclaim_bgs); 2113 2083 spin_unlock(&fs_info->unused_bgs_lock); 2114 2084 btrfs_exclop_finish(fs_info); 2085 + } 2086 + 2087 + void btrfs_reclaim_bgs_work(struct work_struct *work) 2088 + { 2089 + struct btrfs_fs_info *fs_info = 2090 + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); 2091 + 2092 + btrfs_reclaim_block_groups(fs_info, -1); 2115 2093 } 2116 2094 2117 2095 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) ··· 2260 2222 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) 2261 2223 io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); 2262 2224 2263 - buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); 2225 + buf = kzalloc_objs(u64, map->num_stripes, GFP_NOFS); 2264 2226 if (!buf) { 2265 2227 ret = -ENOMEM; 2266 2228 goto out; ··· 2576 2538 btrfs_mark_bg_unused(cache); 2577 2539 } 2578 2540 } else { 2579 - inc_block_group_ro(cache, 1); 2541 + inc_block_group_ro(cache, true); 2580 2542 } 2581 2543 2582 2544 return 0; ··· 2732 2694 list_for_each_entry(cache, 2733 2695 &space_info->block_groups[BTRFS_RAID_RAID0], 2734 2696 list) 2735 - inc_block_group_ro(cache, 1); 2697 + inc_block_group_ro(cache, true); 2736 2698 list_for_each_entry(cache, 2737 2699 &space_info->block_groups[BTRFS_RAID_SINGLE], 2738 2700 list) 2739 - inc_block_group_ro(cache, 1); 2701 + inc_block_group_ro(cache, true); 2740 2702 } 2741 2703 2742 2704 btrfs_init_global_block_rsv(info); ··· 3125 3087 */ 3126 3088 if (sb_rdonly(fs_info->sb)) { 3127 3089 mutex_lock(&fs_info->ro_block_group_mutex); 3128 - ret = inc_block_group_ro(cache, 0); 3090 + ret = inc_block_group_ro(cache, false); 3129 3091 mutex_unlock(&fs_info->ro_block_group_mutex); 3130 3092 return ret; 3131 3093 } ··· 3176 3138 } 3177 3139 } 3178 3140 3179 - ret = inc_block_group_ro(cache, 0); 3141 + ret = inc_block_group_ro(cache, false); 3180 3142 if (!ret) 3181 3143 goto out; 3182 3144 if (ret == -ETXTBSY) ··· 3203 3165 if (ret < 0) 3204 3166 goto out; 3205 3167 3206 - ret = inc_block_group_ro(cache, 0); 3168 + ret = inc_block_group_ro(cache, false); 3207 3169 if (ret == -ETXTBSY) 3208 3170 goto unlock_out; 3209 3171 out: ··· 3343 3305 3344 3306 } 3345 3307 3346 - static int cache_save_setup(struct btrfs_block_group *block_group, 3347 - struct btrfs_trans_handle *trans, 3348 - struct btrfs_path *path) 3308 + static void cache_save_setup(struct btrfs_block_group *block_group, 3309 + struct btrfs_trans_handle *trans, 3310 + struct btrfs_path *path) 3349 3311 { 3350 3312 struct btrfs_fs_info *fs_info = block_group->fs_info; 3351 3313 struct inode *inode = NULL; ··· 3357 3319 int ret = 0; 3358 3320 3359 3321 if (!btrfs_test_opt(fs_info, SPACE_CACHE)) 3360 - return 0; 3322 + return; 3361 3323 3362 3324 /* 3363 3325 * If this block group is smaller than 100 megs don't bother caching the ··· 3367 3329 spin_lock(&block_group->lock); 3368 3330 block_group->disk_cache_state = BTRFS_DC_WRITTEN; 3369 3331 spin_unlock(&block_group->lock); 3370 - return 0; 3332 + return; 3371 3333 } 3372 3334 3373 3335 if (TRANS_ABORTED(trans)) 3374 - return 0; 3336 + return; 3375 3337 again: 3376 3338 inode = lookup_free_space_inode(block_group, path); 3377 3339 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { ··· 3381 3343 } 3382 3344 3383 3345 if (IS_ERR(inode)) { 3384 - BUG_ON(retries); 3346 + if (retries) { 3347 + ret = PTR_ERR(inode); 3348 + btrfs_err(fs_info, 3349 + "failed to lookup free space inode after creation for block group %llu: %d", 3350 + block_group->start, ret); 3351 + goto out_free; 3352 + } 3385 3353 retries++; 3386 3354 3387 3355 if (block_group->ro) ··· 3458 3414 * We hit an ENOSPC when setting up the cache in this transaction, just 3459 3415 * skip doing the setup, we've already cleared the cache so we're safe. 3460 3416 */ 3461 - if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { 3462 - ret = -ENOSPC; 3417 + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) 3463 3418 goto out_put; 3464 - } 3465 3419 3466 3420 /* 3467 3421 * Try to preallocate enough space based on how big the block group is. ··· 3507 3465 spin_unlock(&block_group->lock); 3508 3466 3509 3467 extent_changeset_free(data_reserved); 3510 - return ret; 3511 3468 } 3512 3469 3513 3470 int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)

+1

fs/btrfs/block-group.h

··· 350 350 struct btrfs_chunk_map *map); 351 351 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); 352 352 void btrfs_mark_bg_unused(struct btrfs_block_group *bg); 353 + void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit); 353 354 void btrfs_reclaim_bgs_work(struct work_struct *work); 354 355 void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); 355 356 void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);

+25

fs/btrfs/block-rsv.c

··· 541 541 BTRFS_RESERVE_NO_FLUSH); 542 542 if (!ret) 543 543 return block_rsv; 544 + 545 + /* 546 + * If we are being used for updating a log tree, fail immediately, which 547 + * makes the fsync fallback to a transaction commit. 548 + * 549 + * We don't want to consume from the global block reserve, as that is 550 + * precious space that may be needed to do updates to some trees for 551 + * which we don't reserve space during a transaction commit (update root 552 + * items in the root tree, device stat items in the device tree and 553 + * quota tree updates, see btrfs_init_root_block_rsv()), or to fallback 554 + * to in case we did not reserve enough space to run delayed items, 555 + * delayed references, or anything else we need in order to avoid a 556 + * transaction abort. 557 + * 558 + * We also don't want to do a reservation in flush emergency mode, as 559 + * we end up using metadata that could be critical to allow a 560 + * transaction to complete successfully and therefore increase the 561 + * chances for a transaction abort. 562 + * 563 + * Log trees are an optimization and should never consume from the 564 + * global reserve or be allowed overcommitting metadata. 565 + */ 566 + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) 567 + return ERR_PTR(ret); 568 + 544 569 /* 545 570 * If we couldn't reserve metadata bytes try and use some from 546 571 * the global reserve if its space type is the same as the global

+35 -10

fs/btrfs/compression.c

··· 180 180 /* 181 181 * Common wrappers for page allocation from compression wrappers 182 182 */ 183 - struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) 183 + struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info, gfp_t gfp) 184 184 { 185 185 struct folio *folio = NULL; 186 186 ··· 200 200 return folio; 201 201 202 202 alloc: 203 - return folio_alloc(GFP_NOFS, fs_info->block_min_order); 203 + return folio_alloc(gfp, fs_info->block_min_order); 204 204 } 205 205 206 206 void btrfs_free_compr_folio(struct folio *folio) ··· 292 292 struct compressed_bio *cb = to_compressed_bio(bbio); 293 293 struct folio_iter fi; 294 294 295 - btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, 295 + btrfs_finish_ordered_extent(cb->bbio.ordered, cb->start, cb->len, 296 296 cb->bbio.bio.bi_status == BLK_STS_OK); 297 297 298 298 if (cb->writeback) ··· 330 330 cb->start = ordered->file_offset; 331 331 cb->len = ordered->num_bytes; 332 332 ASSERT(cb->bbio.bio.bi_iter.bi_size == ordered->disk_num_bytes); 333 - cb->compressed_len = ordered->disk_num_bytes; 334 333 cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; 335 334 cb->bbio.ordered = ordered; 336 335 ··· 368 369 static noinline int add_ra_bio_pages(struct inode *inode, 369 370 u64 compressed_end, 370 371 struct compressed_bio *cb, 371 - int *memstall, unsigned long *pflags) 372 + int *memstall, unsigned long *pflags, 373 + bool direct_reclaim) 372 374 { 373 375 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 374 376 pgoff_t end_index; ··· 377 377 u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; 378 378 u64 isize = i_size_read(inode); 379 379 int ret; 380 + gfp_t constraint_gfp, cache_gfp; 380 381 struct folio *folio; 381 382 struct extent_map *em; 382 383 struct address_space *mapping = inode->i_mapping; ··· 406 405 return 0; 407 406 408 407 end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; 408 + 409 + /* 410 + * Avoid direct reclaim when the caller does not allow it. Since 411 + * add_ra_bio_pages() is always speculative, suppress allocation warnings 412 + * in either case. 413 + */ 414 + if (!direct_reclaim) { 415 + constraint_gfp = ~(__GFP_FS | __GFP_DIRECT_RECLAIM) | __GFP_NOWARN; 416 + cache_gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; 417 + } else { 418 + constraint_gfp = (~__GFP_FS) | __GFP_NOWARN; 419 + cache_gfp = GFP_NOFS | __GFP_NOWARN; 420 + } 409 421 410 422 while (cur < compressed_end) { 411 423 pgoff_t page_end; ··· 449 435 continue; 450 436 } 451 437 452 - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), 438 + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), 453 439 0, NULL); 454 440 if (!folio) 455 441 break; 456 442 457 - if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { 443 + if (filemap_add_folio(mapping, folio, pg_index, cache_gfp)) { 458 444 /* There is already a page, skip to page end */ 459 445 cur += folio_size(folio); 460 446 folio_put(folio); ··· 547 533 unsigned int compressed_len; 548 534 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 549 535 u64 file_offset = bbio->file_offset; 536 + gfp_t gfp; 550 537 u64 em_len; 551 538 u64 em_start; 552 539 struct extent_map *em; 553 540 unsigned long pflags; 554 541 int memstall = 0; 555 542 int ret; 543 + 544 + /* 545 + * If this is a readahead bio, prevent direct reclaim. This is done to 546 + * avoid stalling on speculative allocations when memory pressure is 547 + * high. The demand fault will retry with GFP_NOFS and enter direct 548 + * reclaim if needed. 549 + */ 550 + if (bbio->bio.bi_opf & REQ_RAHEAD) 551 + gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; 552 + else 553 + gfp = GFP_NOFS; 556 554 557 555 /* we need the actual starting offset of this extent in the file */ 558 556 read_lock(&em_tree->lock); ··· 586 560 em_start = em->start; 587 561 588 562 cb->len = bbio->bio.bi_iter.bi_size; 589 - cb->compressed_len = compressed_len; 590 563 cb->compress_type = btrfs_extent_map_compression(em); 591 564 cb->orig_bbio = bbio; 592 565 cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; ··· 596 571 struct folio *folio; 597 572 u32 cur_len = min(compressed_len - i * min_folio_size, min_folio_size); 598 573 599 - folio = btrfs_alloc_compr_folio(fs_info); 574 + folio = btrfs_alloc_compr_folio(fs_info, gfp); 600 575 if (!folio) { 601 576 ret = -ENOMEM; 602 577 goto out_free_bio; ··· 612 587 ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len); 613 588 614 589 add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, 615 - &pflags); 590 + &pflags, !(bbio->bio.bi_opf & REQ_RAHEAD)); 616 591 617 592 cb->len = bbio->bio.bi_iter.bi_size; 618 593 cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;

+4 -4

fs/btrfs/compression.h

··· 36 36 #define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) 37 37 static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); 38 38 39 + /* The max size for a single worker to compress. */ 40 + #define BTRFS_COMPRESSION_CHUNK_SIZE (SZ_512K) 41 + 39 42 /* Maximum size of data before compression */ 40 43 #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) 41 44 ··· 50 47 51 48 /* Number of bytes in the inode we're working on */ 52 49 unsigned int len; 53 - 54 - /* Number of bytes on disk */ 55 - unsigned int compressed_len; 56 50 57 51 /* The compression algorithm for this bio */ 58 52 u8 compress_type; ··· 98 98 99 99 int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); 100 100 101 - struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info); 101 + struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info, gfp_t gfp); 102 102 void btrfs_free_compr_folio(struct folio *folio); 103 103 104 104 struct workspace_manager {

+24 -23

fs/btrfs/ctree.c

··· 21 21 #include "fs.h" 22 22 #include "accessors.h" 23 23 #include "extent-tree.h" 24 + #include "extent_io.h" 24 25 #include "relocation.h" 25 26 #include "file-item.h" 26 27 ··· 591 590 btrfs_tree_unlock(buf); 592 591 free_extent_buffer_stale(buf); 593 592 btrfs_mark_buffer_dirty(trans, cow); 593 + 594 + btrfs_inhibit_eb_writeback(trans, cow); 595 + 594 596 *cow_ret = cow; 595 597 return 0; 596 598 ··· 603 599 return ret; 604 600 } 605 601 606 - static inline bool should_cow_block(const struct btrfs_trans_handle *trans, 602 + static inline bool should_cow_block(struct btrfs_trans_handle *trans, 607 603 const struct btrfs_root *root, 608 - const struct extent_buffer *buf) 604 + struct extent_buffer *buf) 609 605 { 610 606 if (btrfs_is_testing(root->fs_info)) 611 607 return false; ··· 639 635 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) 640 636 return true; 641 637 638 + btrfs_inhibit_eb_writeback(trans, buf); 642 639 return false; 643 640 } 644 641 ··· 767 762 768 763 while (low < high) { 769 764 const int unit_size = eb->folio_size; 770 - unsigned long oil; 765 + unsigned long oif; 771 766 unsigned long offset; 772 767 struct btrfs_disk_key *tmp; 773 768 struct btrfs_disk_key unaligned; 774 - int mid; 769 + u32 mid; 775 770 776 771 mid = (low + high) / 2; 777 772 offset = p + mid * item_size; 778 - oil = get_eb_offset_in_folio(eb, offset); 773 + oif = get_eb_offset_in_folio(eb, offset); 779 774 780 - if (oil + key_size <= unit_size) { 775 + if (oif + key_size <= unit_size) { 781 776 const unsigned long idx = get_eb_folio_index(eb, offset); 782 777 char *kaddr = folio_address(eb->folios[idx]); 783 778 784 - oil = get_eb_offset_in_folio(eb, offset); 785 - tmp = (struct btrfs_disk_key *)(kaddr + oil); 779 + tmp = (struct btrfs_disk_key *)(kaddr + oif); 786 780 } else { 787 781 read_extent_buffer(eb, &unaligned, offset, key_size); 788 782 tmp = &unaligned; ··· 826 822 { 827 823 int level = btrfs_header_level(parent); 828 824 struct btrfs_tree_parent_check check = { 0 }; 829 - struct extent_buffer *eb; 830 825 831 826 if (slot < 0 || slot >= btrfs_header_nritems(parent)) 832 827 return ERR_PTR(-ENOENT); ··· 838 835 check.has_first_key = true; 839 836 btrfs_node_key_to_cpu(parent, &check.first_key, slot); 840 837 841 - eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 842 - &check); 843 - if (IS_ERR(eb)) 844 - return eb; 845 - if (unlikely(!extent_buffer_uptodate(eb))) { 846 - free_extent_buffer(eb); 847 - return ERR_PTR(-EIO); 848 - } 849 - 850 - return eb; 838 + return read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), 839 + &check); 851 840 } 852 841 853 842 /* ··· 1498 1503 reada_for_search(fs_info, p, parent_level, slot, key->objectid); 1499 1504 1500 1505 /* first we do an atomic uptodate check */ 1501 - if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { 1506 + if (btrfs_buffer_uptodate(tmp, check.transid, NULL) > 0) { 1502 1507 /* 1503 1508 * Do extra check for first_key, eb can be stale due to 1504 1509 * being cached, read from scrub, or have multiple ··· 2101 2106 p->nodes[level + 1])) { 2102 2107 write_lock_level = level + 1; 2103 2108 btrfs_release_path(p); 2109 + trace_btrfs_search_slot_restart(root, level, "write_lock"); 2104 2110 goto again; 2105 2111 } 2106 2112 ··· 2164 2168 p->slots[level] = slot; 2165 2169 ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, 2166 2170 &write_lock_level); 2167 - if (ret2 == -EAGAIN) 2171 + if (ret2 == -EAGAIN) { 2172 + trace_btrfs_search_slot_restart(root, level, "setup_nodes"); 2168 2173 goto again; 2174 + } 2169 2175 if (ret2) { 2170 2176 ret = ret2; 2171 2177 goto done; ··· 2183 2185 if (slot == 0 && ins_len && write_lock_level < level + 1) { 2184 2186 write_lock_level = level + 1; 2185 2187 btrfs_release_path(p); 2188 + trace_btrfs_search_slot_restart(root, level, "slot_zero"); 2186 2189 goto again; 2187 2190 } 2188 2191 ··· 2197 2198 } 2198 2199 2199 2200 ret2 = read_block_for_search(root, p, &b, slot, key); 2200 - if (ret2 == -EAGAIN && !p->nowait) 2201 + if (ret2 == -EAGAIN && !p->nowait) { 2202 + trace_btrfs_search_slot_restart(root, level, "read_block"); 2201 2203 goto again; 2204 + } 2202 2205 if (ret2) { 2203 2206 ret = ret2; 2204 2207 goto done; ··· 3897 3896 goto err; 3898 3897 } 3899 3898 3900 - ret = split_leaf(trans, root, &key, path, ins_len, 1); 3899 + ret = split_leaf(trans, root, &key, path, ins_len, true); 3901 3900 if (ret) 3902 3901 goto err; 3903 3902

+1 -2

fs/btrfs/delayed-inode.c

··· 596 596 */ 597 597 if (!src_rsv || (!trans->bytes_reserved && 598 598 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { 599 - ret = btrfs_qgroup_reserve_meta(root, num_bytes, 600 - BTRFS_QGROUP_RSV_META_PREALLOC, true); 599 + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true, true); 601 600 if (ret < 0) 602 601 return ret; 603 602 ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes,

+28

fs/btrfs/delayed-ref.c

··· 207 207 * This will refill the delayed block_rsv up to 1 items size worth of space and 208 208 * will return -ENOSPC if we can't make the reservation. 209 209 */ 210 + static int btrfs_zoned_cap_metadata_reservation(struct btrfs_space_info *space_info) 211 + { 212 + struct btrfs_fs_info *fs_info = space_info->fs_info; 213 + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; 214 + u64 usable; 215 + u64 cap; 216 + int ret = 0; 217 + 218 + if (!btrfs_is_zoned(fs_info)) 219 + return 0; 220 + 221 + spin_lock(&space_info->lock); 222 + usable = space_info->total_bytes - space_info->bytes_zone_unusable; 223 + spin_unlock(&space_info->lock); 224 + cap = usable >> 1; 225 + 226 + spin_lock(&block_rsv->lock); 227 + if (block_rsv->size > cap) 228 + ret = -EAGAIN; 229 + spin_unlock(&block_rsv->lock); 230 + 231 + return ret; 232 + } 233 + 210 234 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, 211 235 enum btrfs_reserve_flush_enum flush) 212 236 { ··· 251 227 252 228 if (!num_bytes) 253 229 return 0; 230 + 231 + ret = btrfs_zoned_cap_metadata_reservation(space_info); 232 + if (ret) 233 + return ret; 254 234 255 235 ret = btrfs_reserve_metadata_bytes(space_info, num_bytes, flush); 256 236 if (ret)

+2 -2

fs/btrfs/dev-replace.c

··· 697 697 /* the disk copy procedure reuses the scrub code */ 698 698 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, 699 699 btrfs_device_get_total_bytes(src_device), 700 - &dev_replace->scrub_progress, 0, 1); 700 + &dev_replace->scrub_progress, false, true); 701 701 702 702 ret = btrfs_dev_replace_finishing(fs_info, ret); 703 703 if (ret == -EINPROGRESS) ··· 1255 1255 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, 1256 1256 dev_replace->committed_cursor_left, 1257 1257 btrfs_device_get_total_bytes(dev_replace->srcdev), 1258 - &dev_replace->scrub_progress, 0, 1); 1258 + &dev_replace->scrub_progress, false, true); 1259 1259 ret = btrfs_dev_replace_finishing(fs_info, ret); 1260 1260 WARN_ON(ret && ret != -ECANCELED); 1261 1261

+1 -3

fs/btrfs/dir-item.c

··· 253 253 /* Nothing found, we're safe */ 254 254 if (ret == -ENOENT) 255 255 return 0; 256 - 257 - if (ret < 0) 258 - return ret; 256 + return ret; 259 257 } 260 258 261 259 /* we found an item, look for our name in the item */

+5 -6

fs/btrfs/direct-io.c

··· 107 107 test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) 108 108 btrfs_start_ordered_extent(ordered); 109 109 else 110 - ret = nowait ? -EAGAIN : -ENOTBLK; 110 + ret = -ENOTBLK; 111 111 btrfs_put_ordered_extent(ordered); 112 112 } else { 113 113 /* ··· 625 625 pos += submitted; 626 626 length -= submitted; 627 627 if (write) 628 - btrfs_finish_ordered_extent(dio_data->ordered, NULL, 628 + btrfs_finish_ordered_extent(dio_data->ordered, 629 629 pos, length, false); 630 630 else 631 631 btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, ··· 657 657 } 658 658 659 659 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 660 - btrfs_finish_ordered_extent(bbio->ordered, NULL, 661 - dip->file_offset, dip->bytes, 662 - !bio->bi_status); 660 + btrfs_finish_ordered_extent(bbio->ordered, dip->file_offset, 661 + dip->bytes, !bio->bi_status); 663 662 } else { 664 663 btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, 665 664 dip->file_offset + dip->bytes - 1, NULL); ··· 734 735 735 736 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); 736 737 if (ret) { 737 - btrfs_finish_ordered_extent(dio_data->ordered, NULL, 738 + btrfs_finish_ordered_extent(dio_data->ordered, 738 739 file_offset, dip->bytes, 739 740 !ret); 740 741 bio->bi_status = errno_to_blk_status(ret);

+112 -135

fs/btrfs/disk-io.c

··· 50 50 #include "relocation.h" 51 51 #include "scrub.h" 52 52 #include "super.h" 53 - #include "delayed-inode.h" 54 53 55 54 #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ 56 55 BTRFS_HEADER_FLAG_RELOC |\ ··· 109 110 * detect blocks that either didn't get written at all or got written 110 111 * in the wrong place. 111 112 */ 112 - int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic) 113 + int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, 114 + const struct btrfs_tree_parent_check *check) 113 115 { 114 116 if (!extent_buffer_uptodate(eb)) 115 117 return 0; 116 118 117 - if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 119 + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) { 120 + /* 121 + * On a cache hit, the caller may still need tree parent 122 + * verification before reusing the buffer. 123 + */ 124 + if (unlikely(check && btrfs_verify_level_key(eb, check))) 125 + return -EUCLEAN; 118 126 return 1; 127 + } 119 128 120 - if (atomic) 121 - return -EAGAIN; 122 - 123 - if (!extent_buffer_uptodate(eb) || 124 - btrfs_header_generation(eb) != parent_transid) { 129 + if (btrfs_header_generation(eb) != parent_transid) { 125 130 btrfs_err_rl(eb->fs_info, 126 131 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", 127 132 eb->start, eb->read_mirror, ··· 733 730 } 734 731 735 732 struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, 736 - struct btrfs_key *key) 733 + const struct btrfs_key *key) 737 734 { 738 735 struct rb_node *node; 739 736 struct btrfs_root *root = NULL; ··· 770 767 771 768 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) 772 769 { 773 - struct btrfs_key key = { 770 + const struct btrfs_key key = { 774 771 .objectid = BTRFS_CSUM_TREE_OBJECTID, 775 772 .type = BTRFS_ROOT_ITEM_KEY, 776 773 .offset = btrfs_global_root_id(fs_info, bytenr), ··· 781 778 782 779 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) 783 780 { 784 - struct btrfs_key key = { 781 + const struct btrfs_key key = { 785 782 .objectid = BTRFS_EXTENT_TREE_OBJECTID, 786 783 .type = BTRFS_ROOT_ITEM_KEY, 787 784 .offset = btrfs_global_root_id(fs_info, bytenr), ··· 997 994 root->node = NULL; 998 995 goto fail; 999 996 } 1000 - if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) { 1001 - ret = -EIO; 997 + 998 + ret = btrfs_buffer_uptodate(root->node, generation, &check); 999 + if (unlikely(ret <= 0)) { 1000 + if (ret == 0) 1001 + ret = -EIO; 1002 1002 goto fail; 1003 1003 } 1004 1004 ··· 1556 1550 wake_up_process(fs_info->cleaner_kthread); 1557 1551 mutex_unlock(&fs_info->transaction_kthread_mutex); 1558 1552 1559 - if (BTRFS_FS_ERROR(fs_info)) 1553 + if (unlikely(BTRFS_FS_ERROR(fs_info))) 1560 1554 btrfs_cleanup_transaction(fs_info); 1561 1555 if (!kthread_should_stop() && 1562 1556 (!btrfs_transaction_blocked(fs_info) || ··· 2031 2025 btrfs_put_root(log_tree_root); 2032 2026 return ret; 2033 2027 } 2034 - if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) { 2035 - btrfs_err(fs_info, "failed to read log tree"); 2036 - btrfs_put_root(log_tree_root); 2037 - return -EIO; 2038 - } 2039 2028 2040 2029 /* returns with log_tree_root freed on success */ 2041 2030 ret = btrfs_recover_log_trees(log_tree_root); ··· 2300 2299 return -EUCLEAN; 2301 2300 } 2302 2301 2302 + /* It must hold at least one key and one chunk. */ 2303 + if (unlikely(sys_array_size < sizeof(struct btrfs_disk_key) + 2304 + sizeof(struct btrfs_chunk))) { 2305 + btrfs_err(fs_info, "system chunk array too small %u < %zu", 2306 + sys_array_size, 2307 + sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); 2308 + return -EUCLEAN; 2309 + } 2310 + 2303 2311 while (cur < sys_array_size) { 2304 2312 struct btrfs_disk_key *disk_key; 2305 2313 struct btrfs_chunk *chunk; ··· 2375 2365 int ret = 0; 2376 2366 const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); 2377 2367 2378 - if (btrfs_super_magic(sb) != BTRFS_MAGIC) { 2368 + if (unlikely(btrfs_super_magic(sb) != BTRFS_MAGIC)) { 2379 2369 btrfs_err(fs_info, "no valid FS found"); 2380 2370 ret = -EINVAL; 2381 2371 } 2382 - if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { 2372 + if (unlikely(btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { 2383 2373 if (!ignore_flags) { 2384 2374 btrfs_err(fs_info, 2385 2375 "unrecognized or unsupported super flag 0x%llx", ··· 2391 2381 btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); 2392 2382 } 2393 2383 } 2394 - if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { 2384 + if (unlikely(btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL)) { 2395 2385 btrfs_err(fs_info, "tree_root level too big: %d >= %d", 2396 2386 btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); 2397 2387 ret = -EINVAL; 2398 2388 } 2399 - if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { 2389 + if (unlikely(btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL)) { 2400 2390 btrfs_err(fs_info, "chunk_root level too big: %d >= %d", 2401 2391 btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); 2402 2392 ret = -EINVAL; 2403 2393 } 2404 - if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { 2394 + if (unlikely(btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL)) { 2405 2395 btrfs_err(fs_info, "log_root level too big: %d >= %d", 2406 2396 btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); 2407 2397 ret = -EINVAL; ··· 2411 2401 * Check sectorsize and nodesize first, other check will need it. 2412 2402 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. 2413 2403 */ 2414 - if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || 2415 - sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { 2404 + if (unlikely(!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || 2405 + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE)) { 2416 2406 btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); 2417 2407 ret = -EINVAL; 2418 2408 } 2419 2409 2420 - if (!btrfs_supported_blocksize(sectorsize)) { 2410 + if (unlikely(!btrfs_supported_blocksize(sectorsize))) { 2421 2411 btrfs_err(fs_info, 2422 2412 "sectorsize %llu not yet supported for page size %lu", 2423 2413 sectorsize, PAGE_SIZE); 2424 2414 ret = -EINVAL; 2425 2415 } 2426 2416 2427 - if (!is_power_of_2(nodesize) || nodesize < sectorsize || 2428 - nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { 2417 + if (unlikely(!is_power_of_2(nodesize) || nodesize < sectorsize || 2418 + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE)) { 2429 2419 btrfs_err(fs_info, "invalid nodesize %llu", nodesize); 2430 2420 ret = -EINVAL; 2431 2421 } 2432 - if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { 2422 + if (unlikely(nodesize != le32_to_cpu(sb->__unused_leafsize))) { 2433 2423 btrfs_err(fs_info, "invalid leafsize %u, should be %llu", 2434 2424 le32_to_cpu(sb->__unused_leafsize), nodesize); 2435 2425 ret = -EINVAL; 2436 2426 } 2437 2427 2438 2428 /* Root alignment check */ 2439 - if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { 2429 + if (unlikely(!IS_ALIGNED(btrfs_super_root(sb), sectorsize))) { 2440 2430 btrfs_err(fs_info, "tree_root block unaligned: %llu", 2441 2431 btrfs_super_root(sb)); 2442 2432 ret = -EINVAL; 2443 2433 } 2444 - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { 2434 + if (unlikely(!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize))) { 2445 2435 btrfs_err(fs_info, "chunk_root block unaligned: %llu", 2446 2436 btrfs_super_chunk_root(sb)); 2447 2437 ret = -EINVAL; 2448 2438 } 2449 - if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { 2439 + if (unlikely(!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize))) { 2450 2440 btrfs_err(fs_info, "log_root block unaligned: %llu", 2451 2441 btrfs_super_log_root(sb)); 2452 2442 ret = -EINVAL; 2453 2443 } 2454 2444 2455 - if (!fs_info->fs_devices->temp_fsid && 2456 - memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { 2445 + if (unlikely(!fs_info->fs_devices->temp_fsid && 2446 + memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0)) { 2457 2447 btrfs_err(fs_info, 2458 2448 "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", 2459 2449 sb->fsid, fs_info->fs_devices->fsid); 2460 2450 ret = -EINVAL; 2461 2451 } 2462 2452 2463 - if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), 2464 - BTRFS_FSID_SIZE) != 0) { 2453 + if (unlikely(memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), 2454 + BTRFS_FSID_SIZE) != 0)) { 2465 2455 btrfs_err(fs_info, 2466 2456 "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", 2467 2457 btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); 2468 2458 ret = -EINVAL; 2469 2459 } 2470 2460 2471 - if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, 2472 - BTRFS_FSID_SIZE) != 0) { 2461 + if (unlikely(memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, 2462 + BTRFS_FSID_SIZE) != 0)) { 2473 2463 btrfs_err(fs_info, 2474 2464 "dev_item UUID does not match metadata fsid: %pU != %pU", 2475 2465 fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); ··· 2480 2470 * Artificial requirement for block-group-tree to force newer features 2481 2471 * (free-space-tree, no-holes) so the test matrix is smaller. 2482 2472 */ 2483 - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && 2484 - (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || 2485 - !btrfs_fs_incompat(fs_info, NO_HOLES))) { 2473 + if (unlikely(btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && 2474 + (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || 2475 + !btrfs_fs_incompat(fs_info, NO_HOLES)))) { 2486 2476 btrfs_err(fs_info, 2487 2477 "block-group-tree feature requires free-space-tree and no-holes"); 2488 2478 ret = -EINVAL; ··· 2493 2483 * Reduce test matrix for remap tree by requiring block-group-tree 2494 2484 * and no-holes. Free-space-tree is a hard requirement. 2495 2485 */ 2496 - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || 2497 - !btrfs_fs_incompat(fs_info, NO_HOLES) || 2498 - !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { 2486 + if (unlikely(!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || 2487 + !btrfs_fs_incompat(fs_info, NO_HOLES) || 2488 + !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))) { 2499 2489 btrfs_err(fs_info, 2500 2490 "remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); 2501 2491 ret = -EINVAL; 2502 2492 } 2503 2493 2504 - if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 2494 + if (unlikely(btrfs_fs_incompat(fs_info, MIXED_GROUPS))) { 2505 2495 btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); 2506 2496 ret = -EINVAL; 2507 2497 } 2508 2498 2509 - if (btrfs_fs_incompat(fs_info, ZONED)) { 2499 + if (unlikely(btrfs_fs_incompat(fs_info, ZONED))) { 2510 2500 btrfs_err(fs_info, "remap-tree not supported with zoned devices"); 2511 2501 ret = -EINVAL; 2512 2502 } 2513 2503 2514 - if (sectorsize > PAGE_SIZE) { 2504 + if (unlikely(sectorsize > PAGE_SIZE)) { 2515 2505 btrfs_err(fs_info, "remap-tree not supported when block size > page size"); 2516 2506 ret = -EINVAL; 2517 2507 } ··· 2521 2511 * Hint to catch really bogus numbers, bitflips or so, more exact checks are 2522 2512 * done later 2523 2513 */ 2524 - if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { 2514 + if (unlikely(btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb))) { 2525 2515 btrfs_err(fs_info, "bytes_used is too small %llu", 2526 2516 btrfs_super_bytes_used(sb)); 2527 2517 ret = -EINVAL; 2528 2518 } 2529 - if (!is_power_of_2(btrfs_super_stripesize(sb))) { 2519 + if (unlikely(!is_power_of_2(btrfs_super_stripesize(sb)))) { 2530 2520 btrfs_err(fs_info, "invalid stripesize %u", 2531 2521 btrfs_super_stripesize(sb)); 2532 2522 ret = -EINVAL; 2533 2523 } 2534 - if (btrfs_super_num_devices(sb) > (1UL << 31)) 2524 + if (unlikely(btrfs_super_num_devices(sb) > (1UL << 31))) 2535 2525 btrfs_warn(fs_info, "suspicious number of devices: %llu", 2536 2526 btrfs_super_num_devices(sb)); 2537 - if (btrfs_super_num_devices(sb) == 0) { 2527 + if (unlikely(btrfs_super_num_devices(sb) == 0)) { 2538 2528 btrfs_err(fs_info, "number of devices is 0"); 2539 2529 ret = -EINVAL; 2540 2530 } 2541 2531 2542 - if (mirror_num >= 0 && 2543 - btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { 2532 + if (unlikely(mirror_num >= 0 && 2533 + btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num))) { 2544 2534 btrfs_err(fs_info, "super offset mismatch %llu != %llu", 2545 2535 btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num)); 2546 2536 ret = -EINVAL; 2547 2537 } 2548 2538 2549 - if (ret) 2539 + if (unlikely(ret)) 2550 2540 return ret; 2551 2541 2552 2542 ret = validate_sys_chunk_array(fs_info, sb); 2553 2543 2554 2544 /* 2555 - * Obvious sys_chunk_array corruptions, it must hold at least one key 2556 - * and one chunk 2557 - */ 2558 - if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { 2559 - btrfs_err(fs_info, "system chunk array too big %u > %u", 2560 - btrfs_super_sys_array_size(sb), 2561 - BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); 2562 - ret = -EINVAL; 2563 - } 2564 - if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) 2565 - + sizeof(struct btrfs_chunk)) { 2566 - btrfs_err(fs_info, "system chunk array too small %u < %zu", 2567 - btrfs_super_sys_array_size(sb), 2568 - sizeof(struct btrfs_disk_key) 2569 - + sizeof(struct btrfs_chunk)); 2570 - ret = -EINVAL; 2571 - } 2572 - 2573 - /* 2574 2545 * The generation is a global counter, we'll trust it more than the others 2575 2546 * but it's still possible that it's the one that's wrong. 2576 2547 */ 2577 - if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) 2548 + if (unlikely(btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))) 2578 2549 btrfs_warn(fs_info, 2579 2550 "suspicious: generation < chunk_root_generation: %llu < %llu", 2580 2551 btrfs_super_generation(sb), 2581 2552 btrfs_super_chunk_root_generation(sb)); 2582 - if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) 2583 - && btrfs_super_cache_generation(sb) != (u64)-1) 2553 + if (unlikely(btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) && 2554 + btrfs_super_cache_generation(sb) != (u64)-1)) 2584 2555 btrfs_warn(fs_info, 2585 2556 "suspicious: generation < cache_generation: %llu < %llu", 2586 2557 btrfs_super_generation(sb), ··· 2592 2601 int ret; 2593 2602 2594 2603 ret = btrfs_validate_super(fs_info, sb, -1); 2595 - if (ret < 0) 2604 + if (unlikely(ret < 0)) 2596 2605 goto out; 2597 2606 if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) { 2598 2607 ret = -EUCLEAN; ··· 2609 2618 goto out; 2610 2619 } 2611 2620 out: 2612 - if (ret < 0) 2621 + if (unlikely(ret < 0)) 2613 2622 btrfs_err(fs_info, 2614 2623 "super block corruption detected before writing it to disk"); 2615 2624 return ret; ··· 2629 2638 ret = PTR_ERR(root->node); 2630 2639 root->node = NULL; 2631 2640 return ret; 2632 - } 2633 - if (unlikely(!extent_buffer_uptodate(root->node))) { 2634 - free_extent_buffer(root->node); 2635 - root->node = NULL; 2636 - return -EIO; 2637 2641 } 2638 2642 2639 2643 btrfs_set_root_node(&root->root_item, root->node); ··· 3660 3674 3661 3675 if (fs_info->uuid_root && 3662 3676 (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || 3663 - fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { 3677 + !test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))) { 3664 3678 btrfs_info(fs_info, "checking UUID tree"); 3665 3679 ret = btrfs_check_uuid_tree(fs_info); 3666 3680 if (ret) { ··· 3752 3766 * Write superblock @sb to the @device. Do not wait for completion, all the 3753 3767 * folios we use for writing are locked. 3754 3768 * 3755 - * Write @max_mirrors copies of the superblock, where 0 means default that fit 3756 - * the expected device size at commit time. Note that max_mirrors must be 3769 + * Write @max_mirrors copies of the superblock. Note that max_mirrors must be 3757 3770 * same for write and wait phases. 3758 3771 * 3759 3772 * Return number of errors when folio is not found or submission fails. ··· 3767 3782 u64 bytenr, bytenr_orig; 3768 3783 3769 3784 atomic_set(&device->sb_write_errors, 0); 3770 - 3771 - if (max_mirrors == 0) 3772 - max_mirrors = BTRFS_SUPER_MIRROR_MAX; 3773 3785 3774 3786 for (i = 0; i < max_mirrors; i++) { 3775 3787 struct folio *folio; ··· 3852 3870 int ret; 3853 3871 u64 bytenr; 3854 3872 3855 - if (max_mirrors == 0) 3856 - max_mirrors = BTRFS_SUPER_MIRROR_MAX; 3857 - 3858 3873 for (i = 0; i < max_mirrors; i++) { 3859 3874 struct folio *folio; 3860 3875 3861 3876 ret = btrfs_sb_log_location(device, i, READ, &bytenr); 3862 3877 if (ret == -ENOENT) { 3863 3878 break; 3864 - } else if (ret < 0) { 3879 + } else if (unlikely(ret < 0)) { 3865 3880 errors++; 3866 3881 if (i == 0) 3867 3882 primary_failed = true; ··· 3880 3901 } 3881 3902 3882 3903 errors += atomic_read(&device->sb_write_errors); 3883 - if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) 3884 - primary_failed = true; 3885 - if (primary_failed) { 3904 + 3905 + if (unlikely(primary_failed || errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)) { 3886 3906 btrfs_err(device->fs_info, "error writing primary super block to device %llu", 3887 3907 device->devid); 3888 3908 return -1; ··· 3932 3954 3933 3955 wait_for_completion_io(&device->flush_wait); 3934 3956 3935 - if (bio->bi_status) { 3957 + if (unlikely(bio->bi_status)) { 3936 3958 set_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); 3937 3959 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); 3938 3960 return true; ··· 3970 3992 list_for_each_entry(dev, head, dev_list) { 3971 3993 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) 3972 3994 continue; 3973 - if (!dev->bdev) { 3995 + if (unlikely(!dev->bdev)) { 3974 3996 errors_wait++; 3975 3997 continue; 3976 3998 } ··· 3978 4000 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) 3979 4001 continue; 3980 4002 3981 - if (wait_dev_flush(dev)) 4003 + if (unlikely(wait_dev_flush(dev))) 3982 4004 errors_wait++; 3983 4005 } 3984 4006 ··· 4021 4043 return min_tolerated; 4022 4044 } 4023 4045 4024 - int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) 4046 + int write_all_supers(struct btrfs_trans_handle *trans) 4025 4047 { 4048 + struct btrfs_fs_info *fs_info = trans->fs_info; 4026 4049 struct list_head *head; 4027 4050 struct btrfs_device *dev; 4028 4051 struct btrfs_super_block *sb; 4029 4052 struct btrfs_dev_item *dev_item; 4053 + int max_mirrors; 4030 4054 int ret; 4031 4055 int do_barriers; 4032 4056 int max_errors; 4033 4057 int total_errors = 0; 4034 - u64 flags; 4035 4058 4036 4059 do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); 4037 4060 4038 - /* 4039 - * max_mirrors == 0 indicates we're from commit_transaction, 4040 - * not from fsync where the tree roots in fs_info have not 4041 - * been consistent on disk. 4042 - */ 4043 - if (max_mirrors == 0) { 4061 + if (trans->transaction->state < TRANS_STATE_UNBLOCKED) { 4062 + /* We are called from fsync. */ 4063 + max_mirrors = 1; 4064 + } else { 4065 + /* We are called from transaction commit. */ 4066 + max_mirrors = BTRFS_SUPER_MIRROR_MAX; 4044 4067 ret = backup_super_roots(fs_info); 4045 4068 if (ret < 0) 4046 4069 return ret; ··· 4056 4077 4057 4078 if (do_barriers) { 4058 4079 ret = barrier_all_devices(fs_info); 4059 - if (ret) { 4080 + if (unlikely(ret)) { 4060 4081 mutex_unlock( 4061 4082 &fs_info->fs_devices->device_list_mutex); 4062 - btrfs_handle_fs_error(fs_info, ret, 4063 - "errors while submitting device barriers."); 4083 + btrfs_abort_transaction(trans, ret); 4084 + btrfs_err(fs_info, "error while submitting device barriers"); 4064 4085 return ret; 4065 4086 } 4066 4087 } 4067 4088 4089 + btrfs_set_super_flags(sb, btrfs_super_flags(sb) | BTRFS_HEADER_FLAG_WRITTEN); 4090 + 4068 4091 list_for_each_entry(dev, head, dev_list) { 4069 - if (!dev->bdev) { 4092 + if (unlikely(!dev->bdev)) { 4070 4093 total_errors++; 4071 4094 continue; 4072 4095 } ··· 4090 4109 memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, 4091 4110 BTRFS_FSID_SIZE); 4092 4111 4093 - flags = btrfs_super_flags(sb); 4094 - btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); 4095 - 4096 4112 ret = btrfs_validate_write_super(fs_info, sb); 4097 4113 if (unlikely(ret < 0)) { 4098 4114 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4099 - btrfs_handle_fs_error(fs_info, -EUCLEAN, 4100 - "unexpected superblock corruption detected"); 4101 - return -EUCLEAN; 4115 + btrfs_abort_transaction(trans, ret); 4116 + btrfs_err(fs_info, 4117 + "unexpected superblock corruption before writing it"); 4118 + return ret; 4102 4119 } 4103 4120 4104 4121 ret = write_dev_supers(dev, sb, max_mirrors); 4105 - if (ret) 4122 + if (unlikely(ret)) 4106 4123 total_errors++; 4107 4124 } 4108 4125 if (unlikely(total_errors > max_errors)) { ··· 4109 4130 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4110 4131 4111 4132 /* FUA is masked off if unsupported and can't be the reason */ 4112 - btrfs_handle_fs_error(fs_info, -EIO, 4113 - "%d errors while writing supers", 4114 - total_errors); 4133 + btrfs_abort_transaction(trans, -EIO); 4134 + btrfs_err(fs_info, "%d errors while writing supers", total_errors); 4115 4135 return -EIO; 4116 4136 } 4117 4137 4118 4138 total_errors = 0; 4119 4139 list_for_each_entry(dev, head, dev_list) { 4120 - if (!dev->bdev) 4140 + if (unlikely(!dev->bdev)) 4121 4141 continue; 4122 4142 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || 4123 4143 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) 4124 4144 continue; 4125 4145 4126 4146 ret = wait_dev_supers(dev, max_mirrors); 4127 - if (ret) 4147 + if (unlikely(ret)) 4128 4148 total_errors++; 4129 4149 } 4130 4150 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 4131 4151 if (unlikely(total_errors > max_errors)) { 4132 - btrfs_handle_fs_error(fs_info, -EIO, 4133 - "%d errors while writing supers", 4134 - total_errors); 4152 + btrfs_abort_transaction(trans, -EIO); 4153 + btrfs_err(fs_info, "%d errors while writing supers", total_errors); 4135 4154 return -EIO; 4136 4155 } 4137 4156 return 0; ··· 4148 4171 drop_ref = true; 4149 4172 spin_unlock(&fs_info->fs_roots_radix_lock); 4150 4173 4151 - if (BTRFS_FS_ERROR(fs_info)) { 4174 + if (unlikely(BTRFS_FS_ERROR(fs_info))) { 4152 4175 ASSERT(root->log_root == NULL); 4153 4176 if (root->reloc_root) { 4154 4177 btrfs_put_root(root->reloc_root); ··· 4434 4457 4435 4458 btrfs_put_block_group_cache(fs_info); 4436 4459 4437 - /* 4438 - * we must make sure there is not any read request to 4439 - * submit after we stopping all workers. 4440 - */ 4441 - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 4442 - btrfs_stop_all_workers(fs_info); 4443 - 4444 4460 /* We shouldn't have any transaction open at this point */ 4445 4461 warn_about_uncommitted_trans(fs_info); 4446 4462 4447 4463 clear_bit(BTRFS_FS_OPEN, &fs_info->flags); 4448 4464 free_root_pointers(fs_info, true); 4449 4465 btrfs_free_fs_roots(fs_info); 4466 + 4467 + /* 4468 + * We must make sure there is not any read request to 4469 + * submit after we stop all workers. 4470 + */ 4471 + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 4472 + btrfs_stop_all_workers(fs_info); 4450 4473 4451 4474 /* 4452 4475 * We must free the block groups after dropping the fs_roots as we could

+4 -3

fs/btrfs/disk-io.h

··· 58 58 int btrfs_validate_super(const struct btrfs_fs_info *fs_info, 59 59 const struct btrfs_super_block *sb, int mirror_num); 60 60 int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); 61 - int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); 61 + int write_all_supers(struct btrfs_trans_handle *trans); 62 62 int btrfs_commit_super(struct btrfs_fs_info *fs_info); 63 63 struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, 64 64 const struct btrfs_key *key); ··· 76 76 int btrfs_global_root_insert(struct btrfs_root *root); 77 77 void btrfs_global_root_delete(struct btrfs_root *root); 78 78 struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, 79 - struct btrfs_key *key); 79 + const struct btrfs_key *key); 80 80 struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); 81 81 struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); 82 82 ··· 107 107 void btrfs_put_root(struct btrfs_root *root); 108 108 void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, 109 109 struct extent_buffer *buf); 110 - int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic); 110 + int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, 111 + const struct btrfs_tree_parent_check *check); 111 112 int btrfs_read_extent_buffer(struct extent_buffer *buf, 112 113 const struct btrfs_tree_parent_check *check); 113 114

+139 -45

fs/btrfs/extent-io-tree.c

··· 185 185 186 186 static int add_extent_changeset(struct extent_state *state, u32 bits, 187 187 struct extent_changeset *changeset, 188 - int set) 188 + bool set) 189 189 { 190 + int ret; 191 + 190 192 if (!changeset) 191 193 return 0; 192 194 if (set && (state->state & bits) == bits) 193 195 return 0; 194 196 if (!set && (state->state & bits) == 0) 195 197 return 0; 196 - changeset->bytes_changed += state->end - state->start + 1; 197 198 198 - return ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); 199 + changeset->bytes_changed += state->end - state->start + 1; 200 + if (!extent_changeset_tracks_ranges(changeset)) 201 + return 0; 202 + 203 + ret = ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); 204 + if (ret < 0) 205 + return ret; 206 + return 0; 199 207 } 200 208 201 209 static inline struct extent_state *next_state(struct extent_state *state) ··· 334 326 return tree_search_for_insert(tree, offset, NULL, NULL); 335 327 } 336 328 337 - static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, 338 - const struct extent_state *state, 339 - const char *opname, 340 - int err) 341 - { 342 - btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err, 343 - "extent io tree error on %s state start %llu end %llu", 344 - opname, state->start, state->end); 345 - } 329 + #define extent_io_tree_panic(tree, state, opname, err) \ 330 + btrfs_panic(btrfs_extent_io_tree_to_fs_info((tree)), (err), \ 331 + "extent io tree error on %s state start %llu end %llu", \ 332 + (opname), (state)->start, (state)->end) 346 333 347 334 static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) 348 335 { ··· 397 394 if (tree->owner == IO_TREE_INODE_IO) 398 395 btrfs_set_delalloc_extent(tree->inode, state, bits); 399 396 400 - ret = add_extent_changeset(state, bits_to_set, changeset, 1); 401 - BUG_ON(ret < 0); 397 + ret = add_extent_changeset(state, bits_to_set, changeset, true); 398 + if (unlikely(ret)) 399 + extent_io_tree_panic(tree, state, "add_extent_changeset", ret); 402 400 state->state |= bits_to_set; 403 401 } 404 402 ··· 539 535 return 0; 540 536 } 541 537 538 + static inline void state_wake_up(struct extent_io_tree *tree, 539 + struct extent_state *state, u32 bits) 540 + { 541 + lockdep_assert_held(&tree->lock); 542 + 543 + if (!(bits & EXTENT_LOCK_BITS)) 544 + return; 545 + 546 + /* 547 + * No memory barriers because the tree's lock is held while: 548 + * 549 + * 1) Adding waiters to the queue. 550 + * 2) Waking up waiters. 551 + * 3) Removing waiters from queue. 552 + */ 553 + cond_wake_up_nomb(&state->wq); 554 + } 555 + 542 556 /* 543 557 * Use this during tree iteration to avoid doing next node searches when it's 544 558 * not needed (the current record ends at or after the target range's end). ··· 571 549 572 550 /* 573 551 * Utility function to clear some bits in an extent state struct. It will 574 - * optionally wake up anyone waiting on this state (wake == 1). 552 + * optionally wake up anyone waiting on this state. 575 553 * 576 554 * If no bits are set on the state struct after clearing things, the 577 555 * struct is freed and removed from the tree 578 556 */ 579 557 static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 580 558 struct extent_state *state, 581 - u32 bits, int wake, u64 end, 559 + u32 bits, u64 end, 582 560 struct extent_changeset *changeset) 583 561 { 584 562 struct extent_state *next; ··· 588 566 if (tree->owner == IO_TREE_INODE_IO) 589 567 btrfs_clear_delalloc_extent(tree->inode, state, bits); 590 568 591 - ret = add_extent_changeset(state, bits_to_clear, changeset, 0); 592 - BUG_ON(ret < 0); 569 + ret = add_extent_changeset(state, bits_to_clear, changeset, false); 570 + if (unlikely(ret)) 571 + extent_io_tree_panic(tree, state, "add_extent_changeset", ret); 593 572 state->state &= ~bits_to_clear; 594 - if (wake) 595 - wake_up(&state->wq); 573 + state_wake_up(tree, state, bits); 596 574 if (state->state == 0) { 575 + if (unlikely(!extent_state_in_tree(state))) 576 + extent_io_tree_panic(tree, state, "extent_state_in_tree", -EUCLEAN); 577 + 597 578 next = next_search_state(state, end); 598 - if (extent_state_in_tree(state)) { 599 - rb_erase(&state->rb_node, &tree->state); 600 - RB_CLEAR_NODE(&state->rb_node); 601 - btrfs_free_extent_state(state); 602 - } else { 603 - WARN_ON(1); 604 - } 579 + rb_erase(&state->rb_node, &tree->state); 580 + RB_CLEAR_NODE(&state->rb_node); 581 + btrfs_free_extent_state(state); 605 582 } else { 606 583 merge_state(tree, state); 607 584 next = next_search_state(state, end); ··· 637 616 u64 last_end; 638 617 int ret = 0; 639 618 bool clear; 640 - bool wake; 641 619 const bool delete = (bits & EXTENT_CLEAR_ALL_BITS); 620 + const u32 bits_to_clear = (bits & ~EXTENT_CTLBITS); 642 621 gfp_t mask; 643 622 644 623 set_gfp_mask_from_bits(&bits, &mask); ··· 651 630 if (bits & EXTENT_DELALLOC) 652 631 bits |= EXTENT_NORESERVE; 653 632 654 - wake = (bits & EXTENT_LOCK_BITS); 655 633 clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); 656 634 again: 657 635 if (!prealloc) { ··· 716 696 */ 717 697 718 698 if (state->start < start) { 699 + /* 700 + * If all bits are cleared, there's no point in allocating or 701 + * using the prealloc extent, split the state record, insert the 702 + * prealloc record and then remove this record. We can just 703 + * adjust this record and move on to the next without adding or 704 + * removing anything to the tree. 705 + */ 706 + if (state->end <= end && (state->state & ~bits_to_clear) == 0) { 707 + const u64 orig_start = state->start; 708 + 709 + if (tree->owner == IO_TREE_INODE_IO) 710 + btrfs_split_delalloc_extent(tree->inode, state, start); 711 + 712 + /* 713 + * Temporarilly ajdust this state's range to match the 714 + * range for which we are clearing bits. 715 + */ 716 + state->start = start; 717 + 718 + ret = add_extent_changeset(state, bits_to_clear, changeset, false); 719 + if (unlikely(ret < 0)) { 720 + extent_io_tree_panic(tree, state, 721 + "add_extent_changeset", ret); 722 + goto out; 723 + } 724 + 725 + if (tree->owner == IO_TREE_INODE_IO) 726 + btrfs_clear_delalloc_extent(tree->inode, state, bits); 727 + 728 + /* 729 + * Now adjust the range to the section for which no bits 730 + * are cleared. 731 + */ 732 + state->start = orig_start; 733 + state->end = start - 1; 734 + 735 + state_wake_up(tree, state, bits); 736 + state = next_search_state(state, end); 737 + goto next; 738 + } 739 + 719 740 prealloc = alloc_extent_state_atomic(prealloc); 720 741 if (!prealloc) 721 742 goto search_again; 722 743 ret = split_state(tree, state, prealloc, start); 723 744 prealloc = NULL; 724 - if (ret) { 745 + if (unlikely(ret)) { 725 746 extent_io_tree_panic(tree, state, "split", ret); 726 747 goto out; 727 748 } 728 749 if (state->end <= end) { 729 - state = clear_state_bit(tree, state, bits, wake, end, 730 - changeset); 750 + state = clear_state_bit(tree, state, bits, end, changeset); 731 751 goto next; 732 752 } 733 753 if (need_resched()) ··· 784 724 * We need to split the extent, and clear the bit on the first half. 785 725 */ 786 726 if (state->start <= end && state->end > end) { 727 + /* 728 + * If all bits are cleared, there's no point in allocating or 729 + * using the prealloc extent, split the state record, insert the 730 + * prealloc record and then remove it. We can just adjust the 731 + * start offset of the current state and avoid all that. 732 + */ 733 + if ((state->state & ~bits_to_clear) == 0) { 734 + const u64 orig_end = state->end; 735 + 736 + if (tree->owner == IO_TREE_INODE_IO) 737 + btrfs_split_delalloc_extent(tree->inode, state, end + 1); 738 + 739 + /* 740 + * Temporarily adjust the end offset to match the 741 + * removed subrange to update the changeset. 742 + */ 743 + state->end = end; 744 + 745 + ret = add_extent_changeset(state, bits_to_clear, changeset, false); 746 + if (unlikely(ret < 0)) { 747 + extent_io_tree_panic(tree, state, 748 + "add_extent_changeset", ret); 749 + goto out; 750 + } 751 + 752 + if (tree->owner == IO_TREE_INODE_IO) 753 + btrfs_clear_delalloc_extent(tree->inode, state, bits); 754 + 755 + state->start = end + 1; 756 + state->end = orig_end; 757 + 758 + state_wake_up(tree, state, bits); 759 + goto out; 760 + } 761 + 787 762 prealloc = alloc_extent_state_atomic(prealloc); 788 763 if (!prealloc) 789 764 goto search_again; 790 765 ret = split_state(tree, state, prealloc, end + 1); 791 - if (ret) { 766 + if (unlikely(ret)) { 792 767 extent_io_tree_panic(tree, state, "split", ret); 793 768 prealloc = NULL; 794 769 goto out; 795 770 } 796 771 797 - if (wake) 798 - wake_up(&state->wq); 772 + state_wake_up(tree, state, bits); 799 773 800 - clear_state_bit(tree, prealloc, bits, wake, end, changeset); 774 + clear_state_bit(tree, prealloc, bits, end, changeset); 801 775 802 776 prealloc = NULL; 803 777 goto out; 804 778 } 805 779 806 - state = clear_state_bit(tree, state, bits, wake, end, changeset); 780 + state = clear_state_bit(tree, state, bits, end, changeset); 807 781 next: 808 782 if (last_end >= end) 809 783 goto out; ··· 919 825 } 920 826 } 921 827 out: 828 + spin_unlock(&tree->lock); 922 829 /* This state is no longer useful, clear it and free it up. */ 923 830 if (cached_state && *cached_state) { 924 831 state = *cached_state; 925 832 *cached_state = NULL; 926 833 btrfs_free_extent_state(state); 927 834 } 928 - spin_unlock(&tree->lock); 929 835 } 930 836 931 837 static void cache_state_if_flags(struct extent_state *state, ··· 1263 1169 if (!prealloc) 1264 1170 goto search_again; 1265 1171 ret = split_state(tree, state, prealloc, start); 1266 - if (ret) 1172 + if (unlikely(ret)) 1267 1173 extent_io_tree_panic(tree, state, "split", ret); 1268 1174 1269 1175 prealloc = NULL; ··· 1353 1259 if (!prealloc) 1354 1260 goto search_again; 1355 1261 ret = split_state(tree, state, prealloc, end + 1); 1356 - if (ret) { 1262 + if (unlikely(ret)) { 1357 1263 extent_io_tree_panic(tree, state, "split", ret); 1358 1264 prealloc = NULL; 1359 1265 goto out; ··· 1476 1382 if (state->start == start && state->end <= end) { 1477 1383 set_state_bits(tree, state, bits, NULL); 1478 1384 cache_state(state, cached_state); 1479 - state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); 1385 + state = clear_state_bit(tree, state, clear_bits, end, NULL); 1480 1386 if (last_end >= end) 1481 1387 goto out; 1482 1388 start = last_end + 1; ··· 1508 1414 } 1509 1415 ret = split_state(tree, state, prealloc, start); 1510 1416 prealloc = NULL; 1511 - if (ret) { 1417 + if (unlikely(ret)) { 1512 1418 extent_io_tree_panic(tree, state, "split", ret); 1513 1419 goto out; 1514 1420 } 1515 1421 if (state->end <= end) { 1516 1422 set_state_bits(tree, state, bits, NULL); 1517 1423 cache_state(state, cached_state); 1518 - state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); 1424 + state = clear_state_bit(tree, state, clear_bits, end, NULL); 1519 1425 if (last_end >= end) 1520 1426 goto out; 1521 1427 start = last_end + 1; ··· 1592 1498 } 1593 1499 1594 1500 ret = split_state(tree, state, prealloc, end + 1); 1595 - if (ret) { 1501 + if (unlikely(ret)) { 1596 1502 extent_io_tree_panic(tree, state, "split", ret); 1597 1503 prealloc = NULL; 1598 1504 goto out; ··· 1600 1506 1601 1507 set_state_bits(tree, prealloc, bits, NULL); 1602 1508 cache_state(prealloc, cached_state); 1603 - clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL); 1509 + clear_state_bit(tree, prealloc, clear_bits, end, NULL); 1604 1510 prealloc = NULL; 1605 1511 goto out; 1606 1512 }

+58 -61

fs/btrfs/extent-tree.c

··· 4013 4013 * Lock nesting 4014 4014 * ============ 4015 4015 * 4016 - * space_info::lock 4017 - * block_group::lock 4018 - * fs_info::treelog_bg_lock 4016 + * block_group::lock 4017 + * fs_info::treelog_bg_lock 4019 4018 */ 4020 4019 4021 4020 /* ··· 4027 4028 struct btrfs_block_group **bg_ret) 4028 4029 { 4029 4030 struct btrfs_fs_info *fs_info = block_group->fs_info; 4030 - struct btrfs_space_info *space_info = block_group->space_info; 4031 4031 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 4032 4032 u64 start = block_group->start; 4033 4033 u64 num_bytes = ffe_ctl->num_bytes; ··· 4087 4089 */ 4088 4090 } 4089 4091 4090 - spin_lock(&space_info->lock); 4091 4092 spin_lock(&block_group->lock); 4092 4093 spin_lock(&fs_info->treelog_bg_lock); 4093 4094 spin_lock(&fs_info->relocation_bg_lock); ··· 4188 4191 spin_unlock(&fs_info->relocation_bg_lock); 4189 4192 spin_unlock(&fs_info->treelog_bg_lock); 4190 4193 spin_unlock(&block_group->lock); 4191 - spin_unlock(&space_info->lock); 4192 4194 return ret; 4193 4195 } 4194 4196 ··· 4349 4353 return 1; 4350 4354 4351 4355 /* See the comments for btrfs_loop_type for an explanation of the phases. */ 4352 - if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { 4353 - ffe_ctl->index = 0; 4354 - /* 4355 - * We want to skip the LOOP_CACHING_WAIT step if we don't have 4356 - * any uncached bgs and we've already done a full search 4357 - * through. 4358 - */ 4359 - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && 4360 - (!ffe_ctl->orig_have_caching_bg && full_search)) 4361 - ffe_ctl->loop++; 4356 + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) 4357 + return -ENOSPC; 4358 + 4359 + ffe_ctl->index = 0; 4360 + /* 4361 + * We want to skip the LOOP_CACHING_WAIT step if we don't have any 4362 + * uncached bgs and we've already done a full search through. 4363 + */ 4364 + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && 4365 + (!ffe_ctl->orig_have_caching_bg && full_search)) 4362 4366 ffe_ctl->loop++; 4367 + ffe_ctl->loop++; 4363 4368 4364 - if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 4365 - struct btrfs_trans_handle *trans; 4366 - int exist = 0; 4369 + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { 4370 + struct btrfs_trans_handle *trans; 4371 + bool have_trans = false; 4367 4372 4368 - /* Check if allocation policy allows to create a new chunk */ 4369 - ret = can_allocate_chunk(fs_info, ffe_ctl); 4370 - if (ret) 4371 - return ret; 4373 + /* Check if allocation policy allows to create a new chunk. */ 4374 + ret = can_allocate_chunk(fs_info, ffe_ctl); 4375 + if (ret) 4376 + return ret; 4372 4377 4373 - trans = current->journal_info; 4374 - if (trans) 4375 - exist = 1; 4376 - else 4377 - trans = btrfs_join_transaction(root); 4378 + trans = current->journal_info; 4379 + if (trans) 4380 + have_trans = true; 4381 + else 4382 + trans = btrfs_join_transaction(root); 4378 4383 4379 - if (IS_ERR(trans)) 4380 - return PTR_ERR(trans); 4384 + if (IS_ERR(trans)) 4385 + return PTR_ERR(trans); 4381 4386 4382 - ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, 4383 - CHUNK_ALLOC_FORCE_FOR_EXTENT); 4387 + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, 4388 + CHUNK_ALLOC_FORCE_FOR_EXTENT); 4384 4389 4385 - /* Do not bail out on ENOSPC since we can do more. */ 4386 - if (ret == -ENOSPC) { 4387 - ret = 0; 4388 - ffe_ctl->loop++; 4389 - } 4390 - else if (ret < 0) 4391 - btrfs_abort_transaction(trans, ret); 4392 - else 4393 - ret = 0; 4394 - if (!exist) 4395 - btrfs_end_transaction(trans); 4396 - if (ret) 4397 - return ret; 4390 + /* Do not bail out on ENOSPC since we can do more. */ 4391 + if (ret == -ENOSPC) { 4392 + ret = 0; 4393 + ffe_ctl->loop++; 4394 + } else if (ret < 0) { 4395 + btrfs_abort_transaction(trans, ret); 4396 + } else { 4397 + ret = 0; 4398 4398 } 4399 4399 4400 - if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 4401 - if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) 4402 - return -ENOSPC; 4400 + if (!have_trans) 4401 + btrfs_end_transaction(trans); 4403 4402 4404 - /* 4405 - * Don't loop again if we already have no empty_size and 4406 - * no empty_cluster. 4407 - */ 4408 - if (ffe_ctl->empty_size == 0 && 4409 - ffe_ctl->empty_cluster == 0) 4410 - return -ENOSPC; 4411 - ffe_ctl->empty_size = 0; 4412 - ffe_ctl->empty_cluster = 0; 4413 - } 4414 - return 1; 4403 + if (ret) 4404 + return ret; 4415 4405 } 4416 - return -ENOSPC; 4406 + 4407 + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { 4408 + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) 4409 + return -ENOSPC; 4410 + 4411 + /* 4412 + * Don't loop again if we already have no empty_size and 4413 + * no empty_cluster. 4414 + */ 4415 + if (ffe_ctl->empty_size == 0 && ffe_ctl->empty_cluster == 0) 4416 + return -ENOSPC; 4417 + ffe_ctl->empty_size = 0; 4418 + ffe_ctl->empty_cluster = 0; 4419 + } 4420 + 4421 + return 1; 4417 4422 } 4418 4423 4419 4424 static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, ··· 5781 5784 5782 5785 generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); 5783 5786 5784 - if (btrfs_buffer_uptodate(next, generation, false)) 5787 + if (btrfs_buffer_uptodate(next, generation, NULL)) 5785 5788 return 0; 5786 5789 5787 5790 check.level = level - 1;

+117 -69

fs/btrfs/extent_io.c

··· 14 14 #include <linux/pagevec.h> 15 15 #include <linux/prefetch.h> 16 16 #include <linux/fsverity.h> 17 + #include <linux/lockdep.h> 17 18 #include "extent_io.h" 18 19 #include "extent-io-tree.h" 19 20 #include "extent_map.h" ··· 521 520 struct bio *bio = &bbio->bio; 522 521 int error = blk_status_to_errno(bio->bi_status); 523 522 struct folio_iter fi; 524 - const u32 sectorsize = fs_info->sectorsize; 523 + u32 bio_size = 0; 525 524 526 525 ASSERT(!bio_flagged(bio, BIO_CLONED)); 527 526 bio_for_each_folio_all(fi, bio) { ··· 529 528 u64 start = folio_pos(folio) + fi.offset; 530 529 u32 len = fi.length; 531 530 532 - /* Our read/write should always be sector aligned. */ 533 - if (!IS_ALIGNED(fi.offset, sectorsize)) 534 - btrfs_err(fs_info, 535 - "partial page write in btrfs with offset %zu and length %zu", 536 - fi.offset, fi.length); 537 - else if (!IS_ALIGNED(fi.length, sectorsize)) 538 - btrfs_info(fs_info, 539 - "incomplete page write with offset %zu and length %zu", 540 - fi.offset, fi.length); 541 - 542 - btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, 543 - !error); 544 - if (error) 545 - mapping_set_error(folio->mapping, error); 531 + bio_size += len; 532 + ASSERT(btrfs_folio_test_ordered(fs_info, folio, start, len)); 533 + btrfs_folio_clear_ordered(fs_info, folio, start, len); 546 534 btrfs_folio_clear_writeback(fs_info, folio, start, len); 547 535 } 548 536 537 + if (error) 538 + mapping_set_error(bbio->inode->vfs_inode.i_mapping, error); 539 + 540 + btrfs_finish_ordered_extent(bbio->ordered, bbio->file_offset, bio_size, !error); 549 541 bio_put(bio); 550 542 } 551 543 ··· 1581 1587 u64 start = page_start + (start_bit << fs_info->sectorsize_bits); 1582 1588 u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; 1583 1589 1584 - btrfs_mark_ordered_io_finished(inode, folio, start, len, false); 1590 + btrfs_folio_clear_ordered(fs_info, folio, start, len); 1591 + btrfs_mark_ordered_io_finished(inode, start, len, false); 1585 1592 } 1586 1593 return ret; 1587 1594 } ··· 1658 1663 * ordered extent. 1659 1664 */ 1660 1665 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1666 + btrfs_folio_clear_ordered(fs_info, folio, filepos, sectorsize); 1661 1667 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1662 1668 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1663 1669 ··· 1666 1670 * Since there is no bio submitted to finish the ordered 1667 1671 * extent, we have to manually finish this sector. 1668 1672 */ 1669 - btrfs_mark_ordered_io_finished(inode, folio, filepos, 1670 - fs_info->sectorsize, false); 1673 + btrfs_mark_ordered_io_finished(inode, filepos, fs_info->sectorsize, 1674 + false); 1671 1675 return PTR_ERR(em); 1672 1676 } 1673 1677 ··· 1779 1783 spin_unlock(&inode->ordered_tree_lock); 1780 1784 btrfs_put_ordered_extent(ordered); 1781 1785 1782 - btrfs_mark_ordered_io_finished(inode, folio, cur, 1783 - fs_info->sectorsize, true); 1786 + btrfs_folio_clear_ordered(fs_info, folio, cur, fs_info->sectorsize); 1787 + btrfs_mark_ordered_io_finished(inode, cur, fs_info->sectorsize, true); 1784 1788 /* 1785 1789 * This range is beyond i_size, thus we don't need to 1786 1790 * bother writing back. ··· 1945 1949 * of time. 1946 1950 */ 1947 1951 spin_lock(&eb->refs_lock); 1948 - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 1952 + if ((wbc->sync_mode == WB_SYNC_ALL || 1953 + atomic_read(&eb->writeback_inhibitors) == 0) && 1954 + test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 1949 1955 XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); 1950 1956 unsigned long flags; 1951 1957 ··· 2394 2396 index = 0; 2395 2397 goto retry; 2396 2398 } 2399 + 2397 2400 /* 2398 - * If something went wrong, don't allow any metadata write bio to be 2399 - * submitted. 2400 - * 2401 - * This would prevent use-after-free if we had dirty pages not 2402 - * cleaned up, which can still happen by fuzzed images. 2403 - * 2404 - * - Bad extent tree 2405 - * Allowing existing tree block to be allocated for other trees. 2406 - * 2407 - * - Log tree operations 2408 - * Exiting tree blocks get allocated to log tree, bumps its 2409 - * generation, then get cleaned in tree re-balance. 2410 - * Such tree block will not be written back, since it's clean, 2411 - * thus no WRITTEN flag set. 2412 - * And after log writes back, this tree block is not traced by 2413 - * any dirty extent_io_tree. 2414 - * 2415 - * - Offending tree block gets re-dirtied from its original owner 2416 - * Since it has bumped generation, no WRITTEN flag, it can be 2417 - * reused without COWing. This tree block will not be traced 2418 - * by btrfs_transaction::dirty_pages. 2419 - * 2420 - * Now such dirty tree block will not be cleaned by any dirty 2421 - * extent io tree. Thus we don't want to submit such wild eb 2422 - * if the fs already has error. 2423 - * 2424 - * We can get ret > 0 from submit_extent_folio() indicating how many ebs 2425 - * were submitted. Reset it to 0 to avoid false alerts for the caller. 2401 + * Only btrfs_check_meta_write_pointer() can update @ret, 2402 + * and it only returns 0 or errors. 2426 2403 */ 2427 - if (ret > 0) 2428 - ret = 0; 2429 - if (!ret && BTRFS_FS_ERROR(fs_info)) 2404 + ASSERT(ret <= 0); 2405 + if (unlikely(!ret && BTRFS_FS_ERROR(fs_info))) 2430 2406 ret = -EROFS; 2431 2407 2432 2408 if (ctx.zoned_bg) ··· 2631 2659 if (IS_ERR(folio)) { 2632 2660 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); 2633 2661 cur_len = cur_end + 1 - cur; 2634 - btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, 2635 - cur, cur_len, false); 2662 + btrfs_mark_ordered_io_finished(BTRFS_I(inode), cur, cur_len, false); 2636 2663 mapping_set_error(mapping, PTR_ERR(folio)); 2637 2664 cur = cur_end; 2638 2665 continue; ··· 2982 3011 kmem_cache_free(extent_buffer_cache, eb); 2983 3012 } 2984 3013 3014 + /* 3015 + * Inhibit writeback on buffer during transaction. 3016 + * 3017 + * @trans: transaction handle that will own the inhibitor 3018 + * @eb: extent buffer to inhibit writeback on 3019 + * 3020 + * Attempt to track this extent buffer in the transaction's inhibited set. If 3021 + * memory allocation fails, the buffer is simply not tracked. It may be written 3022 + * back and need re-COW, which is the original behavior. This is acceptable 3023 + * since inhibiting writeback is an optimization. 3024 + */ 3025 + void btrfs_inhibit_eb_writeback(struct btrfs_trans_handle *trans, struct extent_buffer *eb) 3026 + { 3027 + unsigned long index = eb->start >> trans->fs_info->nodesize_bits; 3028 + void *old; 3029 + 3030 + lockdep_assert_held(&eb->lock); 3031 + /* Check if already inhibited by this handle. */ 3032 + old = xa_load(&trans->writeback_inhibited_ebs, index); 3033 + if (old == eb) 3034 + return; 3035 + 3036 + /* Take reference for the xarray entry. */ 3037 + refcount_inc(&eb->refs); 3038 + 3039 + old = xa_store(&trans->writeback_inhibited_ebs, index, eb, GFP_NOFS); 3040 + if (xa_is_err(old)) { 3041 + /* Allocation failed, just skip inhibiting this buffer. */ 3042 + free_extent_buffer(eb); 3043 + return; 3044 + } 3045 + 3046 + /* Handle replacement of different eb at same index. */ 3047 + if (old && old != eb) { 3048 + struct extent_buffer *old_eb = old; 3049 + 3050 + atomic_dec(&old_eb->writeback_inhibitors); 3051 + free_extent_buffer(old_eb); 3052 + } 3053 + 3054 + atomic_inc(&eb->writeback_inhibitors); 3055 + } 3056 + 3057 + /* 3058 + * Uninhibit writeback on all extent buffers. 3059 + */ 3060 + void btrfs_uninhibit_all_eb_writeback(struct btrfs_trans_handle *trans) 3061 + { 3062 + struct extent_buffer *eb; 3063 + unsigned long index; 3064 + 3065 + xa_for_each(&trans->writeback_inhibited_ebs, index, eb) { 3066 + atomic_dec(&eb->writeback_inhibitors); 3067 + free_extent_buffer(eb); 3068 + } 3069 + xa_destroy(&trans->writeback_inhibited_ebs); 3070 + } 3071 + 2985 3072 static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, 2986 3073 u64 start) 2987 3074 { ··· 3050 3021 eb->len = fs_info->nodesize; 3051 3022 eb->fs_info = fs_info; 3052 3023 init_rwsem(&eb->lock); 3024 + atomic_set(&eb->writeback_inhibitors, 0); 3053 3025 3054 3026 btrfs_leak_debug_add_eb(eb); 3055 3027 ··· 3901 3871 struct btrfs_fs_info *fs_info = eb->fs_info; 3902 3872 struct btrfs_bio *bbio; 3903 3873 3904 - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3874 + if (extent_buffer_uptodate(eb)) { 3875 + int ret; 3876 + 3877 + ret = btrfs_buffer_uptodate(eb, 0, check); 3878 + if (unlikely(ret <= 0)) { 3879 + if (ret == 0) 3880 + ret = -EIO; 3881 + return ret; 3882 + } 3905 3883 return 0; 3884 + } 3906 3885 3907 3886 /* 3908 3887 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write ··· 3931 3892 * started and finished reading the same eb. In this case, UPTODATE 3932 3893 * will now be set, and we shouldn't read it in again. 3933 3894 */ 3934 - if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { 3895 + if (unlikely(extent_buffer_uptodate(eb))) { 3896 + int ret; 3897 + 3935 3898 clear_extent_buffer_reading(eb); 3899 + ret = btrfs_buffer_uptodate(eb, 0, check); 3900 + if (unlikely(ret <= 0)) { 3901 + if (ret == 0) 3902 + ret = -EIO; 3903 + return ret; 3904 + } 3936 3905 return 0; 3937 3906 } 3938 3907 ··· 3976 3929 return ret; 3977 3930 3978 3931 wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); 3979 - if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) 3932 + if (unlikely(!extent_buffer_uptodate(eb))) 3980 3933 return -EIO; 3981 3934 return 0; 3982 3935 } ··· 4018 3971 size_t cur; 4019 3972 size_t offset; 4020 3973 char *dst = (char *)dstv; 4021 - unsigned long i = get_eb_folio_index(eb, start); 3974 + unsigned long i; 4022 3975 4023 3976 if (check_eb_range(eb, start, len)) { 4024 3977 /* ··· 4035 3988 } 4036 3989 4037 3990 offset = get_eb_offset_in_folio(eb, start); 4038 - 3991 + i = get_eb_folio_index(eb, start); 4039 3992 while (len > 0) { 4040 3993 char *kaddr; 4041 3994 ··· 4058 4011 size_t cur; 4059 4012 size_t offset; 4060 4013 char __user *dst = (char __user *)dstv; 4061 - unsigned long i = get_eb_folio_index(eb, start); 4014 + unsigned long i; 4062 4015 int ret = 0; 4063 4016 4064 - WARN_ON(start > eb->len); 4065 - WARN_ON(start + len > eb->start + eb->len); 4017 + if (check_eb_range(eb, start, len)) 4018 + return -EINVAL; 4066 4019 4067 4020 if (eb->addr) { 4068 4021 if (copy_to_user_nofault(dstv, eb->addr + start, len)) ··· 4071 4024 } 4072 4025 4073 4026 offset = get_eb_offset_in_folio(eb, start); 4074 - 4027 + i = get_eb_folio_index(eb, start); 4075 4028 while (len > 0) { 4076 4029 char *kaddr; 4077 4030 ··· 4099 4052 size_t offset; 4100 4053 char *kaddr; 4101 4054 char *ptr = (char *)ptrv; 4102 - unsigned long i = get_eb_folio_index(eb, start); 4055 + unsigned long i; 4103 4056 int ret = 0; 4104 4057 4105 4058 if (check_eb_range(eb, start, len)) ··· 4109 4062 return memcmp(ptrv, eb->addr + start, len); 4110 4063 4111 4064 offset = get_eb_offset_in_folio(eb, start); 4112 - 4065 + i = get_eb_folio_index(eb, start); 4113 4066 while (len > 0) { 4114 4067 cur = min(len, unit_size - offset); 4115 4068 kaddr = folio_address(eb->folios[i]); ··· 4169 4122 size_t offset; 4170 4123 char *kaddr; 4171 4124 const char *src = (const char *)srcv; 4172 - unsigned long i = get_eb_folio_index(eb, start); 4125 + unsigned long i; 4173 4126 /* For unmapped (dummy) ebs, no need to check their uptodate status. */ 4174 4127 const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); 4175 4128 ··· 4185 4138 } 4186 4139 4187 4140 offset = get_eb_offset_in_folio(eb, start); 4188 - 4141 + i = get_eb_folio_index(eb, start); 4189 4142 while (len > 0) { 4190 4143 if (check_uptodate) 4191 4144 assert_eb_folio_uptodate(eb, i); ··· 4271 4224 size_t cur; 4272 4225 size_t offset; 4273 4226 char *kaddr; 4274 - unsigned long i = get_eb_folio_index(dst, dst_offset); 4227 + unsigned long i; 4275 4228 4276 4229 if (check_eb_range(dst, dst_offset, len) || 4277 4230 check_eb_range(src, src_offset, len)) ··· 4281 4234 4282 4235 offset = get_eb_offset_in_folio(dst, dst_offset); 4283 4236 4237 + i = get_eb_folio_index(dst, dst_offset); 4284 4238 while (len > 0) { 4285 4239 assert_eb_folio_uptodate(dst, i); 4286 4240 ··· 4654 4606 if (IS_ERR(eb)) 4655 4607 return; 4656 4608 4657 - if (btrfs_buffer_uptodate(eb, gen, true)) { 4609 + if (btrfs_buffer_uptodate(eb, gen, NULL)) { 4658 4610 free_extent_buffer(eb); 4659 4611 return; 4660 4612 }

+29 -2

fs/btrfs/extent_io.h

··· 99 99 spinlock_t refs_lock; 100 100 refcount_t refs; 101 101 int read_mirror; 102 + /* Inhibit WB_SYNC_NONE writeback when > 0. */ 103 + atomic_t writeback_inhibitors; 102 104 /* >= 0 if eb belongs to a log tree, -1 otherwise */ 103 105 s8 log_index; 104 106 u8 folio_shift; ··· 198 196 ulist_init(&changeset->range_changed); 199 197 } 200 198 199 + /* 200 + * Sentinel value for range_changed.prealloc indicating that the changeset 201 + * only tracks bytes_changed and does not record individual ranges. This 202 + * avoids GFP_ATOMIC allocations inside add_extent_changeset() when the 203 + * caller doesn't need to iterate the changed ranges afterwards. 204 + */ 205 + #define EXTENT_CHANGESET_BYTES_ONLY ((struct ulist_node *)1) 206 + 207 + static inline void extent_changeset_init_bytes_only(struct extent_changeset *changeset) 208 + { 209 + changeset->bytes_changed = 0; 210 + changeset->range_changed.prealloc = EXTENT_CHANGESET_BYTES_ONLY; 211 + } 212 + 213 + static inline bool extent_changeset_tracks_ranges(const struct extent_changeset *changeset) 214 + { 215 + return changeset->range_changed.prealloc != EXTENT_CHANGESET_BYTES_ONLY; 216 + } 217 + 201 218 static inline struct extent_changeset *extent_changeset_alloc(void) 202 219 { 203 220 struct extent_changeset *ret; ··· 231 210 232 211 static inline void extent_changeset_prealloc(struct extent_changeset *changeset, gfp_t gfp_mask) 233 212 { 213 + ASSERT(extent_changeset_tracks_ranges(changeset)); 234 214 ulist_prealloc(&changeset->range_changed, gfp_mask); 235 215 } 236 216 ··· 240 218 if (!changeset) 241 219 return; 242 220 changeset->bytes_changed = 0; 243 - ulist_release(&changeset->range_changed); 221 + if (extent_changeset_tracks_ranges(changeset)) 222 + ulist_release(&changeset->range_changed); 244 223 } 245 224 246 225 static inline void extent_changeset_free(struct extent_changeset *changeset) ··· 321 298 return num_extent_pages(eb); 322 299 } 323 300 324 - static inline int extent_buffer_uptodate(const struct extent_buffer *eb) 301 + static inline bool extent_buffer_uptodate(const struct extent_buffer *eb) 325 302 { 326 303 return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 327 304 } ··· 403 380 #else 404 381 #define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0) 405 382 #endif 383 + 384 + void btrfs_inhibit_eb_writeback(struct btrfs_trans_handle *trans, 385 + struct extent_buffer *eb); 386 + void btrfs_uninhibit_all_eb_writeback(struct btrfs_trans_handle *trans); 406 387 407 388 #endif

+3 -3

fs/btrfs/file-item.c

··· 1097 1097 return 0; 1098 1098 } 1099 1099 1100 - int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 1101 - struct btrfs_root *root, 1102 - struct btrfs_ordered_sum *sums) 1100 + int btrfs_insert_data_csums(struct btrfs_trans_handle *trans, 1101 + struct btrfs_root *root, 1102 + struct btrfs_ordered_sum *sums) 1103 1103 { 1104 1104 struct btrfs_fs_info *fs_info = root->fs_info; 1105 1105 struct btrfs_key file_key;

+3 -3

fs/btrfs/file-item.h

··· 61 61 struct btrfs_root *root, 62 62 struct btrfs_path *path, u64 objectid, 63 63 u64 bytenr, int mod); 64 - int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 65 - struct btrfs_root *root, 66 - struct btrfs_ordered_sum *sums); 64 + int btrfs_insert_data_csums(struct btrfs_trans_handle *trans, 65 + struct btrfs_root *root, 66 + struct btrfs_ordered_sum *sums); 67 67 int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async); 68 68 int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); 69 69 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,

+3 -3

fs/btrfs/file.c

··· 1445 1445 * have opened a file as writable, we have to stop this write operation 1446 1446 * to ensure consistency. 1447 1447 */ 1448 - if (BTRFS_FS_ERROR(inode->root->fs_info)) 1448 + if (unlikely(BTRFS_FS_ERROR(inode->root->fs_info))) 1449 1449 return -EROFS; 1450 1450 1451 1451 if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) ··· 3316 3316 *delalloc_start_ret = start; 3317 3317 delalloc_len = btrfs_count_range_bits(&inode->io_tree, 3318 3318 delalloc_start_ret, end, 3319 - len, EXTENT_DELALLOC, 1, 3320 - cached_state); 3319 + len, EXTENT_DELALLOC, 3320 + true, cached_state); 3321 3321 } else { 3322 3322 spin_unlock(&inode->lock); 3323 3323 }

+6 -3

fs/btrfs/fs.h

··· 27 27 #include <linux/sched.h> 28 28 #include <linux/rbtree.h> 29 29 #include <linux/xxhash.h> 30 + #include <linux/fserror.h> 30 31 #include <uapi/linux/btrfs.h> 31 32 #include <uapi/linux/btrfs_tree.h> 32 33 #include "extent-io-tree.h" ··· 967 966 #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ 968 967 struct inode *: (_inode)))->root->fs_info) 969 968 970 - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) 969 + static inline gfp_t btrfs_alloc_write_mask(const struct address_space *mapping) 971 970 { 972 971 return mapping_gfp_constraint(mapping, ~__GFP_FS); 973 972 } 974 973 975 974 /* Return the minimal folio size of the fs. */ 976 - static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info) 975 + static inline unsigned int btrfs_min_folio_size(const struct btrfs_fs_info *fs_info) 977 976 { 978 977 return 1U << (PAGE_SHIFT + fs_info->block_min_order); 979 978 } ··· 1200 1199 * So here we only mark the fs error without flipping it RO. 1201 1200 */ 1202 1201 WRITE_ONCE(fs_info->fs_error, -EIO); 1203 - if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) 1202 + if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) { 1204 1203 btrfs_crit(fs_info, "emergency shutdown"); 1204 + fserror_report_shutdown(fs_info->sb, GFP_KERNEL); 1205 + } 1205 1206 } 1206 1207 1207 1208 /*

+157 -159

fs/btrfs/inode.c

··· 74 74 #include "delayed-inode.h" 75 75 76 76 #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) 77 - #define COW_FILE_RANGE_NO_INLINE (1UL << 1) 78 77 79 78 struct btrfs_iget_args { 80 79 u64 ino; ··· 423 424 folio_put(folio); 424 425 } 425 426 426 - return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); 427 + return btrfs_mark_ordered_io_finished(inode, offset, bytes, false); 427 428 } 428 429 429 430 static int btrfs_dirty_inode(struct btrfs_inode *inode); ··· 621 622 * 622 623 * If being used directly, you must have already checked we're allowed to cow 623 624 * the range by getting true from can_cow_file_range_inline(). 625 + * 626 + * Return 0 if the inlined extent is created successfully. 627 + * Return <0 for critical error, and should be considered as an writeback error. 628 + * Return >0 if can not create an inlined extent (mostly due to lack of meta space). 624 629 */ 625 630 static noinline int __cow_file_range_inline(struct btrfs_inode *inode, 626 631 u64 size, size_t compressed_size, ··· 706 703 return ret; 707 704 } 708 705 709 - static noinline int cow_file_range_inline(struct btrfs_inode *inode, 710 - struct folio *locked_folio, 711 - u64 offset, u64 end, 712 - size_t compressed_size, 713 - int compress_type, 714 - struct folio *compressed_folio, 715 - bool update_i_size) 716 - { 717 - struct extent_state *cached = NULL; 718 - unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | 719 - EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; 720 - u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); 721 - int ret; 722 - 723 - if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) 724 - return 1; 725 - 726 - btrfs_lock_extent(&inode->io_tree, offset, end, &cached); 727 - ret = __cow_file_range_inline(inode, size, compressed_size, 728 - compress_type, compressed_folio, 729 - update_i_size); 730 - if (ret > 0) { 731 - btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); 732 - return ret; 733 - } 734 - 735 - /* 736 - * In the successful case (ret == 0 here), cow_file_range will return 1. 737 - * 738 - * Quite a bit further up the callstack in extent_writepage(), ret == 1 739 - * is treated as a short circuited success and does not unlock the folio, 740 - * so we must do it here. 741 - * 742 - * In the failure case, the locked_folio does get unlocked by 743 - * btrfs_folio_end_all_writers, which asserts that it is still locked 744 - * at that point, so we must *not* unlock it here. 745 - * 746 - * The other two callsites in compress_file_range do not have a 747 - * locked_folio, so they are not relevant to this logic. 748 - */ 749 - if (ret == 0) 750 - locked_folio = NULL; 751 - 752 - extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, 753 - clear_flags, PAGE_UNLOCK | 754 - PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); 755 - return ret; 756 - } 757 - 758 706 struct async_extent { 759 707 u64 start; 760 708 u64 ram_size; ··· 751 797 * options, defragmentation, properties or heuristics. 752 798 */ 753 799 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, 754 - u64 end) 800 + u64 end, bool check_inline) 755 801 { 756 802 struct btrfs_fs_info *fs_info = inode->root->fs_info; 757 803 ··· 765 811 * do not even bother try compression, as there will be no space saving 766 812 * and will always fallback to regular write later. 767 813 */ 768 - if (start != 0 && end + 1 - start <= fs_info->sectorsize) 814 + if (end + 1 - start <= fs_info->sectorsize && 815 + (!check_inline || (start > 0 || end + 1 < inode->disk_i_size))) 769 816 return 0; 817 + 770 818 /* Defrag ioctl takes precedence over mount options and properties. */ 771 819 if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) 772 820 return 0; ··· 846 890 return page_folio(phys_to_page(paddr)); 847 891 } 848 892 849 - static void zero_last_folio(struct compressed_bio *cb) 850 - { 851 - struct bio *bio = &cb->bbio.bio; 852 - struct folio *last_folio = compressed_bio_last_folio(cb); 853 - const u32 bio_size = bio->bi_iter.bi_size; 854 - const u32 foffset = offset_in_folio(last_folio, bio_size); 855 - 856 - folio_zero_range(last_folio, foffset, folio_size(last_folio) - foffset); 857 - } 858 - 859 893 static void round_up_last_block(struct compressed_bio *cb, u32 blocksize) 860 894 { 861 895 struct bio *bio = &cb->bbio.bio; 862 896 struct folio *last_folio = compressed_bio_last_folio(cb); 863 897 const u32 bio_size = bio->bi_iter.bi_size; 864 898 const u32 foffset = offset_in_folio(last_folio, bio_size); 899 + const u32 padding_len = round_up(foffset, blocksize) - foffset; 865 900 bool ret; 866 901 867 902 if (IS_ALIGNED(bio_size, blocksize)) 868 903 return; 869 904 870 - ret = bio_add_folio(bio, last_folio, round_up(foffset, blocksize) - foffset, foffset); 905 + folio_zero_range(last_folio, foffset, padding_len); 906 + ret = bio_add_folio(bio, last_folio, padding_len, foffset); 871 907 /* The remaining part should be merged thus never fail. */ 872 908 ASSERT(ret); 873 909 } ··· 883 935 container_of(work, struct async_chunk, work); 884 936 struct btrfs_inode *inode = async_chunk->inode; 885 937 struct btrfs_fs_info *fs_info = inode->root->fs_info; 886 - struct address_space *mapping = inode->vfs_inode.i_mapping; 887 938 struct compressed_bio *cb = NULL; 888 - const u32 min_folio_size = btrfs_min_folio_size(fs_info); 889 939 u64 blocksize = fs_info->sectorsize; 890 940 u64 start = async_chunk->start; 891 941 u64 end = async_chunk->end; ··· 893 947 int ret = 0; 894 948 unsigned long total_compressed = 0; 895 949 unsigned long total_in = 0; 896 - unsigned int loff; 897 950 int compress_type = fs_info->compress_type; 898 951 int compress_level = fs_info->compress_level; 899 952 ··· 954 1009 * been flagged as NOCOMPRESS. This flag can change at any time if we 955 1010 * discover bad compression ratios. 956 1011 */ 957 - if (!inode_need_compress(inode, start, end)) 1012 + if (!inode_need_compress(inode, start, end, false)) 958 1013 goto cleanup_and_bail_uncompressed; 959 1014 960 1015 if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { ··· 976 1031 total_in = cur_len; 977 1032 978 1033 /* 979 - * Zero the tail end of the last folio, as we might be sending it down 980 - * to disk. 981 - */ 982 - loff = (total_compressed & (min_folio_size - 1)); 983 - if (loff) 984 - zero_last_folio(cb); 985 - 986 - /* 987 - * Try to create an inline extent. 988 - * 989 - * If we didn't compress the entire range, try to create an uncompressed 990 - * inline extent, else a compressed one. 991 - * 992 - * Check cow_file_range() for why we don't even try to create inline 993 - * extent for the subpage case. 994 - */ 995 - if (total_in < actual_end) 996 - ret = cow_file_range_inline(inode, NULL, start, end, 0, 997 - BTRFS_COMPRESS_NONE, NULL, false); 998 - else 999 - ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, 1000 - compress_type, 1001 - bio_first_folio_all(&cb->bbio.bio), false); 1002 - if (ret <= 0) { 1003 - cleanup_compressed_bio(cb); 1004 - if (ret < 0) 1005 - mapping_set_error(mapping, -EIO); 1006 - return; 1007 - } 1008 - 1009 - /* 1010 1034 * We aren't doing an inline extent. Round the compressed size up to a 1011 1035 * block size boundary so the allocator does sane things. 1012 1036 */ 1013 - total_compressed = ALIGN(total_compressed, blocksize); 1014 1037 round_up_last_block(cb, blocksize); 1038 + total_compressed = cb->bbio.bio.bi_iter.bi_size; 1039 + ASSERT(IS_ALIGNED(total_compressed, blocksize)); 1015 1040 1016 1041 /* 1017 1042 * One last check to make sure the compression is really a win, compare ··· 1352 1437 * 1353 1438 * When this function fails, it unlocks all folios except @locked_folio. 1354 1439 * 1355 - * When this function successfully creates an inline extent, it returns 1 and 1356 - * unlocks all folios including locked_folio and starts I/O on them. 1357 - * (In reality inline extents are limited to a single block, so locked_folio is 1358 - * the only folio handled anyway). 1359 - * 1360 1440 * When this function succeed and creates a normal extent, the folio locking 1361 1441 * status depends on the passed in flags: 1362 1442 * ··· 1395 1485 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); 1396 1486 1397 1487 inode_should_defrag(inode, start, end, num_bytes, SZ_64K); 1398 - 1399 - if (!(flags & COW_FILE_RANGE_NO_INLINE)) { 1400 - /* lets try to make an inline extent */ 1401 - ret = cow_file_range_inline(inode, locked_folio, start, end, 0, 1402 - BTRFS_COMPRESS_NONE, NULL, false); 1403 - if (ret <= 0) { 1404 - /* 1405 - * We succeeded, return 1 so the caller knows we're done 1406 - * with this page and already handled the IO. 1407 - * 1408 - * If there was an error then cow_file_range_inline() has 1409 - * already done the cleanup. 1410 - */ 1411 - if (ret == 0) 1412 - ret = 1; 1413 - goto done; 1414 - } 1415 - } 1416 - 1417 1488 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); 1418 1489 1419 1490 /* ··· 1472 1581 } 1473 1582 extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, 1474 1583 EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); 1475 - done: 1476 1584 if (done_offset) 1477 1585 *done_offset = end; 1478 1586 return ret; ··· 1591 1701 struct async_cow *ctx; 1592 1702 struct async_chunk *async_chunk; 1593 1703 unsigned long nr_pages; 1594 - u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); 1704 + u64 num_chunks = DIV_ROUND_UP(end - start, BTRFS_COMPRESSION_CHUNK_SIZE); 1595 1705 int i; 1596 1706 unsigned nofs_flag; 1597 1707 const blk_opf_t write_flags = wbc_to_write_flags(wbc); ··· 1608 1718 atomic_set(&ctx->num_chunks, num_chunks); 1609 1719 1610 1720 for (i = 0; i < num_chunks; i++) { 1611 - u64 cur_end = min(end, start + SZ_512K - 1); 1721 + u64 cur_end = min(end, start + BTRFS_COMPRESSION_CHUNK_SIZE - 1); 1612 1722 1613 1723 /* 1614 1724 * igrab is called higher up in the call chain, take only the ··· 1743 1853 */ 1744 1854 btrfs_lock_extent(io_tree, start, end, &cached_state); 1745 1855 count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, 1746 - EXTENT_NORESERVE, 0, NULL); 1856 + EXTENT_NORESERVE, false, NULL); 1747 1857 if (count > 0 || is_space_ino || is_reloc_ino) { 1748 1858 u64 bytes = count; 1749 1859 struct btrfs_fs_info *fs_info = inode->root->fs_info; ··· 1774 1884 * a locked folio, which can race with writeback. 1775 1885 */ 1776 1886 ret = cow_file_range(inode, locked_folio, start, end, NULL, 1777 - COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); 1887 + COW_FILE_RANGE_KEEP_LOCKED); 1778 1888 ASSERT(ret != 1); 1779 1889 return ret; 1780 1890 } ··· 1825 1935 int can_nocow = 0; 1826 1936 int ret = 0; 1827 1937 bool nowait = path->nowait; 1938 + 1939 + /* If there are pending snapshots for this root, we must do COW. */ 1940 + if (args->writeback_path && !is_freespace_inode && 1941 + atomic_read(&root->snapshot_force_cow)) 1942 + goto out; 1828 1943 1829 1944 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 1830 1945 extent_type = btrfs_file_extent_type(leaf, fi); ··· 1891 1996 btrfs_free_path(path); 1892 1997 path = NULL; 1893 1998 } 1894 - 1895 - /* If there are pending snapshots for this root, we must COW. */ 1896 - if (args->writeback_path && !is_freespace_inode && 1897 - atomic_read(&root->snapshot_force_cow)) 1898 - goto out; 1899 1999 1900 2000 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; 1901 2001 args->file_extent.offset += args->start - key->offset; ··· 2326 2436 } 2327 2437 2328 2438 /* 2439 + * Return 0 if an inlined extent is created successfully. 2440 + * Return <0 if critical error happened. 2441 + * Return >0 if an inline extent can not be created. 2442 + */ 2443 + static int run_delalloc_inline(struct btrfs_inode *inode, struct folio *locked_folio) 2444 + { 2445 + struct btrfs_fs_info *fs_info = inode->root->fs_info; 2446 + struct compressed_bio *cb = NULL; 2447 + struct extent_state *cached = NULL; 2448 + const u64 i_size = i_size_read(&inode->vfs_inode); 2449 + const u32 blocksize = fs_info->sectorsize; 2450 + int compress_type = fs_info->compress_type; 2451 + int compress_level = fs_info->compress_level; 2452 + u32 compressed_size = 0; 2453 + int ret; 2454 + 2455 + ASSERT(folio_pos(locked_folio) == 0); 2456 + 2457 + if (btrfs_inode_can_compress(inode) && 2458 + inode_need_compress(inode, 0, blocksize, true)) { 2459 + if (inode->defrag_compress > 0 && 2460 + inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { 2461 + compress_type = inode->defrag_compress; 2462 + compress_level = inode->defrag_compress_level; 2463 + } else if (inode->prop_compress) { 2464 + compress_type = inode->prop_compress; 2465 + } 2466 + cb = btrfs_compress_bio(inode, 0, blocksize, compress_type, compress_level, 0); 2467 + if (IS_ERR(cb)) { 2468 + cb = NULL; 2469 + /* Just fall back to non-compressed case. */ 2470 + } else { 2471 + compressed_size = cb->bbio.bio.bi_iter.bi_size; 2472 + } 2473 + } 2474 + if (!can_cow_file_range_inline(inode, 0, i_size, compressed_size)) { 2475 + if (cb) 2476 + cleanup_compressed_bio(cb); 2477 + return 1; 2478 + } 2479 + 2480 + btrfs_lock_extent(&inode->io_tree, 0, blocksize - 1, &cached); 2481 + if (cb) { 2482 + ret = __cow_file_range_inline(inode, i_size, compressed_size, compress_type, 2483 + bio_first_folio_all(&cb->bbio.bio), false); 2484 + cleanup_compressed_bio(cb); 2485 + cb = NULL; 2486 + } else { 2487 + ret = __cow_file_range_inline(inode, i_size, 0, BTRFS_COMPRESS_NONE, 2488 + NULL, false); 2489 + } 2490 + /* 2491 + * We failed to insert inline extent due to lack of meta space. 2492 + * Just unlock the extent io range and fallback to regular COW/NOCOW path. 2493 + */ 2494 + if (ret > 0) { 2495 + btrfs_unlock_extent(&inode->io_tree, 0, blocksize - 1, &cached); 2496 + return ret; 2497 + } 2498 + 2499 + /* 2500 + * In the successful case (ret == 0 here), btrfs_run_delalloc_range() 2501 + * will return 1. 2502 + * 2503 + * Quite a bit further up the callstack in extent_writepage(), ret == 1 2504 + * is treated as a short circuited success and does not unlock the folio, 2505 + * so we must do it here. 2506 + * 2507 + * For failure case, the @locked_folio does get unlocked by 2508 + * btrfs_folio_end_lock_bitmap(), so we must *not* unlock it here. 2509 + * 2510 + * So if ret == 0, we let extent_clear_unlock_delalloc() to unlock the 2511 + * folio by passing NULL as @locked_folio. 2512 + * Otherwise pass @locked_folio as usual. 2513 + */ 2514 + if (ret == 0) 2515 + locked_folio = NULL; 2516 + extent_clear_unlock_delalloc(inode, 0, blocksize - 1, locked_folio, &cached, 2517 + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | 2518 + EXTENT_DO_ACCOUNTING | EXTENT_LOCKED, 2519 + PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); 2520 + return ret; 2521 + } 2522 + 2523 + /* 2329 2524 * Function to process delayed allocation (create CoW) for ranges which are 2330 2525 * being touched for the first time. 2331 2526 */ ··· 2426 2451 ASSERT(!(end <= folio_pos(locked_folio) || 2427 2452 start >= folio_next_pos(locked_folio))); 2428 2453 2454 + if (start == 0 && end + 1 <= inode->root->fs_info->sectorsize && 2455 + end + 1 >= inode->disk_i_size) { 2456 + int ret; 2457 + 2458 + ret = run_delalloc_inline(inode, locked_folio); 2459 + if (ret < 0) 2460 + return ret; 2461 + if (ret == 0) 2462 + return 1; 2463 + /* 2464 + * Continue regular handling if we can not create an 2465 + * inlined extent. 2466 + */ 2467 + } 2468 + 2429 2469 if (should_nocow(inode, start, end)) 2430 2470 return run_delalloc_nocow(inode, locked_folio, start, end); 2431 2471 2432 2472 if (btrfs_inode_can_compress(inode) && 2433 - inode_need_compress(inode, start, end) && 2473 + inode_need_compress(inode, start, end, false) && 2434 2474 run_delalloc_compressed(inode, locked_folio, start, end, wbc)) 2435 2475 return 1; 2436 2476 ··· 2735 2745 } 2736 2746 2737 2747 /* 2738 - * given a list of ordered sums record them in the inode. This happens 2739 - * at IO completion time based on sums calculated at bio submission time. 2748 + * Given an ordered extent and insert all its checksums into the csum tree. 2749 + * 2750 + * This happens at IO completion time based on sums calculated at bio 2751 + * submission time. 2740 2752 */ 2741 2753 static int add_pending_csums(struct btrfs_trans_handle *trans, 2742 - struct list_head *list) 2754 + struct btrfs_ordered_extent *oe) 2743 2755 { 2744 2756 struct btrfs_ordered_sum *sum; 2745 2757 struct btrfs_root *csum_root = NULL; 2746 2758 int ret; 2747 2759 2748 - list_for_each_entry(sum, list, list) { 2760 + list_for_each_entry(sum, &oe->csum_list, list) { 2749 2761 if (!csum_root) { 2750 2762 csum_root = btrfs_csum_root(trans->fs_info, 2751 2763 sum->logical); ··· 2759 2767 } 2760 2768 } 2761 2769 trans->adding_csums = true; 2762 - ret = btrfs_csum_file_blocks(trans, csum_root, sum); 2770 + ret = btrfs_insert_data_csums(trans, csum_root, sum); 2763 2771 trans->adding_csums = false; 2764 2772 if (ret) 2765 2773 return ret; ··· 2948 2956 * to reflect the errors and clean the page. 2949 2957 */ 2950 2958 mapping_set_error(folio->mapping, ret); 2951 - btrfs_mark_ordered_io_finished(inode, folio, page_start, 2959 + btrfs_folio_clear_ordered(fs_info, folio, page_start, 2960 + folio_size(folio)); 2961 + btrfs_mark_ordered_io_finished(inode, page_start, 2952 2962 folio_size(folio), !ret); 2953 2963 folio_clear_dirty_for_io(folio); 2954 2964 } ··· 3197 3203 bool freespace_inode; 3198 3204 bool truncated = false; 3199 3205 bool clear_reserved_extent = true; 3200 - unsigned int clear_bits = EXTENT_DEFRAG; 3206 + unsigned int clear_bits = 0; 3201 3207 3202 3208 start = ordered_extent->file_offset; 3203 3209 end = start + ordered_extent->num_bytes - 1; ··· 3207 3213 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && 3208 3214 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) 3209 3215 clear_bits |= EXTENT_DELALLOC_NEW; 3216 + 3217 + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) 3218 + clear_bits |= EXTENT_DEFRAG; 3210 3219 3211 3220 freespace_inode = btrfs_is_free_space_inode(inode); 3212 3221 if (!freespace_inode) ··· 3268 3271 3269 3272 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 3270 3273 /* Logic error */ 3271 - ASSERT(list_empty(&ordered_extent->list)); 3272 - if (unlikely(!list_empty(&ordered_extent->list))) { 3274 + ASSERT(list_empty(&ordered_extent->csum_list)); 3275 + if (unlikely(!list_empty(&ordered_extent->csum_list))) { 3273 3276 ret = -EINVAL; 3274 3277 btrfs_abort_transaction(trans, ret); 3275 3278 goto out; ··· 3318 3321 goto out; 3319 3322 } 3320 3323 3321 - ret = add_pending_csums(trans, &ordered_extent->list); 3324 + ret = add_pending_csums(trans, ordered_extent); 3322 3325 if (unlikely(ret)) { 3323 3326 btrfs_abort_transaction(trans, ret); 3324 3327 goto out; ··· 3342 3345 goto out; 3343 3346 } 3344 3347 out: 3345 - btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, 3346 - &cached_state); 3348 + if (clear_bits) 3349 + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, 3350 + &cached_state); 3347 3351 3348 3352 if (trans) 3349 3353 btrfs_end_transaction(trans); ··· 3425 3427 * This needs to be done to make sure anybody waiting knows we are done 3426 3428 * updating everything for this ordered extent. 3427 3429 */ 3428 - btrfs_remove_ordered_extent(inode, ordered_extent); 3430 + btrfs_remove_ordered_extent(ordered_extent); 3429 3431 3430 3432 /* once for us */ 3431 3433 btrfs_put_ordered_extent(ordered_extent); ··· 4695 4697 dir_id = btrfs_super_root_dir(fs_info->super_copy); 4696 4698 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, 4697 4699 dir_id, &name, 0); 4698 - if (di && !IS_ERR(di)) { 4700 + if (!IS_ERR_OR_NULL(di)) { 4699 4701 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); 4700 4702 if (key.objectid == btrfs_root_id(root)) { 4701 4703 ret = -EPERM; ··· 5446 5448 * zero. Make sure any new writes to the file get on disk 5447 5449 * on close. 5448 5450 */ 5449 - if (newsize == 0) 5451 + if (newsize == 0 && oldsize != 0) 5450 5452 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, 5451 5453 &BTRFS_I(inode)->runtime_flags); 5452 5454 ··· 6857 6859 } 6858 6860 } else { 6859 6861 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 6860 - 0, BTRFS_I(inode)->dir_index); 6862 + false, BTRFS_I(inode)->dir_index); 6861 6863 if (unlikely(ret)) { 6862 6864 btrfs_abort_transaction(trans, ret); 6863 6865 goto discard; ··· 7073 7075 inode_set_ctime_current(inode); 7074 7076 7075 7077 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), 7076 - &fname.disk_name, 1, index); 7078 + &fname.disk_name, true, index); 7077 7079 if (ret) 7078 7080 goto fail; 7079 7081 ··· 8171 8173 if (!freespace_inode) 8172 8174 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); 8173 8175 8174 - btrfs_remove_ordered_extent(inode, ordered); 8176 + btrfs_remove_ordered_extent(ordered); 8175 8177 btrfs_put_ordered_extent(ordered); 8176 8178 btrfs_put_ordered_extent(ordered); 8177 8179 } ··· 8493 8495 } 8494 8496 8495 8497 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 8496 - new_name, 0, old_idx); 8498 + new_name, false, old_idx); 8497 8499 if (unlikely(ret)) { 8498 8500 btrfs_abort_transaction(trans, ret); 8499 8501 goto out_fail; 8500 8502 } 8501 8503 8502 8504 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), 8503 - old_name, 0, new_idx); 8505 + old_name, false, new_idx); 8504 8506 if (unlikely(ret)) { 8505 8507 btrfs_abort_transaction(trans, ret); 8506 8508 goto out_fail; ··· 8791 8793 } 8792 8794 8793 8795 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), 8794 - &new_fname.disk_name, 0, index); 8796 + &new_fname.disk_name, false, index); 8795 8797 if (unlikely(ret)) { 8796 8798 btrfs_abort_transaction(trans, ret); 8797 8799 goto out_fail; ··· 8976 8978 { 8977 8979 struct btrfs_fs_info *fs_info = root->fs_info; 8978 8980 8979 - if (BTRFS_FS_ERROR(fs_info)) 8981 + if (unlikely(BTRFS_FS_ERROR(fs_info))) 8980 8982 return -EROFS; 8981 8983 return start_delalloc_inodes(root, NULL, true, in_reclaim_context); 8982 8984 } ··· 8989 8991 LIST_HEAD(splice); 8990 8992 int ret; 8991 8993 8992 - if (BTRFS_FS_ERROR(fs_info)) 8994 + if (unlikely(BTRFS_FS_ERROR(fs_info))) 8993 8995 return -EROFS; 8994 8996 8995 8997 mutex_lock(&fs_info->delalloc_root_mutex); ··· 9984 9986 size_t bytes = min(min_folio_size, iov_iter_count(from)); 9985 9987 char *kaddr; 9986 9988 9987 - folio = btrfs_alloc_compr_folio(fs_info); 9989 + folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 9988 9990 if (!folio) { 9989 9991 ret = -ENOMEM; 9990 9992 goto out_cb;

+5 -4

fs/btrfs/ioctl.c

··· 2897 2897 return -ENOMEM; 2898 2898 2899 2899 space_args.total_spaces = 0; 2900 - dest = kmalloc(alloc_size, GFP_KERNEL); 2900 + dest = kzalloc(alloc_size, GFP_KERNEL); 2901 2901 if (!dest) 2902 2902 return -ENOMEM; 2903 2903 dest_orig = dest; ··· 2953 2953 user_dest = (struct btrfs_ioctl_space_info __user *) 2954 2954 (arg + sizeof(struct btrfs_ioctl_space_args)); 2955 2955 2956 - if (copy_to_user(user_dest, dest_orig, alloc_size)) 2956 + if (copy_to_user(user_dest, dest_orig, 2957 + space_args.total_spaces * sizeof(*dest_orig))) 2957 2958 return -EFAULT; 2958 2959 2959 2960 out: ··· 3039 3038 3040 3039 ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, 3041 3040 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 3042 - 0); 3041 + false); 3043 3042 3044 3043 /* 3045 3044 * Copy scrub args to user space even if btrfs_scrub_dev() returned an ··· 3929 3928 ret = btrfs_uuid_tree_add(trans, sa->uuid, 3930 3929 BTRFS_UUID_KEY_RECEIVED_SUBVOL, 3931 3930 btrfs_root_id(root)); 3932 - if (unlikely(ret < 0 && ret != -EEXIST)) { 3931 + if (unlikely(ret < 0)) { 3933 3932 btrfs_abort_transaction(trans, ret); 3934 3933 btrfs_end_transaction(trans); 3935 3934 goto out;

+13 -28

fs/btrfs/lzo.c

··· 106 106 return ERR_PTR(-ENOMEM); 107 107 } 108 108 109 - static inline void write_compress_length(char *buf, size_t len) 110 - { 111 - __le32 dlen; 112 - 113 - dlen = cpu_to_le32(len); 114 - memcpy(buf, &dlen, LZO_LEN); 115 - } 116 - 117 - static inline size_t read_compress_length(const char *buf) 118 - { 119 - __le32 dlen; 120 - 121 - memcpy(&dlen, buf, LZO_LEN); 122 - return le32_to_cpu(dlen); 123 - } 124 - 125 109 /* 126 110 * Write data into @out_folio and queue it into @out_bio. 127 111 * ··· 202 218 ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits); 203 219 204 220 if (!*out_folio) { 205 - *out_folio = btrfs_alloc_compr_folio(fs_info); 221 + *out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 206 222 if (!*out_folio) 207 223 return -ENOMEM; 208 224 } 209 225 210 226 /* Write the segment header first. */ 211 227 kaddr = kmap_local_folio(*out_folio, offset_in_folio(*out_folio, *total_out)); 212 - write_compress_length(kaddr, compressed_size); 228 + put_unaligned_le32(compressed_size, kaddr); 213 229 kunmap_local(kaddr); 214 230 ret = write_and_queue_folio(out_bio, out_folio, total_out, LZO_LEN); 215 231 if (ret < 0) ··· 229 245 return -E2BIG; 230 246 231 247 if (!*out_folio) { 232 - *out_folio = btrfs_alloc_compr_folio(fs_info); 248 + *out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 233 249 if (!*out_folio) 234 250 return -ENOMEM; 235 251 } ··· 280 296 ASSERT(bio->bi_iter.bi_size == 0); 281 297 ASSERT(len); 282 298 283 - folio_out = btrfs_alloc_compr_folio(fs_info); 299 + folio_out = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 284 300 if (!folio_out) 285 301 return -ENOMEM; 286 302 ··· 346 362 347 363 /* Store the size of all chunks of compressed data */ 348 364 sizes_ptr = kmap_local_folio(bio_first_folio_all(bio), 0); 349 - write_compress_length(sizes_ptr, total_out); 365 + put_unaligned_le32(total_out, sizes_ptr); 350 366 kunmap_local(sizes_ptr); 351 367 out: 352 368 /* ··· 415 431 struct workspace *workspace = list_entry(ws, struct workspace, list); 416 432 struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; 417 433 const u32 sectorsize = fs_info->sectorsize; 434 + const u32 compressed_len = bio_get_size(&cb->bbio.bio); 418 435 struct folio_iter fi; 419 436 char *kaddr; 420 437 int ret; ··· 434 449 return -EINVAL; 435 450 ASSERT(folio_size(fi.folio) == btrfs_min_folio_size(fs_info)); 436 451 kaddr = kmap_local_folio(fi.folio, 0); 437 - len_in = read_compress_length(kaddr); 452 + len_in = get_unaligned_le32(kaddr); 438 453 kunmap_local(kaddr); 439 454 cur_in += LZO_LEN; 440 455 ··· 445 460 * and all sectors should be used. 446 461 * If this happens, it means the compressed extent is corrupted. 447 462 */ 448 - if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || 449 - round_up(len_in, sectorsize) < cb->compressed_len)) { 463 + if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, compressed_len) || 464 + round_up(len_in, sectorsize) < compressed_len)) { 450 465 struct btrfs_inode *inode = cb->bbio.inode; 451 466 452 467 btrfs_err(fs_info, 453 468 "lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u", 454 469 btrfs_root_id(inode->root), btrfs_ino(inode), 455 - cb->start, len_in, cb->compressed_len); 470 + cb->start, len_in, compressed_len); 456 471 return -EUCLEAN; 457 472 } 458 473 ··· 473 488 cur_folio = get_current_folio(cb, &fi, &cur_folio_index, cur_in); 474 489 ASSERT(cur_folio); 475 490 kaddr = kmap_local_folio(cur_folio, 0); 476 - seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); 491 + seg_len = get_unaligned_le32(kaddr + offset_in_folio(cur_folio, cur_in)); 477 492 kunmap_local(kaddr); 478 493 cur_in += LZO_LEN; 479 494 ··· 544 559 if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) 545 560 return -EUCLEAN; 546 561 547 - in_len = read_compress_length(data_in); 562 + in_len = get_unaligned_le32(data_in); 548 563 if (unlikely(in_len != srclen)) 549 564 return -EUCLEAN; 550 565 data_in += LZO_LEN; 551 566 552 - in_len = read_compress_length(data_in); 567 + in_len = get_unaligned_le32(data_in); 553 568 if (unlikely(in_len != srclen - LZO_LEN * 2)) 554 569 return -EUCLEAN; 555 570 data_in += LZO_LEN;

+1 -1

fs/btrfs/messages.c

··· 37 37 memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); 38 38 curr += sizeof(STATE_STRING_PREFACE) - 1; 39 39 40 - if (BTRFS_FS_ERROR(info)) { 40 + if (unlikely(BTRFS_FS_ERROR(info))) { 41 41 *curr++ = 'E'; 42 42 states_printed = true; 43 43 }

+4 -4

fs/btrfs/messages.h

··· 144 144 verify_assert_printk_format("check the format string" args); \ 145 145 if (!likely(cond)) { \ 146 146 if (("" __FIRST_ARG(args) [0]) == 0) { \ 147 - pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ 148 - #cond, (long)(cond), __FILE__, __LINE__); \ 147 + pr_err("assertion failed: %s, in %s:%d\n", \ 148 + #cond, __FILE__, __LINE__); \ 149 149 } else { \ 150 - pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \ 151 - #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \ 150 + pr_err("assertion failed: %s, in %s:%d (" __FIRST_ARG(args) ")\n", \ 151 + #cond, __FILE__, __LINE__ __REST_ARGS(args)); \ 152 152 } \ 153 153 BUG(); \ 154 154 } \

+13 -5

fs/btrfs/misc.h

··· 28 28 name = (1U << __ ## name ## _BIT), \ 29 29 __ ## name ## _SEQ = __ ## name ## _BIT 30 30 31 - static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) 31 + static inline phys_addr_t bio_iter_phys(const struct bio *bio, 32 + const struct bvec_iter *iter) 32 33 { 33 34 struct bio_vec bv = bio_iter_iovec(bio, *iter); 34 35 ··· 53 52 (paddr = bio_iter_phys((bio), (iter)), 1); \ 54 53 bio_advance_iter_single((bio), (iter), (blocksize))) 55 54 56 - /* Initialize a bvec_iter to the size of the specified bio. */ 57 - static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) 55 + /* Can only be called on a non-cloned bio. */ 56 + static inline u32 bio_get_size(struct bio *bio) 58 57 { 59 58 struct bio_vec *bvec; 60 - u32 bio_size = 0; 59 + u32 ret = 0; 61 60 int i; 62 61 63 62 bio_for_each_bvec_all(bvec, bio, i) 64 - bio_size += bvec->bv_len; 63 + ret += bvec->bv_len; 64 + return ret; 65 + } 66 + 67 + /* Initialize a bvec_iter to the size of the specified bio. */ 68 + static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) 69 + { 70 + const u32 bio_size = bio_get_size(bio); 65 71 66 72 return (struct bvec_iter) { 67 73 .bi_sector = 0,

+34 -35

fs/btrfs/ordered-data.c

··· 156 156 const bool is_nocow = (flags & 157 157 ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); 158 158 159 + /* Only one type flag can be set. */ 160 + ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS)); 161 + 162 + /* DIRECT cannot be set with COMPRESSED nor ENCODED. */ 163 + if (test_bit(BTRFS_ORDERED_DIRECT, &flags)) { 164 + ASSERT(!test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); 165 + ASSERT(!test_bit(BTRFS_ORDERED_ENCODED, &flags)); 166 + } 167 + 168 + /* ENCODED must be set with COMPRESSED. */ 169 + if (test_bit(BTRFS_ORDERED_ENCODED, &flags)) 170 + ASSERT(test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); 171 + 159 172 /* 160 173 * For a NOCOW write we can free the qgroup reserve right now. For a COW 161 174 * one we transfer the reserved space from the inode's iotree into the ··· 210 197 entry->flags = flags; 211 198 refcount_set(&entry->refs, 1); 212 199 init_waitqueue_head(&entry->wait); 213 - INIT_LIST_HEAD(&entry->list); 200 + INIT_LIST_HEAD(&entry->csum_list); 214 201 INIT_LIST_HEAD(&entry->log_list); 215 202 INIT_LIST_HEAD(&entry->root_extent_list); 216 203 INIT_LIST_HEAD(&entry->work_list); ··· 253 240 spin_lock(&inode->ordered_tree_lock); 254 241 node = tree_insert(&inode->ordered_tree, entry->file_offset, 255 242 &entry->rb_node); 256 - if (unlikely(node)) 243 + if (unlikely(node)) { 244 + struct btrfs_ordered_extent *exist = 245 + rb_entry(node, struct btrfs_ordered_extent, rb_node); 246 + 257 247 btrfs_panic(fs_info, -EEXIST, 258 - "inconsistency in ordered tree at offset %llu", 259 - entry->file_offset); 248 + "overlapping ordered extents, existing oe file_offset %llu num_bytes %llu flags 0x%lx, new oe file_offset %llu num_bytes %llu flags 0x%lx", 249 + exist->file_offset, exist->num_bytes, exist->flags, 250 + entry->file_offset, entry->num_bytes, entry->flags); 251 + } 260 252 spin_unlock(&inode->ordered_tree_lock); 261 253 262 254 spin_lock(&root->ordered_extent_lock); ··· 347 329 struct btrfs_inode *inode = entry->inode; 348 330 349 331 spin_lock(&inode->ordered_tree_lock); 350 - list_add_tail(&sum->list, &entry->list); 332 + list_add_tail(&sum->list, &entry->csum_list); 351 333 spin_unlock(&inode->ordered_tree_lock); 352 334 } 353 335 ··· 366 348 } 367 349 368 350 static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 369 - struct folio *folio, u64 file_offset, 370 - u64 len, bool uptodate) 351 + u64 file_offset, u64 len, bool uptodate) 371 352 { 372 353 struct btrfs_inode *inode = ordered->inode; 373 354 struct btrfs_fs_info *fs_info = inode->root->fs_info; 374 355 375 356 lockdep_assert_held(&inode->ordered_tree_lock); 376 - 377 - if (folio) { 378 - ASSERT(folio->mapping); 379 - ASSERT(folio_pos(folio) <= file_offset); 380 - ASSERT(file_offset + len <= folio_next_pos(folio)); 381 - 382 - /* 383 - * Ordered flag indicates whether we still have 384 - * pending io unfinished for the ordered extent. 385 - * 386 - * If it's not set, we need to skip to next range. 387 - */ 388 - if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) 389 - return false; 390 - btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); 391 - } 392 357 393 358 /* Now we're fine to update the accounting. */ 394 359 if (WARN_ON_ONCE(len > ordered->bytes_left)) { ··· 386 385 } 387 386 388 387 if (!uptodate) 389 - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 388 + btrfs_mark_ordered_extent_error(ordered); 390 389 391 390 if (ordered->bytes_left) 392 391 return false; ··· 414 413 } 415 414 416 415 void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 417 - struct folio *folio, u64 file_offset, u64 len, 418 - bool uptodate) 416 + u64 file_offset, u64 len, bool uptodate) 419 417 { 420 418 struct btrfs_inode *inode = ordered->inode; 421 419 bool ret; ··· 422 422 trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); 423 423 424 424 spin_lock(&inode->ordered_tree_lock); 425 - ret = can_finish_ordered_extent(ordered, folio, file_offset, len, 425 + ret = can_finish_ordered_extent(ordered, file_offset, len, 426 426 uptodate); 427 427 spin_unlock(&inode->ordered_tree_lock); 428 428 ··· 475 475 * extent(s) covering it. 476 476 */ 477 477 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 478 - struct folio *folio, u64 file_offset, 479 - u64 num_bytes, bool uptodate) 478 + u64 file_offset, u64 num_bytes, bool uptodate) 480 479 { 481 480 struct rb_node *node; 482 481 struct btrfs_ordered_extent *entry = NULL; ··· 535 536 len = this_end - cur; 536 537 ASSERT(len < U32_MAX); 537 538 538 - if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { 539 + if (can_finish_ordered_extent(entry, cur, len, uptodate)) { 539 540 spin_unlock(&inode->ordered_tree_lock); 540 541 btrfs_queue_ordered_fn(entry); 541 542 spin_lock(&inode->ordered_tree_lock); ··· 627 628 ASSERT(list_empty(&entry->log_list)); 628 629 ASSERT(RB_EMPTY_NODE(&entry->rb_node)); 629 630 btrfs_add_delayed_iput(entry->inode); 630 - list_for_each_entry_safe(sum, tmp, &entry->list, list) 631 + list_for_each_entry_safe(sum, tmp, &entry->csum_list, list) 631 632 kvfree(sum); 632 633 kmem_cache_free(btrfs_ordered_extent_cache, entry); 633 634 } ··· 637 638 * remove an ordered extent from the tree. No references are dropped 638 639 * and waiters are woken up. 639 640 */ 640 - void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, 641 - struct btrfs_ordered_extent *entry) 641 + void btrfs_remove_ordered_extent(struct btrfs_ordered_extent *entry) 642 642 { 643 + struct btrfs_inode *btrfs_inode = entry->inode; 643 644 struct btrfs_root *root = btrfs_inode->root; 644 645 struct btrfs_fs_info *fs_info = root->fs_info; 645 646 struct rb_node *node; ··· 1322 1323 } 1323 1324 } 1324 1325 1325 - list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { 1326 + list_for_each_entry_safe(sum, tmpsum, &ordered->csum_list, list) { 1326 1327 if (offset == len) 1327 1328 break; 1328 - list_move_tail(&sum->list, &new->list); 1329 + list_move_tail(&sum->list, &new->csum_list); 1329 1330 offset += sum->len; 1330 1331 } 1331 1332

+35 -31

fs/btrfs/ordered-data.h

··· 47 47 * IO is done and any metadata is inserted into the tree. 48 48 */ 49 49 enum { 50 + /* Extra status bits for ordered extents */ 51 + 52 + /* Set when all the pages are written. */ 53 + BTRFS_ORDERED_IO_DONE, 54 + /* Set when removed from the tree. */ 55 + BTRFS_ORDERED_COMPLETE, 56 + /* We had an io error when writing this out. */ 57 + BTRFS_ORDERED_IOERR, 58 + /* Set when we have to truncate an extent. */ 59 + BTRFS_ORDERED_TRUNCATED, 60 + /* Used during fsync to track already logged extents. */ 61 + BTRFS_ORDERED_LOGGED, 62 + /* We have already logged all the csums of the ordered extent. */ 63 + BTRFS_ORDERED_LOGGED_CSUM, 64 + /* We wait for this extent to complete in the current transaction. */ 65 + BTRFS_ORDERED_PENDING, 66 + 50 67 /* 51 - * Different types for ordered extents, one and only one of the 4 types 68 + * Different types for ordered extents, one and only one of these types 52 69 * need to be set when creating ordered extent. 53 70 * 54 71 * REGULAR: For regular non-compressed COW write ··· 78 61 BTRFS_ORDERED_PREALLOC, 79 62 BTRFS_ORDERED_COMPRESSED, 80 63 64 + /* Extra bit for encoded write, must be set with COMPRESSED. */ 65 + BTRFS_ORDERED_ENCODED, 66 + 81 67 /* 82 68 * Extra bit for direct io, can only be set for 83 - * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent. 69 + * REGULAR/NOCOW/PREALLOC. Must not be set for COMPRESSED nor ENCODED. 84 70 */ 85 71 BTRFS_ORDERED_DIRECT, 86 72 87 - /* Extra status bits for ordered extents */ 88 - 89 - /* set when all the pages are written */ 90 - BTRFS_ORDERED_IO_DONE, 91 - /* set when removed from the tree */ 92 - BTRFS_ORDERED_COMPLETE, 93 - /* We had an io error when writing this out */ 94 - BTRFS_ORDERED_IOERR, 95 - /* Set when we have to truncate an extent */ 96 - BTRFS_ORDERED_TRUNCATED, 97 - /* Used during fsync to track already logged extents */ 98 - BTRFS_ORDERED_LOGGED, 99 - /* We have already logged all the csums of the ordered extent */ 100 - BTRFS_ORDERED_LOGGED_CSUM, 101 - /* We wait for this extent to complete in the current transaction */ 102 - BTRFS_ORDERED_PENDING, 103 - /* BTRFS_IOC_ENCODED_WRITE */ 104 - BTRFS_ORDERED_ENCODED, 73 + BTRFS_ORDERED_NR_FLAGS, 105 74 }; 75 + static_assert(BTRFS_ORDERED_NR_FLAGS <= BITS_PER_LONG); 76 + 77 + /* One and only one flag can be set. */ 78 + #define BTRFS_ORDERED_EXCLUSIVE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ 79 + (1UL << BTRFS_ORDERED_NOCOW) | \ 80 + (1UL << BTRFS_ORDERED_PREALLOC) | \ 81 + (1UL << BTRFS_ORDERED_COMPRESSED)) 106 82 107 83 /* BTRFS_ORDERED_* flags that specify the type of the extent. */ 108 - #define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ 109 - (1UL << BTRFS_ORDERED_NOCOW) | \ 110 - (1UL << BTRFS_ORDERED_PREALLOC) | \ 111 - (1UL << BTRFS_ORDERED_COMPRESSED) | \ 84 + #define BTRFS_ORDERED_TYPE_FLAGS (BTRFS_ORDERED_EXCLUSIVE_FLAGS | \ 112 85 (1UL << BTRFS_ORDERED_DIRECT) | \ 113 86 (1UL << BTRFS_ORDERED_ENCODED)) 114 87 ··· 141 134 struct btrfs_inode *inode; 142 135 143 136 /* list of checksums for insertion when the extent io is done */ 144 - struct list_head list; 137 + struct list_head csum_list; 145 138 146 139 /* used for fast fsyncs */ 147 140 struct list_head log_list; ··· 168 161 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); 169 162 170 163 void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); 171 - void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, 172 - struct btrfs_ordered_extent *entry); 164 + void btrfs_remove_ordered_extent(struct btrfs_ordered_extent *entry); 173 165 void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, 174 - struct folio *folio, u64 file_offset, u64 len, 175 - bool uptodate); 166 + u64 file_offset, u64 len, bool uptodate); 176 167 void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, 177 - struct folio *folio, u64 file_offset, 178 - u64 num_bytes, bool uptodate); 168 + u64 file_offset, u64 num_bytes, bool uptodate); 179 169 bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, 180 170 struct btrfs_ordered_extent **cached, 181 171 u64 file_offset, u64 io_size);

-4

fs/btrfs/print-tree.c

··· 626 626 next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check); 627 627 if (IS_ERR(next)) 628 628 continue; 629 - if (!extent_buffer_uptodate(next)) { 630 - free_extent_buffer(next); 631 - continue; 632 - } 633 629 634 630 if (btrfs_is_leaf(next) && 635 631 level != 1)

+20 -22

fs/btrfs/qgroup.c

··· 2740 2740 } 2741 2741 } 2742 2742 2743 - #define UPDATE_NEW 0 2744 - #define UPDATE_OLD 1 2745 2743 /* 2746 2744 * Walk all of the roots that points to the bytenr and adjust their refcnts. 2747 2745 */ ··· 2978 2980 seq = fs_info->qgroup_seq; 2979 2981 2980 2982 /* Update old refcnts using old_roots */ 2981 - qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); 2983 + qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, true); 2982 2984 2983 2985 /* Update new refcnts using new_roots */ 2984 - qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); 2986 + qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, false); 2985 2987 2986 2988 qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, 2987 2989 num_bytes, seq); ··· 4324 4326 u64 freed = 0; 4325 4327 int ret; 4326 4328 4327 - extent_changeset_init(&changeset); 4329 + extent_changeset_init_bytes_only(&changeset); 4328 4330 len = round_up(start + len, root->fs_info->sectorsize); 4329 4331 start = round_down(start, root->fs_info->sectorsize); 4330 4332 ··· 4389 4391 WARN_ON(!free && reserved); 4390 4392 if (free && reserved) 4391 4393 return qgroup_free_reserved_data(inode, reserved, start, len, released); 4392 - extent_changeset_init(&changeset); 4394 + extent_changeset_init_bytes_only(&changeset); 4393 4395 ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1, 4394 4396 EXTENT_QGROUP_RESERVED, &changeset); 4395 4397 if (ret < 0) ··· 4489 4491 return num_bytes; 4490 4492 } 4491 4493 4492 - int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4493 - enum btrfs_qgroup_rsv_type type, bool enforce) 4494 + static int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4495 + enum btrfs_qgroup_rsv_type type, bool enforce) 4494 4496 { 4495 4497 struct btrfs_fs_info *fs_info = root->fs_info; 4496 4498 int ret; ··· 4516 4518 return ret; 4517 4519 } 4518 4520 4519 - int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 4520 - enum btrfs_qgroup_rsv_type type, bool enforce, 4521 - bool noflush) 4521 + int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, int num_bytes, 4522 + bool enforce, bool noflush) 4522 4523 { 4523 4524 int ret; 4524 4525 4525 - ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4526 + ret = btrfs_qgroup_reserve_meta(root, num_bytes, 4527 + BTRFS_QGROUP_RSV_META_PREALLOC, enforce); 4526 4528 if ((ret <= 0 && ret != -EDQUOT) || noflush) 4527 4529 return ret; 4528 4530 4529 4531 ret = try_flush_qgroup(root); 4530 4532 if (ret < 0) 4531 4533 return ret; 4532 - return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); 4534 + return btrfs_qgroup_reserve_meta(root, num_bytes, 4535 + BTRFS_QGROUP_RSV_META_PREALLOC, enforce); 4533 4536 } 4534 4537 4535 4538 /* ··· 4552 4553 BTRFS_QGROUP_RSV_META_PERTRANS); 4553 4554 } 4554 4555 4555 - void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 4556 - enum btrfs_qgroup_rsv_type type) 4556 + void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, int num_bytes) 4557 4557 { 4558 4558 struct btrfs_fs_info *fs_info = root->fs_info; 4559 4559 ··· 4565 4567 * which can lead to underflow. 4566 4568 * Here ensure we will only free what we really have reserved. 4567 4569 */ 4568 - num_bytes = sub_root_meta_rsv(root, num_bytes, type); 4570 + num_bytes = sub_root_meta_rsv(root, num_bytes, 4571 + BTRFS_QGROUP_RSV_META_PREALLOC); 4569 4572 BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); 4570 - trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type); 4571 - btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); 4573 + trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, 4574 + BTRFS_QGROUP_RSV_META_PREALLOC); 4575 + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, 4576 + BTRFS_QGROUP_RSV_META_PREALLOC); 4572 4577 } 4573 4578 4574 4579 static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, ··· 4647 4646 4648 4647 WARN_ON(ret < 0); 4649 4648 if (WARN_ON(changeset.bytes_changed)) { 4649 + ASSERT(extent_changeset_tracks_ranges(&changeset)); 4650 4650 ULIST_ITER_INIT(&iter); 4651 4651 while ((unode = ulist_next(&changeset.range_changed, &iter))) { 4652 4652 btrfs_warn(inode->root->fs_info, ··· 4883 4881 if (IS_ERR(reloc_eb)) { 4884 4882 ret = PTR_ERR(reloc_eb); 4885 4883 reloc_eb = NULL; 4886 - goto free_out; 4887 - } 4888 - if (unlikely(!extent_buffer_uptodate(reloc_eb))) { 4889 - ret = -EIO; 4890 4884 goto free_out; 4891 4885 } 4892 4886

+3 -39

fs/btrfs/qgroup.h

··· 392 392 int btrfs_qgroup_free_data(struct btrfs_inode *inode, 393 393 struct extent_changeset *reserved, u64 start, 394 394 u64 len, u64 *freed); 395 - int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 396 - enum btrfs_qgroup_rsv_type type, bool enforce); 397 - int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, 398 - enum btrfs_qgroup_rsv_type type, bool enforce, 399 - bool noflush); 400 - /* Reserve metadata space for pertrans and prealloc type */ 401 - static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, 402 - int num_bytes, bool enforce) 403 - { 404 - return __btrfs_qgroup_reserve_meta(root, num_bytes, 405 - BTRFS_QGROUP_RSV_META_PERTRANS, 406 - enforce, false); 407 - } 408 - static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, 409 - int num_bytes, bool enforce, 410 - bool noflush) 411 - { 412 - return __btrfs_qgroup_reserve_meta(root, num_bytes, 413 - BTRFS_QGROUP_RSV_META_PREALLOC, 414 - enforce, noflush); 415 - } 416 - 417 - void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, 418 - enum btrfs_qgroup_rsv_type type); 419 - 420 - /* Free per-transaction meta reservation for error handling */ 421 - static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root, 422 - int num_bytes) 423 - { 424 - __btrfs_qgroup_free_meta(root, num_bytes, 425 - BTRFS_QGROUP_RSV_META_PERTRANS); 426 - } 427 - 395 + int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, int num_bytes, 396 + bool enforce, bool noflush); 428 397 /* Pre-allocated meta reservation can be freed at need */ 429 - static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, 430 - int num_bytes) 431 - { 432 - __btrfs_qgroup_free_meta(root, num_bytes, 433 - BTRFS_QGROUP_RSV_META_PREALLOC); 434 - } 398 + void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, int num_bytes); 435 399 436 400 void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); 437 401 void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);

+1 -1

fs/btrfs/raid-stripe-tree.c

··· 300 300 int ret; 301 301 302 302 stripe_extent = kzalloc(item_size, GFP_NOFS); 303 - if (!unlikely(stripe_extent)) { 303 + if (unlikely(!stripe_extent)) { 304 304 btrfs_abort_transaction(trans, -ENOMEM); 305 305 btrfs_end_transaction(trans); 306 306 return -ENOMEM;

+6 -11

fs/btrfs/raid56.c

··· 1653 1653 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) 1654 1654 { 1655 1655 int total_sector_nr = get_bio_sector_nr(rbio, bio); 1656 - u32 bio_size = 0; 1657 - struct bio_vec *bvec; 1658 - int i; 1659 - 1660 - bio_for_each_bvec_all(bvec, bio, i) 1661 - bio_size += bvec->bv_len; 1656 + const u32 bio_size = bio_get_size(bio); 1662 1657 1663 1658 /* 1664 1659 * Since we can have multiple bios touching the error_bitmap, we cannot ··· 1661 1666 * 1662 1667 * Instead use set_bit() for each bit, as set_bit() itself is atomic. 1663 1668 */ 1664 - for (i = total_sector_nr; i < total_sector_nr + 1669 + for (int i = total_sector_nr; i < total_sector_nr + 1665 1670 (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) 1666 1671 set_bit(i, rbio->error_bitmap); 1667 1672 } ··· 2105 2110 * @unmap_array stores copy of pointers that does not get reordered 2106 2111 * during reconstruction so that kunmap_local works. 2107 2112 */ 2108 - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2109 - unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2113 + pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); 2114 + unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); 2110 2115 if (!pointers || !unmap_array) { 2111 2116 ret = -ENOMEM; 2112 2117 goto out; ··· 2839 2844 * @unmap_array stores copy of pointers that does not get reordered 2840 2845 * during reconstruction so that kunmap_local works. 2841 2846 */ 2842 - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2843 - unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); 2847 + pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); 2848 + unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); 2844 2849 if (!pointers || !unmap_array) { 2845 2850 ret = -ENOMEM; 2846 2851 goto out;

+47 -2

fs/btrfs/reflink.c

··· 322 322 323 323 ret = copy_inline_to_page(inode, new_key->offset, 324 324 inline_data, size, datal, comp_type); 325 + 326 + /* 327 + * If we copied the inline extent data to a page/folio beyond the i_size 328 + * of the destination inode, then we need to increase the i_size before 329 + * we start a transaction to update the inode item. This is to prevent a 330 + * deadlock when the flushoncommit mount option is used, which happens 331 + * like this: 332 + * 333 + * 1) Task A clones an inline extent from inode X to an offset of inode 334 + * Y that is beyond Y's current i_size. This means we copied the 335 + * inline extent's data to a folio of inode Y that is beyond its EOF, 336 + * using the call above to copy_inline_to_page(); 337 + * 338 + * 2) Task B starts a transaction commit and calls 339 + * btrfs_start_delalloc_flush() to flush delalloc; 340 + * 341 + * 3) The delalloc flushing sees the new dirty folio of inode Y and when 342 + * it attempts to flush it, it ends up at extent_writepage() and sees 343 + * that the offset of the folio is beyond the i_size of inode Y, so 344 + * it attempts to invalidate the folio by calling folio_invalidate(), 345 + * which ends up at btrfs' folio invalidate callback - 346 + * btrfs_invalidate_folio(). There it tries to lock the folio's range 347 + * in inode Y's extent io tree, but it blocks since it's currently 348 + * locked by task A - during reflink we lock the inodes and the 349 + * source and destination ranges after flushing all delalloc and 350 + * waiting for ordered extent completion - after that we don't expect 351 + * to have dirty folios in the ranges, the exception is if we have to 352 + * copy an inline extent's data (because the destination offset is 353 + * not zero); 354 + * 355 + * 4) Task A then does the 'goto out' below and attempts to start a 356 + * transaction to update the inode item, and then it's blocked since 357 + * the current transaction is in the TRANS_STATE_COMMIT_START state. 358 + * Therefore task A has to wait for the current transaction to become 359 + * unblocked (its state >= TRANS_STATE_UNBLOCKED). 360 + * 361 + * This leads to a deadlock - the task committing the transaction 362 + * waiting for the delalloc flushing which is blocked during folio 363 + * invalidation on the inode's extent lock and the reflink task waiting 364 + * for the current transaction to be unblocked so that it can start a 365 + * a new one to update the inode item (while holding the extent lock). 366 + */ 367 + if (ret == 0 && new_key->offset + datal > i_size_read(&inode->vfs_inode)) 368 + i_size_write(&inode->vfs_inode, new_key->offset + datal); 369 + 325 370 goto out; 326 371 } 327 372 ··· 691 646 */ 692 647 btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); 693 648 ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, 694 - ALIGN(len, bs), dst_loff, 1); 649 + ALIGN(len, bs), dst_loff, true); 695 650 btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); 696 651 697 652 btrfs_btree_balance_dirty(fs_info); ··· 792 747 */ 793 748 end = destoff + len - 1; 794 749 btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); 795 - ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); 750 + ret = btrfs_clone(src, inode, off, olen, len, destoff, false); 796 751 btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); 797 752 if (ret < 0) 798 753 return ret;

+2 -10

fs/btrfs/relocation.c

··· 2440 2440 eb = read_tree_block(fs_info, block->bytenr, &check); 2441 2441 if (IS_ERR(eb)) 2442 2442 return PTR_ERR(eb); 2443 - if (unlikely(!extent_buffer_uptodate(eb))) { 2444 - free_extent_buffer(eb); 2445 - return -EIO; 2446 - } 2443 + 2447 2444 if (block->level == 0) 2448 2445 btrfs_item_key_to_cpu(eb, &block->key, 0); 2449 2446 else ··· 3642 3645 btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); 3643 3646 3644 3647 /* get rid of pinned extents */ 3645 - trans = btrfs_join_transaction(rc->extent_root); 3646 - if (IS_ERR(trans)) { 3647 - err = PTR_ERR(trans); 3648 - goto out_free; 3649 - } 3650 - ret = btrfs_commit_transaction(trans); 3648 + ret = btrfs_commit_current_transaction(rc->extent_root); 3651 3649 if (ret && !err) 3652 3650 err = ret; 3653 3651 out_free:

+5 -19

fs/btrfs/scrub.c

··· 891 891 { 892 892 struct scrub_stripe *stripe = bbio->private; 893 893 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 894 - struct bio_vec *bvec; 895 894 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 896 - u32 bio_size = 0; 897 - int i; 895 + const u32 bio_size = bio_get_size(&bbio->bio); 898 896 899 897 ASSERT(sector_nr < stripe->nr_sectors); 900 - 901 - bio_for_each_bvec_all(bvec, &bbio->bio, i) 902 - bio_size += bvec->bv_len; 903 898 904 899 if (bbio->bio.bi_status) { 905 900 scrub_bitmap_set_io_error(stripe, sector_nr, ··· 1244 1249 static void scrub_read_endio(struct btrfs_bio *bbio) 1245 1250 { 1246 1251 struct scrub_stripe *stripe = bbio->private; 1247 - struct bio_vec *bvec; 1248 1252 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1249 1253 int num_sectors; 1250 - u32 bio_size = 0; 1251 - int i; 1254 + const u32 bio_size = bio_get_size(&bbio->bio); 1252 1255 1253 1256 ASSERT(sector_nr < stripe->nr_sectors); 1254 - bio_for_each_bvec_all(bvec, &bbio->bio, i) 1255 - bio_size += bvec->bv_len; 1256 1257 num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; 1257 1258 1258 1259 if (bbio->bio.bi_status) { ··· 1269 1278 { 1270 1279 struct scrub_stripe *stripe = bbio->private; 1271 1280 struct btrfs_fs_info *fs_info = stripe->bg->fs_info; 1272 - struct bio_vec *bvec; 1273 1281 int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); 1274 - u32 bio_size = 0; 1275 - int i; 1276 - 1277 - bio_for_each_bvec_all(bvec, &bbio->bio, i) 1278 - bio_size += bvec->bv_len; 1282 + const u32 bio_size = bio_get_size(&bbio->bio); 1279 1283 1280 1284 if (bbio->bio.bi_status) { 1281 1285 unsigned long flags; ··· 1279 1293 bitmap_set(&stripe->write_error_bitmap, sector_nr, 1280 1294 bio_size >> fs_info->sectorsize_bits); 1281 1295 spin_unlock_irqrestore(&stripe->write_error_lock, flags); 1282 - for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1296 + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) 1283 1297 btrfs_dev_stat_inc_and_print(stripe->dev, 1284 1298 BTRFS_DEV_STAT_WRITE_ERRS); 1285 1299 } ··· 2974 2988 struct page *page; 2975 2989 struct btrfs_fs_info *fs_info = sctx->fs_info; 2976 2990 2977 - if (BTRFS_FS_ERROR(fs_info)) 2991 + if (unlikely(BTRFS_FS_ERROR(fs_info))) 2978 2992 return -EROFS; 2979 2993 2980 2994 page = alloc_page(GFP_KERNEL);

+3 -3

fs/btrfs/send.c

··· 7201 7201 sctx->right_path = right_path; 7202 7202 sctx->cmp_key = key; 7203 7203 7204 - ret = finish_inode_if_needed(sctx, 0); 7204 + ret = finish_inode_if_needed(sctx, false); 7205 7205 if (ret < 0) 7206 7206 return ret; 7207 7207 ··· 7328 7328 } 7329 7329 7330 7330 out_finish: 7331 - return finish_inode_if_needed(sctx, 1); 7331 + return finish_inode_if_needed(sctx, true); 7332 7332 } 7333 7333 7334 7334 static int replace_node_with_clone(struct btrfs_path *path, int level) ··· 7879 7879 ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); 7880 7880 if (ret < 0) 7881 7881 goto out; 7882 - ret = finish_inode_if_needed(sctx, 1); 7882 + ret = finish_inode_if_needed(sctx, true); 7883 7883 if (ret < 0) 7884 7884 goto out; 7885 7885 } else {

+47 -22

fs/btrfs/space-info.c

··· 129 129 * churn a lot and we can avoid making some extent tree modifications if we 130 130 * are able to delay for as long as possible. 131 131 * 132 + * RECLAIM_ZONES 133 + * This state only works for the zoned mode. In zoned mode, we cannot reuse 134 + * regions that have once been allocated and then been freed until we reset 135 + * the zone, due to the sequential write requirement. The RECLAIM_ZONES state 136 + * calls the reclaim machinery, evacuating the still valid data in these 137 + * block-groups and relocates it to the data_reloc_bg. Afterwards these 138 + * block-groups get deleted and the transaction is committed. This frees up 139 + * space to use for new allocations. 140 + * 132 141 * RESET_ZONES 133 142 * This state works only for the zoned mode. On the zoned mode, we cannot 134 143 * reuse once allocated then freed region until we reset the zone, due to ··· 212 203 213 204 #define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) 214 205 206 + #define BTRFS_ZONED_SYNC_RECLAIM_BATCH (5) 207 + 215 208 /* 216 209 * Calculate chunk size depending on volume type (regular or zoned). 217 210 */ ··· 287 276 sub_group->subgroup_id = id; 288 277 289 278 ret = btrfs_sysfs_add_space_info_type(sub_group); 290 - if (ret) { 291 - kfree(sub_group); 279 + if (ret) 292 280 parent->sub_group[index] = NULL; 293 - } 294 281 return ret; 295 282 } 296 283 ··· 320 311 321 312 ret = btrfs_sysfs_add_space_info_type(space_info); 322 313 if (ret) 323 - goto out_free; 314 + return ret; 324 315 325 316 list_add(&space_info->list, &info->space_info); 326 317 if (flags & BTRFS_BLOCK_GROUP_DATA) ··· 412 403 up_write(&space_info->groups_sem); 413 404 } 414 405 415 - struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 406 + struct btrfs_space_info *btrfs_find_space_info(const struct btrfs_fs_info *info, 416 407 u64 flags) 417 408 { 418 - struct list_head *head = &info->space_info; 409 + const struct list_head *head = &info->space_info; 419 410 struct btrfs_space_info *found; 420 411 421 412 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; ··· 427 418 return NULL; 428 419 } 429 420 430 - static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) 421 + static u64 calc_effective_data_chunk_size(const struct btrfs_fs_info *fs_info) 431 422 { 432 423 struct btrfs_space_info *data_sinfo; 433 424 u64 data_chunk_size; ··· 453 444 enum btrfs_reserve_flush_enum flush) 454 445 { 455 446 struct btrfs_fs_info *fs_info = space_info->fs_info; 447 + bool has_per_profile; 456 448 u64 profile; 457 449 u64 avail; 458 450 u64 data_chunk_size; ··· 464 454 else 465 455 profile = btrfs_metadata_alloc_profile(fs_info); 466 456 467 - avail = atomic64_read(&fs_info->free_chunk_space); 457 + has_per_profile = btrfs_get_per_profile_avail(fs_info, profile, &avail); 458 + if (!has_per_profile) { 459 + avail = atomic64_read(&fs_info->free_chunk_space); 468 460 469 - /* 470 - * If we have dup, raid1 or raid10 then only half of the free 471 - * space is actually usable. For raid56, the space info used 472 - * doesn't include the parity drive, so we don't have to 473 - * change the math 474 - */ 475 - factor = btrfs_bg_type_to_factor(profile); 476 - avail = div_u64(avail, factor); 477 - if (avail == 0) 478 - return 0; 479 - 461 + /* 462 + * If we have dup, raid1 or raid10 then only half of the free 463 + * space is actually usable. For raid56, the space info used 464 + * doesn't include the parity drive, so we don't have to 465 + * change the math 466 + */ 467 + factor = btrfs_bg_type_to_factor(profile); 468 + avail = div_u64(avail, factor); 469 + if (avail == 0) 470 + return 0; 471 + } 480 472 data_chunk_size = calc_effective_data_chunk_size(fs_info); 481 473 482 474 /* ··· 501 489 /* 502 490 * If we aren't flushing all things, let us overcommit up to 503 491 * 1/2th of the space. If we can flush, don't let us overcommit 504 - * too much, let it overcommit up to 1/8 of the space. 492 + * too much, let it overcommit up to 1/64th of the space. 505 493 */ 506 - if (flush == BTRFS_RESERVE_FLUSH_ALL) 507 - avail >>= 3; 494 + if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) 495 + avail >>= 6; 508 496 else 509 497 avail >>= 1; 510 498 ··· 913 901 914 902 if (ret > 0 || ret == -ENOSPC) 915 903 ret = 0; 904 + break; 905 + case RECLAIM_ZONES: 906 + if (btrfs_is_zoned(fs_info)) { 907 + btrfs_reclaim_sweep(fs_info); 908 + btrfs_delete_unused_bgs(fs_info); 909 + btrfs_reclaim_block_groups(fs_info, 910 + BTRFS_ZONED_SYNC_RECLAIM_BATCH); 911 + ASSERT(current->journal_info == NULL); 912 + ret = btrfs_commit_current_transaction(root); 913 + } else { 914 + ret = 0; 915 + } 916 916 break; 917 917 case RUN_DELAYED_IPUTS: 918 918 /* ··· 1424 1400 FLUSH_DELALLOC_FULL, 1425 1401 RUN_DELAYED_IPUTS, 1426 1402 COMMIT_TRANS, 1403 + RECLAIM_ZONES, 1427 1404 RESET_ZONES, 1428 1405 ALLOC_CHUNK_FORCE, 1429 1406 };

+20 -2

fs/btrfs/space-info.h

··· 21 21 * The higher the level, the more methods we try to reclaim space. 22 22 */ 23 23 enum btrfs_reserve_flush_enum { 24 - /* If we are in the transaction, we can't flush anything.*/ 24 + /* 25 + * Used when we can't flush or don't need: 26 + * 27 + * 1) We are holding a transaction handle open, so we can't flush as 28 + * that could deadlock. 29 + * 30 + * 2) For a nowait write we don't want to block when reserving delalloc. 31 + * 32 + * 3) Joining a transaction or attaching a transaction, we don't want 33 + * to wait and we don't need to reserve anything (any needed space 34 + * was reserved before in a dedicated block reserve, or we rely on 35 + * the global block reserve, see btrfs_init_root_block_rsv()). 36 + * 37 + * 4) Starting a transaction when we don't need to reserve space, as 38 + * we don't need it because we previously reserved in a dedicated 39 + * block reserve or rely on the global block reserve, like the above 40 + * case. 41 + */ 25 42 BTRFS_RESERVE_NO_FLUSH, 26 43 27 44 /* ··· 113 96 RUN_DELAYED_IPUTS = 10, 114 97 COMMIT_TRANS = 11, 115 98 RESET_ZONES = 12, 99 + RECLAIM_ZONES = 13, 116 100 }; 117 101 118 102 enum btrfs_space_info_sub_group { ··· 292 274 struct btrfs_block_group *block_group); 293 275 void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, 294 276 u64 chunk_size); 295 - struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, 277 + struct btrfs_space_info *btrfs_find_space_info(const struct btrfs_fs_info *info, 296 278 u64 flags); 297 279 void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 298 280 void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes,

+1 -5

fs/btrfs/super.c

··· 1299 1299 { 1300 1300 int ret; 1301 1301 1302 - if (BTRFS_FS_ERROR(fs_info)) { 1302 + if (unlikely(BTRFS_FS_ERROR(fs_info))) { 1303 1303 btrfs_err(fs_info, 1304 1304 "remounting read-write after error is not allowed"); 1305 1305 return -EINVAL; ··· 2423 2423 return 0; 2424 2424 } 2425 2425 2426 - #ifdef CONFIG_BTRFS_EXPERIMENTAL 2427 2426 static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev) 2428 2427 { 2429 2428 struct btrfs_fs_info *fs_info = btrfs_sb(sb); ··· 2480 2481 2481 2482 btrfs_force_shutdown(fs_info); 2482 2483 } 2483 - #endif 2484 2484 2485 2485 static int btrfs_show_stats(struct seq_file *seq, struct dentry *root) 2486 2486 { ··· 2509 2511 .nr_cached_objects = btrfs_nr_cached_objects, 2510 2512 .free_cached_objects = btrfs_free_cached_objects, 2511 2513 .show_stats = btrfs_show_stats, 2512 - #ifdef CONFIG_BTRFS_EXPERIMENTAL 2513 2514 .remove_bdev = btrfs_remove_bdev, 2514 2515 .shutdown = btrfs_shutdown, 2515 - #endif 2516 2516 }; 2517 2517 2518 2518 static const struct file_operations btrfs_ctl_fops = {

+1 -1

fs/btrfs/super.h

··· 18 18 u64 subvol_objectid); 19 19 void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info); 20 20 21 - static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) 21 + static inline struct btrfs_fs_info *btrfs_sb(const struct super_block *sb) 22 22 { 23 23 return sb->s_fs_info; 24 24 }

+3

fs/btrfs/tests/btrfs-tests.c

··· 303 303 } 304 304 } 305 305 ret = btrfs_test_extent_map(); 306 + if (ret) 307 + goto out; 308 + ret = btrfs_test_zoned(); 306 309 307 310 out: 308 311 btrfs_destroy_test_fs();

+10

fs/btrfs/tests/btrfs-tests.h

··· 63 63 struct btrfs_fs_info *fs_info); 64 64 void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); 65 65 struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); 66 + 67 + #ifdef CONFIG_BLK_DEV_ZONED 68 + int btrfs_test_zoned(void); 69 + #else 70 + static inline int btrfs_test_zoned(void) 71 + { 72 + return 0; 73 + } 74 + #endif 75 + 66 76 #else 67 77 static inline int btrfs_run_sanity_tests(void) 68 78 {

+675

fs/btrfs/tests/zoned-tests.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2026 Western Digital. All rights reserved. 4 + */ 5 + 6 + #include <linux/cleanup.h> 7 + #include <linux/sizes.h> 8 + 9 + #include "btrfs-tests.h" 10 + #include "../space-info.h" 11 + #include "../volumes.h" 12 + #include "../zoned.h" 13 + 14 + #define WP_MISSING_DEV ((u64)-1) 15 + #define WP_CONVENTIONAL ((u64)-2) 16 + #define ZONE_SIZE SZ_256M 17 + 18 + #define HALF_STRIPE_LEN (BTRFS_STRIPE_LEN >> 1) 19 + 20 + struct load_zone_info_test_vector { 21 + u64 raid_type; 22 + u64 num_stripes; 23 + u64 alloc_offsets[8]; 24 + u64 last_alloc; 25 + u64 bg_length; 26 + bool degraded; 27 + 28 + int expected_result; 29 + u64 expected_alloc_offset; 30 + 31 + const char *description; 32 + }; 33 + 34 + struct zone_info { 35 + u64 physical; 36 + u64 capacity; 37 + u64 alloc_offset; 38 + }; 39 + 40 + static int test_load_zone_info(struct btrfs_fs_info *fs_info, 41 + const struct load_zone_info_test_vector *test) 42 + { 43 + struct btrfs_block_group *bg __free(btrfs_free_dummy_block_group) = NULL; 44 + struct btrfs_chunk_map *map __free(btrfs_free_chunk_map) = NULL; 45 + struct zone_info AUTO_KFREE(zone_info); 46 + unsigned long AUTO_KFREE(active); 47 + int ret; 48 + 49 + bg = btrfs_alloc_dummy_block_group(fs_info, test->bg_length); 50 + if (!bg) { 51 + test_std_err(TEST_ALLOC_BLOCK_GROUP); 52 + return -ENOMEM; 53 + } 54 + 55 + map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL); 56 + if (!map) { 57 + test_std_err(TEST_ALLOC_EXTENT_MAP); 58 + return -ENOMEM; 59 + } 60 + 61 + zone_info = kzalloc_objs(*zone_info, test->num_stripes, GFP_KERNEL); 62 + if (!zone_info) { 63 + test_err("cannot allocate zone info"); 64 + return -ENOMEM; 65 + } 66 + 67 + active = bitmap_zalloc(test->num_stripes, GFP_KERNEL); 68 + if (!zone_info) { 69 + test_err("cannot allocate active bitmap"); 70 + return -ENOMEM; 71 + } 72 + 73 + map->type = test->raid_type; 74 + map->num_stripes = test->num_stripes; 75 + if (test->raid_type == BTRFS_BLOCK_GROUP_RAID10) 76 + map->sub_stripes = 2; 77 + for (int i = 0; i < test->num_stripes; i++) { 78 + zone_info[i].physical = 0; 79 + zone_info[i].alloc_offset = test->alloc_offsets[i]; 80 + zone_info[i].capacity = ZONE_SIZE; 81 + if (zone_info[i].alloc_offset && zone_info[i].alloc_offset < ZONE_SIZE) 82 + __set_bit(i, active); 83 + } 84 + if (test->degraded) 85 + btrfs_set_opt(fs_info->mount_opt, DEGRADED); 86 + else 87 + btrfs_clear_opt(fs_info->mount_opt, DEGRADED); 88 + 89 + ret = btrfs_load_block_group_by_raid_type(bg, map, zone_info, active, 90 + test->last_alloc); 91 + 92 + if (ret != test->expected_result) { 93 + test_err("unexpected return value: ret %d expected %d", ret, 94 + test->expected_result); 95 + return -EINVAL; 96 + } 97 + 98 + if (!ret && bg->alloc_offset != test->expected_alloc_offset) { 99 + test_err("unexpected alloc_offset: alloc_offset %llu expected %llu", 100 + bg->alloc_offset, test->expected_alloc_offset); 101 + return -EINVAL; 102 + } 103 + 104 + return 0; 105 + } 106 + 107 + static const struct load_zone_info_test_vector load_zone_info_tests[] = { 108 + /* SINGLE */ 109 + { 110 + .description = "SINGLE: load write pointer from sequential zone", 111 + .raid_type = 0, 112 + .num_stripes = 1, 113 + .alloc_offsets = { 114 + SZ_1M, 115 + }, 116 + .expected_alloc_offset = SZ_1M, 117 + }, 118 + /* 119 + * SINGLE block group on a conventional zone sets last_alloc outside of 120 + * btrfs_load_block_group_*(). Do not test that case. 121 + */ 122 + 123 + /* DUP */ 124 + /* Normal case */ 125 + { 126 + .description = "DUP: having matching write pointers", 127 + .raid_type = BTRFS_BLOCK_GROUP_DUP, 128 + .num_stripes = 2, 129 + .alloc_offsets = { 130 + SZ_1M, SZ_1M, 131 + }, 132 + .expected_alloc_offset = SZ_1M, 133 + }, 134 + /* 135 + * One sequential zone and one conventional zone, having matching 136 + * last_alloc. 137 + */ 138 + { 139 + .description = "DUP: seq zone and conv zone, matching last_alloc", 140 + .raid_type = BTRFS_BLOCK_GROUP_DUP, 141 + .num_stripes = 2, 142 + .alloc_offsets = { 143 + SZ_1M, WP_CONVENTIONAL, 144 + }, 145 + .last_alloc = SZ_1M, 146 + .expected_alloc_offset = SZ_1M, 147 + }, 148 + /* 149 + * One sequential and one conventional zone, but having smaller 150 + * last_alloc than write pointer. 151 + */ 152 + { 153 + .description = "DUP: seq zone and conv zone, smaller last_alloc", 154 + .raid_type = BTRFS_BLOCK_GROUP_DUP, 155 + .num_stripes = 2, 156 + .alloc_offsets = { 157 + SZ_1M, WP_CONVENTIONAL, 158 + }, 159 + .last_alloc = 0, 160 + .expected_alloc_offset = SZ_1M, 161 + }, 162 + /* Error case: having different write pointers. */ 163 + { 164 + .description = "DUP: fail: different write pointers", 165 + .raid_type = BTRFS_BLOCK_GROUP_DUP, 166 + .num_stripes = 2, 167 + .alloc_offsets = { 168 + SZ_1M, SZ_2M, 169 + }, 170 + .expected_result = -EIO, 171 + }, 172 + /* Error case: partial missing device should not happen on DUP. */ 173 + { 174 + .description = "DUP: fail: missing device", 175 + .raid_type = BTRFS_BLOCK_GROUP_DUP, 176 + .num_stripes = 2, 177 + .alloc_offsets = { 178 + SZ_1M, WP_MISSING_DEV, 179 + }, 180 + .expected_result = -EIO, 181 + }, 182 + /* 183 + * Error case: one sequential and one conventional zone, but having larger 184 + * last_alloc than write pointer. 185 + */ 186 + { 187 + .description = "DUP: fail: seq zone and conv zone, larger last_alloc", 188 + .raid_type = BTRFS_BLOCK_GROUP_DUP, 189 + .num_stripes = 2, 190 + .alloc_offsets = { 191 + SZ_1M, WP_CONVENTIONAL, 192 + }, 193 + .last_alloc = SZ_2M, 194 + .expected_result = -EIO, 195 + }, 196 + 197 + /* RAID1 */ 198 + /* Normal case */ 199 + { 200 + .description = "RAID1: having matching write pointers", 201 + .raid_type = BTRFS_BLOCK_GROUP_RAID1, 202 + .num_stripes = 2, 203 + .alloc_offsets = { 204 + SZ_1M, SZ_1M, 205 + }, 206 + .expected_alloc_offset = SZ_1M, 207 + }, 208 + /* 209 + * One sequential zone and one conventional zone, having matching 210 + * last_alloc. 211 + */ 212 + { 213 + .description = "RAID1: seq zone and conv zone, matching last_alloc", 214 + .raid_type = BTRFS_BLOCK_GROUP_RAID1, 215 + .num_stripes = 2, 216 + .alloc_offsets = { 217 + SZ_1M, WP_CONVENTIONAL, 218 + }, 219 + .last_alloc = SZ_1M, 220 + .expected_alloc_offset = SZ_1M, 221 + }, 222 + /* 223 + * One sequential and one conventional zone, but having smaller 224 + * last_alloc than write pointer. 225 + */ 226 + { 227 + .description = "RAID1: seq zone and conv zone, smaller last_alloc", 228 + .raid_type = BTRFS_BLOCK_GROUP_RAID1, 229 + .num_stripes = 2, 230 + .alloc_offsets = { 231 + SZ_1M, WP_CONVENTIONAL, 232 + }, 233 + .last_alloc = 0, 234 + .expected_alloc_offset = SZ_1M, 235 + }, 236 + /* Partial missing device should be recovered on DEGRADED mount */ 237 + { 238 + .description = "RAID1: fail: missing device on DEGRADED", 239 + .raid_type = BTRFS_BLOCK_GROUP_RAID1, 240 + .num_stripes = 2, 241 + .alloc_offsets = { 242 + SZ_1M, WP_MISSING_DEV, 243 + }, 244 + .degraded = true, 245 + .expected_alloc_offset = SZ_1M, 246 + }, 247 + /* Error case: having different write pointers. */ 248 + { 249 + .description = "RAID1: fail: different write pointers", 250 + .raid_type = BTRFS_BLOCK_GROUP_RAID1, 251 + .num_stripes = 2, 252 + .alloc_offsets = { 253 + SZ_1M, SZ_2M, 254 + }, 255 + .expected_result = -EIO, 256 + }, 257 + /* 258 + * Partial missing device is not allowed on non-DEGRADED mount never happen 259 + * as it is rejected beforehand. 260 + */ 261 + /* 262 + * Error case: one sequential and one conventional zone, but having larger 263 + * last_alloc than write pointer. 264 + */ 265 + { 266 + .description = "RAID1: fail: seq zone and conv zone, larger last_alloc", 267 + .raid_type = BTRFS_BLOCK_GROUP_RAID1, 268 + .num_stripes = 2, 269 + .alloc_offsets = { 270 + SZ_1M, WP_CONVENTIONAL, 271 + }, 272 + .last_alloc = SZ_2M, 273 + .expected_result = -EIO, 274 + }, 275 + 276 + /* RAID0 */ 277 + /* Normal case */ 278 + { 279 + .description = "RAID0: initial partial write", 280 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 281 + .num_stripes = 4, 282 + .alloc_offsets = { 283 + HALF_STRIPE_LEN, 0, 0, 0, 284 + }, 285 + .expected_alloc_offset = HALF_STRIPE_LEN, 286 + }, 287 + { 288 + .description = "RAID0: while in second stripe", 289 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 290 + .num_stripes = 4, 291 + .alloc_offsets = { 292 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN + HALF_STRIPE_LEN, 293 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 294 + }, 295 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5 + HALF_STRIPE_LEN, 296 + }, 297 + { 298 + .description = "RAID0: one stripe advanced", 299 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 300 + .num_stripes = 2, 301 + .alloc_offsets = { 302 + SZ_1M + BTRFS_STRIPE_LEN, SZ_1M, 303 + }, 304 + .expected_alloc_offset = SZ_2M + BTRFS_STRIPE_LEN, 305 + }, 306 + /* Error case: having different write pointers. */ 307 + { 308 + .description = "RAID0: fail: disordered stripes", 309 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 310 + .num_stripes = 4, 311 + .alloc_offsets = { 312 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN * 2, 313 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 314 + }, 315 + .expected_result = -EIO, 316 + }, 317 + { 318 + .description = "RAID0: fail: far distance", 319 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 320 + .num_stripes = 4, 321 + .alloc_offsets = { 322 + BTRFS_STRIPE_LEN * 3, BTRFS_STRIPE_LEN, 323 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 324 + }, 325 + .expected_result = -EIO, 326 + }, 327 + { 328 + .description = "RAID0: fail: too many partial write", 329 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 330 + .num_stripes = 4, 331 + .alloc_offsets = { 332 + HALF_STRIPE_LEN, HALF_STRIPE_LEN, 0, 0, 333 + }, 334 + .expected_result = -EIO, 335 + }, 336 + /* 337 + * Error case: Partial missing device is not allowed even on non-DEGRADED 338 + * mount. 339 + */ 340 + { 341 + .description = "RAID0: fail: missing device on DEGRADED", 342 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 343 + .num_stripes = 2, 344 + .alloc_offsets = { 345 + SZ_1M, WP_MISSING_DEV, 346 + }, 347 + .degraded = true, 348 + .expected_result = -EIO, 349 + }, 350 + 351 + /* 352 + * One sequential zone and one conventional zone, having matching 353 + * last_alloc. 354 + */ 355 + { 356 + .description = "RAID0: seq zone and conv zone, partially written stripe", 357 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 358 + .num_stripes = 2, 359 + .alloc_offsets = { 360 + SZ_1M, WP_CONVENTIONAL, 361 + }, 362 + .last_alloc = SZ_2M - SZ_4K, 363 + .expected_alloc_offset = SZ_2M - SZ_4K, 364 + }, 365 + { 366 + .description = "RAID0: conv zone and seq zone, partially written stripe", 367 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 368 + .num_stripes = 2, 369 + .alloc_offsets = { 370 + WP_CONVENTIONAL, SZ_1M, 371 + }, 372 + .last_alloc = SZ_2M + SZ_4K, 373 + .expected_alloc_offset = SZ_2M + SZ_4K, 374 + }, 375 + /* 376 + * Error case: one sequential and one conventional zone, but having larger 377 + * last_alloc than write pointer. 378 + */ 379 + { 380 + .description = "RAID0: fail: seq zone and conv zone, larger last_alloc", 381 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 382 + .num_stripes = 2, 383 + .alloc_offsets = { 384 + SZ_1M, WP_CONVENTIONAL, 385 + }, 386 + .last_alloc = SZ_2M + BTRFS_STRIPE_LEN * 2, 387 + .expected_result = -EIO, 388 + }, 389 + 390 + /* RAID0, 4 stripes with seq zones and conv zones. */ 391 + { 392 + .description = "RAID0: stripes [2, 2, ?, ?] last_alloc = 6", 393 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 394 + .num_stripes = 4, 395 + .alloc_offsets = { 396 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 397 + WP_CONVENTIONAL, WP_CONVENTIONAL, 398 + }, 399 + .last_alloc = BTRFS_STRIPE_LEN * 6, 400 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 6, 401 + }, 402 + { 403 + .description = "RAID0: stripes [2, 2, ?, ?] last_alloc = 7.5", 404 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 405 + .num_stripes = 4, 406 + .alloc_offsets = { 407 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 408 + WP_CONVENTIONAL, WP_CONVENTIONAL, 409 + }, 410 + .last_alloc = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, 411 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, 412 + }, 413 + { 414 + .description = "RAID0: stripes [3, ?, ?, ?] last_alloc = 1", 415 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 416 + .num_stripes = 4, 417 + .alloc_offsets = { 418 + BTRFS_STRIPE_LEN * 3, WP_CONVENTIONAL, 419 + WP_CONVENTIONAL, WP_CONVENTIONAL, 420 + }, 421 + .last_alloc = BTRFS_STRIPE_LEN, 422 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 9, 423 + }, 424 + { 425 + .description = "RAID0: stripes [2, ?, 1, ?] last_alloc = 5", 426 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 427 + .num_stripes = 4, 428 + .alloc_offsets = { 429 + BTRFS_STRIPE_LEN * 2, WP_CONVENTIONAL, 430 + BTRFS_STRIPE_LEN, WP_CONVENTIONAL, 431 + }, 432 + .last_alloc = BTRFS_STRIPE_LEN * 5, 433 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5, 434 + }, 435 + { 436 + .description = "RAID0: fail: stripes [2, ?, 1, ?] last_alloc = 7", 437 + .raid_type = BTRFS_BLOCK_GROUP_RAID0, 438 + .num_stripes = 4, 439 + .alloc_offsets = { 440 + BTRFS_STRIPE_LEN * 2, WP_CONVENTIONAL, 441 + BTRFS_STRIPE_LEN, WP_CONVENTIONAL, 442 + }, 443 + .last_alloc = BTRFS_STRIPE_LEN * 7, 444 + .expected_result = -EIO, 445 + }, 446 + 447 + /* RAID10 */ 448 + /* Normal case */ 449 + { 450 + .description = "RAID10: initial partial write", 451 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 452 + .num_stripes = 4, 453 + .alloc_offsets = { 454 + HALF_STRIPE_LEN, HALF_STRIPE_LEN, 0, 0, 455 + }, 456 + .expected_alloc_offset = HALF_STRIPE_LEN, 457 + }, 458 + { 459 + .description = "RAID10: while in second stripe", 460 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 461 + .num_stripes = 8, 462 + .alloc_offsets = { 463 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 464 + BTRFS_STRIPE_LEN + HALF_STRIPE_LEN, 465 + BTRFS_STRIPE_LEN + HALF_STRIPE_LEN, 466 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 467 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 468 + }, 469 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5 + HALF_STRIPE_LEN, 470 + }, 471 + { 472 + .description = "RAID10: one stripe advanced", 473 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 474 + .num_stripes = 4, 475 + .alloc_offsets = { 476 + SZ_1M + BTRFS_STRIPE_LEN, SZ_1M + BTRFS_STRIPE_LEN, 477 + SZ_1M, SZ_1M, 478 + }, 479 + .expected_alloc_offset = SZ_2M + BTRFS_STRIPE_LEN, 480 + }, 481 + { 482 + .description = "RAID10: one stripe advanced, with conventional zone", 483 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 484 + .num_stripes = 4, 485 + .alloc_offsets = { 486 + SZ_1M + BTRFS_STRIPE_LEN, WP_CONVENTIONAL, 487 + WP_CONVENTIONAL, SZ_1M, 488 + }, 489 + .expected_alloc_offset = SZ_2M + BTRFS_STRIPE_LEN, 490 + }, 491 + /* Error case: having different write pointers. */ 492 + { 493 + .description = "RAID10: fail: disordered stripes", 494 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 495 + .num_stripes = 8, 496 + .alloc_offsets = { 497 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 498 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 499 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 500 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 501 + }, 502 + .expected_result = -EIO, 503 + }, 504 + { 505 + .description = "RAID10: fail: far distance", 506 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 507 + .num_stripes = 8, 508 + .alloc_offsets = { 509 + BTRFS_STRIPE_LEN * 3, BTRFS_STRIPE_LEN * 3, 510 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 511 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 512 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 513 + }, 514 + .expected_result = -EIO, 515 + }, 516 + { 517 + .description = "RAID10: fail: too many partial write", 518 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 519 + .num_stripes = 8, 520 + .alloc_offsets = { 521 + HALF_STRIPE_LEN, HALF_STRIPE_LEN, 522 + HALF_STRIPE_LEN, HALF_STRIPE_LEN, 523 + 0, 0, 0, 0, 524 + }, 525 + .expected_result = -EIO, 526 + }, 527 + /* 528 + * Error case: Partial missing device in RAID0 level is not allowed even on 529 + * non-DEGRADED mount. 530 + */ 531 + { 532 + .description = "RAID10: fail: missing device on DEGRADED", 533 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 534 + .num_stripes = 4, 535 + .alloc_offsets = { 536 + SZ_1M, SZ_1M, 537 + WP_MISSING_DEV, WP_MISSING_DEV, 538 + }, 539 + .degraded = true, 540 + .expected_result = -EIO, 541 + }, 542 + 543 + /* 544 + * One sequential zone and one conventional zone, having matching 545 + * last_alloc. 546 + */ 547 + { 548 + .description = "RAID10: seq zone and conv zone, partially written stripe", 549 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 550 + .num_stripes = 4, 551 + .alloc_offsets = { 552 + SZ_1M, SZ_1M, 553 + WP_CONVENTIONAL, WP_CONVENTIONAL, 554 + }, 555 + .last_alloc = SZ_2M - SZ_4K, 556 + .expected_alloc_offset = SZ_2M - SZ_4K, 557 + }, 558 + { 559 + .description = "RAID10: conv zone and seq zone, partially written stripe", 560 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 561 + .num_stripes = 4, 562 + .alloc_offsets = { 563 + WP_CONVENTIONAL, WP_CONVENTIONAL, 564 + SZ_1M, SZ_1M, 565 + }, 566 + .last_alloc = SZ_2M + SZ_4K, 567 + .expected_alloc_offset = SZ_2M + SZ_4K, 568 + }, 569 + /* 570 + * Error case: one sequential and one conventional zone, but having larger 571 + * last_alloc than write pointer. 572 + */ 573 + { 574 + .description = "RAID10: fail: seq zone and conv zone, larger last_alloc", 575 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 576 + .num_stripes = 4, 577 + .alloc_offsets = { 578 + SZ_1M, SZ_1M, 579 + WP_CONVENTIONAL, WP_CONVENTIONAL, 580 + }, 581 + .last_alloc = SZ_2M + BTRFS_STRIPE_LEN * 2, 582 + .expected_result = -EIO, 583 + }, 584 + 585 + /* RAID10, 4 stripes with seq zones and conv zones. */ 586 + { 587 + .description = "RAID10: stripes [2, 2, ?, ?] last_alloc = 6", 588 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 589 + .num_stripes = 8, 590 + .alloc_offsets = { 591 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 592 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 593 + WP_CONVENTIONAL, WP_CONVENTIONAL, 594 + WP_CONVENTIONAL, WP_CONVENTIONAL, 595 + }, 596 + .last_alloc = BTRFS_STRIPE_LEN * 6, 597 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 6, 598 + }, 599 + { 600 + .description = "RAID10: stripes [2, 2, ?, ?] last_alloc = 7.5", 601 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 602 + .num_stripes = 8, 603 + .alloc_offsets = { 604 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 605 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 606 + WP_CONVENTIONAL, WP_CONVENTIONAL, 607 + WP_CONVENTIONAL, WP_CONVENTIONAL, 608 + }, 609 + .last_alloc = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, 610 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, 611 + }, 612 + { 613 + .description = "RAID10: stripes [3, ?, ?, ?] last_alloc = 1", 614 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 615 + .num_stripes = 8, 616 + .alloc_offsets = { 617 + BTRFS_STRIPE_LEN * 3, BTRFS_STRIPE_LEN * 3, 618 + WP_CONVENTIONAL, WP_CONVENTIONAL, 619 + WP_CONVENTIONAL, WP_CONVENTIONAL, 620 + WP_CONVENTIONAL, WP_CONVENTIONAL, 621 + }, 622 + .last_alloc = BTRFS_STRIPE_LEN, 623 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 9, 624 + }, 625 + { 626 + .description = "RAID10: stripes [2, ?, 1, ?] last_alloc = 5", 627 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 628 + .num_stripes = 8, 629 + .alloc_offsets = { 630 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 631 + WP_CONVENTIONAL, WP_CONVENTIONAL, 632 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 633 + WP_CONVENTIONAL, WP_CONVENTIONAL, 634 + }, 635 + .last_alloc = BTRFS_STRIPE_LEN * 5, 636 + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5, 637 + }, 638 + { 639 + .description = "RAID10: fail: stripes [2, ?, 1, ?] last_alloc = 7", 640 + .raid_type = BTRFS_BLOCK_GROUP_RAID10, 641 + .num_stripes = 8, 642 + .alloc_offsets = { 643 + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, 644 + WP_CONVENTIONAL, WP_CONVENTIONAL, 645 + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, 646 + WP_CONVENTIONAL, WP_CONVENTIONAL, 647 + }, 648 + .last_alloc = BTRFS_STRIPE_LEN * 7, 649 + .expected_result = -EIO, 650 + }, 651 + }; 652 + 653 + int btrfs_test_zoned(void) 654 + { 655 + struct btrfs_fs_info *fs_info __free(btrfs_free_dummy_fs_info) = NULL; 656 + int ret; 657 + 658 + test_msg("running zoned tests (error messages are expected)"); 659 + 660 + fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); 661 + if (!fs_info) { 662 + test_std_err(TEST_ALLOC_FS_INFO); 663 + return -ENOMEM; 664 + } 665 + 666 + for (int i = 0; i < ARRAY_SIZE(load_zone_info_tests); i++) { 667 + ret = test_load_zone_info(fs_info, &load_zone_info_tests[i]); 668 + if (ret) { 669 + test_err("test case \"%s\" failed", load_zone_info_tests[i].description); 670 + return ret; 671 + } 672 + } 673 + 674 + return 0; 675 + }

+52 -38

fs/btrfs/transaction.c

··· 15 15 #include "misc.h" 16 16 #include "ctree.h" 17 17 #include "disk-io.h" 18 + #include "extent_io.h" 18 19 #include "transaction.h" 19 20 #include "locking.h" 20 21 #include "tree-log.h" ··· 275 274 spin_lock(&fs_info->trans_lock); 276 275 loop: 277 276 /* The file system has been taken offline. No new transactions. */ 278 - if (BTRFS_FS_ERROR(fs_info)) { 277 + if (unlikely(BTRFS_FS_ERROR(fs_info))) { 279 278 spin_unlock(&fs_info->trans_lock); 280 279 return -EROFS; 281 280 } ··· 333 332 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 334 333 kfree(cur_trans); 335 334 goto loop; 336 - } else if (BTRFS_FS_ERROR(fs_info)) { 335 + } else if (unlikely(BTRFS_FS_ERROR(fs_info))) { 337 336 spin_unlock(&fs_info->trans_lock); 338 337 btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); 339 338 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); ··· 504 503 return 0; 505 504 506 505 mutex_lock(&fs_info->reloc_mutex); 507 - ret = record_root_in_trans(trans, root, 0); 506 + ret = record_root_in_trans(trans, root, false); 508 507 mutex_unlock(&fs_info->reloc_mutex); 509 508 510 509 return ret; ··· 612 611 bool do_chunk_alloc = false; 613 612 int ret; 614 613 615 - if (BTRFS_FS_ERROR(fs_info)) 614 + if (unlikely(BTRFS_FS_ERROR(fs_info))) 616 615 return ERR_PTR(-EROFS); 617 616 618 617 if (current->journal_info) { ··· 679 678 * here. 680 679 */ 681 680 ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); 681 + if (ret == -EAGAIN) { 682 + ASSERT(btrfs_is_zoned(fs_info)); 683 + ret = btrfs_commit_current_transaction(root); 684 + if (ret) 685 + goto reserve_fail; 686 + ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); 687 + } 688 + 682 689 if (ret) 683 690 goto reserve_fail; 684 691 } ··· 696 687 ret = -ENOMEM; 697 688 goto alloc_fail; 698 689 } 690 + 691 + xa_init(&h->writeback_inhibited_ebs); 699 692 700 693 /* 701 694 * If we are JOIN_NOLOCK we're already committing a transaction and ··· 1095 1084 if (trans->type & __TRANS_FREEZABLE) 1096 1085 sb_end_intwrite(info->sb); 1097 1086 1087 + /* 1088 + * Uninhibit extent buffer writeback before decrementing num_writers, 1089 + * since the decrement wakes the committing thread which needs all 1090 + * buffers uninhibited to write them to disk. 1091 + */ 1092 + btrfs_uninhibit_all_eb_writeback(trans); 1093 + 1098 1094 WARN_ON(cur_trans != info->running_transaction); 1099 1095 WARN_ON(atomic_read(&cur_trans->num_writers) < 1); 1100 1096 atomic_dec(&cur_trans->num_writers); ··· 1120 1102 if (throttle) 1121 1103 btrfs_run_delayed_iputs(info); 1122 1104 1123 - if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { 1105 + if (unlikely(TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info))) { 1124 1106 wake_up_process(info->transaction_kthread); 1125 1107 if (TRANS_ABORTED(trans)) 1126 1108 ret = trans->aborted; ··· 1589 1571 * recorded root will never be updated again, causing an outdated root 1590 1572 * item. 1591 1573 */ 1592 - ret = record_root_in_trans(trans, src, 1); 1574 + ret = record_root_in_trans(trans, src, true); 1593 1575 if (ret) 1594 1576 return ret; 1595 1577 ··· 1612 1594 1613 1595 ret = commit_fs_roots(trans); 1614 1596 if (ret) 1615 - goto out; 1597 + return ret; 1616 1598 ret = btrfs_qgroup_account_extents(trans); 1617 1599 if (ret < 0) 1618 - goto out; 1600 + return ret; 1619 1601 1620 1602 /* Now qgroup are all updated, we can inherit it to new qgroups */ 1621 1603 ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid, 1622 1604 btrfs_root_id(parent), inherit); 1623 1605 if (ret < 0) 1624 - goto out; 1606 + return ret; 1625 1607 1626 1608 /* 1627 1609 * Now we do a simplified commit transaction, which will: ··· 1637 1619 */ 1638 1620 ret = commit_cowonly_roots(trans); 1639 1621 if (ret) 1640 - goto out; 1622 + return ret; 1641 1623 switch_commit_roots(trans); 1642 1624 ret = btrfs_write_and_wait_transaction(trans); 1643 - if (unlikely(ret)) 1625 + if (unlikely(ret)) { 1644 1626 btrfs_err(fs_info, 1645 1627 "error while writing out transaction during qgroup snapshot accounting: %d", ret); 1628 + return ret; 1629 + } 1646 1630 1647 - out: 1648 1631 /* 1649 1632 * Force parent root to be updated, as we recorded it before so its 1650 1633 * last_trans == cur_transid. 1651 1634 * Or it won't be committed again onto disk after later 1652 1635 * insert_dir_item() 1653 1636 */ 1654 - if (!ret) 1655 - ret = record_root_in_trans(trans, parent, 1); 1656 - return ret; 1637 + return record_root_in_trans(trans, parent, true); 1657 1638 } 1658 1639 1659 1640 /* ··· 1679 1662 BTRFS_PATH_AUTO_FREE(path); 1680 1663 struct btrfs_dir_item *dir_item; 1681 1664 struct extent_buffer *tmp; 1682 - struct extent_buffer *old; 1665 + struct extent_buffer *root_eb; 1683 1666 struct timespec64 cur_time; 1684 1667 int ret = 0; 1685 1668 u64 to_reserve = 0; ··· 1736 1719 trans->transid, 1737 1720 trans->bytes_reserved, 1); 1738 1721 parent_root = parent_inode->root; 1739 - ret = record_root_in_trans(trans, parent_root, 0); 1722 + ret = record_root_in_trans(trans, parent_root, false); 1740 1723 if (unlikely(ret)) 1741 1724 goto fail; 1742 1725 cur_time = current_time(&parent_inode->vfs_inode); ··· 1754 1737 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, 1755 1738 btrfs_ino(parent_inode), 1756 1739 &fname.disk_name, 0); 1757 - if (unlikely(dir_item != NULL && !IS_ERR(dir_item))) { 1740 + if (!IS_ERR_OR_NULL(dir_item)) { 1758 1741 pending->error = -EEXIST; 1759 1742 goto dir_item_existed; 1760 1743 } else if (IS_ERR(dir_item)) { ··· 1784 1767 goto fail; 1785 1768 } 1786 1769 1787 - ret = record_root_in_trans(trans, root, 0); 1770 + ret = record_root_in_trans(trans, root, false); 1788 1771 if (unlikely(ret)) { 1789 1772 btrfs_abort_transaction(trans, ret); 1790 1773 goto fail; ··· 1817 1800 btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); 1818 1801 btrfs_set_root_otransid(new_root_item, trans->transid); 1819 1802 1820 - old = btrfs_lock_root_node(root); 1821 - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, 1822 - BTRFS_NESTING_COW); 1823 - if (unlikely(ret)) { 1824 - btrfs_tree_unlock(old); 1825 - free_extent_buffer(old); 1826 - btrfs_abort_transaction(trans, ret); 1827 - goto fail; 1828 - } 1829 - 1830 - ret = btrfs_copy_root(trans, root, old, &tmp, objectid); 1831 - /* clean up in any case */ 1832 - btrfs_tree_unlock(old); 1833 - free_extent_buffer(old); 1803 + root_eb = btrfs_lock_root_node(root); 1804 + ret = btrfs_copy_root(trans, root, root_eb, &tmp, objectid); 1805 + btrfs_tree_unlock(root_eb); 1806 + free_extent_buffer(root_eb); 1834 1807 if (unlikely(ret)) { 1835 1808 btrfs_abort_transaction(trans, ret); 1836 1809 goto fail; ··· 1928 1921 */ 1929 1922 if (ret == -EOVERFLOW) 1930 1923 ret = 0; 1931 - if (unlikely(ret && ret != -EEXIST)) { 1924 + if (unlikely(ret)) { 1932 1925 btrfs_abort_transaction(trans, ret); 1933 1926 goto fail; 1934 1927 } ··· 2134 2127 if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) 2135 2128 btrfs_scrub_cancel(fs_info); 2136 2129 2130 + btrfs_uninhibit_all_eb_writeback(trans); 2137 2131 kmem_cache_free(btrfs_trans_handle_cachep, trans); 2138 2132 } 2139 2133 ··· 2351 2343 * abort to prevent writing a new superblock that reflects a 2352 2344 * corrupt state (pointing to trees with unwritten nodes/leafs). 2353 2345 */ 2354 - if (BTRFS_FS_ERROR(fs_info)) { 2346 + if (unlikely(BTRFS_FS_ERROR(fs_info))) { 2355 2347 spin_unlock(&fs_info->trans_lock); 2356 2348 ret = -EROFS; 2357 2349 goto lockdep_release; ··· 2574 2566 fs_info->cleaner_kthread) 2575 2567 wake_up_process(fs_info->cleaner_kthread); 2576 2568 2569 + /* 2570 + * Uninhibit writeback on all extent buffers inhibited during this 2571 + * transaction before writing them to disk. Inhibiting prevented 2572 + * writeback while the transaction was building, but now we need 2573 + * them written. 2574 + */ 2575 + btrfs_uninhibit_all_eb_writeback(trans); 2576 + 2577 2577 ret = btrfs_write_and_wait_transaction(trans); 2578 2578 if (unlikely(ret)) { 2579 2579 btrfs_err(fs_info, "error while writing out transaction: %d", ret); ··· 2589 2573 goto scrub_continue; 2590 2574 } 2591 2575 2592 - ret = write_all_supers(fs_info, 0); 2576 + ret = write_all_supers(trans); 2593 2577 /* 2594 2578 * the super is written, we can safely allow the tree-loggers 2595 2579 * to go about their business ··· 2657 2641 btrfs_trans_release_chunk_metadata(trans); 2658 2642 trans->block_rsv = NULL; 2659 2643 btrfs_warn(fs_info, "Skipping commit of aborted transaction."); 2660 - if (current->journal_info == trans) 2661 - current->journal_info = NULL; 2662 2644 cleanup_transaction(trans, ret); 2663 2645 2664 2646 return ret;

+3

fs/btrfs/transaction.h

··· 12 12 #include <linux/time64.h> 13 13 #include <linux/mutex.h> 14 14 #include <linux/wait.h> 15 + #include <linux/xarray.h> 15 16 #include "btrfs_inode.h" 16 17 #include "delayed-ref.h" 17 18 ··· 163 162 struct btrfs_fs_info *fs_info; 164 163 struct list_head new_bgs; 165 164 struct btrfs_block_rsv delayed_rsv; 165 + /* Extent buffers with writeback inhibited by this handle. */ 166 + struct xarray writeback_inhibited_ebs; 166 167 }; 167 168 168 169 /*

+247

fs/btrfs/tree-checker.c

··· 777 777 BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); 778 778 return -EUCLEAN; 779 779 } 780 + 781 + if (unlikely(!btrfs_fs_incompat(fs_info, REMAP_TREE) && 782 + type == BTRFS_BLOCK_GROUP_METADATA_REMAP)) { 783 + block_group_err(leaf, slot, 784 + "invalid type, METADATA_REMAP set but REMAP_TREE incompat flag not set"); 785 + return -EUCLEAN; 786 + } 787 + 788 + if (unlikely(!btrfs_fs_incompat(fs_info, REMAP_TREE) && 789 + flags & BTRFS_BLOCK_GROUP_REMAPPED)) { 790 + block_group_err(leaf, slot, 791 + "invalid flags, REMAPPED set but REMAP_TREE incompat flag not set"); 792 + return -EUCLEAN; 793 + } 794 + 795 + if (item_size == sizeof(struct btrfs_block_group_item_v2)) { 796 + struct btrfs_block_group_item_v2 *bgi2; 797 + u64 remap_bytes; 798 + u32 identity_remap_count; 799 + 800 + bgi2 = btrfs_item_ptr(leaf, slot, struct btrfs_block_group_item_v2); 801 + remap_bytes = btrfs_block_group_v2_remap_bytes(leaf, bgi2); 802 + 803 + if (unlikely(remap_bytes > key->offset)) { 804 + block_group_err(leaf, slot, 805 + "invalid remap_bytes, have %llu expect [0, %llu]", 806 + remap_bytes, key->offset); 807 + return -EUCLEAN; 808 + } 809 + 810 + identity_remap_count = btrfs_block_group_v2_identity_remap_count(leaf, bgi2); 811 + if (unlikely((u64)identity_remap_count > 812 + key->offset >> fs_info->sectorsize_bits)) { 813 + block_group_err(leaf, slot, 814 + "invalid identity_remap_count, have %u expect [0, %llu]", 815 + identity_remap_count, 816 + key->offset >> fs_info->sectorsize_bits); 817 + return -EUCLEAN; 818 + } 819 + } 820 + 780 821 return 0; 781 822 } 782 823 ··· 1038 997 "mixed chunk type in non-mixed mode: 0x%llx", type); 1039 998 return -EUCLEAN; 1040 999 } 1000 + } 1001 + 1002 + if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA_REMAP) && 1003 + !(features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE))) { 1004 + chunk_err(fs_info, leaf, chunk, logical, 1005 + "METADATA_REMAP chunk type without REMAP_TREE incompat bit"); 1006 + return -EUCLEAN; 1007 + } 1008 + 1009 + if (unlikely(remapped && 1010 + !(features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE))) { 1011 + chunk_err(fs_info, leaf, chunk, logical, 1012 + "REMAPPED chunk flag without REMAP_TREE incompat bit"); 1013 + return -EUCLEAN; 1041 1014 } 1042 1015 1043 1016 if (!remapped && ··· 1934 1879 return 0; 1935 1880 } 1936 1881 1882 + static int check_remap_key(const struct extent_buffer *leaf, 1883 + const struct btrfs_key *key, int slot) 1884 + { 1885 + const u32 item_size = btrfs_item_size(leaf, slot); 1886 + const u32 sectorsize = leaf->fs_info->sectorsize; 1887 + u64 end; 1888 + 1889 + if (unlikely(!btrfs_fs_incompat(leaf->fs_info, REMAP_TREE))) { 1890 + generic_err(leaf, slot, 1891 + "remap key type %u present but REMAP_TREE incompat bit unset", 1892 + key->type); 1893 + return -EUCLEAN; 1894 + } 1895 + 1896 + switch (key->type) { 1897 + case BTRFS_IDENTITY_REMAP_KEY: 1898 + if (unlikely(item_size != 0)) { 1899 + generic_err(leaf, slot, 1900 + "invalid item size for IDENTITY_REMAP, have %u expect 0", 1901 + item_size); 1902 + return -EUCLEAN; 1903 + } 1904 + break; 1905 + case BTRFS_REMAP_KEY: 1906 + case BTRFS_REMAP_BACKREF_KEY: 1907 + if (unlikely(item_size != sizeof(struct btrfs_remap_item))) { 1908 + generic_err(leaf, slot, 1909 + "invalid item size for remap key type %u, have %u expect %zu", 1910 + key->type, item_size, 1911 + sizeof(struct btrfs_remap_item)); 1912 + return -EUCLEAN; 1913 + } 1914 + break; 1915 + } 1916 + 1917 + if (unlikely(key->offset == 0)) { 1918 + generic_err(leaf, slot, 1919 + "invalid remap key length, have 0 expect nonzero"); 1920 + return -EUCLEAN; 1921 + } 1922 + 1923 + if (unlikely(!IS_ALIGNED(key->objectid, sectorsize))) { 1924 + generic_err(leaf, slot, 1925 + "invalid remap key objectid, have %llu expect aligned to %u", 1926 + key->objectid, sectorsize); 1927 + return -EUCLEAN; 1928 + } 1929 + 1930 + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { 1931 + generic_err(leaf, slot, 1932 + "invalid remap key offset (length), have %llu expect aligned to %u", 1933 + key->offset, sectorsize); 1934 + return -EUCLEAN; 1935 + } 1936 + 1937 + if (unlikely(check_add_overflow(key->objectid, key->offset, &end))) { 1938 + generic_err(leaf, slot, 1939 + "remap key overflow, objectid %llu + offset %llu wraps", 1940 + key->objectid, key->offset); 1941 + return -EUCLEAN; 1942 + } 1943 + 1944 + return 0; 1945 + } 1946 + 1937 1947 static int check_dev_extent_item(const struct extent_buffer *leaf, 1938 1948 const struct btrfs_key *key, 1939 1949 int slot, ··· 2065 1945 return 0; 2066 1946 } 2067 1947 1948 + static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *key, 1949 + int slot) 1950 + { 1951 + struct btrfs_fs_info *fs_info = leaf->fs_info; 1952 + struct btrfs_free_space_info *fsi; 1953 + const u32 blocksize = fs_info->sectorsize; 1954 + u32 flags; 1955 + 1956 + if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) { 1957 + generic_err(leaf, slot, 1958 + "free space info key objectid is not aligned to %u, has " BTRFS_KEY_FMT, 1959 + blocksize, BTRFS_KEY_FMT_VALUE(key)); 1960 + return -EUCLEAN; 1961 + } 1962 + if (unlikely(!IS_ALIGNED(key->offset, blocksize))) { 1963 + generic_err(leaf, slot, 1964 + "free space info key offset is not aligned to %u, has " BTRFS_KEY_FMT, 1965 + blocksize, BTRFS_KEY_FMT_VALUE(key)); 1966 + return -EUCLEAN; 1967 + } 1968 + if (unlikely(btrfs_item_size(leaf, slot) != 1969 + sizeof(struct btrfs_free_space_info))) { 1970 + generic_err(leaf, slot, 1971 + "invalid item size for free space info, has %u expect %zu", 1972 + btrfs_item_size(leaf, slot), 1973 + sizeof(struct btrfs_free_space_info)); 1974 + return -EUCLEAN; 1975 + } 1976 + fsi = btrfs_item_ptr(leaf, slot, struct btrfs_free_space_info); 1977 + flags = btrfs_free_space_flags(leaf, fsi); 1978 + if (unlikely(flags & ~BTRFS_FREE_SPACE_FLAGS_MASK)) { 1979 + generic_err(leaf, slot, 1980 + "unknown flags for free space info, has 0x%x valid mask 0x%lx", 1981 + flags, BTRFS_FREE_SPACE_FLAGS_MASK); 1982 + return -EUCLEAN; 1983 + } 1984 + if (unlikely(btrfs_free_space_extent_count(leaf, fsi) > 1985 + key->offset >> fs_info->sectorsize_bits)) { 1986 + generic_err(leaf, slot, 1987 + "suspicious extent count, has %u max valid %llu", 1988 + btrfs_free_space_extent_count(leaf, fsi), 1989 + key->offset >> fs_info->sectorsize_bits); 1990 + return -EUCLEAN; 1991 + } 1992 + return 0; 1993 + } 1994 + 1995 + static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key *key, int slot) 1996 + { 1997 + struct btrfs_fs_info *fs_info = leaf->fs_info; 1998 + const u32 blocksize = fs_info->sectorsize; 1999 + 2000 + if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) { 2001 + generic_err(leaf, slot, 2002 + "free space extent key objectid is not aligned to %u, has " BTRFS_KEY_FMT, 2003 + blocksize, BTRFS_KEY_FMT_VALUE(key)); 2004 + return -EUCLEAN; 2005 + } 2006 + if (unlikely(!IS_ALIGNED(key->offset, blocksize))) { 2007 + generic_err(leaf, slot, 2008 + "free space extent key offset is not aligned to %u, has " BTRFS_KEY_FMT, 2009 + blocksize, BTRFS_KEY_FMT_VALUE(key)); 2010 + return -EUCLEAN; 2011 + } 2012 + if (unlikely(btrfs_item_size(leaf, slot) != 0)) { 2013 + generic_err(leaf, slot, 2014 + "invalid item size for free space info, has %u expect 0", 2015 + btrfs_item_size(leaf, slot)); 2016 + return -EUCLEAN; 2017 + } 2018 + return 0; 2019 + } 2020 + 2021 + static int check_free_space_bitmap(struct extent_buffer *leaf, 2022 + struct btrfs_key *key, int slot) 2023 + { 2024 + struct btrfs_fs_info *fs_info = leaf->fs_info; 2025 + const u32 blocksize = fs_info->sectorsize; 2026 + u32 expected_item_size; 2027 + 2028 + if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) { 2029 + generic_err(leaf, slot, 2030 + "free space bitmap key objectid is not aligned to %u, has " BTRFS_KEY_FMT, 2031 + blocksize, BTRFS_KEY_FMT_VALUE(key)); 2032 + return -EUCLEAN; 2033 + } 2034 + if (unlikely(!IS_ALIGNED(key->offset, blocksize))) { 2035 + generic_err(leaf, slot, 2036 + "free space bitmap key offset is not aligned to %u, has " BTRFS_KEY_FMT, 2037 + blocksize, BTRFS_KEY_FMT_VALUE(key)); 2038 + return -EUCLEAN; 2039 + } 2040 + if (unlikely(key->offset == 0)) { 2041 + generic_err(leaf, slot, "free space bitmap length is 0"); 2042 + return -EUCLEAN; 2043 + } 2044 + /* 2045 + * The item must hold exactly the right number of bitmap bytes for the 2046 + * range described by key->offset. A mismatch means the item was 2047 + * truncated or the key is corrupt; either way the bitmap data is not 2048 + * safe to access. 2049 + */ 2050 + expected_item_size = DIV_ROUND_UP(key->offset >> fs_info->sectorsize_bits, 2051 + BITS_PER_BYTE); 2052 + if (unlikely(btrfs_item_size(leaf, slot) != expected_item_size)) { 2053 + generic_err(leaf, slot, 2054 + "invalid item size for free space bitmap, has %u expect %u", 2055 + btrfs_item_size(leaf, slot), expected_item_size); 2056 + return -EUCLEAN; 2057 + } 2058 + return 0; 2059 + } 2060 + 2068 2061 /* 2069 2062 * Common point to switch the item-specific validation. 2070 2063 */ ··· 2240 2007 break; 2241 2008 case BTRFS_RAID_STRIPE_KEY: 2242 2009 ret = check_raid_stripe_extent(leaf, key, slot); 2010 + break; 2011 + case BTRFS_FREE_SPACE_INFO_KEY: 2012 + ret = check_free_space_info(leaf, key, slot); 2013 + break; 2014 + case BTRFS_FREE_SPACE_EXTENT_KEY: 2015 + ret = check_free_space_extent(leaf, key, slot); 2016 + break; 2017 + case BTRFS_FREE_SPACE_BITMAP_KEY: 2018 + ret = check_free_space_bitmap(leaf, key, slot); 2019 + break; 2020 + case BTRFS_IDENTITY_REMAP_KEY: 2021 + case BTRFS_REMAP_KEY: 2022 + case BTRFS_REMAP_BACKREF_KEY: 2023 + ret = check_remap_key(leaf, key, slot); 2243 2024 break; 2244 2025 } 2245 2026

+35 -40

fs/btrfs/tree-log.c

··· 457 457 return ret; 458 458 } 459 459 460 - if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) { 460 + if (btrfs_buffer_uptodate(eb, gen, NULL) && level == 0) { 461 461 ret = btrfs_exclude_logged_extents(eb); 462 462 if (ret) 463 463 btrfs_abort_transaction(trans, ret); ··· 1003 1003 btrfs_root_id(root)); 1004 1004 } 1005 1005 if (!ret) { 1006 - ret = btrfs_csum_file_blocks(trans, csum_root, sums); 1006 + ret = btrfs_insert_data_csums(trans, csum_root, sums); 1007 1007 if (ret) 1008 1008 btrfs_abort_log_replay(wc, ret, 1009 1009 "failed to add csums for range [%llu, %llu) inode %llu root %llu", ··· 1711 1711 } 1712 1712 1713 1713 /* insert our name */ 1714 - ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); 1714 + ret = btrfs_add_link(trans, dir, inode, &name, false, ref_index); 1715 1715 if (ret) { 1716 1716 btrfs_abort_log_replay(wc, ret, 1717 1717 "failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu", ··· 2059 2059 return PTR_ERR(dir); 2060 2060 } 2061 2061 2062 - ret = btrfs_add_link(trans, dir, inode, name, 1, index); 2062 + ret = btrfs_add_link(trans, dir, inode, name, true, index); 2063 2063 2064 2064 /* FIXME, put inode into FIXUP list */ 2065 2065 ··· 3566 3566 * writing the super here would result in transid mismatches. If there 3567 3567 * is an error here just bail. 3568 3568 */ 3569 - if (BTRFS_FS_ERROR(fs_info)) { 3569 + if (unlikely(BTRFS_FS_ERROR(fs_info))) { 3570 3570 ret = -EIO; 3571 3571 btrfs_set_log_full_commit(trans); 3572 3572 btrfs_abort_transaction(trans, ret); ··· 3576 3576 3577 3577 btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); 3578 3578 btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); 3579 - ret = write_all_supers(fs_info, 1); 3579 + ret = write_all_supers(trans); 3580 3580 mutex_unlock(&fs_info->tree_log_mutex); 3581 3581 if (unlikely(ret)) { 3582 3582 btrfs_set_log_full_commit(trans); ··· 3681 3681 * free all the extents used by the tree log. This should be called 3682 3682 * at commit time of the full transaction 3683 3683 */ 3684 - int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3684 + void btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) 3685 3685 { 3686 3686 if (root->log_root) { 3687 3687 free_log_tree(trans, root->log_root); 3688 3688 root->log_root = NULL; 3689 3689 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); 3690 3690 } 3691 - return 0; 3692 3691 } 3693 3692 3694 - int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 3695 - struct btrfs_fs_info *fs_info) 3693 + void btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) 3696 3694 { 3697 3695 if (fs_info->log_root_tree) { 3698 3696 free_log_tree(trans, fs_info->log_root_tree); 3699 3697 fs_info->log_root_tree = NULL; 3700 3698 clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); 3701 3699 } 3702 - return 0; 3703 3700 } 3704 3701 3705 3702 static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans, ··· 4610 4613 static void fill_inode_item(struct btrfs_trans_handle *trans, 4611 4614 struct extent_buffer *leaf, 4612 4615 struct btrfs_inode_item *item, 4613 - struct inode *inode, bool log_inode_only, 4616 + struct btrfs_inode *inode, bool log_inode_only, 4614 4617 u64 logged_isize) 4615 4618 { 4616 - u64 gen = BTRFS_I(inode)->generation; 4619 + struct inode *vfs_inode = &inode->vfs_inode; 4620 + u64 gen = inode->generation; 4617 4621 u64 flags; 4618 4622 4619 4623 if (log_inode_only) { ··· 4629 4631 * and one can set it to 0 since that only happens on eviction 4630 4632 * and we are holding a ref on the inode. 4631 4633 */ 4632 - ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0); 4633 - if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid) 4634 + ASSERT(data_race(inode->logged_trans) > 0); 4635 + if (data_race(inode->logged_trans) < trans->transid) 4634 4636 gen = 0; 4635 4637 4636 4638 btrfs_set_inode_size(leaf, item, logged_isize); 4637 4639 } else { 4638 - btrfs_set_inode_size(leaf, item, inode->i_size); 4640 + btrfs_set_inode_size(leaf, item, vfs_inode->i_size); 4639 4641 } 4640 4642 4641 4643 btrfs_set_inode_generation(leaf, item, gen); 4642 4644 4643 - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); 4644 - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); 4645 - btrfs_set_inode_mode(leaf, item, inode->i_mode); 4646 - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); 4645 + btrfs_set_inode_uid(leaf, item, i_uid_read(vfs_inode)); 4646 + btrfs_set_inode_gid(leaf, item, i_gid_read(vfs_inode)); 4647 + btrfs_set_inode_mode(leaf, item, vfs_inode->i_mode); 4648 + btrfs_set_inode_nlink(leaf, item, vfs_inode->i_nlink); 4647 4649 4648 - btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); 4649 - btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); 4650 + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(vfs_inode)); 4651 + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(vfs_inode)); 4650 4652 4651 - btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); 4652 - btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); 4653 + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(vfs_inode)); 4654 + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(vfs_inode)); 4653 4655 4654 - btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); 4655 - btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); 4656 + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(vfs_inode)); 4657 + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(vfs_inode)); 4656 4658 4657 - btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); 4658 - btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); 4659 + btrfs_set_timespec_sec(leaf, &item->otime, inode->i_otime_sec); 4660 + btrfs_set_timespec_nsec(leaf, &item->otime, inode->i_otime_nsec); 4659 4661 4660 4662 /* 4661 4663 * We do not need to set the nbytes field, in fact during a fast fsync ··· 4666 4668 * inode item in subvolume tree as needed (see overwrite_item()). 4667 4669 */ 4668 4670 4669 - btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); 4671 + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(vfs_inode)); 4670 4672 btrfs_set_inode_transid(leaf, item, trans->transid); 4671 - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 4672 - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, 4673 - BTRFS_I(inode)->ro_flags); 4673 + btrfs_set_inode_rdev(leaf, item, vfs_inode->i_rdev); 4674 + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); 4674 4675 btrfs_set_inode_flags(leaf, item, flags); 4675 4676 btrfs_set_inode_block_group(leaf, item, 0); 4676 4677 } ··· 4716 4719 return ret; 4717 4720 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4718 4721 struct btrfs_inode_item); 4719 - fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, 4720 - false, 0); 4722 + fill_inode_item(trans, path->nodes[0], inode_item, inode, false, 0); 4721 4723 btrfs_release_path(path); 4722 4724 return 0; 4723 4725 } ··· 4736 4740 * worry about logging checksum items with overlapping ranges. 4737 4741 */ 4738 4742 if (inode->last_reflink_trans < trans->transid) 4739 - return btrfs_csum_file_blocks(trans, log_root, sums); 4743 + return btrfs_insert_data_csums(trans, log_root, sums); 4740 4744 4741 4745 /* 4742 4746 * Serialize logging for checksums. This is to avoid racing with the ··· 4759 4763 */ 4760 4764 ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); 4761 4765 if (!ret) 4762 - ret = btrfs_csum_file_blocks(trans, log_root, sums); 4766 + ret = btrfs_insert_data_csums(trans, log_root, sums); 4763 4767 4764 4768 btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, 4765 4769 &cached_state); ··· 4985 4989 inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, 4986 4990 struct btrfs_inode_item); 4987 4991 fill_inode_item(trans, dst_path->nodes[0], inode_item, 4988 - &inode->vfs_inode, 4989 - inode_only == LOG_INODE_EXISTS, 4992 + inode, inode_only == LOG_INODE_EXISTS, 4990 4993 logged_isize); 4991 4994 } else { 4992 4995 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, ··· 5083 5088 if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) 5084 5089 continue; 5085 5090 5086 - list_for_each_entry(sums, &ordered->list, list) { 5091 + list_for_each_entry(sums, &ordered->csum_list, list) { 5087 5092 ret = log_csums(trans, inode, log_root, sums); 5088 5093 if (ret) 5089 5094 return ret; ··· 5798 5803 name_str.len = this_name_len; 5799 5804 di = btrfs_lookup_dir_item(NULL, inode->root, search_path, 5800 5805 parent, &name_str, 0); 5801 - if (di && !IS_ERR(di)) { 5806 + if (!IS_ERR_OR_NULL(di)) { 5802 5807 struct btrfs_key di_key; 5803 5808 5804 5809 btrfs_dir_item_key_to_cpu(search_path->nodes[0],

+2 -3

fs/btrfs/tree-log.h

··· 71 71 72 72 int btrfs_sync_log(struct btrfs_trans_handle *trans, 73 73 struct btrfs_root *root, struct btrfs_log_ctx *ctx); 74 - int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 75 - int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, 76 - struct btrfs_fs_info *fs_info); 74 + void btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 75 + void btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); 77 76 int btrfs_recover_log_trees(struct btrfs_root *tree_root); 78 77 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 79 78 struct dentry *dentry,

+3 -5

fs/btrfs/tree-mod-log.c

··· 1042 1042 check.owner_root = btrfs_root_id(root); 1043 1043 1044 1044 old = read_tree_block(fs_info, logical, &check); 1045 - if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { 1046 - if (!IS_ERR(old)) 1047 - free_extent_buffer(old); 1045 + if (WARN_ON(IS_ERR(old))) { 1048 1046 btrfs_warn(fs_info, 1049 - "failed to read tree block %llu from get_old_root", 1050 - logical); 1047 + "failed to read tree block %llu from get_old_root: %ld", 1048 + logical, PTR_ERR(old)); 1051 1049 } else { 1052 1050 struct tree_mod_elem *tm2; 1053 1051

+2 -5

fs/btrfs/uuid-tree.c

··· 35 35 struct btrfs_key key; 36 36 37 37 if (WARN_ON_ONCE(!uuid_root)) 38 - return -ENOENT; 38 + return -EINVAL; 39 39 40 40 path = btrfs_alloc_path(); 41 41 if (!path) ··· 91 91 ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu); 92 92 if (ret != -ENOENT) 93 93 return ret; 94 - 95 - if (WARN_ON_ONCE(!uuid_root)) 96 - return -EINVAL; 97 94 98 95 btrfs_uuid_to_key(uuid, type, &key); 99 96 ··· 513 516 514 517 out: 515 518 btrfs_free_path(path); 516 - if (trans && !IS_ERR(trans)) 519 + if (!IS_ERR_OR_NULL(trans)) 517 520 btrfs_end_transaction(trans); 518 521 if (ret) 519 522 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);

+220 -5

fs/btrfs/volumes.c

··· 392 392 INIT_LIST_HEAD(&fs_devs->alloc_list); 393 393 INIT_LIST_HEAD(&fs_devs->fs_list); 394 394 INIT_LIST_HEAD(&fs_devs->seed_list); 395 + spin_lock_init(&fs_devs->per_profile_lock); 395 396 396 397 if (fsid) { 397 398 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); ··· 2340 2339 mutex_lock(&fs_info->chunk_mutex); 2341 2340 list_del_init(&device->dev_alloc_list); 2342 2341 device->fs_devices->rw_devices--; 2342 + btrfs_update_per_profile_avail(fs_info); 2343 2343 mutex_unlock(&fs_info->chunk_mutex); 2344 2344 } 2345 2345 ··· 2452 2450 list_add(&device->dev_alloc_list, 2453 2451 &fs_devices->alloc_list); 2454 2452 device->fs_devices->rw_devices++; 2453 + btrfs_update_per_profile_avail(fs_info); 2455 2454 mutex_unlock(&fs_info->chunk_mutex); 2456 2455 } 2457 2456 return ret; ··· 2940 2937 */ 2941 2938 btrfs_clear_space_info_full(fs_info); 2942 2939 2940 + btrfs_update_per_profile_avail(fs_info); 2943 2941 mutex_unlock(&fs_info->chunk_mutex); 2944 2942 2945 2943 /* Add sysfs device entry */ ··· 2951 2947 if (seeding_dev) { 2952 2948 mutex_lock(&fs_info->chunk_mutex); 2953 2949 ret = init_first_rw_device(trans); 2950 + btrfs_update_per_profile_avail(fs_info); 2954 2951 mutex_unlock(&fs_info->chunk_mutex); 2955 2952 if (unlikely(ret)) { 2956 2953 btrfs_abort_transaction(trans, ret); ··· 3034 3029 orig_super_total_bytes); 3035 3030 btrfs_set_super_num_devices(fs_info->super_copy, 3036 3031 orig_super_num_devices); 3032 + btrfs_update_per_profile_avail(fs_info); 3037 3033 mutex_unlock(&fs_info->chunk_mutex); 3038 3034 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 3039 3035 error_trans: ··· 3127 3121 if (list_empty(&device->post_commit_list)) 3128 3122 list_add_tail(&device->post_commit_list, 3129 3123 &trans->transaction->dev_update_list); 3124 + btrfs_update_per_profile_avail(fs_info); 3130 3125 mutex_unlock(&fs_info->chunk_mutex); 3131 3126 3132 3127 btrfs_reserve_chunk_metadata(trans, false); ··· 3504 3497 } 3505 3498 } 3506 3499 3500 + btrfs_update_per_profile_avail(fs_info); 3507 3501 mutex_unlock(&fs_info->chunk_mutex); 3508 3502 trans->removing_chunk = false; 3509 3503 ··· 3602 3594 * If we had a transaction abort, stop all running scrubs. 3603 3595 * See transaction.c:cleanup_transaction() why we do it here. 3604 3596 */ 3605 - if (BTRFS_FS_ERROR(fs_info)) 3597 + if (unlikely(BTRFS_FS_ERROR(fs_info))) 3606 3598 btrfs_scrub_cancel(fs_info); 3607 3599 return ret; 3608 3600 } ··· 5208 5200 atomic64_sub(free_diff, &fs_info->free_chunk_space); 5209 5201 } 5210 5202 5203 + btrfs_update_per_profile_avail(fs_info); 5211 5204 /* 5212 5205 * Once the device's size has been set to the new size, ensure all 5213 5206 * in-memory chunks are synced to disk so that the loop below sees them ··· 5324 5315 WARN_ON(diff > old_total); 5325 5316 btrfs_set_super_total_bytes(super_copy, 5326 5317 round_down(old_total - diff, fs_info->sectorsize)); 5318 + btrfs_update_per_profile_avail(fs_info); 5327 5319 mutex_unlock(&fs_info->chunk_mutex); 5328 5320 5329 5321 btrfs_reserve_chunk_metadata(trans, false); ··· 5395 5385 if (di_a->total_avail < di_b->total_avail) 5396 5386 return 1; 5397 5387 return 0; 5388 + } 5389 + 5390 + /* 5391 + * Return 0 if we allocated any virtual(*) chunk, and restore the size to 5392 + * @allocated. 5393 + * Return -ENOSPC if we have no more space to allocate virtual chunk 5394 + * 5395 + * *: A virtual chunk is a chunk that only exists during per-profile available 5396 + * estimation. 5397 + * Those numbers won't really take on-disk space, but only to emulate 5398 + * chunk allocator behavior to get accurate estimation on available space. 5399 + * 5400 + * Another difference is, a virtual chunk has no size limit and doesn't care 5401 + * about holes in the device tree, allowing us to exhaust device space 5402 + * much faster. 5403 + */ 5404 + static int alloc_virtual_chunk(struct btrfs_fs_info *fs_info, 5405 + struct btrfs_device_info *devices_info, 5406 + enum btrfs_raid_types type, 5407 + u64 *allocated) 5408 + { 5409 + const struct btrfs_raid_attr *raid_attr = &btrfs_raid_array[type]; 5410 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 5411 + struct btrfs_device *device; 5412 + u64 stripe_size; 5413 + int ndevs = 0; 5414 + 5415 + lockdep_assert_held(&fs_info->chunk_mutex); 5416 + 5417 + /* Go through devices to collect their unallocated space. */ 5418 + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 5419 + u64 avail; 5420 + 5421 + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, 5422 + &device->dev_state) || 5423 + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) 5424 + continue; 5425 + 5426 + if (device->total_bytes > device->bytes_used + 5427 + device->per_profile_allocated) 5428 + avail = device->total_bytes - device->bytes_used - 5429 + device->per_profile_allocated; 5430 + else 5431 + avail = 0; 5432 + 5433 + avail = round_down(avail, fs_info->sectorsize); 5434 + 5435 + /* And exclude the [0, 1M) reserved space. */ 5436 + if (avail > BTRFS_DEVICE_RANGE_RESERVED) 5437 + avail -= BTRFS_DEVICE_RANGE_RESERVED; 5438 + else 5439 + avail = 0; 5440 + 5441 + /* 5442 + * Not enough to support a single stripe, this device 5443 + * can not be utilized for chunk allocation. 5444 + */ 5445 + if (avail < BTRFS_STRIPE_LEN) 5446 + continue; 5447 + 5448 + /* 5449 + * Unlike chunk allocator, we don't care about stripe or hole 5450 + * size, so here we use @avail directly. 5451 + */ 5452 + devices_info[ndevs].dev_offset = 0; 5453 + devices_info[ndevs].total_avail = avail; 5454 + devices_info[ndevs].max_avail = avail; 5455 + devices_info[ndevs].dev = device; 5456 + ++ndevs; 5457 + } 5458 + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), 5459 + btrfs_cmp_device_info, NULL); 5460 + ndevs = rounddown(ndevs, raid_attr->devs_increment); 5461 + if (ndevs < raid_attr->devs_min) 5462 + return -ENOSPC; 5463 + if (raid_attr->devs_max) 5464 + ndevs = min(ndevs, (int)raid_attr->devs_max); 5465 + else 5466 + ndevs = min(ndevs, (int)BTRFS_MAX_DEVS(fs_info)); 5467 + 5468 + /* 5469 + * Stripe size will be determined by the device with the least 5470 + * unallocated space. 5471 + */ 5472 + stripe_size = devices_info[ndevs - 1].total_avail; 5473 + 5474 + for (int i = 0; i < ndevs; i++) 5475 + devices_info[i].dev->per_profile_allocated += stripe_size; 5476 + *allocated = div_u64(stripe_size * (ndevs - raid_attr->nparity), 5477 + raid_attr->ncopies); 5478 + return 0; 5479 + } 5480 + 5481 + static int calc_one_profile_avail(struct btrfs_fs_info *fs_info, 5482 + enum btrfs_raid_types type, 5483 + u64 *result_ret) 5484 + { 5485 + struct btrfs_device_info *devices_info = NULL; 5486 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 5487 + struct btrfs_device *device; 5488 + u64 allocated; 5489 + u64 result = 0; 5490 + int ret = 0; 5491 + 5492 + lockdep_assert_held(&fs_info->chunk_mutex); 5493 + ASSERT(type >= 0 && type < BTRFS_NR_RAID_TYPES); 5494 + 5495 + /* Not enough devices, quick exit, just update the result. */ 5496 + if (fs_devices->rw_devices < btrfs_raid_array[type].devs_min) { 5497 + ret = -ENOSPC; 5498 + goto out; 5499 + } 5500 + 5501 + devices_info = kzalloc_objs(*devices_info, fs_devices->rw_devices, GFP_NOFS); 5502 + if (!devices_info) { 5503 + ret = -ENOMEM; 5504 + goto out; 5505 + } 5506 + /* Clear virtual chunk used space for each device. */ 5507 + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) 5508 + device->per_profile_allocated = 0; 5509 + 5510 + while (!alloc_virtual_chunk(fs_info, devices_info, type, &allocated)) 5511 + result += allocated; 5512 + 5513 + out: 5514 + kfree(devices_info); 5515 + if (ret < 0 && ret != -ENOSPC) 5516 + return ret; 5517 + *result_ret = result; 5518 + return 0; 5519 + } 5520 + 5521 + /* Update the per-profile available space array. */ 5522 + void btrfs_update_per_profile_avail(struct btrfs_fs_info *fs_info) 5523 + { 5524 + u64 results[BTRFS_NR_RAID_TYPES]; 5525 + int ret; 5526 + 5527 + /* 5528 + * Zoned is more complex as we can not simply get the amount of 5529 + * available space for each device. 5530 + */ 5531 + if (btrfs_is_zoned(fs_info)) 5532 + goto error; 5533 + 5534 + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) { 5535 + ret = calc_one_profile_avail(fs_info, i, &results[i]); 5536 + if (ret < 0) 5537 + goto error; 5538 + } 5539 + 5540 + spin_lock(&fs_info->fs_devices->per_profile_lock); 5541 + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) 5542 + fs_info->fs_devices->per_profile_avail[i] = results[i]; 5543 + spin_unlock(&fs_info->fs_devices->per_profile_lock); 5544 + return; 5545 + error: 5546 + spin_lock(&fs_info->fs_devices->per_profile_lock); 5547 + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) 5548 + fs_info->fs_devices->per_profile_avail[i] = U64_MAX; 5549 + spin_unlock(&fs_info->fs_devices->per_profile_lock); 5398 5550 } 5399 5551 5400 5552 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) ··· 6036 5864 check_raid56_incompat_flag(info, type); 6037 5865 check_raid1c34_incompat_flag(info, type); 6038 5866 5867 + btrfs_update_per_profile_avail(info); 5868 + 6039 5869 return block_group; 6040 5870 } 6041 5871 ··· 6075 5901 ctl.space_info = space_info; 6076 5902 init_alloc_chunk_ctl(fs_devices, &ctl); 6077 5903 6078 - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), 6079 - GFP_NOFS); 5904 + devices_info = kzalloc_objs(*devices_info, fs_devices->rw_devices, GFP_NOFS); 6080 5905 if (!devices_info) 6081 5906 return ERR_PTR(-ENOMEM); 6082 5907 ··· 8250 8077 struct btrfs_device *device; 8251 8078 int stats_cnt; 8252 8079 int ret = 0; 8080 + bool need_update_dev_stats = false; 8081 + 8082 + /* 8083 + * Do an initial pass using RCU to see if we need to update any dev 8084 + * stats item. This is to avoid taking the device_list_mutex which is 8085 + * acquired by the fitrim operation and can take a while since it does 8086 + * discard operations while holding that mutex. Most of the time, if 8087 + * we are on a healthy filesystem, we don't have new stat updates, so 8088 + * this avoids blocking on that mutex, which is specially important 8089 + * because we are called during the critical section of a transaction 8090 + * commit, therefore blocking new transactions from starting while 8091 + * discard is running. 8092 + * 8093 + * Also note that adding/removing devices also requires starting a 8094 + * transaction, and since we are called from the critical section of a 8095 + * transaction commit, no one can be concurrently adding or removing a 8096 + * device. 8097 + */ 8098 + rcu_read_lock(); 8099 + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { 8100 + if (device->dev_stats_valid && 8101 + atomic_read(&device->dev_stats_ccnt) != 0) { 8102 + need_update_dev_stats = true; 8103 + break; 8104 + } 8105 + } 8106 + rcu_read_unlock(); 8107 + 8108 + if (!need_update_dev_stats) 8109 + return 0; 8253 8110 8254 8111 mutex_lock(&fs_devices->device_list_mutex); 8255 8112 list_for_each_entry(device, &fs_devices->devices, dev_list) { ··· 8642 8439 } 8643 8440 8644 8441 /* Ensure all chunks have corresponding dev extents */ 8645 - return verify_chunk_dev_extent_mapping(fs_info); 8442 + ret = verify_chunk_dev_extent_mapping(fs_info); 8443 + if (ret < 0) 8444 + return ret; 8445 + 8446 + mutex_lock(&fs_info->chunk_mutex); 8447 + btrfs_update_per_profile_avail(fs_info); 8448 + mutex_unlock(&fs_info->chunk_mutex); 8449 + return 0; 8646 8450 } 8647 8451 8648 8452 /* ··· 8667 8457 8668 8458 mutex_lock(&uuid_mutex); 8669 8459 list_for_each_entry(dev, &fs_info->fs_devices->devices, dev_list) { 8670 - if (!test_bit(BTRFS_DEV_STATE_ITEM_FOUND, &dev->dev_state)) { 8460 + /* 8461 + * Replace target dev item (devid 0) is not inserted into chunk tree. 8462 + * So skip the DEV_STATE_ITEM check. 8463 + */ 8464 + if (dev->devid != BTRFS_DEV_REPLACE_DEVID && 8465 + !test_bit(BTRFS_DEV_STATE_ITEM_FOUND, &dev->dev_state)) { 8671 8466 btrfs_err(fs_info, 8672 8467 "devid %llu path %s is registered but not found in chunk tree", 8673 8468 dev->devid, btrfs_dev_name(dev));

+34

fs/btrfs/volumes.h

··· 22 22 #include <uapi/linux/btrfs_tree.h> 23 23 #include "messages.h" 24 24 #include "extent-io-tree.h" 25 + #include "fs.h" 25 26 26 27 struct block_device; 27 28 struct bdev_handle; ··· 214 213 215 214 /* Bandwidth limit for scrub, in bytes */ 216 215 u64 scrub_speed_max; 216 + 217 + /* 218 + * A temporary number of allocated space during per-profile 219 + * available space calculation. 220 + */ 221 + u64 per_profile_allocated; 217 222 }; 218 223 219 224 /* ··· 465 458 /* Device to be used for reading in case of RAID1. */ 466 459 u64 read_devid; 467 460 #endif 461 + 462 + /* 463 + * Each value indicates the available space for that profile. 464 + * U64_MAX means the estimation is unavailable. 465 + * 466 + * Protected by per_profile_lock; 467 + */ 468 + u64 per_profile_avail[BTRFS_NR_RAID_TYPES]; 469 + spinlock_t per_profile_lock; 468 470 }; 469 471 470 472 #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ ··· 903 887 const char *btrfs_bg_type_to_raid_name(u64 flags); 904 888 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); 905 889 bool btrfs_verify_dev_items(const struct btrfs_fs_info *fs_info); 890 + void btrfs_update_per_profile_avail(struct btrfs_fs_info *fs_info); 891 + 892 + static inline bool btrfs_get_per_profile_avail(struct btrfs_fs_info *fs_info, 893 + u64 profile, u64 *avail_ret) 894 + { 895 + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(profile); 896 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 897 + bool uptodate = false; 898 + 899 + spin_lock(&fs_devices->per_profile_lock); 900 + if (fs_devices->per_profile_avail[index] != U64_MAX) { 901 + uptodate = true; 902 + *avail_ret = fs_devices->per_profile_avail[index]; 903 + } 904 + spin_unlock(&fs_info->fs_devices->per_profile_lock); 905 + return uptodate; 906 + } 907 + 906 908 bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); 907 909 908 910 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);

+10 -16

fs/btrfs/zlib.c

··· 71 71 72 72 struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) 73 73 { 74 - const u32 blocksize = fs_info->sectorsize; 75 74 struct workspace *workspace; 76 75 int workspacesize; 77 76 ··· 90 91 workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; 91 92 } 92 93 if (!workspace->buf) { 93 - workspace->buf = kmalloc(blocksize, GFP_KERNEL); 94 - workspace->buf_size = blocksize; 94 + workspace->buf = kmalloc(fs_info->sectorsize, GFP_KERNEL); 95 + workspace->buf_size = fs_info->sectorsize; 95 96 } 96 97 if (!workspace->strm.workspace || !workspace->buf) 97 98 goto fail; ··· 156 157 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 157 158 int ret; 158 159 char *data_in = NULL; 159 - char *cfolio_out; 160 160 struct folio *in_folio = NULL; 161 161 struct folio *out_folio = NULL; 162 - const u32 blocksize = fs_info->sectorsize; 163 162 const u64 orig_end = start + len; 164 163 165 164 ret = zlib_deflateInit(&workspace->strm, workspace->level); ··· 172 175 workspace->strm.total_in = 0; 173 176 workspace->strm.total_out = 0; 174 177 175 - out_folio = btrfs_alloc_compr_folio(fs_info); 178 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 176 179 if (out_folio == NULL) { 177 180 ret = -ENOMEM; 178 181 goto out; 179 182 } 180 - cfolio_out = folio_address(out_folio); 181 183 182 184 workspace->strm.next_in = workspace->buf; 183 185 workspace->strm.avail_in = 0; 184 - workspace->strm.next_out = cfolio_out; 186 + workspace->strm.next_out = folio_address(out_folio); 185 187 workspace->strm.avail_out = min_folio_size; 186 188 187 189 while (workspace->strm.total_in < len) { ··· 238 242 } 239 243 240 244 /* We're making it bigger, give up. */ 241 - if (workspace->strm.total_in > blocksize * 2 && 245 + if (workspace->strm.total_in > fs_info->sectorsize * 2 && 242 246 workspace->strm.total_in < workspace->strm.total_out) { 243 247 ret = -E2BIG; 244 248 goto out; ··· 254 258 goto out; 255 259 } 256 260 257 - out_folio = btrfs_alloc_compr_folio(fs_info); 261 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 258 262 if (out_folio == NULL) { 259 263 ret = -ENOMEM; 260 264 goto out; 261 265 } 262 - cfolio_out = folio_address(out_folio); 263 266 workspace->strm.avail_out = min_folio_size; 264 - workspace->strm.next_out = cfolio_out; 267 + workspace->strm.next_out = folio_address(out_folio); 265 268 } 266 269 /* We're all done. */ 267 270 if (workspace->strm.total_in >= len) ··· 291 296 goto out; 292 297 } 293 298 /* Get another folio for the stream end. */ 294 - out_folio = btrfs_alloc_compr_folio(fs_info); 299 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 295 300 if (out_folio == NULL) { 296 301 ret = -ENOMEM; 297 302 goto out; 298 303 } 299 - cfolio_out = folio_address(out_folio); 300 304 workspace->strm.avail_out = min_folio_size; 301 - workspace->strm.next_out = cfolio_out; 305 + workspace->strm.next_out = folio_address(out_folio); 302 306 } 303 307 } 304 308 /* Queue the remaining part of the folio. */ ··· 345 351 int wbits = MAX_WBITS; 346 352 char *data_in; 347 353 size_t total_out = 0; 348 - size_t srclen = cb->compressed_len; 354 + const size_t srclen = bio_get_size(&cb->bbio.bio); 349 355 unsigned long buf_start; 350 356 351 357 bio_first_folio(&fi, &cb->bbio.bio, 0);

+9 -8

fs/btrfs/zoned.c

··· 1699 1699 return -EINVAL; 1700 1700 } 1701 1701 1702 - raid0_allocs = kcalloc(map->num_stripes / map->sub_stripes, sizeof(*raid0_allocs), 1703 - GFP_NOFS); 1702 + raid0_allocs = kzalloc_objs(*raid0_allocs, map->num_stripes / map->sub_stripes, GFP_NOFS); 1704 1703 if (!raid0_allocs) 1705 1704 return -ENOMEM; 1706 1705 ··· 1917 1918 1918 1919 cache->physical_map = map; 1919 1920 1920 - zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); 1921 + zone_info = kzalloc_objs(*zone_info, map->num_stripes, GFP_NOFS); 1921 1922 if (!zone_info) { 1922 1923 ret = -ENOMEM; 1923 1924 goto out; ··· 2122 2123 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 2123 2124 return; 2124 2125 2125 - ASSERT(!list_empty(&ordered->list)); 2126 - /* The ordered->list can be empty in the above pre-alloc case. */ 2127 - sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); 2126 + ASSERT(!list_empty(&ordered->csum_list)); 2127 + sum = list_first_entry(&ordered->csum_list, struct btrfs_ordered_sum, list); 2128 2128 logical = sum->logical; 2129 2129 len = sum->len; 2130 2130 ··· 2134 2136 continue; 2135 2137 } 2136 2138 if (!btrfs_zoned_split_ordered(ordered, logical, len)) { 2137 - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 2139 + btrfs_mark_ordered_extent_error(ordered); 2138 2140 btrfs_err(fs_info, "failed to split ordered extent"); 2139 2141 goto out; 2140 2142 } ··· 2154 2156 */ 2155 2157 if ((inode->flags & BTRFS_INODE_NODATASUM) || 2156 2158 test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { 2157 - while ((sum = list_first_entry_or_null(&ordered->list, 2159 + while ((sum = list_first_entry_or_null(&ordered->csum_list, 2158 2160 typeof(*sum), list))) { 2159 2161 list_del(&sum->list); 2160 2162 kfree(sum); ··· 2382 2384 int i; 2383 2385 2384 2386 if (!btrfs_is_zoned(block_group->fs_info)) 2387 + return true; 2388 + 2389 + if (unlikely(btrfs_is_testing(fs_info))) 2385 2390 return true; 2386 2391 2387 2392 map = block_group->physical_map;

+8 -12

fs/btrfs/zstd.c

··· 370 370 371 371 struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) 372 372 { 373 - const u32 blocksize = fs_info->sectorsize; 374 373 struct workspace *workspace; 375 374 376 375 workspace = kzalloc_obj(*workspace); ··· 382 383 workspace->req_level = level; 383 384 workspace->last_used = jiffies; 384 385 workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); 385 - workspace->buf = kmalloc(blocksize, GFP_KERNEL); 386 + workspace->buf = kmalloc(fs_info->sectorsize, GFP_KERNEL); 386 387 if (!workspace->mem || !workspace->buf) 387 388 goto fail; 388 389 ··· 413 414 const u64 start = cb->start; 414 415 const u32 len = cb->len; 415 416 const u64 end = start + len; 416 - const u32 blocksize = fs_info->sectorsize; 417 417 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 418 418 419 419 workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); ··· 437 439 workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, start); 438 440 439 441 /* Allocate and map in the output buffer. */ 440 - out_folio = btrfs_alloc_compr_folio(fs_info); 442 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 441 443 if (out_folio == NULL) { 442 444 ret = -ENOMEM; 443 445 goto out; ··· 461 463 } 462 464 463 465 /* Check to see if we are making it bigger. */ 464 - if (tot_in + workspace->in_buf.pos > blocksize * 2 && 466 + if (tot_in + workspace->in_buf.pos > fs_info->sectorsize * 2 && 465 467 tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { 466 468 ret = -E2BIG; 467 469 goto out; ··· 480 482 goto out; 481 483 } 482 484 483 - out_folio = btrfs_alloc_compr_folio(fs_info); 485 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 484 486 if (out_folio == NULL) { 485 487 ret = -ENOMEM; 486 488 goto out; ··· 553 555 ret = -E2BIG; 554 556 goto out; 555 557 } 556 - out_folio = btrfs_alloc_compr_folio(fs_info); 558 + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); 557 559 if (out_folio == NULL) { 558 560 ret = -ENOMEM; 559 561 goto out; ··· 585 587 struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); 586 588 struct workspace *workspace = list_entry(ws, struct workspace, list); 587 589 struct folio_iter fi; 588 - size_t srclen = cb->compressed_len; 590 + size_t srclen = bio_get_size(&cb->bbio.bio); 589 591 zstd_dstream *stream; 590 592 int ret = 0; 591 - const u32 blocksize = fs_info->sectorsize; 592 593 const unsigned int min_folio_size = btrfs_min_folio_size(fs_info); 593 594 unsigned long folio_in_index = 0; 594 595 unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); ··· 617 620 618 621 workspace->out_buf.dst = workspace->buf; 619 622 workspace->out_buf.pos = 0; 620 - workspace->out_buf.size = blocksize; 623 + workspace->out_buf.size = fs_info->sectorsize; 621 624 622 625 while (1) { 623 626 size_t ret2; ··· 679 682 { 680 683 struct workspace *workspace = list_entry(ws, struct workspace, list); 681 684 struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); 682 - const u32 sectorsize = fs_info->sectorsize; 683 685 zstd_dstream *stream; 684 686 int ret = 0; 685 687 unsigned long to_copy = 0; ··· 702 706 703 707 workspace->out_buf.dst = workspace->buf; 704 708 workspace->out_buf.pos = 0; 705 - workspace->out_buf.size = sectorsize; 709 + workspace->out_buf.size = fs_info->sectorsize; 706 710 707 711 /* 708 712 * Since both input and output buffers should not exceed one sector,

+24

include/trace/events/btrfs.h

··· 1113 1113 __entry->cow_level) 1114 1114 ); 1115 1115 1116 + TRACE_EVENT(btrfs_search_slot_restart, 1117 + 1118 + TP_PROTO(const struct btrfs_root *root, int level, 1119 + const char *reason), 1120 + 1121 + TP_ARGS(root, level, reason), 1122 + 1123 + TP_STRUCT__entry_btrfs( 1124 + __field( u64, root_objectid ) 1125 + __field( int, level ) 1126 + __string( reason, reason ) 1127 + ), 1128 + 1129 + TP_fast_assign_btrfs(root->fs_info, 1130 + __entry->root_objectid = btrfs_root_id(root); 1131 + __entry->level = level; 1132 + __assign_str(reason); 1133 + ), 1134 + 1135 + TP_printk_btrfs("root=%llu(%s) level=%d reason=%s", 1136 + show_root_type(__entry->root_objectid), 1137 + __entry->level, __get_str(reason)) 1138 + ); 1139 + 1116 1140 TRACE_EVENT(btrfs_space_reservation, 1117 1141 1118 1142 TP_PROTO(const struct btrfs_fs_info *fs_info, const char *type, u64 val,

+2 -1

include/uapi/linux/btrfs_tree.h

··· 1245 1245 __le32 flags; 1246 1246 } __attribute__ ((__packed__)); 1247 1247 1248 - #define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) 1248 + #define BTRFS_FREE_SPACE_USING_BITMAPS (1UL << 0) 1249 + #define BTRFS_FREE_SPACE_FLAGS_MASK (BTRFS_FREE_SPACE_USING_BITMAPS) 1249 1250 1250 1251 #define BTRFS_QGROUP_LEVEL_SHIFT 48 1251 1252 static inline __u16 btrfs_qgroup_level(__u64 qgroupid)

Configure Feed

Configure Feed