Merge tag 'for-6.20-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+5 -8

fs/btrfs/Kconfig

··· 4 4 tristate "Btrfs filesystem support" 5 5 select BLK_CGROUP_PUNT_BIO 6 6 select CRC32 7 - select CRYPTO 8 - select CRYPTO_CRC32C 9 - select CRYPTO_XXHASH 10 - select CRYPTO_SHA256 11 - select CRYPTO_BLAKE2B 7 + select CRYPTO_LIB_BLAKE2B 8 + select CRYPTO_LIB_SHA256 12 9 select ZLIB_INFLATE 13 10 select ZLIB_DEFLATE 14 11 select LZO_COMPRESS ··· 15 18 select FS_IOMAP 16 19 select RAID6_PQ 17 20 select XOR_BLOCKS 21 + select XXHASH 18 22 depends on PAGE_SIZE_LESS_THAN_256KB 19 23 20 24 help ··· 104 106 105 107 - send stream protocol v3 - fs-verity support 106 108 107 - - checksum offload mode - sysfs knob to affect when checksums are 108 - calculated (at IO time, or in a thread) 109 - 110 109 - raid-stripe-tree - additional mapping of extents to devices to 111 110 support RAID1* profiles on zoned devices, 112 111 RAID56 not yet supported ··· 115 120 - shutdown ioctl and auto-degradation support 116 121 117 122 - asynchronous checksum generation for data writes 123 + 124 + - remap-tree - logical address remapping tree 118 125 119 126 If unsure, say N.

+2 -1

fs/btrfs/Makefile

··· 44 44 tests/extent-buffer-tests.o tests/btrfs-tests.o \ 45 45 tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ 46 46 tests/free-space-tree-tests.o tests/extent-map-tests.o \ 47 - tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o 47 + tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o \ 48 + tests/chunk-allocation-tests.o

+30

fs/btrfs/accessors.h

··· 240 240 BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, 241 241 struct btrfs_block_group_item, flags, 64); 242 242 243 + /* struct btrfs_block_group_item_v2 */ 244 + BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_used, struct btrfs_block_group_item_v2, 245 + used, 64); 246 + BTRFS_SETGET_FUNCS(block_group_v2_used, struct btrfs_block_group_item_v2, used, 64); 247 + BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_chunk_objectid, 248 + struct btrfs_block_group_item_v2, chunk_objectid, 64); 249 + BTRFS_SETGET_FUNCS(block_group_v2_chunk_objectid, 250 + struct btrfs_block_group_item_v2, chunk_objectid, 64); 251 + BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_flags, 252 + struct btrfs_block_group_item_v2, flags, 64); 253 + BTRFS_SETGET_FUNCS(block_group_v2_flags, struct btrfs_block_group_item_v2, flags, 64); 254 + BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_remap_bytes, 255 + struct btrfs_block_group_item_v2, remap_bytes, 64); 256 + BTRFS_SETGET_FUNCS(block_group_v2_remap_bytes, struct btrfs_block_group_item_v2, 257 + remap_bytes, 64); 258 + BTRFS_SETGET_STACK_FUNCS(stack_block_group_v2_identity_remap_count, 259 + struct btrfs_block_group_item_v2, identity_remap_count, 32); 260 + BTRFS_SETGET_FUNCS(block_group_v2_identity_remap_count, struct btrfs_block_group_item_v2, 261 + identity_remap_count, 32); 262 + 243 263 /* struct btrfs_free_space_info */ 244 264 BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, 245 265 extent_count, 32); ··· 883 863 uuid_tree_generation, 64); 884 864 BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, 885 865 nr_global_roots, 64); 866 + BTRFS_SETGET_STACK_FUNCS(super_remap_root, struct btrfs_super_block, 867 + remap_root, 64); 868 + BTRFS_SETGET_STACK_FUNCS(super_remap_root_generation, struct btrfs_super_block, 869 + remap_root_generation, 64); 870 + BTRFS_SETGET_STACK_FUNCS(super_remap_root_level, struct btrfs_super_block, 871 + remap_root_level, 8); 886 872 887 873 /* struct btrfs_file_extent_item */ 888 874 BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, ··· 1035 1009 struct btrfs_verity_descriptor_item, encryption, 8); 1036 1010 BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, 1037 1011 struct btrfs_verity_descriptor_item, size, 64); 1012 + 1013 + BTRFS_SETGET_FUNCS(remap_address, struct btrfs_remap_item, address, 64); 1014 + BTRFS_SETGET_STACK_FUNCS(stack_remap_address, struct btrfs_remap_item, 1015 + address, 64); 1038 1016 1039 1017 /* Cast into the data area of the leaf. */ 1040 1018 #define btrfs_item_ptr(leaf, slot, type) \

+1 -3

fs/btrfs/backref.c

··· 3609 3609 } 3610 3610 3611 3611 rb_node = rb_simple_insert(&cache->rb_root, &upper->simple_node); 3612 - if (unlikely(rb_node)) { 3612 + if (unlikely(rb_node)) 3613 3613 btrfs_backref_panic(cache->fs_info, upper->bytenr, -EEXIST); 3614 - return -EUCLEAN; 3615 - } 3616 3614 3617 3615 list_add_tail(&edge->list[UPPER], &upper->lower); 3618 3616

+17 -18

fs/btrfs/bio.c

··· 97 97 bbio->orig_logical = orig_bbio->orig_logical; 98 98 orig_bbio->orig_logical += map_length; 99 99 } 100 + 100 101 bbio->csum_search_commit_root = orig_bbio->csum_search_commit_root; 102 + bbio->can_use_append = orig_bbio->can_use_append; 103 + bbio->is_scrub = orig_bbio->is_scrub; 104 + bbio->is_remap = orig_bbio->is_remap; 105 + bbio->async_csum = orig_bbio->async_csum; 106 + 101 107 atomic_inc(&orig_bbio->pending_ios); 102 108 return bbio; 103 109 } ··· 486 480 487 481 static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) 488 482 { 483 + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 484 + 489 485 if (!dev || !dev->bdev || 490 486 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 491 487 (btrfs_op(bio) == BTRFS_MAP_WRITE && ··· 502 494 * For zone append writing, bi_sector must point the beginning of the 503 495 * zone 504 496 */ 505 - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 506 - u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 497 + if (btrfs_bio(bio)->can_use_append && btrfs_dev_is_sequential(dev, physical)) { 507 498 u64 zone_start = round_down(physical, dev->fs_info->zone_size); 508 499 509 500 ASSERT(btrfs_dev_is_sequential(dev, physical)); 510 501 bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; 502 + bio->bi_opf &= ~REQ_OP_WRITE; 503 + bio->bi_opf |= REQ_OP_ZONE_APPEND; 511 504 } 512 505 btrfs_debug(dev->fs_info, 513 506 "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", ··· 671 662 bool auto_csum_mode = true; 672 663 673 664 #ifdef CONFIG_BTRFS_EXPERIMENTAL 674 - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 675 - enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode); 676 - 677 - if (csum_mode == BTRFS_OFFLOAD_CSUM_FORCE_ON) 678 - return true; 679 665 /* 680 666 * Write bios will calculate checksum and submit bio at the same time. 681 667 * Unless explicitly required don't offload serial csum calculate and bio ··· 751 747 u64 logical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 752 748 u64 length = bio->bi_iter.bi_size; 753 749 u64 map_length = length; 754 - bool use_append = btrfs_use_zone_append(bbio); 755 750 struct btrfs_io_context *bioc = NULL; 756 751 struct btrfs_io_stripe smap; 757 752 blk_status_t status; ··· 778 775 if (bio_op(bio) == REQ_OP_WRITE && is_data_bbio(bbio)) 779 776 bbio->orig_logical = logical; 780 777 778 + bbio->can_use_append = btrfs_use_zone_append(bbio); 779 + 781 780 map_length = min(map_length, length); 782 - if (use_append) 781 + if (bbio->can_use_append) 783 782 map_length = btrfs_append_map_length(bbio, map_length); 784 783 785 784 if (map_length < length) { ··· 810 805 } 811 806 812 807 if (btrfs_op(bio) == BTRFS_MAP_WRITE) { 813 - if (use_append) { 814 - bio->bi_opf &= ~REQ_OP_WRITE; 815 - bio->bi_opf |= REQ_OP_ZONE_APPEND; 816 - } 817 - 818 808 if (is_data_bbio(bbio) && bioc && bioc->use_rst) { 819 809 /* 820 810 * No locking for the list update, as we only add to ··· 827 827 */ 828 828 if (!(inode->flags & BTRFS_INODE_NODATASUM) && 829 829 !test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state) && 830 - !btrfs_is_data_reloc_root(inode->root)) { 830 + !btrfs_is_data_reloc_root(inode->root) && !bbio->is_remap) { 831 831 if (should_async_write(bbio) && 832 832 btrfs_wq_submit_bio(bbio, bioc, &smap, mirror_num)) 833 833 goto done; ··· 836 836 status = errno_to_blk_status(ret); 837 837 if (status) 838 838 goto fail; 839 - } else if (use_append || 840 - (btrfs_is_zoned(fs_info) && inode && 841 - inode->flags & BTRFS_INODE_NODATASUM)) { 839 + } else if (bbio->can_use_append || 840 + (btrfs_is_zoned(fs_info) && inode->flags & BTRFS_INODE_NODATASUM)) { 842 841 ret = btrfs_alloc_dummy_sum(bbio); 843 842 status = errno_to_blk_status(ret); 844 843 if (status)

+13 -6

fs/btrfs/bio.h

··· 68 68 struct btrfs_tree_parent_check parent_check; 69 69 }; 70 70 71 + /* For internal use in read end I/O handling */ 72 + struct work_struct end_io_work; 73 + 71 74 /* End I/O information supplied to btrfs_bio_alloc */ 72 75 btrfs_bio_end_io_t end_io; 73 76 void *private; 74 77 75 - /* For internal use in read end I/O handling */ 76 - unsigned int mirror_num; 77 78 atomic_t pending_ios; 78 - struct work_struct end_io_work; 79 + u16 mirror_num; 79 80 80 81 /* Save the first error status of split bio. */ 81 82 blk_status_t status; 82 83 83 84 /* Use the commit root to look up csums (data read bio only). */ 84 - bool csum_search_commit_root; 85 + bool csum_search_commit_root:1; 85 86 86 87 /* 87 88 * Since scrub will reuse btree inode, we need this flag to distinguish 88 89 * scrub bios. 89 90 */ 90 - bool is_scrub; 91 + bool is_scrub:1; 92 + 93 + /* Whether the bio is coming from copy_remapped_data_io(). */ 94 + bool is_remap:1; 91 95 92 96 /* Whether the csum generation for data write is async. */ 93 - bool async_csum; 97 + bool async_csum:1; 98 + 99 + /* Whether the bio is written using zone append. */ 100 + bool can_use_append:1; 94 101 95 102 /* 96 103 * This member must come last, bio_alloc_bioset will allocate enough

+324 -131

fs/btrfs/block-group.c

··· 239 239 240 240 while (n) { 241 241 cache = rb_entry(n, struct btrfs_block_group, cache_node); 242 - end = cache->start + cache->length - 1; 242 + end = btrfs_block_group_end(cache) - 1; 243 243 start = cache->start; 244 244 245 245 if (bytenr < start) { ··· 292 292 293 293 /* If our block group was removed, we need a full search. */ 294 294 if (RB_EMPTY_NODE(&cache->cache_node)) { 295 - const u64 next_bytenr = cache->start + cache->length; 295 + const u64 next_bytenr = btrfs_block_group_end(cache); 296 296 297 297 read_unlock(&fs_info->block_group_cache_lock); 298 298 btrfs_put_block_group(cache); ··· 575 575 /* 576 576 * Get an arbitrary extent item index / max_index through the block group 577 577 * 578 - * @block_group the block group to sample from 578 + * @caching_ctl the caching control containing the block group to sample from 579 579 * @index: the integral step through the block group to grab from 580 580 * @max_index: the granularity of the sampling 581 581 * @key: return value parameter for the item we find 582 + * @path: path to use for searching in the extent tree 582 583 * 583 584 * Pre-conditions on indices: 584 585 * 0 <= index <= max_index 585 586 * 0 < max_index 586 587 * 587 - * Returns: 0 on success, 1 if the search didn't yield a useful item, negative 588 - * error code on error. 588 + * Returns: 0 on success, 1 if the search didn't yield a useful item. 589 589 */ 590 590 static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, 591 - struct btrfs_block_group *block_group, 592 591 int index, int max_index, 593 - struct btrfs_key *found_key) 592 + struct btrfs_key *found_key, 593 + struct btrfs_path *path) 594 594 { 595 + struct btrfs_block_group *block_group = caching_ctl->block_group; 595 596 struct btrfs_fs_info *fs_info = block_group->fs_info; 596 597 struct btrfs_root *extent_root; 597 598 u64 search_offset; 598 - u64 search_end = block_group->start + block_group->length; 599 - BTRFS_PATH_AUTO_FREE(path); 599 + const u64 search_end = btrfs_block_group_end(block_group); 600 600 struct btrfs_key search_key; 601 601 int ret = 0; 602 602 ··· 606 606 lockdep_assert_held(&caching_ctl->mutex); 607 607 lockdep_assert_held_read(&fs_info->commit_root_sem); 608 608 609 - path = btrfs_alloc_path(); 610 - if (!path) 611 - return -ENOMEM; 612 - 613 - extent_root = btrfs_extent_root(fs_info, max_t(u64, block_group->start, 614 - BTRFS_SUPER_INFO_OFFSET)); 615 - 616 - path->skip_locking = true; 617 - path->search_commit_root = true; 618 - path->reada = READA_FORWARD; 609 + extent_root = btrfs_extent_root(fs_info, block_group->start); 610 + if (unlikely(!extent_root)) { 611 + btrfs_err(fs_info, 612 + "missing extent root for block group at offset %llu", 613 + block_group->start); 614 + return -EUCLEAN; 615 + } 619 616 620 617 search_offset = index * div_u64(block_group->length, max_index); 621 618 search_key.objectid = block_group->start + search_offset; ··· 670 673 * 3, we can either read every file extent, or admit that this is best effort 671 674 * anyway and try to stay fast. 672 675 * 673 - * Returns: 0 on success, negative error code on error. 676 + * No errors are returned since failing to determine the size class is not a 677 + * critical error, size classes are just an optimization. 674 678 */ 675 - static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, 676 - struct btrfs_block_group *block_group) 679 + static void load_block_group_size_class(struct btrfs_caching_control *caching_ctl) 677 680 { 681 + BTRFS_PATH_AUTO_RELEASE(path); 682 + struct btrfs_block_group *block_group = caching_ctl->block_group; 678 683 struct btrfs_fs_info *fs_info = block_group->fs_info; 679 684 struct btrfs_key key; 680 685 int i; 681 686 u64 min_size = block_group->length; 682 687 enum btrfs_block_group_size_class size_class = BTRFS_BG_SZ_NONE; 683 - int ret; 688 + 689 + /* 690 + * Since we run in workqueue context, we allocate the path on stack to 691 + * avoid memory allocation failure, as the stack in a work queue task 692 + * is not deep. 693 + */ 694 + ASSERT(current_work() == &caching_ctl->work.normal_work); 684 695 685 696 if (!btrfs_block_group_should_use_size_class(block_group)) 686 - return 0; 697 + return; 698 + 699 + path.skip_locking = true; 700 + path.search_commit_root = true; 701 + path.reada = READA_FORWARD; 687 702 688 703 lockdep_assert_held(&caching_ctl->mutex); 689 704 lockdep_assert_held_read(&fs_info->commit_root_sem); 690 705 for (i = 0; i < 5; ++i) { 691 - ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); 706 + int ret; 707 + 708 + ret = sample_block_group_extent_item(caching_ctl, i, 5, &key, &path); 692 709 if (ret < 0) 693 - goto out; 710 + return; 711 + btrfs_release_path(&path); 694 712 if (ret > 0) 695 713 continue; 696 714 min_size = min_t(u64, min_size, key.offset); ··· 716 704 block_group->size_class = size_class; 717 705 spin_unlock(&block_group->lock); 718 706 } 719 - out: 720 - return ret; 721 707 } 722 708 723 709 static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) 724 710 { 725 711 struct btrfs_block_group *block_group = caching_ctl->block_group; 712 + const u64 block_group_end = btrfs_block_group_end(block_group); 726 713 struct btrfs_fs_info *fs_info = block_group->fs_info; 727 714 struct btrfs_root *extent_root; 728 715 BTRFS_PATH_AUTO_FREE(path); ··· 766 755 next: 767 756 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); 768 757 if (ret < 0) 769 - goto out; 758 + return ret; 770 759 771 760 leaf = path->nodes[0]; 772 761 nritems = btrfs_header_nritems(leaf); 773 762 774 763 while (1) { 775 - if (btrfs_fs_closing(fs_info) > 1) { 764 + if (btrfs_fs_closing_done(fs_info)) { 776 765 last = (u64)-1; 777 766 break; 778 767 } ··· 797 786 798 787 ret = btrfs_next_leaf(extent_root, path); 799 788 if (ret < 0) 800 - goto out; 789 + return ret; 801 790 if (ret) 802 791 break; 803 792 leaf = path->nodes[0]; ··· 818 807 continue; 819 808 } 820 809 821 - if (key.objectid >= block_group->start + block_group->length) 810 + if (key.objectid >= block_group_end) 822 811 break; 823 812 824 813 if (key.type == BTRFS_EXTENT_ITEM_KEY || ··· 828 817 ret = btrfs_add_new_free_space(block_group, last, 829 818 key.objectid, &space_added); 830 819 if (ret) 831 - goto out; 820 + return ret; 832 821 total_found += space_added; 833 822 if (key.type == BTRFS_METADATA_ITEM_KEY) 834 823 last = key.objectid + ··· 847 836 path->slots[0]++; 848 837 } 849 838 850 - ret = btrfs_add_new_free_space(block_group, last, 851 - block_group->start + block_group->length, 852 - NULL); 853 - out: 854 - return ret; 839 + return btrfs_add_new_free_space(block_group, last, block_group_end, NULL); 855 840 } 856 841 857 842 static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) 858 843 { 859 844 btrfs_clear_extent_bit(&bg->fs_info->excluded_extents, bg->start, 860 - bg->start + bg->length - 1, EXTENT_DIRTY, NULL); 845 + btrfs_block_group_end(bg) - 1, EXTENT_DIRTY, NULL); 861 846 } 862 847 863 848 static noinline void caching_thread(struct btrfs_work *work) ··· 870 863 mutex_lock(&caching_ctl->mutex); 871 864 down_read(&fs_info->commit_root_sem); 872 865 873 - load_block_group_size_class(caching_ctl, block_group); 866 + load_block_group_size_class(caching_ctl); 874 867 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { 875 868 ret = load_free_space_cache(block_group); 876 869 if (ret == 1) { ··· 938 931 939 932 /* Allocator for zoned filesystems does not use the cache at all */ 940 933 if (btrfs_is_zoned(fs_info)) 934 + return 0; 935 + 936 + /* 937 + * No allocations can be done from remapped block groups, so they have 938 + * no entries in the free-space tree. 939 + */ 940 + if (cache->flags & BTRFS_BLOCK_GROUP_REMAPPED) 941 941 return 0; 942 942 943 943 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS); ··· 1071 1057 if (ret < 0) 1072 1058 return ret; 1073 1059 1074 - ret = btrfs_del_item(trans, root, path); 1075 - return ret; 1060 + return btrfs_del_item(trans, root, path); 1061 + } 1062 + 1063 + void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg) 1064 + { 1065 + int factor = btrfs_bg_type_to_factor(bg->flags); 1066 + 1067 + spin_lock(&bg->space_info->lock); 1068 + if (btrfs_test_opt(bg->fs_info, ENOSPC_DEBUG)) { 1069 + WARN_ON(bg->space_info->total_bytes < bg->length); 1070 + WARN_ON(bg->space_info->bytes_readonly < bg->length - bg->zone_unusable); 1071 + WARN_ON(bg->space_info->bytes_zone_unusable < bg->zone_unusable); 1072 + WARN_ON(bg->space_info->disk_total < bg->length * factor); 1073 + } 1074 + bg->space_info->total_bytes -= bg->length; 1075 + bg->space_info->bytes_readonly -= (bg->length - bg->zone_unusable); 1076 + btrfs_space_info_update_bytes_zone_unusable(bg->space_info, -bg->zone_unusable); 1077 + bg->space_info->disk_total -= bg->length * factor; 1078 + spin_unlock(&bg->space_info->lock); 1076 1079 } 1077 1080 1078 1081 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, ··· 1103 1072 struct kobject *kobj = NULL; 1104 1073 int ret; 1105 1074 int index; 1106 - int factor; 1107 1075 struct btrfs_caching_control *caching_ctl = NULL; 1108 1076 bool remove_map; 1109 1077 bool remove_rsv = false; 1110 1078 1111 1079 block_group = btrfs_lookup_block_group(fs_info, map->start); 1112 - if (!block_group) 1080 + if (unlikely(!block_group)) { 1081 + btrfs_abort_transaction(trans, -ENOENT); 1113 1082 return -ENOENT; 1083 + } 1114 1084 1115 - BUG_ON(!block_group->ro); 1085 + if (unlikely(!block_group->ro && 1086 + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) { 1087 + ret = -EUCLEAN; 1088 + btrfs_abort_transaction(trans, ret); 1089 + goto out; 1090 + } 1116 1091 1117 1092 trace_btrfs_remove_block_group(block_group); 1118 1093 /* ··· 1130 1093 block_group->length); 1131 1094 1132 1095 index = btrfs_bg_flags_to_raid_index(block_group->flags); 1133 - factor = btrfs_bg_type_to_factor(block_group->flags); 1134 1096 1135 1097 /* make sure this block group isn't part of an allocation cluster */ 1136 1098 cluster = &fs_info->data_alloc_cluster; ··· 1150 1114 btrfs_clear_data_reloc_bg(block_group); 1151 1115 1152 1116 path = btrfs_alloc_path(); 1153 - if (!path) { 1117 + if (unlikely(!path)) { 1154 1118 ret = -ENOMEM; 1119 + btrfs_abort_transaction(trans, ret); 1155 1120 goto out; 1156 1121 } 1157 1122 ··· 1188 1151 mutex_unlock(&trans->transaction->cache_write_mutex); 1189 1152 1190 1153 ret = btrfs_remove_free_space_inode(trans, inode, block_group); 1191 - if (ret) 1154 + if (unlikely(ret)) { 1155 + btrfs_abort_transaction(trans, ret); 1192 1156 goto out; 1157 + } 1193 1158 1194 1159 write_lock(&fs_info->block_group_cache_lock); 1195 1160 rb_erase_cached(&block_group->cache_node, ··· 1256 1217 1257 1218 spin_lock(&block_group->space_info->lock); 1258 1219 list_del_init(&block_group->ro_list); 1259 - 1260 - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { 1261 - WARN_ON(block_group->space_info->total_bytes 1262 - < block_group->length); 1263 - WARN_ON(block_group->space_info->bytes_readonly 1264 - < block_group->length - block_group->zone_unusable); 1265 - WARN_ON(block_group->space_info->bytes_zone_unusable 1266 - < block_group->zone_unusable); 1267 - WARN_ON(block_group->space_info->disk_total 1268 - < block_group->length * factor); 1269 - } 1270 - block_group->space_info->total_bytes -= block_group->length; 1271 - block_group->space_info->bytes_readonly -= 1272 - (block_group->length - block_group->zone_unusable); 1273 - btrfs_space_info_update_bytes_zone_unusable(block_group->space_info, 1274 - -block_group->zone_unusable); 1275 - block_group->space_info->disk_total -= block_group->length * factor; 1276 - 1277 1220 spin_unlock(&block_group->space_info->lock); 1221 + 1222 + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) 1223 + btrfs_remove_bg_from_sinfo(block_group); 1278 1224 1279 1225 /* 1280 1226 * Remove the free space for the block group from the free space tree ··· 1271 1247 * deletes the block group item from the extent tree, allowing for 1272 1248 * another task to attempt to create another block group with the same 1273 1249 * item key (and failing with -EEXIST and a transaction abort). 1250 + * 1251 + * If the REMAPPED flag has been set the block group's free space 1252 + * has already been removed, so we can skip the call to 1253 + * btrfs_remove_block_group_free_space(). 1274 1254 */ 1275 - ret = btrfs_remove_block_group_free_space(trans, block_group); 1276 - if (ret) 1277 - goto out; 1255 + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { 1256 + ret = btrfs_remove_block_group_free_space(trans, block_group); 1257 + if (unlikely(ret)) { 1258 + btrfs_abort_transaction(trans, ret); 1259 + goto out; 1260 + } 1261 + } 1278 1262 1279 1263 ret = remove_block_group_item(trans, path, block_group); 1280 - if (ret < 0) 1264 + if (unlikely(ret < 0)) { 1265 + btrfs_abort_transaction(trans, ret); 1281 1266 goto out; 1267 + } 1282 1268 1283 1269 spin_lock(&block_group->lock); 1284 1270 /* ··· 1411 1377 goto out; 1412 1378 } 1413 1379 1414 - num_bytes = cache->length - cache->reserved - cache->pinned - 1415 - cache->bytes_super - cache->zone_unusable - cache->used; 1380 + num_bytes = btrfs_block_group_available_space(cache); 1416 1381 1417 1382 /* 1418 1383 * Data never overcommits, even in mixed mode, so do just the straight ··· 1597 1564 1598 1565 spin_lock(&space_info->lock); 1599 1566 spin_lock(&block_group->lock); 1600 - if (btrfs_is_block_group_used(block_group) || block_group->ro || 1601 - list_is_singular(&block_group->list)) { 1567 + if (btrfs_is_block_group_used(block_group) || 1568 + (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) || 1569 + list_is_singular(&block_group->list) || 1570 + test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &block_group->runtime_flags)) { 1602 1571 /* 1603 1572 * We want to bail if we made new allocations or have 1604 1573 * outstanding allocations in this block group. We do ··· 1641 1606 * needing to allocate extents from the block group. 1642 1607 */ 1643 1608 used = btrfs_space_info_used(space_info, true); 1644 - if ((space_info->total_bytes - block_group->length < used && 1645 - block_group->zone_unusable < block_group->length) || 1646 - has_unwritten_metadata(block_group)) { 1609 + if (((space_info->total_bytes - block_group->length < used && 1610 + block_group->zone_unusable < block_group->length) || 1611 + has_unwritten_metadata(block_group)) && 1612 + !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { 1647 1613 /* 1648 1614 * Add a reference for the list, compensate for the ref 1649 1615 * drop under the "next" label for the ··· 1809 1773 btrfs_get_block_group(bg); 1810 1774 trace_btrfs_add_unused_block_group(bg); 1811 1775 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1776 + } else if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED && 1777 + bg->identity_remap_count == 0) { 1778 + /* Leave fully remapped block groups on the fully_remapped_bgs list. */ 1812 1779 } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { 1813 1780 /* Pull out the block group from the reclaim_bgs list. */ 1814 1781 trace_btrfs_add_unused_block_group(bg); ··· 1844 1805 1845 1806 static inline bool btrfs_should_reclaim(const struct btrfs_fs_info *fs_info) 1846 1807 { 1808 + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1809 + return false; 1810 + 1811 + if (btrfs_fs_closing(fs_info)) 1812 + return false; 1813 + 1847 1814 if (btrfs_is_zoned(fs_info)) 1848 1815 return btrfs_zoned_should_reclaim(fs_info); 1849 1816 return true; ··· 1884 1839 struct btrfs_space_info *space_info; 1885 1840 LIST_HEAD(retry_list); 1886 1841 1887 - if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) 1888 - return; 1889 - 1890 - if (btrfs_fs_closing(fs_info)) 1891 - return; 1892 - 1893 1842 if (!btrfs_should_reclaim(fs_info)) 1894 1843 return; 1895 1844 ··· 1911 1872 while (!list_empty(&fs_info->reclaim_bgs)) { 1912 1873 u64 used; 1913 1874 u64 reserved; 1875 + u64 old_total; 1914 1876 int ret = 0; 1915 1877 1916 1878 bg = list_first_entry(&fs_info->reclaim_bgs, ··· 1977 1937 } 1978 1938 1979 1939 spin_unlock(&bg->lock); 1940 + old_total = space_info->total_bytes; 1980 1941 spin_unlock(&space_info->lock); 1981 1942 1982 1943 /* ··· 2030 1989 reserved = 0; 2031 1990 spin_lock(&space_info->lock); 2032 1991 space_info->reclaim_errors++; 2033 - if (READ_ONCE(space_info->periodic_reclaim)) 2034 - space_info->periodic_reclaim_ready = false; 2035 1992 spin_unlock(&space_info->lock); 2036 1993 } 2037 1994 spin_lock(&space_info->lock); 2038 1995 space_info->reclaim_count++; 2039 1996 space_info->reclaim_bytes += used; 2040 1997 space_info->reclaim_bytes += reserved; 1998 + if (space_info->total_bytes < old_total) 1999 + btrfs_set_periodic_reclaim_ready(space_info, true); 2041 2000 spin_unlock(&space_info->lock); 2042 2001 2043 2002 next: ··· 2290 2249 2291 2250 while (nr--) { 2292 2251 u64 len = min_t(u64, stripe_len, 2293 - cache->start + cache->length - logical[nr]); 2252 + btrfs_block_group_end(cache) - logical[nr]); 2294 2253 2295 2254 cache->bytes_super += len; 2296 2255 ret = btrfs_set_extent_bit(&fs_info->excluded_extents, ··· 2307 2266 return 0; 2308 2267 } 2309 2268 2310 - static struct btrfs_block_group *btrfs_create_block_group_cache( 2269 + static struct btrfs_block_group *btrfs_create_block_group( 2311 2270 struct btrfs_fs_info *fs_info, u64 start) 2312 2271 { 2313 2272 struct btrfs_block_group *cache; ··· 2401 2360 } 2402 2361 2403 2362 static int read_one_block_group(struct btrfs_fs_info *info, 2404 - struct btrfs_block_group_item *bgi, 2363 + struct btrfs_block_group_item_v2 *bgi, 2405 2364 const struct btrfs_key *key, 2406 2365 int need_clear) 2407 2366 { ··· 2411 2370 2412 2371 ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY); 2413 2372 2414 - cache = btrfs_create_block_group_cache(info, key->objectid); 2373 + cache = btrfs_create_block_group(info, key->objectid); 2415 2374 if (!cache) 2416 2375 return -ENOMEM; 2417 2376 2418 2377 cache->length = key->offset; 2419 - cache->used = btrfs_stack_block_group_used(bgi); 2420 - cache->commit_used = cache->used; 2421 - cache->flags = btrfs_stack_block_group_flags(bgi); 2422 - cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); 2378 + cache->used = btrfs_stack_block_group_v2_used(bgi); 2379 + cache->last_used = cache->used; 2380 + cache->flags = btrfs_stack_block_group_v2_flags(bgi); 2381 + cache->last_flags = cache->flags; 2382 + cache->global_root_id = btrfs_stack_block_group_v2_chunk_objectid(bgi); 2423 2383 cache->space_info = btrfs_find_space_info(info, cache->flags); 2384 + cache->remap_bytes = btrfs_stack_block_group_v2_remap_bytes(bgi); 2385 + cache->last_remap_bytes = cache->remap_bytes; 2386 + cache->identity_remap_count = btrfs_stack_block_group_v2_identity_remap_count(bgi); 2387 + cache->last_identity_remap_count = cache->identity_remap_count; 2424 2388 2425 2389 btrfs_set_free_space_tree_thresholds(cache); 2426 2390 ··· 2490 2444 } else if (cache->length == cache->used) { 2491 2445 cache->cached = BTRFS_CACHE_FINISHED; 2492 2446 btrfs_free_excluded_extents(cache); 2493 - } else if (cache->used == 0) { 2447 + } else if (cache->used == 0 && cache->remap_bytes == 0) { 2494 2448 cache->cached = BTRFS_CACHE_FINISHED; 2495 2449 ret = btrfs_add_new_free_space(cache, cache->start, 2496 - cache->start + cache->length, NULL); 2450 + btrfs_block_group_end(cache), NULL); 2497 2451 btrfs_free_excluded_extents(cache); 2498 2452 if (ret) 2499 2453 goto error; ··· 2510 2464 2511 2465 set_avail_alloc_bits(info, cache->flags); 2512 2466 if (btrfs_chunk_writeable(info, cache->start)) { 2513 - if (cache->used == 0) { 2467 + if (cache->used == 0 && cache->remap_bytes == 0) { 2514 2468 ASSERT(list_empty(&cache->bg_list)); 2515 2469 if (btrfs_test_opt(info, DISCARD_ASYNC)) 2516 2470 btrfs_discard_queue_work(&info->discard_ctl, cache); ··· 2537 2491 struct btrfs_block_group *bg; 2538 2492 2539 2493 map = rb_entry(node, struct btrfs_chunk_map, rb_node); 2540 - bg = btrfs_create_block_group_cache(fs_info, map->start); 2494 + bg = btrfs_create_block_group(fs_info, map->start); 2541 2495 if (!bg) { 2542 2496 ret = -ENOMEM; 2543 2497 break; ··· 2614 2568 need_clear = 1; 2615 2569 2616 2570 while (1) { 2617 - struct btrfs_block_group_item bgi; 2571 + struct btrfs_block_group_item_v2 bgi; 2618 2572 struct extent_buffer *leaf; 2619 2573 int slot; 2574 + size_t size; 2620 2575 2621 2576 ret = find_first_block_group(info, path, &key); 2622 2577 if (ret > 0) ··· 2628 2581 leaf = path->nodes[0]; 2629 2582 slot = path->slots[0]; 2630 2583 2584 + if (btrfs_fs_incompat(info, REMAP_TREE)) { 2585 + size = sizeof(struct btrfs_block_group_item_v2); 2586 + } else { 2587 + size = sizeof(struct btrfs_block_group_item); 2588 + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, 0); 2589 + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 0); 2590 + } 2591 + 2631 2592 read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot), 2632 - sizeof(bgi)); 2593 + size); 2633 2594 2634 2595 btrfs_item_key_to_cpu(leaf, &key, slot); 2635 2596 btrfs_release_path(path); ··· 2707 2652 struct btrfs_block_group *block_group) 2708 2653 { 2709 2654 struct btrfs_fs_info *fs_info = trans->fs_info; 2710 - struct btrfs_block_group_item bgi; 2655 + struct btrfs_block_group_item_v2 bgi; 2711 2656 struct btrfs_root *root = btrfs_block_group_root(fs_info); 2712 2657 struct btrfs_key key; 2713 - u64 old_commit_used; 2658 + u64 old_last_used; 2659 + size_t size; 2714 2660 int ret; 2715 2661 2716 2662 spin_lock(&block_group->lock); 2717 - btrfs_set_stack_block_group_used(&bgi, block_group->used); 2718 - btrfs_set_stack_block_group_chunk_objectid(&bgi, 2719 - block_group->global_root_id); 2720 - btrfs_set_stack_block_group_flags(&bgi, block_group->flags); 2721 - old_commit_used = block_group->commit_used; 2722 - block_group->commit_used = block_group->used; 2663 + btrfs_set_stack_block_group_v2_used(&bgi, block_group->used); 2664 + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, block_group->global_root_id); 2665 + btrfs_set_stack_block_group_v2_flags(&bgi, block_group->flags); 2666 + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, block_group->remap_bytes); 2667 + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, block_group->identity_remap_count); 2668 + old_last_used = block_group->last_used; 2669 + block_group->last_used = block_group->used; 2670 + block_group->last_remap_bytes = block_group->remap_bytes; 2671 + block_group->last_identity_remap_count = block_group->identity_remap_count; 2672 + block_group->last_flags = block_group->flags; 2723 2673 key.objectid = block_group->start; 2724 2674 key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 2725 2675 key.offset = block_group->length; 2726 2676 spin_unlock(&block_group->lock); 2727 2677 2728 - ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); 2678 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) 2679 + size = sizeof(struct btrfs_block_group_item_v2); 2680 + else 2681 + size = sizeof(struct btrfs_block_group_item); 2682 + 2683 + ret = btrfs_insert_item(trans, root, &key, &bgi, size); 2729 2684 if (ret < 0) { 2730 2685 spin_lock(&block_group->lock); 2731 - block_group->commit_used = old_commit_used; 2686 + block_group->last_used = old_last_used; 2732 2687 spin_unlock(&block_group->lock); 2733 2688 } 2734 2689 ··· 2951 2886 2952 2887 btrfs_set_log_full_commit(trans); 2953 2888 2954 - cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 2889 + cache = btrfs_create_block_group(fs_info, chunk_offset); 2955 2890 if (!cache) 2956 2891 return ERR_PTR(-ENOMEM); 2957 2892 ··· 3155 3090 void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) 3156 3091 { 3157 3092 struct btrfs_space_info *sinfo = cache->space_info; 3158 - u64 num_bytes; 3159 3093 3160 3094 BUG_ON(!cache->ro); 3161 3095 ··· 3170 3106 btrfs_space_info_update_bytes_zone_unusable(sinfo, cache->zone_unusable); 3171 3107 sinfo->bytes_readonly -= cache->zone_unusable; 3172 3108 } 3173 - num_bytes = cache->length - cache->reserved - 3174 - cache->pinned - cache->bytes_super - 3175 - cache->zone_unusable - cache->used; 3176 - sinfo->bytes_readonly -= num_bytes; 3109 + sinfo->bytes_readonly -= btrfs_block_group_available_space(cache); 3177 3110 list_del_init(&cache->ro_list); 3178 3111 } 3179 3112 spin_unlock(&cache->lock); ··· 3186 3125 struct btrfs_root *root = btrfs_block_group_root(fs_info); 3187 3126 unsigned long bi; 3188 3127 struct extent_buffer *leaf; 3189 - struct btrfs_block_group_item bgi; 3128 + struct btrfs_block_group_item_v2 bgi; 3190 3129 struct btrfs_key key; 3191 - u64 old_commit_used; 3192 - u64 used; 3130 + u64 old_last_used, old_last_remap_bytes; 3131 + u32 old_last_identity_remap_count; 3132 + u64 used, remap_bytes; 3133 + u32 identity_remap_count; 3193 3134 3194 3135 /* 3195 3136 * Block group items update can be triggered out of commit transaction ··· 3200 3137 * may be changed. 3201 3138 */ 3202 3139 spin_lock(&cache->lock); 3203 - old_commit_used = cache->commit_used; 3140 + old_last_used = cache->last_used; 3141 + old_last_remap_bytes = cache->last_remap_bytes; 3142 + old_last_identity_remap_count = cache->last_identity_remap_count; 3204 3143 used = cache->used; 3205 - /* No change in used bytes, can safely skip it. */ 3206 - if (cache->commit_used == used) { 3144 + remap_bytes = cache->remap_bytes; 3145 + identity_remap_count = cache->identity_remap_count; 3146 + /* No change in values, can safely skip it. */ 3147 + if (cache->last_used == used && 3148 + cache->last_remap_bytes == remap_bytes && 3149 + cache->last_identity_remap_count == identity_remap_count && 3150 + cache->last_flags == cache->flags) { 3207 3151 spin_unlock(&cache->lock); 3208 3152 return 0; 3209 3153 } 3210 - cache->commit_used = used; 3154 + cache->last_used = used; 3155 + cache->last_remap_bytes = remap_bytes; 3156 + cache->last_identity_remap_count = identity_remap_count; 3157 + cache->last_flags = cache->flags; 3211 3158 spin_unlock(&cache->lock); 3212 3159 3213 3160 key.objectid = cache->start; ··· 3233 3160 3234 3161 leaf = path->nodes[0]; 3235 3162 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 3236 - btrfs_set_stack_block_group_used(&bgi, used); 3237 - btrfs_set_stack_block_group_chunk_objectid(&bgi, 3238 - cache->global_root_id); 3239 - btrfs_set_stack_block_group_flags(&bgi, cache->flags); 3240 - write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); 3163 + btrfs_set_stack_block_group_v2_used(&bgi, used); 3164 + btrfs_set_stack_block_group_v2_chunk_objectid(&bgi, cache->global_root_id); 3165 + btrfs_set_stack_block_group_v2_flags(&bgi, cache->flags); 3166 + 3167 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { 3168 + btrfs_set_stack_block_group_v2_remap_bytes(&bgi, cache->remap_bytes); 3169 + btrfs_set_stack_block_group_v2_identity_remap_count(&bgi, 3170 + cache->identity_remap_count); 3171 + write_extent_buffer(leaf, &bgi, bi, 3172 + sizeof(struct btrfs_block_group_item_v2)); 3173 + } else { 3174 + write_extent_buffer(leaf, &bgi, bi, 3175 + sizeof(struct btrfs_block_group_item)); 3176 + } 3177 + 3241 3178 fail: 3242 3179 btrfs_release_path(path); 3243 3180 /* 3244 - * We didn't update the block group item, need to revert commit_used 3181 + * We didn't update the block group item, need to revert last_used 3245 3182 * unless the block group item didn't exist yet - this is to prevent a 3246 3183 * race with a concurrent insertion of the block group item, with 3247 3184 * insert_block_group_item(), that happened just after we attempted to 3248 - * update. In that case we would reset commit_used to 0 just after the 3185 + * update. In that case we would reset last_used to 0 just after the 3249 3186 * insertion set it to a value greater than 0 - if the block group later 3250 3187 * becomes with 0 used bytes, we would incorrectly skip its update. 3251 3188 */ 3252 3189 if (ret < 0 && ret != -ENOENT) { 3253 3190 spin_lock(&cache->lock); 3254 - cache->commit_used = old_commit_used; 3191 + cache->last_used = old_last_used; 3192 + cache->last_remap_bytes = old_last_remap_bytes; 3193 + cache->last_identity_remap_count = old_last_identity_remap_count; 3255 3194 spin_unlock(&cache->lock); 3256 3195 } 3257 3196 return ret; ··· 3786 3701 return -ENOENT; 3787 3702 3788 3703 /* An extent can not span multiple block groups. */ 3789 - ASSERT(bytenr + num_bytes <= cache->start + cache->length); 3704 + ASSERT(bytenr + num_bytes <= btrfs_block_group_end(cache)); 3790 3705 3791 3706 space_info = cache->space_info; 3792 3707 factor = btrfs_bg_type_to_factor(cache->flags); ··· 4615 4530 list_del_init(&block_group->bg_list); 4616 4531 btrfs_put_block_group(block_group); 4617 4532 } 4533 + 4534 + while (!list_empty(&info->fully_remapped_bgs)) { 4535 + block_group = list_first_entry(&info->fully_remapped_bgs, 4536 + struct btrfs_block_group, bg_list); 4537 + list_del_init(&block_group->bg_list); 4538 + btrfs_put_block_group(block_group); 4539 + } 4618 4540 spin_unlock(&info->unused_bgs_lock); 4619 4541 4620 4542 spin_lock(&info->zone_active_bgs_lock); ··· 4772 4680 enum btrfs_block_group_size_class size_class, 4773 4681 bool force_wrong_size_class) 4774 4682 { 4683 + lockdep_assert_held(&bg->lock); 4775 4684 ASSERT(size_class != BTRFS_BG_SZ_NONE); 4776 4685 4777 4686 /* The new allocation is in the right size class, do nothing */ ··· 4809 4716 if (!btrfs_is_block_group_data_only(bg)) 4810 4717 return false; 4811 4718 return true; 4719 + } 4720 + 4721 + void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, 4722 + struct btrfs_trans_handle *trans) 4723 + { 4724 + struct btrfs_fs_info *fs_info = trans->fs_info; 4725 + 4726 + 4727 + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { 4728 + spin_lock(&bg->lock); 4729 + set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); 4730 + spin_unlock(&bg->lock); 4731 + 4732 + btrfs_discard_queue_work(&fs_info->discard_ctl, bg); 4733 + } else { 4734 + spin_lock(&fs_info->unused_bgs_lock); 4735 + /* 4736 + * The block group might already be on the unused_bgs list, 4737 + * remove it if it is. It'll get readded after 4738 + * btrfs_handle_fully_remapped_bgs() finishes. 4739 + */ 4740 + if (!list_empty(&bg->bg_list)) 4741 + list_del(&bg->bg_list); 4742 + else 4743 + btrfs_get_block_group(bg); 4744 + 4745 + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); 4746 + spin_unlock(&fs_info->unused_bgs_lock); 4747 + } 4748 + } 4749 + 4750 + /* 4751 + * Compare the block group and chunk trees, and find any fully-remapped block 4752 + * groups which haven't yet had their chunk stripes and device extents removed, 4753 + * and put them on the fully_remapped_bgs list so this gets done. 4754 + * 4755 + * This happens when a block group becomes fully remapped, i.e. its last 4756 + * identity mapping is removed, and the volume is unmounted before async 4757 + * discard has finished. It's important this gets done as until it is the 4758 + * chunk's stripes are dead space. 4759 + */ 4760 + int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info) 4761 + { 4762 + struct rb_node *node_bg, *node_chunk; 4763 + 4764 + node_bg = rb_first_cached(&fs_info->block_group_cache_tree); 4765 + node_chunk = rb_first_cached(&fs_info->mapping_tree); 4766 + 4767 + while (node_bg && node_chunk) { 4768 + struct btrfs_block_group *bg; 4769 + struct btrfs_chunk_map *map; 4770 + 4771 + bg = rb_entry(node_bg, struct btrfs_block_group, cache_node); 4772 + map = rb_entry(node_chunk, struct btrfs_chunk_map, rb_node); 4773 + 4774 + ASSERT(bg->start == map->start); 4775 + 4776 + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) 4777 + goto next; 4778 + 4779 + if (bg->identity_remap_count != 0) 4780 + goto next; 4781 + 4782 + if (map->num_stripes == 0) 4783 + goto next; 4784 + 4785 + spin_lock(&fs_info->unused_bgs_lock); 4786 + 4787 + if (list_empty(&bg->bg_list)) { 4788 + btrfs_get_block_group(bg); 4789 + list_add_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); 4790 + } else { 4791 + list_move_tail(&bg->bg_list, &fs_info->fully_remapped_bgs); 4792 + } 4793 + 4794 + spin_unlock(&fs_info->unused_bgs_lock); 4795 + 4796 + /* 4797 + * Ideally we'd want to call btrfs_discard_queue_work() here, 4798 + * but it'd do nothing as the discard worker hasn't been 4799 + * started yet. 4800 + * 4801 + * The block group will get added to the discard list when 4802 + * btrfs_handle_fully_remapped_bgs() gets called, when we 4803 + * commit the first transaction. 4804 + */ 4805 + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { 4806 + spin_lock(&bg->lock); 4807 + set_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); 4808 + spin_unlock(&bg->lock); 4809 + } 4810 + 4811 + next: 4812 + node_bg = rb_next(node_bg); 4813 + node_chunk = rb_next(node_chunk); 4814 + } 4815 + 4816 + ASSERT(!node_bg && !node_chunk); 4817 + 4818 + return 0; 4812 4819 }

+28 -3

fs/btrfs/block-group.h

··· 49 49 BTRFS_DISCARD_EXTENTS, 50 50 BTRFS_DISCARD_BITMAPS, 51 51 BTRFS_DISCARD_RESET_CURSOR, 52 + BTRFS_DISCARD_FULLY_REMAPPED, 52 53 }; 53 54 54 55 /* ··· 93 92 * transaction. 94 93 */ 95 94 BLOCK_GROUP_FLAG_NEW, 95 + BLOCK_GROUP_FLAG_FULLY_REMAPPED, 96 + BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, 96 97 }; 97 98 98 99 enum btrfs_caching_type { ··· 132 129 u64 flags; 133 130 u64 cache_generation; 134 131 u64 global_root_id; 132 + u64 remap_bytes; 133 + u32 identity_remap_count; 135 134 136 135 /* 137 136 * The last committed used bytes of this block group, if the above @used 138 - * is still the same as @commit_used, we don't need to update block 137 + * is still the same as @last_used, we don't need to update block 139 138 * group item of this block group. 140 139 */ 141 - u64 commit_used; 140 + u64 last_used; 141 + /* The last committed remap_bytes value of this block group. */ 142 + u64 last_remap_bytes; 143 + /* The last commited identity_remap_count value of this block group. */ 144 + u32 last_identity_remap_count; 145 + /* The last committed flags value for this block group. */ 146 + u64 last_flags; 147 + 142 148 /* 143 149 * If the free space extent count exceeds this number, convert the block 144 150 * group to bitmaps. ··· 294 282 { 295 283 lockdep_assert_held(&bg->lock); 296 284 297 - return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); 285 + return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0 || 286 + bg->remap_bytes > 0); 298 287 } 299 288 300 289 static inline bool btrfs_is_block_group_data_only(const struct btrfs_block_group *block_group) ··· 306 293 */ 307 294 return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && 308 295 !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); 296 + } 297 + 298 + static inline u64 btrfs_block_group_available_space(const struct btrfs_block_group *bg) 299 + { 300 + lockdep_assert_held(&bg->lock); 301 + 302 + return (bg->length - bg->used - bg->pinned - bg->reserved - 303 + bg->bytes_super - bg->zone_unusable); 309 304 } 310 305 311 306 #ifdef CONFIG_BTRFS_DEBUG ··· 345 324 struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( 346 325 struct btrfs_fs_info *fs_info, 347 326 const u64 chunk_offset); 327 + void btrfs_remove_bg_from_sinfo(struct btrfs_block_group *bg); 348 328 int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 349 329 struct btrfs_chunk_map *map); 350 330 void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); ··· 417 395 enum btrfs_block_group_size_class size_class, 418 396 bool force_wrong_size_class); 419 397 bool btrfs_block_group_should_use_size_class(const struct btrfs_block_group *bg); 398 + void btrfs_mark_bg_fully_remapped(struct btrfs_block_group *bg, 399 + struct btrfs_trans_handle *trans); 400 + int btrfs_populate_fully_remapped_bgs_list(struct btrfs_fs_info *fs_info); 420 401 421 402 #endif /* BTRFS_BLOCK_GROUP_H */

+8

fs/btrfs/block-rsv.c

··· 419 419 case BTRFS_TREE_LOG_OBJECTID: 420 420 root->block_rsv = &fs_info->treelog_rsv; 421 421 break; 422 + case BTRFS_REMAP_TREE_OBJECTID: 423 + root->block_rsv = &fs_info->remap_block_rsv; 424 + break; 422 425 default: 423 426 root->block_rsv = NULL; 424 427 break; ··· 434 431 435 432 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 436 433 fs_info->chunk_block_rsv.space_info = space_info; 434 + 435 + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA_REMAP); 436 + fs_info->remap_block_rsv.space_info = space_info; 437 437 438 438 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 439 439 fs_info->global_block_rsv.space_info = space_info; ··· 464 458 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 465 459 WARN_ON(fs_info->chunk_block_rsv.size > 0); 466 460 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 461 + WARN_ON(fs_info->remap_block_rsv.size > 0); 462 + WARN_ON(fs_info->remap_block_rsv.reserved > 0); 467 463 WARN_ON(fs_info->delayed_block_rsv.size > 0); 468 464 WARN_ON(fs_info->delayed_block_rsv.reserved > 0); 469 465 WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);

+1

fs/btrfs/block-rsv.h

··· 22 22 BTRFS_BLOCK_RSV_DELALLOC, 23 23 BTRFS_BLOCK_RSV_TRANS, 24 24 BTRFS_BLOCK_RSV_CHUNK, 25 + BTRFS_BLOCK_RSV_REMAP, 25 26 BTRFS_BLOCK_RSV_DELOPS, 26 27 BTRFS_BLOCK_RSV_DELREFS, 27 28 BTRFS_BLOCK_RSV_TREELOG,

+101 -116

fs/btrfs/compression.c

··· 21 21 #include <linux/sched/mm.h> 22 22 #include <linux/log2.h> 23 23 #include <linux/shrinker.h> 24 - #include <crypto/hash.h> 25 24 #include "misc.h" 26 25 #include "ctree.h" 27 26 #include "fs.h" ··· 86 87 return false; 87 88 } 88 89 89 - static int compression_compress_pages(int type, struct list_head *ws, 90 - struct btrfs_inode *inode, u64 start, 91 - struct folio **folios, unsigned long *out_folios, 92 - unsigned long *total_in, unsigned long *total_out) 93 - { 94 - switch (type) { 95 - case BTRFS_COMPRESS_ZLIB: 96 - return zlib_compress_folios(ws, inode, start, folios, 97 - out_folios, total_in, total_out); 98 - case BTRFS_COMPRESS_LZO: 99 - return lzo_compress_folios(ws, inode, start, folios, 100 - out_folios, total_in, total_out); 101 - case BTRFS_COMPRESS_ZSTD: 102 - return zstd_compress_folios(ws, inode, start, folios, 103 - out_folios, total_in, total_out); 104 - case BTRFS_COMPRESS_NONE: 105 - default: 106 - /* 107 - * This can happen when compression races with remount setting 108 - * it to 'no compress', while caller doesn't call 109 - * inode_need_compress() to check if we really need to 110 - * compress. 111 - * 112 - * Not a big deal, just need to inform caller that we 113 - * haven't allocated any pages yet. 114 - */ 115 - *out_folios = 0; 116 - return -E2BIG; 117 - } 118 - } 119 - 120 90 static int compression_decompress_bio(struct list_head *ws, 121 91 struct compressed_bio *cb) 122 92 { ··· 122 154 */ 123 155 BUG(); 124 156 } 125 - } 126 - 127 - static void btrfs_free_compressed_folios(struct compressed_bio *cb) 128 - { 129 - for (unsigned int i = 0; i < cb->nr_folios; i++) 130 - btrfs_free_compr_folio(cb->compressed_folios[i]); 131 - kfree(cb->compressed_folios); 132 157 } 133 158 134 159 static int btrfs_decompress_bio(struct compressed_bio *cb); ··· 232 271 { 233 272 struct compressed_bio *cb = to_compressed_bio(bbio); 234 273 blk_status_t status = bbio->bio.bi_status; 274 + struct folio_iter fi; 235 275 236 276 if (!status) 237 277 status = errno_to_blk_status(btrfs_decompress_bio(cb)); 238 278 239 - btrfs_free_compressed_folios(cb); 240 279 btrfs_bio_end_io(cb->orig_bbio, status); 280 + bio_for_each_folio_all(fi, &bbio->bio) 281 + btrfs_free_compr_folio(fi.folio); 241 282 bio_put(&bbio->bio); 242 283 } 243 284 ··· 290 327 static void end_bbio_compressed_write(struct btrfs_bio *bbio) 291 328 { 292 329 struct compressed_bio *cb = to_compressed_bio(bbio); 330 + struct folio_iter fi; 293 331 294 332 btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, 295 333 cb->bbio.bio.bi_status == BLK_STS_OK); ··· 298 334 if (cb->writeback) 299 335 end_compressed_writeback(cb); 300 336 /* Note, our inode could be gone now. */ 301 - btrfs_free_compressed_folios(cb); 337 + bio_for_each_folio_all(fi, &bbio->bio) 338 + btrfs_free_compr_folio(fi.folio); 302 339 bio_put(&cb->bbio.bio); 303 - } 304 - 305 - static void btrfs_add_compressed_bio_folios(struct compressed_bio *cb) 306 - { 307 - struct bio *bio = &cb->bbio.bio; 308 - u32 offset = 0; 309 - unsigned int findex = 0; 310 - 311 - while (offset < cb->compressed_len) { 312 - struct folio *folio = cb->compressed_folios[findex]; 313 - u32 len = min_t(u32, cb->compressed_len - offset, folio_size(folio)); 314 - int ret; 315 - 316 - /* Maximum compressed extent is smaller than bio size limit. */ 317 - ret = bio_add_folio(bio, folio, len, 0); 318 - ASSERT(ret); 319 - offset += len; 320 - findex++; 321 - } 322 340 } 323 341 324 342 /* ··· 313 367 * the end io hooks. 314 368 */ 315 369 void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, 316 - struct folio **compressed_folios, 317 - unsigned int nr_folios, 318 - blk_opf_t write_flags, 319 - bool writeback) 370 + struct compressed_bio *cb) 320 371 { 321 372 struct btrfs_inode *inode = ordered->inode; 322 373 struct btrfs_fs_info *fs_info = inode->root->fs_info; 323 - struct compressed_bio *cb; 324 374 325 375 ASSERT(IS_ALIGNED(ordered->file_offset, fs_info->sectorsize)); 326 376 ASSERT(IS_ALIGNED(ordered->num_bytes, fs_info->sectorsize)); 377 + ASSERT(cb->writeback); 327 378 328 - cb = alloc_compressed_bio(inode, ordered->file_offset, 329 - REQ_OP_WRITE | write_flags, 330 - end_bbio_compressed_write); 331 379 cb->start = ordered->file_offset; 332 380 cb->len = ordered->num_bytes; 333 - cb->compressed_folios = compressed_folios; 334 381 cb->compressed_len = ordered->disk_num_bytes; 335 - cb->writeback = writeback; 336 - cb->nr_folios = nr_folios; 337 382 cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; 338 383 cb->bbio.ordered = ordered; 339 - btrfs_add_compressed_bio_folios(cb); 340 384 341 385 btrfs_submit_bbio(&cb->bbio, 0); 386 + } 387 + 388 + /* 389 + * Allocate a compressed write bio for @inode file offset @start length @len. 390 + * 391 + * The caller still needs to properly queue all folios and populate involved 392 + * members. 393 + */ 394 + struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode, 395 + u64 start, u64 len) 396 + { 397 + struct compressed_bio *cb; 398 + 399 + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE, end_bbio_compressed_write); 400 + cb->start = start; 401 + cb->len = len; 402 + cb->writeback = true; 403 + 404 + return cb; 342 405 } 343 406 344 407 /* ··· 475 520 folio_put(folio); 476 521 break; 477 522 } 478 - add_size = min(em->start + em->len, page_end + 1) - cur; 523 + add_size = min(btrfs_extent_map_end(em), page_end + 1) - cur; 479 524 btrfs_free_extent_map(em); 480 525 btrfs_unlock_extent(tree, cur, page_end, NULL); 481 526 ··· 526 571 struct extent_map_tree *em_tree = &inode->extent_tree; 527 572 struct compressed_bio *cb; 528 573 unsigned int compressed_len; 574 + const u32 min_folio_size = btrfs_min_folio_size(fs_info); 529 575 u64 file_offset = bbio->file_offset; 530 576 u64 em_len; 531 577 u64 em_start; 532 578 struct extent_map *em; 533 579 unsigned long pflags; 534 580 int memstall = 0; 535 - blk_status_t status; 536 581 int ret; 537 582 538 583 /* we need the actual starting offset of this extent in the file */ ··· 540 585 em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); 541 586 read_unlock(&em_tree->lock); 542 587 if (!em) { 543 - status = BLK_STS_IOERR; 588 + ret = -EIO; 544 589 goto out; 545 590 } 546 591 ··· 562 607 563 608 btrfs_free_extent_map(em); 564 609 565 - cb->nr_folios = DIV_ROUND_UP(compressed_len, btrfs_min_folio_size(fs_info)); 566 - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); 567 - if (!cb->compressed_folios) { 568 - status = BLK_STS_RESOURCE; 569 - goto out_free_bio; 570 - } 610 + for (int i = 0; i * min_folio_size < compressed_len; i++) { 611 + struct folio *folio; 612 + u32 cur_len = min(compressed_len - i * min_folio_size, min_folio_size); 571 613 572 - ret = btrfs_alloc_folio_array(cb->nr_folios, fs_info->block_min_order, 573 - cb->compressed_folios); 574 - if (ret) { 575 - status = BLK_STS_RESOURCE; 576 - goto out_free_compressed_pages; 614 + folio = btrfs_alloc_compr_folio(fs_info); 615 + if (!folio) { 616 + ret = -ENOMEM; 617 + goto out_free_bio; 618 + } 619 + 620 + ret = bio_add_folio(&cb->bbio.bio, folio, cur_len, 0); 621 + if (unlikely(!ret)) { 622 + folio_put(folio); 623 + ret = -EINVAL; 624 + goto out_free_bio; 625 + } 577 626 } 627 + ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len); 578 628 579 629 add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, 580 630 &pflags); 581 631 582 - /* include any pages we added in add_ra-bio_pages */ 583 632 cb->len = bbio->bio.bi_iter.bi_size; 584 633 cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; 585 - btrfs_add_compressed_bio_folios(cb); 586 634 587 635 if (memstall) 588 636 psi_memstall_leave(&pflags); ··· 593 635 btrfs_submit_bbio(&cb->bbio, 0); 594 636 return; 595 637 596 - out_free_compressed_pages: 597 - kfree(cb->compressed_folios); 598 638 out_free_bio: 599 - bio_put(&cb->bbio.bio); 639 + cleanup_compressed_bio(cb); 600 640 out: 601 - btrfs_bio_end_io(bbio, status); 641 + btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); 602 642 } 603 643 604 644 /* ··· 983 1027 } 984 1028 985 1029 /* 986 - * Given an address space and start and length, compress the bytes into @pages 987 - * that are allocated on demand. 1030 + * Given an address space and start and length, compress the page cache 1031 + * contents into @cb. 988 1032 * 989 - * @type_level is encoded algorithm and level, where level 0 means whatever 990 - * default the algorithm chooses and is opaque here; 991 - * - compression algo are 0-3 992 - * - the level are bits 4-7 1033 + * @type_level: is encoded algorithm and level, where level 0 means whatever 1034 + * default the algorithm chooses and is opaque here; 1035 + * - compression algo are 0-3 1036 + * - the level are bits 4-7 993 1037 * 994 - * @out_folios is an in/out parameter, holds maximum number of folios to allocate 995 - * and returns number of actually allocated folios 1038 + * @cb->bbio.bio.bi_iter.bi_size will indicate the compressed data size. 1039 + * The bi_size may not be sectorsize aligned, thus the caller still need 1040 + * to do the round up before submission. 996 1041 * 997 - * @total_in is used to return the number of bytes actually read. It 998 - * may be smaller than the input length if we had to exit early because we 999 - * ran out of room in the folios array or because we cross the 1000 - * max_out threshold. 1001 - * 1002 - * @total_out is an in/out parameter, must be set to the input length and will 1003 - * be also used to return the total number of compressed bytes 1042 + * This function will allocate compressed folios with btrfs_alloc_compr_folio(), 1043 + * thus callers must make sure the endio function and error handling are using 1044 + * btrfs_free_compr_folio() to release those folios. 1045 + * This is already done in end_bbio_compressed_write() and cleanup_compressed_bio(). 1004 1046 */ 1005 - int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, 1006 - u64 start, struct folio **folios, unsigned long *out_folios, 1007 - unsigned long *total_in, unsigned long *total_out) 1047 + struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode, 1048 + u64 start, u32 len, unsigned int type, 1049 + int level, blk_opf_t write_flags) 1008 1050 { 1009 1051 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1010 - const unsigned long orig_len = *total_out; 1011 1052 struct list_head *workspace; 1053 + struct compressed_bio *cb; 1012 1054 int ret; 1055 + 1056 + cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, 1057 + end_bbio_compressed_write); 1058 + cb->start = start; 1059 + cb->len = len; 1060 + cb->writeback = true; 1061 + cb->compress_type = type; 1013 1062 1014 1063 level = btrfs_compress_set_level(type, level); 1015 1064 workspace = get_workspace(fs_info, type, level); 1016 - ret = compression_compress_pages(type, workspace, inode, start, folios, 1017 - out_folios, total_in, total_out); 1018 - /* The total read-in bytes should be no larger than the input. */ 1019 - ASSERT(*total_in <= orig_len); 1065 + switch (type) { 1066 + case BTRFS_COMPRESS_ZLIB: 1067 + ret = zlib_compress_bio(workspace, cb); 1068 + break; 1069 + case BTRFS_COMPRESS_LZO: 1070 + ret = lzo_compress_bio(workspace, cb); 1071 + break; 1072 + case BTRFS_COMPRESS_ZSTD: 1073 + ret = zstd_compress_bio(workspace, cb); 1074 + break; 1075 + case BTRFS_COMPRESS_NONE: 1076 + default: 1077 + /* 1078 + * This can happen when compression races with remount setting 1079 + * it to 'no compress', while caller doesn't call 1080 + * inode_need_compress() to check if we really need to 1081 + * compress. 1082 + * 1083 + * Not a big deal, just need to inform caller that we 1084 + * haven't allocated any pages yet. 1085 + */ 1086 + ret = -E2BIG; 1087 + } 1088 + 1020 1089 put_workspace(fs_info, type, workspace); 1021 - return ret; 1090 + if (ret < 0) { 1091 + cleanup_compressed_bio(cb); 1092 + return ERR_PTR(ret); 1093 + } 1094 + return cb; 1022 1095 } 1023 1096 1024 1097 static int btrfs_decompress_bio(struct compressed_bio *cb)

+19 -21

fs/btrfs/compression.h

··· 42 42 #define BTRFS_ZLIB_DEFAULT_LEVEL 3 43 43 44 44 struct compressed_bio { 45 - /* Number of compressed folios in the array. */ 46 - unsigned int nr_folios; 47 - 48 - /* The folios with the compressed data on them. */ 49 - struct folio **compressed_folios; 50 - 51 45 /* starting offset in the inode for our pages */ 52 46 u64 start; 53 47 ··· 85 91 void __cold btrfs_exit_compress(void); 86 92 87 93 bool btrfs_compress_level_valid(unsigned int type, int level); 88 - int btrfs_compress_folios(unsigned int type, int level, struct btrfs_inode *inode, 89 - u64 start, struct folio **folios, unsigned long *out_folios, 90 - unsigned long *total_in, unsigned long *total_out); 91 94 int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, 92 95 unsigned long dest_pgoff, size_t srclen, size_t destlen); 93 96 int btrfs_decompress_buf2page(const char *buf, u32 buf_len, 94 97 struct compressed_bio *cb, u32 decompressed); 95 98 99 + struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode, 100 + u64 start, u64 len); 96 101 void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, 97 - struct folio **compressed_folios, 98 - unsigned int nr_folios, blk_opf_t write_flags, 99 - bool writeback); 102 + struct compressed_bio *cb); 100 103 void btrfs_submit_compressed_read(struct btrfs_bio *bbio); 101 104 102 105 int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); ··· 137 146 138 147 int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, 139 148 struct folio **in_folio_ret); 149 + struct compressed_bio *btrfs_compress_bio(struct btrfs_inode *inode, 150 + u64 start, u32 len, unsigned int type, 151 + int level, blk_opf_t write_flags); 140 152 141 - int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, 142 - u64 start, struct folio **folios, unsigned long *out_folios, 143 - unsigned long *total_in, unsigned long *total_out); 153 + static inline void cleanup_compressed_bio(struct compressed_bio *cb) 154 + { 155 + struct bio *bio = &cb->bbio.bio; 156 + struct folio_iter fi; 157 + 158 + bio_for_each_folio_all(fi, bio) 159 + btrfs_free_compr_folio(fi.folio); 160 + bio_put(bio); 161 + } 162 + 163 + int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb); 144 164 int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); 145 165 int zlib_decompress(struct list_head *ws, const u8 *data_in, 146 166 struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, ··· 160 158 void zlib_free_workspace(struct list_head *ws); 161 159 struct list_head *zlib_get_workspace(struct btrfs_fs_info *fs_info, unsigned int level); 162 160 163 - int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, 164 - u64 start, struct folio **folios, unsigned long *out_folios, 165 - unsigned long *total_in, unsigned long *total_out); 161 + int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb); 166 162 int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); 167 163 int lzo_decompress(struct list_head *ws, const u8 *data_in, 168 164 struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen, ··· 168 168 struct list_head *lzo_alloc_workspace(struct btrfs_fs_info *fs_info); 169 169 void lzo_free_workspace(struct list_head *ws); 170 170 171 - int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, 172 - u64 start, struct folio **folios, unsigned long *out_folios, 173 - unsigned long *total_in, unsigned long *total_out); 171 + int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb); 174 172 int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); 175 173 int zstd_decompress(struct list_head *ws, const u8 *data_in, 176 174 struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,

+17 -32

fs/btrfs/ctree.c

··· 249 249 int ret = 0; 250 250 int level; 251 251 struct btrfs_disk_key disk_key; 252 + const bool is_reloc_root = (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID); 252 253 u64 reloc_src_root = 0; 253 254 254 255 WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && ··· 263 262 else 264 263 btrfs_node_key(buf, &disk_key, 0); 265 264 266 - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 265 + if (is_reloc_root) 267 266 reloc_src_root = btrfs_header_owner(buf); 268 267 cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, 269 268 &disk_key, level, buf->start, 0, ··· 277 276 btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); 278 277 btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN | 279 278 BTRFS_HEADER_FLAG_RELOC); 280 - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) 279 + if (is_reloc_root) 281 280 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC); 282 281 else 283 282 btrfs_set_header_owner(cow, new_root_objectid); ··· 292 291 return ret; 293 292 } 294 293 295 - if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) { 296 - ret = btrfs_inc_ref(trans, root, cow, 1); 297 - if (unlikely(ret)) 298 - btrfs_abort_transaction(trans, ret); 299 - } else { 300 - ret = btrfs_inc_ref(trans, root, cow, 0); 301 - if (unlikely(ret)) 302 - btrfs_abort_transaction(trans, ret); 303 - } 304 - if (ret) { 294 + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 295 + if (unlikely(ret)) { 296 + btrfs_abort_transaction(trans, ret); 305 297 btrfs_tree_unlock(cow); 306 298 free_extent_buffer(cow); 307 299 return ret; ··· 356 362 u64 owner; 357 363 u64 flags; 358 364 int ret; 365 + const bool is_reloc_root = (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID); 359 366 360 367 /* 361 368 * Backrefs update rules: ··· 392 397 } 393 398 } else { 394 399 refs = 1; 395 - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID || 396 - btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 400 + if (is_reloc_root || btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV) 397 401 flags = BTRFS_BLOCK_FLAG_FULL_BACKREF; 398 402 else 399 403 flags = 0; ··· 411 417 } 412 418 413 419 if (refs > 1) { 414 - if ((owner == btrfs_root_id(root) || 415 - btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) && 420 + if ((owner == btrfs_root_id(root) || is_reloc_root) && 416 421 !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { 417 - ret = btrfs_inc_ref(trans, root, buf, 1); 422 + ret = btrfs_inc_ref(trans, root, buf, true); 418 423 if (ret) 419 424 return ret; 420 425 421 - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) { 422 - ret = btrfs_dec_ref(trans, root, buf, 0); 426 + if (is_reloc_root) { 427 + ret = btrfs_dec_ref(trans, root, buf, false); 423 428 if (ret) 424 429 return ret; 425 - ret = btrfs_inc_ref(trans, root, cow, 1); 430 + ret = btrfs_inc_ref(trans, root, cow, true); 426 431 if (ret) 427 432 return ret; 428 433 } ··· 430 437 if (ret) 431 438 return ret; 432 439 } else { 433 - 434 - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 435 - ret = btrfs_inc_ref(trans, root, cow, 1); 436 - else 437 - ret = btrfs_inc_ref(trans, root, cow, 0); 440 + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 438 441 if (ret) 439 442 return ret; 440 443 } 441 444 } else { 442 445 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 443 - if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) 444 - ret = btrfs_inc_ref(trans, root, cow, 1); 445 - else 446 - ret = btrfs_inc_ref(trans, root, cow, 0); 446 + ret = btrfs_inc_ref(trans, root, cow, is_reloc_root); 447 447 if (ret) 448 448 return ret; 449 - ret = btrfs_dec_ref(trans, root, buf, 1); 449 + ret = btrfs_dec_ref(trans, root, buf, true); 450 450 if (ret) 451 451 return ret; 452 452 } ··· 4002 4016 if (ret) 4003 4017 return ret; 4004 4018 4005 - ret = split_item(trans, path, new_key, split_offset); 4006 - return ret; 4019 + return split_item(trans, path, new_key, split_offset); 4007 4020 } 4008 4021 4009 4022 /*

+9

fs/btrfs/ctree.h

··· 86 86 struct btrfs_path *path_name __free(btrfs_free_path) = NULL 87 87 88 88 /* 89 + * This defines an on-stack path that will be auto released when exiting the scope. 90 + * 91 + * It is compatible with any existing manual btrfs_release_path() calls. 92 + */ 93 + #define BTRFS_PATH_AUTO_RELEASE(path_name) \ 94 + struct btrfs_path path_name __free(btrfs_release_path) = { 0 } 95 + 96 + /* 89 97 * The state of btrfs root 90 98 */ 91 99 enum { ··· 609 601 struct btrfs_path *btrfs_alloc_path(void); 610 602 void btrfs_free_path(struct btrfs_path *p); 611 603 DEFINE_FREE(btrfs_free_path, struct btrfs_path *, btrfs_free_path(_T)) 604 + DEFINE_FREE(btrfs_release_path, struct btrfs_path, btrfs_release_path(&_T)) 612 605 613 606 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 614 607 struct btrfs_path *path, int slot, int nr);

+4 -6

fs/btrfs/defrag.c

··· 609 609 { 610 610 struct btrfs_root *root = inode->root; 611 611 struct btrfs_file_extent_item *fi; 612 - struct btrfs_path path = { 0 }; 612 + BTRFS_PATH_AUTO_RELEASE(path); 613 613 struct extent_map *em; 614 614 struct btrfs_key key; 615 615 u64 ino = btrfs_ino(inode); ··· 720 720 if (ret > 0) 721 721 goto not_found; 722 722 } 723 - btrfs_release_path(&path); 724 723 return em; 725 724 726 725 not_found: 727 - btrfs_release_path(&path); 728 726 btrfs_free_extent_map(em); 729 727 return NULL; 730 728 731 729 err: 732 - btrfs_release_path(&path); 733 730 btrfs_free_extent_map(em); 734 731 return ERR_PTR(ret); 735 732 } ··· 792 795 { 793 796 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); 794 797 struct extent_map *next; 798 + const u64 em_end = btrfs_extent_map_end(em); 795 799 bool ret = false; 796 800 797 801 /* This is the last extent */ 798 - if (em->start + em->len >= i_size_read(inode)) 802 + if (em_end >= i_size_read(inode)) 799 803 return false; 800 804 801 805 /* ··· 805 807 * one will not be a target. 806 808 * This will just cause extra IO without really reducing the fragments. 807 809 */ 808 - next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); 810 + next = defrag_lookup_extent(inode, em_end, newer_than, locked); 809 811 /* No more em or hole */ 810 812 if (!next || next->disk_bytenr >= EXTENT_MAP_LAST_BYTE) 811 813 goto out;

+18 -35

fs/btrfs/delayed-inode.c

··· 232 232 } 233 233 234 234 static struct btrfs_delayed_node *btrfs_first_delayed_node( 235 - struct btrfs_delayed_root *delayed_root, 235 + struct btrfs_fs_info *fs_info, 236 236 struct btrfs_ref_tracker *tracker) 237 237 { 238 238 struct btrfs_delayed_node *node; 239 239 240 - spin_lock(&delayed_root->lock); 241 - node = list_first_entry_or_null(&delayed_root->node_list, 240 + spin_lock(&fs_info->delayed_root.lock); 241 + node = list_first_entry_or_null(&fs_info->delayed_root.node_list, 242 242 struct btrfs_delayed_node, n_list); 243 243 if (node) { 244 244 refcount_inc(&node->refs); 245 245 btrfs_delayed_node_ref_tracker_alloc(node, tracker, GFP_ATOMIC); 246 246 } 247 - spin_unlock(&delayed_root->lock); 247 + spin_unlock(&fs_info->delayed_root.lock); 248 248 249 249 return node; 250 250 } ··· 257 257 struct list_head *p; 258 258 struct btrfs_delayed_node *next = NULL; 259 259 260 - delayed_root = node->root->fs_info->delayed_root; 260 + delayed_root = &node->root->fs_info->delayed_root; 261 261 spin_lock(&delayed_root->lock); 262 262 if (!test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { 263 263 /* not in the list */ ··· 287 287 if (!delayed_node) 288 288 return; 289 289 290 - delayed_root = delayed_node->root->fs_info->delayed_root; 290 + delayed_root = &delayed_node->root->fs_info->delayed_root; 291 291 292 292 mutex_lock(&delayed_node->mutex); 293 293 if (delayed_node->count) ··· 425 425 delayed_node->index_cnt = ins->index + 1; 426 426 427 427 delayed_node->count++; 428 - atomic_inc(&delayed_node->root->fs_info->delayed_root->items); 428 + atomic_inc(&delayed_node->root->fs_info->delayed_root.items); 429 429 return 0; 430 430 } 431 431 ··· 443 443 { 444 444 struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; 445 445 struct rb_root_cached *root; 446 - struct btrfs_delayed_root *delayed_root; 447 446 448 447 /* Not inserted, ignore it. */ 449 448 if (RB_EMPTY_NODE(&delayed_item->rb_node)) ··· 450 451 451 452 /* If it's in a rbtree, then we need to have delayed node locked. */ 452 453 lockdep_assert_held(&delayed_node->mutex); 453 - 454 - delayed_root = delayed_node->root->fs_info->delayed_root; 455 454 456 455 if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) 457 456 root = &delayed_node->ins_root; ··· 459 462 rb_erase_cached(&delayed_item->rb_node, root); 460 463 RB_CLEAR_NODE(&delayed_item->rb_node); 461 464 delayed_node->count--; 462 - 463 - finish_one_item(delayed_root); 465 + finish_one_item(&delayed_node->root->fs_info->delayed_root); 464 466 } 465 467 466 468 static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) ··· 976 980 977 981 static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) 978 982 { 979 - struct btrfs_delayed_root *delayed_root; 980 - 981 983 if (delayed_node && 982 984 test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { 983 985 ASSERT(delayed_node->root); 984 986 clear_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 985 987 delayed_node->count--; 986 - 987 - delayed_root = delayed_node->root->fs_info->delayed_root; 988 - finish_one_item(delayed_root); 988 + finish_one_item(&delayed_node->root->fs_info->delayed_root); 989 989 } 990 990 } 991 991 992 992 static void btrfs_release_delayed_iref(struct btrfs_delayed_node *delayed_node) 993 993 { 994 - 995 994 if (test_and_clear_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { 996 - struct btrfs_delayed_root *delayed_root; 997 - 998 995 ASSERT(delayed_node->root); 999 996 delayed_node->count--; 1000 - 1001 - delayed_root = delayed_node->root->fs_info->delayed_root; 1002 - finish_one_item(delayed_root); 997 + finish_one_item(&delayed_node->root->fs_info->delayed_root); 1003 998 } 1004 999 } 1005 1000 ··· 1124 1137 ret = btrfs_record_root_in_trans(trans, node->root); 1125 1138 if (ret) 1126 1139 return ret; 1127 - ret = btrfs_update_delayed_inode(trans, node->root, path, node); 1128 - return ret; 1140 + 1141 + return btrfs_update_delayed_inode(trans, node->root, path, node); 1129 1142 } 1130 1143 1131 1144 /* ··· 1137 1150 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) 1138 1151 { 1139 1152 struct btrfs_fs_info *fs_info = trans->fs_info; 1140 - struct btrfs_delayed_root *delayed_root; 1141 1153 struct btrfs_delayed_node *curr_node, *prev_node; 1142 1154 struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; 1143 1155 struct btrfs_path *path; ··· 1154 1168 block_rsv = trans->block_rsv; 1155 1169 trans->block_rsv = &fs_info->delayed_block_rsv; 1156 1170 1157 - delayed_root = fs_info->delayed_root; 1158 - 1159 - curr_node = btrfs_first_delayed_node(delayed_root, &curr_delayed_node_tracker); 1171 + curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); 1160 1172 while (curr_node && (!count || nr--)) { 1161 1173 ret = __btrfs_commit_inode_delayed_items(trans, path, 1162 1174 curr_node); ··· 1401 1417 struct btrfs_ref_tracker delayed_node_tracker; 1402 1418 struct btrfs_delayed_node *node; 1403 1419 1404 - node = btrfs_first_delayed_node( fs_info->delayed_root, &delayed_node_tracker); 1420 + node = btrfs_first_delayed_node(fs_info, &delayed_node_tracker); 1405 1421 if (WARN_ON(node)) { 1406 1422 btrfs_delayed_node_ref_tracker_free(node, 1407 1423 &delayed_node_tracker); ··· 1424 1440 1425 1441 void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) 1426 1442 { 1427 - struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; 1443 + struct btrfs_delayed_root *delayed_root = &fs_info->delayed_root; 1428 1444 1429 1445 if ((atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) || 1430 1446 btrfs_workqueue_normal_congested(fs_info->delayed_workers)) ··· 1954 1970 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1955 1971 set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); 1956 1972 delayed_node->count++; 1957 - atomic_inc(&root->fs_info->delayed_root->items); 1973 + atomic_inc(&root->fs_info->delayed_root.items); 1958 1974 release_node: 1959 1975 mutex_unlock(&delayed_node->mutex); 1960 1976 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); ··· 1996 2012 mutex_lock(&delayed_node->mutex); 1997 2013 if (!test_and_set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags)) { 1998 2014 delayed_node->count++; 1999 - atomic_inc(&fs_info->delayed_root->items); 2015 + atomic_inc(&fs_info->delayed_root.items); 2000 2016 } 2001 2017 mutex_unlock(&delayed_node->mutex); 2002 2018 btrfs_release_delayed_node(delayed_node, &delayed_node_tracker); ··· 2102 2118 struct btrfs_delayed_node *curr_node, *prev_node; 2103 2119 struct btrfs_ref_tracker curr_delayed_node_tracker, prev_delayed_node_tracker; 2104 2120 2105 - curr_node = btrfs_first_delayed_node(fs_info->delayed_root, 2106 - &curr_delayed_node_tracker); 2121 + curr_node = btrfs_first_delayed_node(fs_info, &curr_delayed_node_tracker); 2107 2122 while (curr_node) { 2108 2123 __btrfs_kill_delayed_node(curr_node); 2109 2124

-15

fs/btrfs/delayed-inode.h

··· 30 30 BTRFS_DELAYED_DELETION_ITEM 31 31 }; 32 32 33 - struct btrfs_delayed_root { 34 - spinlock_t lock; 35 - struct list_head node_list; 36 - /* 37 - * Used for delayed nodes which is waiting to be dealt with by the 38 - * worker. If the delayed node is inserted into the work queue, we 39 - * drop it from this list. 40 - */ 41 - struct list_head prepare_list; 42 - atomic_t items; /* for delayed items */ 43 - atomic_t items_seq; /* for delayed items */ 44 - int nodes; /* for delayed nodes */ 45 - wait_queue_head_t wait; 46 - }; 47 - 48 33 struct btrfs_ref_tracker_dir { 49 34 #ifdef CONFIG_BTRFS_DEBUG 50 35 struct ref_tracker_dir dir;

+14 -15

fs/btrfs/direct-io.c

··· 763 763 struct btrfs_dio_data data = { 0 }; 764 764 765 765 return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 766 - IOMAP_DIO_PARTIAL, &data, done_before); 766 + IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); 767 767 } 768 768 769 769 static struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, ··· 772 772 struct btrfs_dio_data data = { 0 }; 773 773 774 774 return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 775 - IOMAP_DIO_PARTIAL, &data, done_before); 775 + IOMAP_DIO_PARTIAL | IOMAP_DIO_FSBLOCK_ALIGNED, &data, done_before); 776 776 } 777 777 778 778 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, ··· 785 785 786 786 if (iov_iter_alignment(iter) & blocksize_mask) 787 787 return -EINVAL; 788 - 789 - /* 790 - * For bs > ps support, we heavily rely on large folios to make sure no 791 - * block will cross large folio boundaries. 792 - * 793 - * But memory provided by direct IO is only virtually contiguous, not 794 - * physically contiguous, and will break the btrfs' large folio requirement. 795 - * 796 - * So for bs > ps support, all direct IOs should fallback to buffered ones. 797 - */ 798 - if (fs_info->sectorsize > PAGE_SIZE) 799 - return -EINVAL; 800 - 801 788 return 0; 802 789 } 803 790 ··· 801 814 ssize_t ret; 802 815 unsigned int ilock_flags = 0; 803 816 struct iomap_dio *dio; 817 + const u64 data_profile = btrfs_data_alloc_profile(fs_info) & 818 + BTRFS_BLOCK_GROUP_PROFILE_MASK; 804 819 805 820 if (iocb->ki_flags & IOCB_NOWAIT) 806 821 ilock_flags |= BTRFS_ILOCK_TRY; ··· 815 826 */ 816 827 if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) 817 828 ilock_flags |= BTRFS_ILOCK_SHARED; 829 + 830 + /* 831 + * If our data profile has duplication (either extra mirrors or RAID56), 832 + * we can not trust the direct IO buffer, the content may change during 833 + * writeback and cause different contents written to different mirrors. 834 + * 835 + * Thus only RAID0 and SINGLE can go true zero-copy direct IO. 836 + */ 837 + if (data_profile != BTRFS_BLOCK_GROUP_RAID0 && data_profile != 0) 838 + goto buffered; 818 839 819 840 relock: 820 841 ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);

+45 -7

fs/btrfs/discard.c

··· 216 216 } 217 217 218 218 /* 219 + * Check whether a block group is empty. 220 + * 221 + * "Empty" here means that there are no extents physically located within the 222 + * device extents corresponding to this block group. 223 + * 224 + * For a remapped block group, this means that all of its identity remaps have 225 + * been removed. For a non-remapped block group, this means that no extents 226 + * have an address within its range, and that nothing has been remapped to be 227 + * within it. 228 + */ 229 + static bool block_group_is_empty(const struct btrfs_block_group *bg) 230 + { 231 + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) 232 + return bg->identity_remap_count == 0; 233 + 234 + return bg->used == 0 && bg->remap_bytes == 0; 235 + } 236 + 237 + /* 219 238 * Look up next block group and set it for use. 220 239 * 221 240 * @discard_ctl: discard control ··· 260 241 block_group = find_next_block_group(discard_ctl, now); 261 242 262 243 if (block_group && now >= block_group->discard_eligible_time) { 244 + const bool empty = block_group_is_empty(block_group); 245 + 263 246 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && 264 - block_group->used != 0) { 247 + !empty) { 265 248 if (btrfs_is_block_group_data_only(block_group)) { 266 249 __add_to_discard_list(discard_ctl, block_group); 267 250 /* ··· 288 267 } 289 268 if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { 290 269 block_group->discard_cursor = block_group->start; 291 - block_group->discard_state = BTRFS_DISCARD_EXTENTS; 270 + 271 + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && empty) { 272 + block_group->discard_state = BTRFS_DISCARD_FULLY_REMAPPED; 273 + } else { 274 + block_group->discard_state = BTRFS_DISCARD_EXTENTS; 275 + } 292 276 } 293 277 } 294 278 if (block_group) { ··· 399 373 if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) 400 374 return; 401 375 402 - if (block_group->used == 0) 376 + if (block_group_is_empty(block_group)) 403 377 add_to_discard_unused_list(discard_ctl, block_group); 404 378 else 405 379 add_to_discard_list(discard_ctl, block_group); ··· 496 470 { 497 471 remove_from_discard_list(discard_ctl, block_group); 498 472 499 - if (block_group->used == 0) { 473 + if (block_group_is_empty(block_group)) { 500 474 if (btrfs_is_free_space_trimmed(block_group)) 501 475 btrfs_mark_bg_unused(block_group); 502 476 else ··· 550 524 /* Perform discarding */ 551 525 minlen = discard_minlen[discard_index]; 552 526 553 - if (discard_state == BTRFS_DISCARD_BITMAPS) { 527 + switch (discard_state) { 528 + case BTRFS_DISCARD_BITMAPS: { 554 529 u64 maxlen = 0; 555 530 556 531 /* ··· 568 541 btrfs_block_group_end(block_group), 569 542 minlen, maxlen, true); 570 543 discard_ctl->discard_bitmap_bytes += trimmed; 571 - } else { 544 + 545 + break; 546 + } 547 + 548 + case BTRFS_DISCARD_FULLY_REMAPPED: 549 + btrfs_trim_fully_remapped_block_group(block_group); 550 + break; 551 + 552 + default: 572 553 btrfs_trim_block_group_extents(block_group, &trimmed, 573 554 block_group->discard_cursor, 574 555 btrfs_block_group_end(block_group), 575 556 minlen, true); 576 557 discard_ctl->discard_extent_bytes += trimmed; 558 + 559 + break; 577 560 } 578 561 579 562 /* Determine next steps for a block_group */ 580 563 if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { 581 - if (discard_state == BTRFS_DISCARD_BITMAPS) { 564 + if (discard_state == BTRFS_DISCARD_BITMAPS || 565 + discard_state == BTRFS_DISCARD_FULLY_REMAPPED) { 582 566 btrfs_finish_discard_pass(discard_ctl, block_group); 583 567 } else { 584 568 block_group->discard_cursor = block_group->start;

+167 -117

fs/btrfs/disk-io.c

··· 18 18 #include <linux/crc32c.h> 19 19 #include <linux/sched/mm.h> 20 20 #include <linux/unaligned.h> 21 - #include <crypto/hash.h> 22 21 #include "ctree.h" 23 22 #include "disk-io.h" 24 23 #include "transaction.h" 25 24 #include "btrfs_inode.h" 25 + #include "delayed-inode.h" 26 26 #include "bio.h" 27 27 #include "print-tree.h" 28 28 #include "locking.h" ··· 62 62 static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); 63 63 static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); 64 64 65 - static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) 66 - { 67 - if (fs_info->csum_shash) 68 - crypto_free_shash(fs_info->csum_shash); 69 - } 70 - 71 65 /* 72 66 * Compute the csum of a btree block and store the result to provided buffer. 73 67 */ ··· 70 76 struct btrfs_fs_info *fs_info = buf->fs_info; 71 77 int num_pages; 72 78 u32 first_page_part; 73 - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 79 + struct btrfs_csum_ctx csum; 74 80 char *kaddr; 75 81 int i; 76 82 77 - shash->tfm = fs_info->csum_shash; 78 - crypto_shash_init(shash); 83 + btrfs_csum_init(&csum, fs_info->csum_type); 79 84 80 85 if (buf->addr) { 81 86 /* Pages are contiguous, handle them as a big one. */ ··· 87 94 num_pages = num_extent_pages(buf); 88 95 } 89 96 90 - crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, 91 - first_page_part - BTRFS_CSUM_SIZE); 97 + btrfs_csum_update(&csum, kaddr + BTRFS_CSUM_SIZE, 98 + first_page_part - BTRFS_CSUM_SIZE); 92 99 93 100 /* 94 101 * Multiple single-page folios case would reach here. 95 102 * 96 103 * nodesize <= PAGE_SIZE and large folio all handled by above 97 - * crypto_shash_update() already. 104 + * btrfs_csum_update() already. 98 105 */ 99 106 for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { 100 107 kaddr = folio_address(buf->folios[i]); 101 - crypto_shash_update(shash, kaddr, PAGE_SIZE); 108 + btrfs_csum_update(&csum, kaddr, PAGE_SIZE); 102 109 } 103 110 memset(result, 0, BTRFS_CSUM_SIZE); 104 - crypto_shash_final(shash, result); 111 + btrfs_csum_final(&csum, result); 105 112 } 106 113 107 114 /* ··· 153 160 int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, 154 161 const struct btrfs_super_block *disk_sb) 155 162 { 156 - char result[BTRFS_CSUM_SIZE]; 157 - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 158 - 159 - shash->tfm = fs_info->csum_shash; 163 + u8 result[BTRFS_CSUM_SIZE]; 160 164 161 165 /* 162 166 * The super_block structure does not span the whole 163 167 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is 164 168 * filled with zeros and is included in the checksum. 165 169 */ 166 - crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, 167 - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); 170 + btrfs_csum(fs_info->csum_type, (const u8 *)disk_sb + BTRFS_CSUM_SIZE, 171 + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result); 168 172 169 173 if (memcmp(disk_sb->csum, result, fs_info->csum_size)) 170 174 return 1; ··· 176 186 const u32 step = min(fs_info->nodesize, PAGE_SIZE); 177 187 const u32 nr_steps = eb->len / step; 178 188 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 179 - int ret = 0; 180 189 181 190 if (sb_rdonly(fs_info->sb)) 182 191 return -EROFS; ··· 197 208 paddrs[i] = page_to_phys(&folio->page) + offset_in_page(eb->start); 198 209 } 199 210 200 - ret = btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, eb->start, 201 - paddrs, step, mirror_num); 202 - return ret; 211 + return btrfs_repair_io_failure(fs_info, 0, eb->start, eb->len, 212 + eb->start, paddrs, step, mirror_num); 203 213 } 204 214 205 215 /* ··· 370 382 btrfs_err_rl(fs_info, 371 383 "bad tree block start, mirror %u want %llu have %llu", 372 384 eb->read_mirror, eb->start, found_start); 373 - ret = -EIO; 374 - goto out; 385 + return -EIO; 375 386 } 376 387 if (unlikely(check_tree_block_fsid(eb))) { 377 388 btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", 378 389 eb->start, eb->read_mirror); 379 - ret = -EIO; 380 - goto out; 390 + return -EIO; 381 391 } 382 392 found_level = btrfs_header_level(eb); 383 393 if (unlikely(found_level >= BTRFS_MAX_LEVEL)) { 384 394 btrfs_err(fs_info, 385 395 "bad tree block level, mirror %u level %d on logical %llu", 386 396 eb->read_mirror, btrfs_header_level(eb), eb->start); 387 - ret = -EIO; 388 - goto out; 397 + return -EIO; 389 398 } 390 399 391 400 csum_tree_block(eb, result); ··· 397 412 BTRFS_CSUM_FMT_VALUE(csum_size, result), 398 413 btrfs_header_level(eb), 399 414 ignore_csum ? ", ignored" : ""); 400 - if (unlikely(!ignore_csum)) { 401 - ret = -EUCLEAN; 402 - goto out; 403 - } 415 + if (unlikely(!ignore_csum)) 416 + return -EUCLEAN; 404 417 } 405 418 406 419 if (unlikely(found_level != check->level)) { 407 420 btrfs_err(fs_info, 408 421 "level verify failed on logical %llu mirror %u wanted %u found %u", 409 422 eb->start, eb->read_mirror, check->level, found_level); 410 - ret = -EIO; 411 - goto out; 423 + return -EIO; 412 424 } 413 425 if (unlikely(check->transid && 414 426 btrfs_header_generation(eb) != check->transid)) { ··· 413 431 "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", 414 432 eb->start, eb->read_mirror, check->transid, 415 433 btrfs_header_generation(eb)); 416 - ret = -EIO; 417 - goto out; 434 + return -EIO; 418 435 } 419 436 if (check->has_first_key) { 420 437 const struct btrfs_key *expect_key = &check->first_key; ··· 431 450 expect_key->type, expect_key->offset, 432 451 found_key.objectid, found_key.type, 433 452 found_key.offset); 434 - ret = -EUCLEAN; 435 - goto out; 453 + return -EUCLEAN; 436 454 } 437 455 } 438 456 if (check->owner_root) { 439 457 ret = btrfs_check_eb_owner(eb, check->owner_root); 440 458 if (ret < 0) 441 - goto out; 459 + return ret; 442 460 } 443 461 444 462 /* If this is a leaf block and it is corrupt, just return -EIO. */ ··· 451 471 btrfs_err(fs_info, 452 472 "read time tree block corruption detected on logical %llu mirror %u", 453 473 eb->start, eb->read_mirror); 454 - out: 455 474 return ret; 456 475 } 457 476 ··· 794 815 struct extent_buffer *leaf; 795 816 struct btrfs_root *tree_root = fs_info->tree_root; 796 817 struct btrfs_root *root; 797 - struct btrfs_key key; 798 818 unsigned int nofs_flag; 799 819 int ret = 0; 800 820 ··· 842 864 843 865 btrfs_tree_unlock(leaf); 844 866 845 - key.objectid = objectid; 846 - key.type = BTRFS_ROOT_ITEM_KEY; 847 - key.offset = 0; 848 - ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item); 867 + ret = btrfs_insert_root(trans, tree_root, &root->root_key, &root->root_item); 849 868 if (ret) 850 869 goto fail; 851 870 ··· 1128 1153 return btrfs_grab_root(btrfs_global_root(fs_info, &key)); 1129 1154 case BTRFS_RAID_STRIPE_TREE_OBJECTID: 1130 1155 return btrfs_grab_root(fs_info->stripe_root); 1156 + case BTRFS_REMAP_TREE_OBJECTID: 1157 + return btrfs_grab_root(fs_info->remap_root); 1131 1158 default: 1132 1159 return NULL; 1133 1160 } ··· 1206 1229 ASSERT(percpu_counter_sum_positive(em_counter) == 0); 1207 1230 percpu_counter_destroy(em_counter); 1208 1231 percpu_counter_destroy(&fs_info->dev_replace.bio_counter); 1209 - btrfs_free_csum_hash(fs_info); 1210 1232 btrfs_free_stripe_hash_table(fs_info); 1211 1233 btrfs_free_ref_cache(fs_info); 1212 1234 kfree(fs_info->balance_ctl); 1213 - kfree(fs_info->delayed_root); 1214 1235 free_global_roots(fs_info); 1215 1236 btrfs_put_root(fs_info->tree_root); 1216 1237 btrfs_put_root(fs_info->chunk_root); ··· 1219 1244 btrfs_put_root(fs_info->data_reloc_root); 1220 1245 btrfs_put_root(fs_info->block_group_root); 1221 1246 btrfs_put_root(fs_info->stripe_root); 1247 + btrfs_put_root(fs_info->remap_root); 1222 1248 btrfs_check_leaked_roots(fs_info); 1223 1249 btrfs_extent_buffer_leak_debug_check(fs_info); 1224 1250 kfree(fs_info->super_copy); ··· 1463 1487 * needn't do anything special here. 1464 1488 */ 1465 1489 btrfs_run_defrag_inodes(fs_info); 1490 + 1491 + if (btrfs_fs_incompat(fs_info, REMAP_TREE) && 1492 + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) 1493 + btrfs_handle_fully_remapped_bgs(fs_info); 1466 1494 1467 1495 /* 1468 1496 * Acquires fs_info->reclaim_bgs_lock to avoid racing ··· 1776 1796 free_root_extent_buffers(info->data_reloc_root); 1777 1797 free_root_extent_buffers(info->block_group_root); 1778 1798 free_root_extent_buffers(info->stripe_root); 1799 + free_root_extent_buffers(info->remap_root); 1779 1800 if (free_chunk_root) 1780 1801 free_root_extent_buffers(info->chunk_root); 1781 1802 } ··· 1964 1983 return 0; 1965 1984 } 1966 1985 1967 - static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) 1986 + static void btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) 1968 1987 { 1969 - struct crypto_shash *csum_shash; 1970 - const char *csum_driver = btrfs_super_csum_driver(csum_type); 1971 - 1972 - csum_shash = crypto_alloc_shash(csum_driver, 0, 0); 1973 - 1974 - if (IS_ERR(csum_shash)) { 1975 - btrfs_err(fs_info, "error allocating %s hash for checksum", 1976 - csum_driver); 1977 - return PTR_ERR(csum_shash); 1978 - } 1979 - 1980 - fs_info->csum_shash = csum_shash; 1981 - 1982 1988 /* Check if the checksum implementation is a fast accelerated one. */ 1983 1989 switch (csum_type) { 1984 1990 case BTRFS_CSUM_TYPE_CRC32: ··· 1979 2011 break; 1980 2012 } 1981 2013 1982 - btrfs_info(fs_info, "using %s (%s) checksum algorithm", 1983 - btrfs_super_csum_name(csum_type), 1984 - crypto_shash_driver_name(csum_shash)); 1985 - return 0; 2014 + btrfs_info(fs_info, "using %s checksum algorithm", 2015 + btrfs_super_csum_name(csum_type)); 1986 2016 } 1987 2017 1988 2018 static int btrfs_replay_log(struct btrfs_fs_info *fs_info, ··· 2138 2172 return ret; 2139 2173 if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) 2140 2174 return ret; 2141 - ret = load_global_roots_objectid(tree_root, path, 2142 - BTRFS_FREE_SPACE_TREE_OBJECTID, 2143 - "free space"); 2144 2175 2145 - return ret; 2176 + return load_global_roots_objectid(tree_root, path, 2177 + BTRFS_FREE_SPACE_TREE_OBJECTID, 2178 + "free space"); 2146 2179 } 2147 2180 2148 2181 static int btrfs_read_roots(struct btrfs_fs_info *fs_info) ··· 2190 2225 if (ret) 2191 2226 goto out; 2192 2227 2193 - /* 2194 - * This tree can share blocks with some other fs tree during relocation 2195 - * and we need a proper setup by btrfs_get_fs_root 2196 - */ 2197 - root = btrfs_get_fs_root(tree_root->fs_info, 2198 - BTRFS_DATA_RELOC_TREE_OBJECTID, true); 2199 - if (IS_ERR(root)) { 2200 - if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { 2201 - location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 2202 - ret = PTR_ERR(root); 2203 - goto out; 2228 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { 2229 + /* The remap_root has already been loaded in load_important_roots(). */ 2230 + root = fs_info->remap_root; 2231 + 2232 + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2233 + 2234 + root->root_key.objectid = BTRFS_REMAP_TREE_OBJECTID; 2235 + root->root_key.type = BTRFS_ROOT_ITEM_KEY; 2236 + root->root_key.offset = 0; 2237 + 2238 + /* Check that data reloc tree doesn't also exist. */ 2239 + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 2240 + root = btrfs_read_tree_root(fs_info->tree_root, &location); 2241 + if (!IS_ERR(root)) { 2242 + btrfs_err(fs_info, "data reloc tree exists when remap-tree enabled"); 2243 + btrfs_put_root(root); 2244 + return -EIO; 2245 + } else if (PTR_ERR(root) != -ENOENT) { 2246 + btrfs_warn(fs_info, "error %ld when checking for data reloc tree", 2247 + PTR_ERR(root)); 2204 2248 } 2205 2249 } else { 2206 - set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2207 - fs_info->data_reloc_root = root; 2250 + /* 2251 + * This tree can share blocks with some other fs tree during 2252 + * relocation and we need a proper setup by btrfs_get_fs_root(). 2253 + */ 2254 + root = btrfs_get_fs_root(tree_root->fs_info, 2255 + BTRFS_DATA_RELOC_TREE_OBJECTID, true); 2256 + if (IS_ERR(root)) { 2257 + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { 2258 + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; 2259 + ret = PTR_ERR(root); 2260 + goto out; 2261 + } 2262 + } else { 2263 + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); 2264 + fs_info->data_reloc_root = root; 2265 + } 2208 2266 } 2209 2267 2210 2268 location.objectid = BTRFS_QUOTA_TREE_OBJECTID; ··· 2467 2479 ret = -EINVAL; 2468 2480 } 2469 2481 2482 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { 2483 + /* 2484 + * Reduce test matrix for remap tree by requiring block-group-tree 2485 + * and no-holes. Free-space-tree is a hard requirement. 2486 + */ 2487 + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || 2488 + !btrfs_fs_incompat(fs_info, NO_HOLES) || 2489 + !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { 2490 + btrfs_err(fs_info, 2491 + "remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); 2492 + ret = -EINVAL; 2493 + } 2494 + 2495 + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 2496 + btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); 2497 + ret = -EINVAL; 2498 + } 2499 + 2500 + if (btrfs_fs_incompat(fs_info, ZONED)) { 2501 + btrfs_err(fs_info, "remap-tree not supported with zoned devices"); 2502 + ret = -EINVAL; 2503 + } 2504 + 2505 + if (sectorsize > PAGE_SIZE) { 2506 + btrfs_err(fs_info, "remap-tree not supported when block size > page size"); 2507 + ret = -EINVAL; 2508 + } 2509 + } 2510 + 2470 2511 /* 2471 2512 * Hint to catch really bogus numbers, bitflips or so, more exact checks are 2472 2513 * done later ··· 2654 2637 btrfs_warn(fs_info, "couldn't read tree root"); 2655 2638 return ret; 2656 2639 } 2640 + 2641 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { 2642 + bytenr = btrfs_super_remap_root(sb); 2643 + gen = btrfs_super_remap_root_generation(sb); 2644 + level = btrfs_super_remap_root_level(sb); 2645 + ret = load_super_root(fs_info->remap_root, bytenr, gen, level); 2646 + if (ret) { 2647 + btrfs_warn(fs_info, "couldn't read remap root"); 2648 + return ret; 2649 + } 2650 + } 2651 + 2657 2652 return 0; 2658 2653 } 2659 2654 ··· 2802 2773 INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); 2803 2774 INIT_LIST_HEAD(&fs_info->unused_bgs); 2804 2775 INIT_LIST_HEAD(&fs_info->reclaim_bgs); 2776 + INIT_LIST_HEAD(&fs_info->fully_remapped_bgs); 2805 2777 INIT_LIST_HEAD(&fs_info->zone_active_bgs); 2806 2778 #ifdef CONFIG_BTRFS_DEBUG 2807 2779 INIT_LIST_HEAD(&fs_info->allocated_roots); ··· 2815 2785 BTRFS_BLOCK_RSV_GLOBAL); 2816 2786 btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); 2817 2787 btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); 2788 + btrfs_init_block_rsv(&fs_info->remap_block_rsv, BTRFS_BLOCK_RSV_REMAP); 2818 2789 btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); 2819 2790 btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); 2820 2791 btrfs_init_block_rsv(&fs_info->delayed_block_rsv, ··· 2858 2827 mutex_init(&fs_info->chunk_mutex); 2859 2828 mutex_init(&fs_info->transaction_kthread_mutex); 2860 2829 mutex_init(&fs_info->cleaner_mutex); 2830 + mutex_init(&fs_info->remap_mutex); 2861 2831 mutex_init(&fs_info->ro_block_group_mutex); 2862 2832 init_rwsem(&fs_info->commit_root_sem); 2863 2833 init_rwsem(&fs_info->cleanup_work_sem); ··· 2933 2901 if (ret) 2934 2902 return ret; 2935 2903 2936 - fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root), 2937 - GFP_KERNEL); 2938 - if (!fs_info->delayed_root) 2939 - return -ENOMEM; 2940 - btrfs_init_delayed_root(fs_info->delayed_root); 2904 + btrfs_init_delayed_root(&fs_info->delayed_root); 2941 2905 2942 2906 if (sb_rdonly(sb)) 2943 2907 set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); ··· 3046 3018 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 3047 3019 btrfs_warn(fs_info, 3048 3020 "'clear_cache' option is ignored with extent tree v2"); 3021 + else if (btrfs_fs_incompat(fs_info, REMAP_TREE)) 3022 + btrfs_warn(fs_info, "'clear_cache' option is ignored with remap tree"); 3049 3023 else 3050 3024 rebuild_free_space_tree = true; 3051 3025 } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && ··· 3062 3032 if (ret) { 3063 3033 btrfs_warn(fs_info, 3064 3034 "failed to rebuild free space tree: %d", ret); 3065 - goto out; 3035 + return ret; 3066 3036 } 3067 3037 } 3068 3038 ··· 3073 3043 if (ret) { 3074 3044 btrfs_warn(fs_info, 3075 3045 "failed to disable free space tree: %d", ret); 3076 - goto out; 3046 + return ret; 3077 3047 } 3078 3048 } 3079 3049 3050 + /* 3051 + * Before btrfs-progs v6.16.1 mkfs.btrfs can leave free space entries 3052 + * for deleted temporary chunks. Delete them if they exist. 3053 + */ 3054 + ret = btrfs_delete_orphan_free_space_entries(fs_info); 3055 + if (ret < 0) { 3056 + btrfs_err(fs_info, "failed to delete orphan free space tree entries: %d", ret); 3057 + return ret; 3058 + } 3080 3059 /* 3081 3060 * btrfs_find_orphan_roots() is responsible for finding all the dead 3082 3061 * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load ··· 3099 3060 */ 3100 3061 ret = btrfs_find_orphan_roots(fs_info); 3101 3062 if (ret) 3102 - goto out; 3063 + return ret; 3103 3064 3104 3065 ret = btrfs_cleanup_fs_roots(fs_info); 3105 3066 if (ret) 3106 - goto out; 3067 + return ret; 3107 3068 3108 3069 down_read(&fs_info->cleanup_work_sem); 3109 3070 if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || 3110 3071 (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { 3111 3072 up_read(&fs_info->cleanup_work_sem); 3112 - goto out; 3073 + return ret; 3113 3074 } 3114 3075 up_read(&fs_info->cleanup_work_sem); 3115 3076 ··· 3118 3079 mutex_unlock(&fs_info->cleaner_mutex); 3119 3080 if (ret < 0) { 3120 3081 btrfs_warn(fs_info, "failed to recover relocation: %d", ret); 3121 - goto out; 3082 + return ret; 3122 3083 } 3123 3084 3124 3085 if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && ··· 3128 3089 if (ret) { 3129 3090 btrfs_warn(fs_info, 3130 3091 "failed to create free space tree: %d", ret); 3131 - goto out; 3092 + return ret; 3132 3093 } 3133 3094 } 3134 3095 3135 3096 if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) { 3136 3097 ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); 3137 3098 if (ret) 3138 - goto out; 3099 + return ret; 3139 3100 } 3140 3101 3141 3102 ret = btrfs_resume_balance_async(fs_info); 3142 3103 if (ret) 3143 - goto out; 3104 + return ret; 3144 3105 3145 3106 ret = btrfs_resume_dev_replace_async(fs_info); 3146 3107 if (ret) { 3147 3108 btrfs_warn(fs_info, "failed to resume dev_replace"); 3148 - goto out; 3109 + return ret; 3149 3110 } 3150 3111 3151 3112 btrfs_qgroup_rescan_resume(fs_info); ··· 3156 3117 if (ret) { 3157 3118 btrfs_warn(fs_info, 3158 3119 "failed to create the UUID tree %d", ret); 3159 - goto out; 3120 + return ret; 3160 3121 } 3161 3122 } 3162 3123 3163 - out: 3164 - return ret; 3124 + return 0; 3165 3125 } 3166 3126 3167 3127 /* ··· 3291 3253 struct btrfs_fs_info *fs_info = btrfs_sb(sb); 3292 3254 struct btrfs_root *tree_root; 3293 3255 struct btrfs_root *chunk_root; 3256 + struct btrfs_root *remap_root; 3294 3257 int ret; 3295 3258 int level; 3296 3259 ··· 3341 3302 } 3342 3303 3343 3304 fs_info->csum_size = btrfs_super_csum_size(disk_super); 3305 + fs_info->csum_type = csum_type; 3344 3306 3345 - ret = btrfs_init_csum_hash(fs_info, csum_type); 3346 - if (ret) { 3347 - btrfs_release_disk_super(disk_super); 3348 - goto fail_alloc; 3349 - } 3307 + btrfs_init_csum_hash(fs_info, csum_type); 3350 3308 3351 3309 /* 3352 3310 * We want to check superblock checksum, the type is stored inside. ··· 3425 3389 ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); 3426 3390 if (ret < 0) 3427 3391 goto fail_alloc; 3392 + 3393 + if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { 3394 + remap_root = btrfs_alloc_root(fs_info, BTRFS_REMAP_TREE_OBJECTID, 3395 + GFP_KERNEL); 3396 + fs_info->remap_root = remap_root; 3397 + if (!remap_root) { 3398 + ret = -ENOMEM; 3399 + goto fail_alloc; 3400 + } 3401 + } 3428 3402 3429 3403 /* 3430 3404 * At this point our mount options are validated, if we set ->max_inline ··· 3585 3539 if (ret) { 3586 3540 btrfs_err(fs_info, "failed to read block groups: %d", ret); 3587 3541 goto fail_sysfs; 3542 + } 3543 + 3544 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { 3545 + ret = btrfs_populate_fully_remapped_bgs_list(fs_info); 3546 + if (ret) { 3547 + btrfs_err(fs_info, "failed to populate fully_remapped_bgs list: %d", ret); 3548 + goto fail_sysfs; 3549 + } 3588 3550 } 3589 3551 3590 3552 btrfs_zoned_reserve_data_reloc_bg(fs_info); ··· 3763 3709 { 3764 3710 struct btrfs_fs_info *fs_info = device->fs_info; 3765 3711 struct address_space *mapping = device->bdev->bd_mapping; 3766 - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3767 3712 int i; 3768 3713 int ret; 3769 3714 u64 bytenr, bytenr_orig; ··· 3771 3718 3772 3719 if (max_mirrors == 0) 3773 3720 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 3774 - 3775 - shash->tfm = fs_info->csum_shash; 3776 3721 3777 3722 for (i = 0; i < max_mirrors; i++) { 3778 3723 struct folio *folio; ··· 3795 3744 3796 3745 btrfs_set_super_bytenr(sb, bytenr_orig); 3797 3746 3798 - crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE, 3799 - BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, 3800 - sb->csum); 3747 + btrfs_csum(fs_info->csum_type, (const u8 *)sb + BTRFS_CSUM_SIZE, 3748 + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, sb->csum); 3801 3749 3802 3750 folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT, 3803 3751 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, ··· 3916 3866 { 3917 3867 struct bio *bio = &device->flush_bio; 3918 3868 3919 - device->last_flush_error = BLK_STS_OK; 3869 + clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); 3920 3870 3921 3871 bio_init(bio, device->bdev, NULL, 0, 3922 3872 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); ··· 3941 3891 wait_for_completion_io(&device->flush_wait); 3942 3892 3943 3893 if (bio->bi_status) { 3944 - device->last_flush_error = bio->bi_status; 3894 + set_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); 3945 3895 btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); 3946 3896 return true; 3947 3897 } ··· 3991 3941 } 3992 3942 3993 3943 /* 3994 - * Checks last_flush_error of disks in order to determine the device 3944 + * Checks flush failure of disks in order to determine the device 3995 3945 * state. 3996 3946 */ 3997 3947 if (unlikely(errors_wait && !btrfs_check_rw_degradable(info, NULL)))

+2 -5

fs/btrfs/extent-io-tree.c

··· 187 187 struct extent_changeset *changeset, 188 188 int set) 189 189 { 190 - int ret; 191 - 192 190 if (!changeset) 193 191 return 0; 194 192 if (set && (state->state & bits) == bits) ··· 194 196 if (!set && (state->state & bits) == 0) 195 197 return 0; 196 198 changeset->bytes_changed += state->end - state->start + 1; 197 - ret = ulist_add(&changeset->range_changed, state->start, state->end, 198 - GFP_ATOMIC); 199 - return ret; 199 + 200 + return ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); 200 201 } 201 202 202 203 static inline struct extent_state *next_state(struct extent_state *state)

+348 -109

fs/btrfs/extent-tree.c

··· 41 41 #include "tree-checker.h" 42 42 #include "raid-stripe-tree.h" 43 43 #include "delayed-inode.h" 44 + #include "relocation.h" 44 45 45 46 #undef SCRAMBLE_DELAYED_REFS 46 47 ··· 477 476 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 478 477 if (key.objectid != bytenr || 479 478 key.type != BTRFS_EXTENT_DATA_REF_KEY) 480 - goto fail; 479 + return ret; 481 480 482 481 ref = btrfs_item_ptr(leaf, path->slots[0], 483 482 struct btrfs_extent_data_ref); ··· 488 487 btrfs_release_path(path); 489 488 goto again; 490 489 } 491 - ret = 0; 492 - break; 490 + return 0; 493 491 } 494 492 path->slots[0]++; 495 493 } 496 - fail: 494 + 497 495 return ret; 498 496 } 499 497 ··· 1380 1380 } 1381 1381 1382 1382 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 1383 - u64 num_bytes, u64 *actual_bytes) 1383 + u64 num_bytes, u64 *actual_bytes, bool do_remap) 1384 1384 { 1385 1385 int ret = 0; 1386 1386 u64 discarded_bytes = 0; ··· 1398 1398 int i; 1399 1399 1400 1400 num_bytes = end - cur; 1401 - stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); 1401 + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes, 1402 + do_remap); 1402 1403 if (IS_ERR(stripes)) { 1403 1404 ret = PTR_ERR(stripes); 1404 1405 if (ret == -EOPNOTSUPP) ··· 1552 1551 1553 1552 btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, 1554 1553 BTRFS_QGROUP_RSV_DATA); 1554 + } 1555 + 1556 + static int drop_remap_tree_ref(struct btrfs_trans_handle *trans, 1557 + const struct btrfs_delayed_ref_node *node) 1558 + { 1559 + u64 bytenr = node->bytenr; 1560 + u64 num_bytes = node->num_bytes; 1561 + int ret; 1562 + 1563 + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); 1564 + if (unlikely(ret)) { 1565 + btrfs_abort_transaction(trans, ret); 1566 + return ret; 1567 + } 1568 + 1569 + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); 1570 + if (unlikely(ret)) { 1571 + btrfs_abort_transaction(trans, ret); 1572 + return ret; 1573 + } 1574 + 1575 + return 0; 1555 1576 } 1556 1577 1557 1578 static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ··· 1770 1747 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 1771 1748 ret = __btrfs_inc_extent_ref(trans, node, extent_op); 1772 1749 } else if (node->action == BTRFS_DROP_DELAYED_REF) { 1773 - ret = __btrfs_free_extent(trans, href, node, extent_op); 1750 + if (node->ref_root == BTRFS_REMAP_TREE_OBJECTID) 1751 + ret = drop_remap_tree_ref(trans, node); 1752 + else 1753 + ret = __btrfs_free_extent(trans, href, node, extent_op); 1774 1754 } else { 1775 1755 BUG(); 1776 1756 } ··· 1787 1761 struct btrfs_delayed_extent_op *extent_op, 1788 1762 bool insert_reserved) 1789 1763 { 1764 + struct btrfs_fs_info *fs_info = trans->fs_info; 1790 1765 int ret = 0; 1791 1766 1792 1767 if (TRANS_ABORTED(trans)) { 1793 1768 if (insert_reserved) { 1794 1769 btrfs_pin_extent(trans, node->bytenr, node->num_bytes); 1795 - free_head_ref_squota_rsv(trans->fs_info, href); 1770 + free_head_ref_squota_rsv(fs_info, href); 1796 1771 } 1797 1772 return 0; 1798 1773 } 1799 1774 1800 1775 if (node->type == BTRFS_TREE_BLOCK_REF_KEY || 1801 - node->type == BTRFS_SHARED_BLOCK_REF_KEY) 1776 + node->type == BTRFS_SHARED_BLOCK_REF_KEY) { 1802 1777 ret = run_delayed_tree_ref(trans, href, node, extent_op, 1803 1778 insert_reserved); 1804 - else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1805 - node->type == BTRFS_SHARED_DATA_REF_KEY) 1779 + } else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || 1780 + node->type == BTRFS_SHARED_DATA_REF_KEY) { 1806 1781 ret = run_delayed_data_ref(trans, href, node, extent_op, 1807 1782 insert_reserved); 1808 - else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) 1809 - ret = 0; 1810 - else 1811 - BUG(); 1812 - if (ret && insert_reserved) 1813 - btrfs_pin_extent(trans, node->bytenr, node->num_bytes); 1814 - if (ret < 0) 1815 - btrfs_err(trans->fs_info, 1783 + } else if (unlikely(node->type != BTRFS_EXTENT_OWNER_REF_KEY)) { 1784 + ret = -EUCLEAN; 1785 + btrfs_err(fs_info, "unexpected delayed ref node type: %u", node->type); 1786 + } 1787 + 1788 + if (unlikely(ret)) { 1789 + if (insert_reserved) 1790 + btrfs_pin_extent(trans, node->bytenr, node->num_bytes); 1791 + btrfs_err(fs_info, 1816 1792 "failed to run delayed ref for logical %llu num_bytes %llu type %u action %u ref_mod %d: %d", 1817 1793 node->bytenr, node->num_bytes, node->type, 1818 1794 node->action, node->ref_mod, ret); 1795 + } 1796 + 1819 1797 return ret; 1820 1798 } 1821 1799 ··· 2500 2470 int i; 2501 2471 int action; 2502 2472 int level; 2503 - int ret = 0; 2473 + int ret; 2504 2474 2505 2475 if (btrfs_is_testing(fs_info)) 2506 2476 return 0; ··· 2552 2522 else 2553 2523 ret = btrfs_free_extent(trans, &ref); 2554 2524 if (ret) 2555 - goto fail; 2525 + return ret; 2556 2526 } else { 2557 2527 /* We don't know the owning_root, leave as 0. */ 2558 2528 ref.bytenr = btrfs_node_blockptr(buf, i); ··· 2565 2535 else 2566 2536 ret = btrfs_free_extent(trans, &ref); 2567 2537 if (ret) 2568 - goto fail; 2538 + return ret; 2569 2539 } 2570 2540 } 2571 2541 return 0; 2572 - fail: 2573 - return ret; 2574 2542 } 2575 2543 2576 2544 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, ··· 2587 2559 { 2588 2560 struct btrfs_fs_info *fs_info = root->fs_info; 2589 2561 u64 flags; 2590 - u64 ret; 2591 2562 2592 2563 if (data) 2593 2564 flags = BTRFS_BLOCK_GROUP_DATA; 2594 2565 else if (root == fs_info->chunk_root) 2595 2566 flags = BTRFS_BLOCK_GROUP_SYSTEM; 2567 + else if (root == fs_info->remap_root) 2568 + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; 2596 2569 else 2597 2570 flags = BTRFS_BLOCK_GROUP_METADATA; 2598 2571 2599 - ret = btrfs_get_alloc_profile(fs_info, flags); 2600 - return ret; 2572 + return btrfs_get_alloc_profile(fs_info, flags); 2601 2573 } 2602 2574 2603 2575 static u64 first_logical_byte(struct btrfs_fs_info *fs_info) ··· 2781 2753 u64 len; 2782 2754 bool readonly; 2783 2755 2784 - if (!cache || 2785 - start >= cache->start + cache->length) { 2756 + if (!cache || start >= btrfs_block_group_end(cache)) { 2786 2757 if (cache) 2787 2758 btrfs_put_block_group(cache); 2788 2759 total_unpinned = 0; ··· 2797 2770 empty_cluster <<= 1; 2798 2771 } 2799 2772 2800 - len = cache->start + cache->length - start; 2773 + len = btrfs_block_group_end(cache) - start; 2801 2774 len = min(len, end + 1 - start); 2802 2775 2803 2776 if (return_free_space) ··· 2846 2819 return 0; 2847 2820 } 2848 2821 2822 + /* 2823 + * Complete the remapping of a block group by removing its chunk stripes and 2824 + * device extents, and adding it to the unused list if there's no longer any 2825 + * extents nominally within it. 2826 + */ 2827 + int btrfs_complete_bg_remapping(struct btrfs_block_group *bg) 2828 + { 2829 + struct btrfs_fs_info *fs_info = bg->fs_info; 2830 + struct btrfs_chunk_map *map; 2831 + int ret; 2832 + 2833 + map = btrfs_get_chunk_map(fs_info, bg->start, 1); 2834 + if (IS_ERR(map)) 2835 + return PTR_ERR(map); 2836 + 2837 + ret = btrfs_last_identity_remap_gone(map, bg); 2838 + if (ret) { 2839 + btrfs_free_chunk_map(map); 2840 + return ret; 2841 + } 2842 + 2843 + /* 2844 + * Set num_stripes to 0, so that btrfs_remove_dev_extents() won't run a 2845 + * second time. 2846 + */ 2847 + map->num_stripes = 0; 2848 + 2849 + btrfs_free_chunk_map(map); 2850 + 2851 + if (bg->used == 0) { 2852 + spin_lock(&fs_info->unused_bgs_lock); 2853 + if (!list_empty(&bg->bg_list)) { 2854 + list_del_init(&bg->bg_list); 2855 + btrfs_put_block_group(bg); 2856 + } 2857 + spin_unlock(&fs_info->unused_bgs_lock); 2858 + 2859 + btrfs_mark_bg_unused(bg); 2860 + } 2861 + 2862 + return 0; 2863 + } 2864 + 2865 + void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info) 2866 + { 2867 + struct btrfs_block_group *bg; 2868 + int ret; 2869 + 2870 + spin_lock(&fs_info->unused_bgs_lock); 2871 + while (!list_empty(&fs_info->fully_remapped_bgs)) { 2872 + bg = list_first_entry(&fs_info->fully_remapped_bgs, 2873 + struct btrfs_block_group, bg_list); 2874 + list_del_init(&bg->bg_list); 2875 + spin_unlock(&fs_info->unused_bgs_lock); 2876 + 2877 + btrfs_discard_extent(fs_info, bg->start, bg->length, NULL, false); 2878 + 2879 + ret = btrfs_complete_bg_remapping(bg); 2880 + if (ret) { 2881 + btrfs_put_block_group(bg); 2882 + return; 2883 + } 2884 + 2885 + btrfs_put_block_group(bg); 2886 + spin_lock(&fs_info->unused_bgs_lock); 2887 + } 2888 + spin_unlock(&fs_info->unused_bgs_lock); 2889 + } 2890 + 2849 2891 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) 2850 2892 { 2851 2893 struct btrfs_fs_info *fs_info = trans->fs_info; ··· 2935 2839 2936 2840 if (btrfs_test_opt(fs_info, DISCARD_SYNC)) 2937 2841 ret = btrfs_discard_extent(fs_info, start, 2938 - end + 1 - start, NULL); 2842 + end + 1 - start, NULL, true); 2939 2843 2940 2844 next_state = btrfs_next_extent_state(unpin, cached_state); 2941 2845 btrfs_clear_extent_dirty(unpin, start, end, &cached_state); ··· 2993 2897 ret = -EROFS; 2994 2898 if (!TRANS_ABORTED(trans)) 2995 2899 ret = btrfs_discard_extent(fs_info, block_group->start, 2996 - block_group->length, NULL); 2900 + block_group->length, NULL, true); 2997 2901 2998 2902 /* 2999 2903 * Not strictly necessary to lock, as the block_group should be ··· 3067 2971 } 3068 2972 3069 2973 static int do_free_extent_accounting(struct btrfs_trans_handle *trans, 3070 - u64 bytenr, struct btrfs_squota_delta *delta) 2974 + u64 bytenr, struct btrfs_squota_delta *delta, 2975 + struct btrfs_path *path) 3071 2976 { 3072 2977 int ret; 2978 + bool remapped = false; 3073 2979 u64 num_bytes = delta->num_bytes; 2980 + 2981 + /* Returns 1 on success and 0 on no-op. */ 2982 + ret = btrfs_remove_extent_from_remap_tree(trans, path, bytenr, num_bytes); 2983 + if (unlikely(ret < 0)) { 2984 + btrfs_abort_transaction(trans, ret); 2985 + return ret; 2986 + } else if (ret == 1) { 2987 + remapped = true; 2988 + } 3074 2989 3075 2990 if (delta->is_data) { 3076 2991 struct btrfs_root *csum_root; ··· 3106 2999 return ret; 3107 3000 } 3108 3001 3109 - ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); 3110 - if (unlikely(ret)) { 3111 - btrfs_abort_transaction(trans, ret); 3112 - return ret; 3002 + /* If remapped, FST has already been taken care of in remove_range_from_remap_tree(). */ 3003 + if (!remapped) { 3004 + ret = btrfs_add_to_free_space_tree(trans, bytenr, num_bytes); 3005 + if (unlikely(ret)) { 3006 + btrfs_abort_transaction(trans, ret); 3007 + return ret; 3008 + } 3113 3009 } 3114 3010 3115 3011 ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); ··· 3471 3361 } 3472 3362 btrfs_release_path(path); 3473 3363 3474 - ret = do_free_extent_accounting(trans, bytenr, &delta); 3364 + ret = do_free_extent_accounting(trans, bytenr, &delta, path); 3475 3365 } 3476 3366 btrfs_release_path(path); 3477 3367 ··· 3572 3462 return 0; 3573 3463 3574 3464 if (btrfs_header_generation(buf) != trans->transid) 3575 - goto out; 3465 + return 0; 3576 3466 3577 3467 if (root_id != BTRFS_TREE_LOG_OBJECTID) { 3578 3468 ret = check_ref_cleanup(trans, buf->start); 3579 3469 if (!ret) 3580 - goto out; 3470 + return 0; 3581 3471 } 3582 3472 3583 3473 bg = btrfs_lookup_block_group(fs_info, buf->start); ··· 3585 3475 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 3586 3476 pin_down_extent(trans, bg, buf->start, buf->len, true); 3587 3477 btrfs_put_block_group(bg); 3588 - goto out; 3478 + return 0; 3589 3479 } 3590 3480 3591 3481 /* ··· 3609 3499 || btrfs_is_zoned(fs_info)) { 3610 3500 pin_down_extent(trans, bg, buf->start, buf->len, true); 3611 3501 btrfs_put_block_group(bg); 3612 - goto out; 3502 + return 0; 3613 3503 } 3614 3504 3615 3505 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); ··· 3619 3509 btrfs_put_block_group(bg); 3620 3510 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); 3621 3511 3622 - out: 3623 3512 return 0; 3624 3513 } 3625 3514 ··· 4300 4191 else 4301 4192 trans = btrfs_join_transaction(root); 4302 4193 4303 - if (IS_ERR(trans)) { 4304 - ret = PTR_ERR(trans); 4305 - return ret; 4306 - } 4194 + if (IS_ERR(trans)) 4195 + return PTR_ERR(trans); 4307 4196 4308 4197 ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, 4309 4198 CHUNK_ALLOC_FORCE_FOR_EXTENT); ··· 4395 4288 struct find_free_extent_ctl *ffe_ctl, 4396 4289 struct btrfs_space_info *space_info) 4397 4290 { 4291 + struct btrfs_block_group *block_group; 4292 + 4398 4293 if (ffe_ctl->for_treelog) { 4399 4294 spin_lock(&fs_info->treelog_bg_lock); 4400 4295 if (fs_info->treelog_bg) 4401 4296 ffe_ctl->hint_byte = fs_info->treelog_bg; 4402 4297 spin_unlock(&fs_info->treelog_bg_lock); 4403 - } else if (ffe_ctl->for_data_reloc) { 4298 + return 0; 4299 + } 4300 + 4301 + if (ffe_ctl->for_data_reloc) { 4404 4302 spin_lock(&fs_info->relocation_bg_lock); 4405 4303 if (fs_info->data_reloc_bg) 4406 4304 ffe_ctl->hint_byte = fs_info->data_reloc_bg; 4407 4305 spin_unlock(&fs_info->relocation_bg_lock); 4408 - } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { 4409 - struct btrfs_block_group *block_group; 4410 - 4411 - spin_lock(&fs_info->zone_active_bgs_lock); 4412 - list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { 4413 - /* 4414 - * No lock is OK here because avail is monotonically 4415 - * decreasing, and this is just a hint. 4416 - */ 4417 - u64 avail = block_group->zone_capacity - block_group->alloc_offset; 4418 - 4419 - if (block_group_bits(block_group, ffe_ctl->flags) && 4420 - block_group->space_info == space_info && 4421 - avail >= ffe_ctl->num_bytes) { 4422 - ffe_ctl->hint_byte = block_group->start; 4423 - break; 4424 - } 4425 - } 4426 - spin_unlock(&fs_info->zone_active_bgs_lock); 4306 + return 0; 4427 4307 } 4308 + 4309 + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) 4310 + return 0; 4311 + 4312 + spin_lock(&fs_info->zone_active_bgs_lock); 4313 + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { 4314 + /* 4315 + * No lock is OK here because avail is monotonically 4316 + * decreasing, and this is just a hint. 4317 + */ 4318 + u64 avail = block_group->zone_capacity - block_group->alloc_offset; 4319 + 4320 + if (block_group_bits(block_group, ffe_ctl->flags) && 4321 + block_group->space_info == space_info && 4322 + avail >= ffe_ctl->num_bytes) { 4323 + ffe_ctl->hint_byte = block_group->start; 4324 + break; 4325 + } 4326 + } 4327 + spin_unlock(&fs_info->zone_active_bgs_lock); 4428 4328 4429 4329 return 0; 4430 4330 } ··· 4555 4441 block_group->cached != BTRFS_CACHE_NO) { 4556 4442 down_read(&space_info->groups_sem); 4557 4443 if (list_empty(&block_group->list) || 4558 - block_group->ro) { 4444 + block_group->ro || 4445 + (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { 4559 4446 /* 4560 4447 * someone is removing this block group, 4561 4448 * we can't jump into the have_block_group ··· 4590 4475 4591 4476 ffe_ctl->hinted = false; 4592 4477 /* If the block group is read-only, we can skip it entirely. */ 4593 - if (unlikely(block_group->ro)) { 4478 + if (unlikely(block_group->ro || 4479 + (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED))) { 4594 4480 if (ffe_ctl->for_treelog) 4595 4481 btrfs_clear_treelog_bg(block_group); 4596 4482 if (ffe_ctl->for_data_reloc) ··· 4678 4562 4679 4563 /* move on to the next group */ 4680 4564 if (ffe_ctl->search_start + ffe_ctl->num_bytes > 4681 - block_group->start + block_group->length) { 4565 + btrfs_block_group_end(block_group)) { 4682 4566 btrfs_add_free_space_unused(block_group, 4683 4567 ffe_ctl->found_offset, 4684 4568 ffe_ctl->num_bytes); ··· 4999 4883 int level = btrfs_delayed_ref_owner(node); 5000 4884 bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); 5001 4885 4886 + if (unlikely(node->ref_root == BTRFS_REMAP_TREE_OBJECTID)) 4887 + goto skip; 4888 + 5002 4889 extent_key.objectid = node->bytenr; 5003 4890 if (skinny_metadata) { 5004 4891 /* The owner of a tree block is the level. */ ··· 5054 4935 5055 4936 btrfs_free_path(path); 5056 4937 4938 + skip: 5057 4939 return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); 5058 4940 } 5059 4941 ··· 5383 5263 * @root: the root we are currently deleting 5384 5264 * @wc: the walk control for this deletion 5385 5265 * @eb: the parent eb that we're currently visiting 5386 - * @refs: the number of refs for wc->level - 1 5387 5266 * @flags: the flags for wc->level - 1 5388 5267 * @slot: the slot in the eb that we're currently checking 5389 5268 * ··· 5577 5458 /* wc->stage == UPDATE_BACKREF */ 5578 5459 if (!(wc->flags[level] & flag)) { 5579 5460 ASSERT(path->locks[level]); 5580 - ret = btrfs_inc_ref(trans, root, eb, 1); 5461 + ret = btrfs_inc_ref(trans, root, eb, true); 5581 5462 if (unlikely(ret)) { 5582 5463 btrfs_abort_transaction(trans, ret); 5583 5464 return ret; 5584 5465 } 5585 - ret = btrfs_dec_ref(trans, root, eb, 0); 5466 + ret = btrfs_dec_ref(trans, root, eb, false); 5586 5467 if (unlikely(ret)) { 5587 5468 btrfs_abort_transaction(trans, ret); 5588 5469 return ret; ··· 5983 5864 5984 5865 if (wc->refs[level] == 1) { 5985 5866 if (level == 0) { 5986 - if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) { 5987 - ret = btrfs_dec_ref(trans, root, eb, 1); 5988 - if (ret) { 5989 - btrfs_abort_transaction(trans, ret); 5990 - return ret; 5991 - } 5992 - } else { 5993 - ret = btrfs_dec_ref(trans, root, eb, 0); 5994 - if (unlikely(ret)) { 5995 - btrfs_abort_transaction(trans, ret); 5996 - return ret; 5997 - } 5867 + const bool full_backref = (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF); 5868 + 5869 + ret = btrfs_dec_ref(trans, root, eb, full_backref); 5870 + if (unlikely(ret)) { 5871 + btrfs_abort_transaction(trans, ret); 5872 + return ret; 5998 5873 } 5999 5874 if (btrfs_is_fstree(btrfs_root_id(root))) { 6000 5875 ret = btrfs_qgroup_trace_leaf_items(trans, eb); ··· 6513 6400 * it while performing the free space search since we have already 6514 6401 * held back allocations. 6515 6402 */ 6516 - static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) 6403 + static int btrfs_trim_free_extents_throttle(struct btrfs_device *device, 6404 + u64 *trimmed, u64 pos, u64 *ret_next_pos) 6517 6405 { 6518 - u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; 6519 6406 int ret; 6407 + u64 start = pos; 6408 + u64 trim_len = 0; 6520 6409 6521 6410 *trimmed = 0; 6522 6411 ··· 6538 6423 6539 6424 while (1) { 6540 6425 struct btrfs_fs_info *fs_info = device->fs_info; 6426 + u64 cur_start; 6427 + u64 end; 6428 + u64 len; 6541 6429 u64 bytes; 6542 6430 6543 6431 ret = mutex_lock_interruptible(&fs_info->chunk_mutex); 6544 6432 if (ret) 6545 6433 break; 6546 6434 6435 + cur_start = start; 6547 6436 btrfs_find_first_clear_extent_bit(&device->alloc_state, start, 6548 6437 &start, &end, 6549 6438 CHUNK_TRIMMED | CHUNK_ALLOCATED); 6439 + start = max(start, cur_start); 6550 6440 6551 6441 /* Check if there are any CHUNK_* bits left */ 6552 6442 if (start > device->total_bytes) { ··· 6577 6457 end = min(end, device->total_bytes - 1); 6578 6458 6579 6459 len = end - start + 1; 6460 + len = min(len, BTRFS_MAX_TRIM_LENGTH); 6580 6461 6581 6462 /* We didn't find any extents */ 6582 6463 if (!len) { ··· 6598 6477 6599 6478 start += len; 6600 6479 *trimmed += bytes; 6480 + trim_len += len; 6481 + if (trim_len >= BTRFS_MAX_TRIM_LENGTH) { 6482 + *ret_next_pos = start; 6483 + ret = -EAGAIN; 6484 + break; 6485 + } 6601 6486 6602 6487 if (btrfs_trim_interrupted()) { 6603 6488 ret = -ERESTARTSYS; ··· 6616 6489 return ret; 6617 6490 } 6618 6491 6492 + static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed, 6493 + u64 *dev_failed, int *dev_ret) 6494 + { 6495 + struct btrfs_device *dev; 6496 + struct btrfs_device *working_dev = NULL; 6497 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6498 + u8 uuid[BTRFS_UUID_SIZE]; 6499 + u64 start = BTRFS_DEVICE_RANGE_RESERVED; 6500 + 6501 + *trimmed = 0; 6502 + *dev_failed = 0; 6503 + *dev_ret = 0; 6504 + 6505 + /* Find the device with the smallest UUID to start. */ 6506 + mutex_lock(&fs_devices->device_list_mutex); 6507 + list_for_each_entry(dev, &fs_devices->devices, dev_list) { 6508 + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) 6509 + continue; 6510 + if (!working_dev || 6511 + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) 6512 + working_dev = dev; 6513 + } 6514 + if (working_dev) 6515 + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); 6516 + mutex_unlock(&fs_devices->device_list_mutex); 6517 + 6518 + if (!working_dev) 6519 + return 0; 6520 + 6521 + while (1) { 6522 + u64 group_trimmed = 0; 6523 + u64 next_pos = 0; 6524 + int ret = 0; 6525 + 6526 + mutex_lock(&fs_devices->device_list_mutex); 6527 + 6528 + /* Find and trim the current device. */ 6529 + list_for_each_entry(dev, &fs_devices->devices, dev_list) { 6530 + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) 6531 + continue; 6532 + if (dev == working_dev) { 6533 + ret = btrfs_trim_free_extents_throttle(working_dev, 6534 + &group_trimmed, start, &next_pos); 6535 + break; 6536 + } 6537 + } 6538 + 6539 + /* Throttle: continue the same device from the new position. */ 6540 + if (ret == -EAGAIN && next_pos > start) { 6541 + mutex_unlock(&fs_devices->device_list_mutex); 6542 + *trimmed += group_trimmed; 6543 + start = next_pos; 6544 + cond_resched(); 6545 + continue; 6546 + } 6547 + 6548 + /* User interrupted. */ 6549 + if (ret == -ERESTARTSYS || ret == -EINTR) { 6550 + mutex_unlock(&fs_devices->device_list_mutex); 6551 + *trimmed += group_trimmed; 6552 + return ret; 6553 + } 6554 + 6555 + /* 6556 + * Device completed (ret == 0), failed, or EAGAIN with no progress. 6557 + * Record error if any, then move to next device. 6558 + */ 6559 + if (ret == -EAGAIN) { 6560 + /* No progress - log and skip device. */ 6561 + btrfs_warn(fs_info, 6562 + "trim throttle: no progress, offset=%llu device %s, skipping", 6563 + start, btrfs_dev_name(working_dev)); 6564 + (*dev_failed)++; 6565 + if (!*dev_ret) 6566 + *dev_ret = ret; 6567 + } else if (ret) { 6568 + /* Device failed with error. */ 6569 + (*dev_failed)++; 6570 + if (!*dev_ret) 6571 + *dev_ret = ret; 6572 + } 6573 + 6574 + /* 6575 + * Find next device: smallest UUID larger than current. 6576 + * Devices added during trim with smaller UUID will be skipped. 6577 + */ 6578 + working_dev = NULL; 6579 + list_for_each_entry(dev, &fs_devices->devices, dev_list) { 6580 + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) 6581 + continue; 6582 + /* Must larger than current UUID. */ 6583 + if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0) 6584 + continue; 6585 + /* Find the smallest. */ 6586 + if (!working_dev || 6587 + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) 6588 + working_dev = dev; 6589 + } 6590 + if (working_dev) 6591 + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); 6592 + 6593 + mutex_unlock(&fs_devices->device_list_mutex); 6594 + 6595 + *trimmed += group_trimmed; 6596 + start = BTRFS_DEVICE_RANGE_RESERVED; 6597 + 6598 + /* No more devices. */ 6599 + if (!working_dev) 6600 + break; 6601 + 6602 + cond_resched(); 6603 + } 6604 + 6605 + return 0; 6606 + } 6607 + 6619 6608 /* 6620 6609 * Trim the whole filesystem by: 6621 6610 * 1) trimming the free space in each block group 6622 6611 * 2) trimming the unallocated space on each device 6623 6612 * 6624 6613 * This will also continue trimming even if a block group or device encounters 6625 - * an error. The return value will be the last error, or 0 if nothing bad 6614 + * an error. The return value will be the first error, or 0 if nothing bad 6626 6615 * happens. 6627 6616 */ 6628 6617 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) 6629 6618 { 6630 - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 6631 6619 struct btrfs_block_group *cache = NULL; 6632 - struct btrfs_device *device; 6633 6620 u64 group_trimmed; 6634 6621 u64 range_end = U64_MAX; 6635 6622 u64 start; ··· 6774 6533 } 6775 6534 6776 6535 start = max(range->start, cache->start); 6777 - end = min(range_end, cache->start + cache->length); 6536 + end = min(range_end, btrfs_block_group_end(cache)); 6778 6537 6779 6538 if (end - start >= range->minlen) { 6780 6539 if (!btrfs_block_group_done(cache)) { 6781 6540 ret = btrfs_cache_block_group(cache, true); 6782 6541 if (ret) { 6783 6542 bg_failed++; 6784 - bg_ret = ret; 6543 + if (!bg_ret) 6544 + bg_ret = ret; 6785 6545 continue; 6786 6546 } 6787 6547 } ··· 6793 6551 range->minlen); 6794 6552 6795 6553 trimmed += group_trimmed; 6554 + if (ret == -ERESTARTSYS || ret == -EINTR) { 6555 + btrfs_put_block_group(cache); 6556 + break; 6557 + } 6796 6558 if (ret) { 6797 6559 bg_failed++; 6798 - bg_ret = ret; 6560 + if (!bg_ret) 6561 + bg_ret = ret; 6799 6562 continue; 6800 6563 } 6801 6564 } ··· 6808 6561 6809 6562 if (bg_failed) 6810 6563 btrfs_warn(fs_info, 6811 - "failed to trim %llu block group(s), last error %d", 6564 + "failed to trim %llu block group(s), first error %d", 6812 6565 bg_failed, bg_ret); 6813 6566 6814 - mutex_lock(&fs_devices->device_list_mutex); 6815 - list_for_each_entry(device, &fs_devices->devices, dev_list) { 6816 - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) 6817 - continue; 6567 + if (ret == -ERESTARTSYS || ret == -EINTR) 6568 + return ret; 6818 6569 6819 - ret = btrfs_trim_free_extents(device, &group_trimmed); 6820 - 6821 - trimmed += group_trimmed; 6822 - if (ret) { 6823 - dev_failed++; 6824 - dev_ret = ret; 6825 - break; 6826 - } 6827 - } 6828 - mutex_unlock(&fs_devices->device_list_mutex); 6570 + ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret); 6571 + trimmed += group_trimmed; 6829 6572 6830 6573 if (dev_failed) 6831 6574 btrfs_warn(fs_info, 6832 - "failed to trim %llu device(s), last error %d", 6575 + "failed to trim %llu device(s), first error %d", 6833 6576 dev_failed, dev_ret); 6834 6577 range->len = trimmed; 6578 + if (ret == -ERESTARTSYS || ret == -EINTR) 6579 + return ret; 6835 6580 if (bg_ret) 6836 6581 return bg_ret; 6837 6582 return dev_ret;

+3 -1

fs/btrfs/extent-tree.h

··· 161 161 struct extent_buffer *parent); 162 162 void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); 163 163 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, 164 - u64 num_bytes, u64 *actual_bytes); 164 + u64 num_bytes, u64 *actual_bytes, bool do_remap); 165 165 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); 166 + void btrfs_handle_fully_remapped_bgs(struct btrfs_fs_info *fs_info); 167 + int btrfs_complete_bg_remapping(struct btrfs_block_group *bg); 166 168 167 169 #endif

+48 -29

fs/btrfs/extent_io.c

··· 440 440 loops = 1; 441 441 goto again; 442 442 } else { 443 - found = false; 444 - goto out_failed; 443 + return false; 445 444 } 446 445 } 447 446 ··· 460 461 } 461 462 *start = delalloc_start; 462 463 *end = delalloc_end; 463 - out_failed: 464 + 464 465 return found; 465 466 } 466 467 ··· 969 970 { 970 971 const u64 ra_pos = readahead_pos(ractl); 971 972 const u64 ra_end = ra_pos + readahead_length(ractl); 972 - const u64 em_end = em->start + em->len; 973 + const u64 em_end = btrfs_extent_map_end(em); 973 974 974 975 /* No expansion for holes and inline extents. */ 975 976 if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) ··· 997 998 u64 start = folio_pos(folio); 998 999 const u64 end = start + folio_size(folio) - 1; 999 1000 u64 extent_offset; 1001 + u64 locked_end; 1000 1002 u64 last_byte = i_size_read(inode); 1001 1003 struct extent_map *em; 1002 1004 int ret = 0; 1003 1005 const size_t blocksize = fs_info->sectorsize; 1006 + 1007 + if (bio_ctrl->ractl) 1008 + locked_end = readahead_pos(bio_ctrl->ractl) + readahead_length(bio_ctrl->ractl) - 1; 1009 + else 1010 + locked_end = end; 1004 1011 1005 1012 ret = set_folio_extent_mapped(folio); 1006 1013 if (ret < 0) { ··· 1041 1036 end_folio_read(folio, true, cur, blocksize); 1042 1037 continue; 1043 1038 } 1044 - em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); 1039 + /* 1040 + * Search extent map for the whole locked range. 1041 + * This will allow btrfs_get_extent() to return a larger hole 1042 + * when possible. 1043 + * This can reduce duplicated btrfs_get_extent() calls for large 1044 + * holes. 1045 + */ 1046 + em = get_extent_map(BTRFS_I(inode), folio, cur, locked_end - cur + 1, em_cached); 1045 1047 if (IS_ERR(em)) { 1046 1048 end_folio_read(folio, false, cur, end + 1 - cur); 1047 1049 return PTR_ERR(em); ··· 1438 1426 u64 delalloc_start = page_start; 1439 1427 u64 delalloc_end = page_end; 1440 1428 u64 delalloc_to_write = 0; 1429 + unsigned int start_bit; 1430 + unsigned int end_bit; 1441 1431 int ret = 0; 1442 - int bit; 1443 1432 1444 1433 /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ 1445 1434 if (btrfs_is_subpage(fs_info, folio)) { ··· 1450 1437 bio_ctrl->submit_bitmap = 1; 1451 1438 } 1452 1439 1453 - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { 1454 - u64 start = page_start + (bit << fs_info->sectorsize_bits); 1440 + for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, 1441 + blocks_per_folio) { 1442 + u64 start = page_start + (start_bit << fs_info->sectorsize_bits); 1443 + u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; 1455 1444 1456 - btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); 1445 + btrfs_folio_set_lock(fs_info, folio, start, len); 1457 1446 } 1458 1447 1459 1448 /* Lock all (subpage) delalloc ranges inside the folio first. */ ··· 1572 1557 fs_info->sectorsize_bits, 1573 1558 blocks_per_folio); 1574 1559 1575 - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) 1576 - btrfs_mark_ordered_io_finished(inode, folio, 1577 - page_start + (bit << fs_info->sectorsize_bits), 1578 - fs_info->sectorsize, false); 1560 + for_each_set_bitrange(start_bit, end_bit, &bio_ctrl->submit_bitmap, 1561 + bitmap_size) { 1562 + u64 start = page_start + (start_bit << fs_info->sectorsize_bits); 1563 + u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; 1564 + 1565 + btrfs_mark_ordered_io_finished(inode, folio, start, len, false); 1566 + } 1579 1567 return ret; 1580 1568 } 1581 1569 out: ··· 1616 1598 1617 1599 /* 1618 1600 * Return 0 if we have submitted or queued the sector for submission. 1619 - * Return <0 for critical errors, and the sector will have its dirty flag cleared. 1601 + * Return <0 for critical errors, and the involved sector will be cleaned up. 1620 1602 * 1621 1603 * Caller should make sure filepos < i_size and handle filepos >= i_size case. 1622 1604 */ ··· 1641 1623 em = btrfs_get_extent(inode, NULL, filepos, sectorsize); 1642 1624 if (IS_ERR(em)) { 1643 1625 /* 1626 + * bio_ctrl may contain a bio crossing several folios. 1627 + * Submit it immediately so that the bio has a chance 1628 + * to finish normally, other than marked as error. 1629 + */ 1630 + submit_one_bio(bio_ctrl); 1631 + 1632 + /* 1644 1633 * When submission failed, we should still clear the folio dirty. 1645 1634 * Or the folio will be written back again but without any 1646 1635 * ordered extent. ··· 1655 1630 btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1656 1631 btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1657 1632 btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1633 + 1634 + /* 1635 + * Since there is no bio submitted to finish the ordered 1636 + * extent, we have to manually finish this sector. 1637 + */ 1638 + btrfs_mark_ordered_io_finished(inode, folio, filepos, 1639 + fs_info->sectorsize, false); 1658 1640 return PTR_ERR(em); 1659 1641 } 1660 1642 ··· 1746 1714 return ret; 1747 1715 } 1748 1716 1749 - for (cur = start; cur < end; cur += fs_info->sectorsize) 1750 - set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); 1717 + bitmap_set(&range_bitmap, (start - folio_pos(folio)) >> fs_info->sectorsize_bits, 1718 + len >> fs_info->sectorsize_bits); 1751 1719 bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, 1752 1720 blocks_per_folio); 1753 1721 ··· 1788 1756 } 1789 1757 ret = submit_one_sector(inode, folio, cur, bio_ctrl, i_size); 1790 1758 if (unlikely(ret < 0)) { 1791 - /* 1792 - * bio_ctrl may contain a bio crossing several folios. 1793 - * Submit it immediately so that the bio has a chance 1794 - * to finish normally, other than marked as error. 1795 - */ 1796 - submit_one_bio(bio_ctrl); 1797 - /* 1798 - * Failed to grab the extent map which should be very rare. 1799 - * Since there is no bio submitted to finish the ordered 1800 - * extent, we have to manually finish this sector. 1801 - */ 1802 - btrfs_mark_ordered_io_finished(inode, folio, cur, 1803 - fs_info->sectorsize, false); 1804 1759 if (!found_error) 1805 1760 found_error = ret; 1806 1761 continue;

+12

fs/btrfs/extent_map.c

··· 319 319 /* Internal sanity checks for btrfs debug builds. */ 320 320 static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map *em) 321 321 { 322 + const u32 blocksize = fs_info->sectorsize; 323 + 322 324 if (!IS_ENABLED(CONFIG_BTRFS_DEBUG)) 323 325 return; 326 + 327 + if (!IS_ALIGNED(em->start, blocksize) || 328 + !IS_ALIGNED(em->len, blocksize)) 329 + dump_extent_map(fs_info, "unaligned start offset or length members", em); 330 + 324 331 if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { 325 332 if (em->disk_num_bytes == 0) 326 333 dump_extent_map(fs_info, "zero disk_num_bytes", em); ··· 341 334 dump_extent_map(fs_info, 342 335 "ram_bytes mismatch with disk_num_bytes for non-compressed em", 343 336 em); 337 + if (!IS_ALIGNED(em->disk_bytenr, blocksize) || 338 + !IS_ALIGNED(em->disk_num_bytes, blocksize) || 339 + !IS_ALIGNED(em->offset, blocksize) || 340 + !IS_ALIGNED(em->ram_bytes, blocksize)) 341 + dump_extent_map(fs_info, "unaligned members", em); 344 342 } else if (em->offset) { 345 343 dump_extent_map(fs_info, "non-zero offset for hole/inline", em); 346 344 }

+8 -12

fs/btrfs/file-item.c

··· 8 8 #include <linux/pagemap.h> 9 9 #include <linux/highmem.h> 10 10 #include <linux/sched/mm.h> 11 - #include <crypto/hash.h> 12 11 #include "messages.h" 13 12 #include "ctree.h" 14 13 #include "disk-io.h" ··· 768 769 { 769 770 struct btrfs_inode *inode = bbio->inode; 770 771 struct btrfs_fs_info *fs_info = inode->root->fs_info; 771 - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 772 772 struct bio *bio = &bbio->bio; 773 773 struct btrfs_ordered_sum *sums = bbio->sums; 774 774 struct bvec_iter iter = *src; ··· 778 780 phys_addr_t paddrs[BTRFS_MAX_BLOCKSIZE / PAGE_SIZE]; 779 781 u32 offset = 0; 780 782 int index = 0; 781 - 782 - shash->tfm = fs_info->csum_shash; 783 783 784 784 btrfs_bio_for_each_block(paddr, bio, &iter, step) { 785 785 paddrs[(offset / step) % nr_steps] = paddr; ··· 1134 1138 } 1135 1139 ret = PTR_ERR(item); 1136 1140 if (ret != -EFBIG && ret != -ENOENT) 1137 - goto out; 1141 + return ret; 1138 1142 1139 1143 if (ret == -EFBIG) { 1140 1144 u32 item_size; ··· 1150 1154 /* We didn't find a csum item, insert one. */ 1151 1155 ret = find_next_csum_offset(root, path, &next_offset); 1152 1156 if (ret < 0) 1153 - goto out; 1157 + return ret; 1154 1158 found_next = 1; 1155 1159 goto insert; 1156 1160 } ··· 1178 1182 csum_size, 1); 1179 1183 path->search_for_extension = false; 1180 1184 if (ret < 0) 1181 - goto out; 1185 + return ret; 1182 1186 1183 1187 if (ret > 0) { 1184 1188 if (path->slots[0] == 0) ··· 1234 1238 btrfs_header_nritems(path->nodes[0])) { 1235 1239 ret = find_next_csum_offset(root, path, &next_offset); 1236 1240 if (ret < 0) 1237 - goto out; 1241 + return ret; 1238 1242 found_next = 1; 1239 1243 goto insert; 1240 1244 } 1241 1245 1242 1246 ret = find_next_csum_offset(root, path, &next_offset); 1243 1247 if (ret < 0) 1244 - goto out; 1248 + return ret; 1245 1249 1246 1250 tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; 1247 1251 if (tmp <= INT_MAX) ··· 1282 1286 ret = btrfs_insert_empty_item(trans, root, path, &file_key, 1283 1287 ins_size); 1284 1288 if (ret < 0) 1285 - goto out; 1289 + return ret; 1286 1290 leaf = path->nodes[0]; 1287 1291 csum: 1288 1292 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); ··· 1307 1311 cond_resched(); 1308 1312 goto again; 1309 1313 } 1310 - out: 1311 - return ret; 1314 + 1315 + return 0; 1312 1316 } 1313 1317 1314 1318 void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,

+29 -31

fs/btrfs/file.c

··· 566 566 int del_nr = 0; 567 567 int del_slot = 0; 568 568 int recow; 569 - int ret = 0; 569 + int ret; 570 570 u64 ino = btrfs_ino(inode); 571 571 572 572 path = btrfs_alloc_path(); ··· 581 581 582 582 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 583 583 if (ret < 0) 584 - goto out; 584 + return ret; 585 585 if (ret > 0 && path->slots[0] > 0) 586 586 path->slots[0]--; 587 587 ··· 590 590 if (unlikely(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)) { 591 591 ret = -EINVAL; 592 592 btrfs_abort_transaction(trans, ret); 593 - goto out; 593 + return ret; 594 594 } 595 595 fi = btrfs_item_ptr(leaf, path->slots[0], 596 596 struct btrfs_file_extent_item); 597 597 if (unlikely(btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC)) { 598 598 ret = -EINVAL; 599 599 btrfs_abort_transaction(trans, ret); 600 - goto out; 600 + return ret; 601 601 } 602 602 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); 603 603 if (unlikely(key.offset > start || extent_end < end)) { 604 604 ret = -EINVAL; 605 605 btrfs_abort_transaction(trans, ret); 606 - goto out; 606 + return ret; 607 607 } 608 608 609 609 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); ··· 633 633 trans->transid); 634 634 btrfs_set_file_extent_num_bytes(leaf, fi, 635 635 end - other_start); 636 - goto out; 636 + return 0; 637 637 } 638 638 } 639 639 ··· 661 661 other_end - start); 662 662 btrfs_set_file_extent_offset(leaf, fi, 663 663 start - orig_offset); 664 - goto out; 664 + return 0; 665 665 } 666 666 } 667 667 ··· 677 677 } 678 678 if (unlikely(ret < 0)) { 679 679 btrfs_abort_transaction(trans, ret); 680 - goto out; 680 + return ret; 681 681 } 682 682 683 683 leaf = path->nodes[0]; ··· 705 705 ret = btrfs_inc_extent_ref(trans, &ref); 706 706 if (unlikely(ret)) { 707 707 btrfs_abort_transaction(trans, ret); 708 - goto out; 708 + return ret; 709 709 } 710 710 711 711 if (split == start) { ··· 714 714 if (unlikely(start != key.offset)) { 715 715 ret = -EINVAL; 716 716 btrfs_abort_transaction(trans, ret); 717 - goto out; 717 + return ret; 718 718 } 719 719 path->slots[0]--; 720 720 extent_end = end; ··· 745 745 ret = btrfs_free_extent(trans, &ref); 746 746 if (unlikely(ret)) { 747 747 btrfs_abort_transaction(trans, ret); 748 - goto out; 748 + return ret; 749 749 } 750 750 } 751 751 other_start = 0; ··· 763 763 ret = btrfs_free_extent(trans, &ref); 764 764 if (unlikely(ret)) { 765 765 btrfs_abort_transaction(trans, ret); 766 - goto out; 766 + return ret; 767 767 } 768 768 } 769 769 if (del_nr == 0) { ··· 784 784 ret = btrfs_del_items(trans, root, path, del_slot, del_nr); 785 785 if (unlikely(ret < 0)) { 786 786 btrfs_abort_transaction(trans, ret); 787 - goto out; 787 + return ret; 788 788 } 789 789 } 790 - out: 791 - return ret; 790 + 791 + return 0; 792 792 } 793 793 794 794 /* ··· 860 860 fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | 861 861 fgf_set_order(write_bytes); 862 862 struct folio *folio; 863 - int ret = 0; 863 + int ret; 864 864 865 865 again: 866 866 folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); ··· 877 877 if (ret) { 878 878 /* The folio is already unlocked. */ 879 879 folio_put(folio); 880 - if (!nowait && ret == -EAGAIN) { 881 - ret = 0; 880 + if (!nowait && ret == -EAGAIN) 882 881 goto again; 883 - } 884 882 return ret; 885 883 } 886 884 *folio_ret = folio; ··· 1273 1275 btrfs_delalloc_release_extents(inode, reserved_len); 1274 1276 release_space(inode, *data_reserved, reserved_start, reserved_len, 1275 1277 only_release_metadata); 1276 - ret = extents_locked; 1277 - return ret; 1278 + return extents_locked; 1278 1279 } 1279 1280 1280 1281 copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), ··· 1438 1441 struct btrfs_inode *inode = BTRFS_I(file_inode(file)); 1439 1442 ssize_t num_written, num_sync; 1440 1443 1441 - if (unlikely(btrfs_is_shutdown(inode->root->fs_info))) 1444 + if (btrfs_is_shutdown(inode->root->fs_info)) 1442 1445 return -EIO; 1443 1446 /* 1444 1447 * If the fs flips readonly due to some impossible error, although we ··· 2043 2046 struct file *filp = desc->file; 2044 2047 struct address_space *mapping = filp->f_mapping; 2045 2048 2046 - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(filp))))) 2049 + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(filp)))) 2047 2050 return -EIO; 2048 2051 if (!mapping->a_ops->read_folio) 2049 2052 return -ENOEXEC; ··· 2196 2199 2197 2200 /* Hole or vacuum extent(only exists in no-hole mode) */ 2198 2201 if (em->disk_bytenr == EXTENT_MAP_HOLE) { 2202 + const u64 em_end = btrfs_extent_map_end(em); 2203 + 2199 2204 ret = 1; 2200 - *len = em->start + em->len > *start + *len ? 2201 - 0 : *start + *len - em->start - em->len; 2202 - *start = em->start + em->len; 2205 + *len = (em_end > *start + *len) ? 0 : (*start + *len - em_end); 2206 + *start = em_end; 2203 2207 } 2204 2208 btrfs_free_extent_map(em); 2205 2209 return ret; ··· 2949 2951 * new prealloc extent, so that we get a larger contiguous disk extent. 2950 2952 */ 2951 2953 if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { 2952 - const u64 em_end = em->start + em->len; 2954 + const u64 em_end = btrfs_extent_map_end(em); 2953 2955 2954 2956 if (em_end >= offset + len) { 2955 2957 /* ··· 3115 3117 int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; 3116 3118 int ret; 3117 3119 3118 - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) 3120 + if (btrfs_is_shutdown(inode_to_fs_info(inode))) 3119 3121 return -EIO; 3120 3122 3121 3123 /* Do not allow fallocate in ZONED mode */ ··· 3809 3811 { 3810 3812 int ret; 3811 3813 3812 - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(inode)))) 3814 + if (btrfs_is_shutdown(inode_to_fs_info(inode))) 3813 3815 return -EIO; 3814 3816 3815 3817 filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ··· 3824 3826 { 3825 3827 ssize_t ret = 0; 3826 3828 3827 - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp))))) 3829 + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(iocb->ki_filp)))) 3828 3830 return -EIO; 3829 3831 3830 3832 if (iocb->ki_flags & IOCB_DIRECT) { ··· 3841 3843 struct pipe_inode_info *pipe, 3842 3844 size_t len, unsigned int flags) 3843 3845 { 3844 - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(in))))) 3846 + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(in)))) 3845 3847 return -EIO; 3846 3848 3847 3849 return filemap_splice_read(in, ppos, pipe, len, flags);

+77 -31

fs/btrfs/free-space-cache.c

··· 29 29 #include "file-item.h" 30 30 #include "file.h" 31 31 #include "super.h" 32 + #include "relocation.h" 32 33 33 34 #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) 34 35 #define MAX_CACHE_BYTES_PER_GIG SZ_64K ··· 1080 1079 struct btrfs_trim_range *trim_entry; 1081 1080 1082 1081 /* Get the cluster for this block_group if it exists */ 1083 - if (block_group && !list_empty(&block_group->cluster_list)) { 1082 + if (!list_empty(&block_group->cluster_list)) { 1084 1083 cluster = list_first_entry(&block_group->cluster_list, 1085 1084 struct btrfs_free_cluster, block_group_list); 1086 1085 } ··· 1162 1161 if (ret < 0) { 1163 1162 btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, 1164 1163 EXTENT_DELALLOC, NULL); 1165 - goto fail; 1164 + return ret; 1166 1165 } 1167 1166 leaf = path->nodes[0]; 1168 1167 if (ret > 0) { ··· 1176 1175 inode->i_size - 1, EXTENT_DELALLOC, 1177 1176 NULL); 1178 1177 btrfs_release_path(path); 1179 - goto fail; 1178 + return -ENOENT; 1180 1179 } 1181 1180 } 1182 1181 ··· 1189 1188 btrfs_release_path(path); 1190 1189 1191 1190 return 0; 1192 - 1193 - fail: 1194 - return -1; 1195 1191 } 1196 1192 1197 1193 static noinline_for_stack int write_pinned_extent_entries( ··· 1198 1200 int *entries) 1199 1201 { 1200 1202 u64 start, extent_start, extent_end, len; 1203 + const u64 block_group_end = btrfs_block_group_end(block_group); 1201 1204 struct extent_io_tree *unpin = NULL; 1202 1205 int ret; 1203 - 1204 - if (!block_group) 1205 - return 0; 1206 1206 1207 1207 /* 1208 1208 * We want to add any pinned extents to our free space cache ··· 1213 1217 1214 1218 start = block_group->start; 1215 1219 1216 - while (start < block_group->start + block_group->length) { 1220 + while (start < block_group_end) { 1217 1221 if (!btrfs_find_first_extent_bit(unpin, start, 1218 1222 &extent_start, &extent_end, 1219 1223 EXTENT_DIRTY, NULL)) 1220 1224 return 0; 1221 1225 1222 1226 /* This pinned extent is out of our range */ 1223 - if (extent_start >= block_group->start + block_group->length) 1227 + if (extent_start >= block_group_end) 1224 1228 return 0; 1225 1229 1226 1230 extent_start = max(extent_start, start); 1227 - extent_end = min(block_group->start + block_group->length, 1228 - extent_end + 1); 1231 + extent_end = min(block_group_end, extent_end + 1); 1229 1232 len = extent_end - extent_start; 1230 1233 1231 1234 *entries += 1; ··· 1369 1374 static int __btrfs_write_out_cache(struct inode *inode, 1370 1375 struct btrfs_free_space_ctl *ctl, 1371 1376 struct btrfs_block_group *block_group, 1372 - struct btrfs_io_ctl *io_ctl, 1373 1377 struct btrfs_trans_handle *trans) 1374 1378 { 1379 + struct btrfs_io_ctl *io_ctl = &block_group->io_ctl; 1375 1380 struct extent_state *cached_state = NULL; 1376 1381 LIST_HEAD(bitmap_list); 1377 1382 int entries = 0; ··· 1388 1393 if (ret) 1389 1394 return ret; 1390 1395 1391 - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { 1396 + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { 1392 1397 down_write(&block_group->data_rwsem); 1393 1398 spin_lock(&block_group->lock); 1394 1399 if (block_group->delalloc_bytes) { ··· 1460 1465 goto out_nospc; 1461 1466 } 1462 1467 1463 - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1468 + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 1464 1469 up_write(&block_group->data_rwsem); 1465 1470 /* 1466 1471 * Release the pages and unlock the extent, we will flush ··· 1495 1500 cleanup_write_cache_enospc(inode, io_ctl, &cached_state); 1496 1501 1497 1502 out_unlock: 1498 - if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 1503 + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 1499 1504 up_write(&block_group->data_rwsem); 1500 1505 1501 1506 out: ··· 1531 1536 if (IS_ERR(inode)) 1532 1537 return 0; 1533 1538 1534 - ret = __btrfs_write_out_cache(inode, ctl, block_group, 1535 - &block_group->io_ctl, trans); 1539 + ret = __btrfs_write_out_cache(inode, ctl, block_group, trans); 1536 1540 if (ret) { 1537 1541 btrfs_debug(fs_info, 1538 1542 "failed to write free space cache for block group %llu error %d", ··· 2014 2020 int ret; 2015 2021 2016 2022 if (!ctl->free_space_offset.rb_node) 2017 - goto out; 2023 + return NULL; 2018 2024 again: 2019 2025 if (use_bytes_index) { 2020 2026 node = rb_first_cached(&ctl->free_space_bytes); ··· 2022 2028 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 2023 2029 0, 1); 2024 2030 if (!entry) 2025 - goto out; 2031 + return NULL; 2026 2032 node = &entry->offset_index; 2027 2033 } 2028 2034 ··· 2106 2112 *bytes = entry->bytes - align_off; 2107 2113 return entry; 2108 2114 } 2109 - out: 2115 + 2110 2116 return NULL; 2111 2117 } 2112 2118 ··· 2750 2756 { 2751 2757 enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; 2752 2758 2759 + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) 2760 + return 0; 2761 + 2753 2762 if (btrfs_is_zoned(block_group->fs_info)) 2754 2763 return __btrfs_add_free_space_zoned(block_group, bytenr, size, 2755 2764 true); ··· 2891 2894 old_end - (offset + bytes), 2892 2895 info->trim_state); 2893 2896 WARN_ON(ret); 2894 - goto out; 2897 + return ret; 2895 2898 } 2896 2899 } 2897 2900 ··· 2903 2906 out_lock: 2904 2907 btrfs_discard_update_discardable(block_group); 2905 2908 spin_unlock(&ctl->tree_lock); 2906 - out: 2909 + 2907 2910 return ret; 2908 2911 } 2909 2912 ··· 3059 3062 struct btrfs_free_space *info; 3060 3063 struct rb_node *node; 3061 3064 bool ret = true; 3065 + 3066 + if (block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED && 3067 + !test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &block_group->runtime_flags) && 3068 + block_group->identity_remap_count == 0) { 3069 + return true; 3070 + } 3062 3071 3063 3072 spin_lock(&ctl->tree_lock); 3064 3073 node = rb_first(&ctl->free_space_offset); ··· 3677 3674 } 3678 3675 spin_unlock(&space_info->lock); 3679 3676 3680 - ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); 3677 + ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed, false); 3681 3678 if (!ret) { 3682 3679 *total_trimmed += trimmed; 3683 3680 trim_state = BTRFS_TRIM_STATE_TRIMMED; ··· 3834 3831 return ret; 3835 3832 } 3836 3833 3834 + void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg) 3835 + { 3836 + struct btrfs_fs_info *fs_info = bg->fs_info; 3837 + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; 3838 + int ret = 0; 3839 + u64 bytes, trimmed; 3840 + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); 3841 + u64 end = btrfs_block_group_end(bg); 3842 + 3843 + if (!test_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags)) { 3844 + bg->discard_cursor = end; 3845 + 3846 + if (bg->used == 0) { 3847 + spin_lock(&fs_info->unused_bgs_lock); 3848 + if (!list_empty(&bg->bg_list)) { 3849 + list_del_init(&bg->bg_list); 3850 + btrfs_put_block_group(bg); 3851 + } 3852 + spin_unlock(&fs_info->unused_bgs_lock); 3853 + 3854 + btrfs_mark_bg_unused(bg); 3855 + } 3856 + 3857 + return; 3858 + } 3859 + 3860 + bytes = end - bg->discard_cursor; 3861 + 3862 + if (max_discard_size && 3863 + bytes >= (max_discard_size + BTRFS_ASYNC_DISCARD_MIN_FILTER)) 3864 + bytes = max_discard_size; 3865 + 3866 + ret = btrfs_discard_extent(fs_info, bg->discard_cursor, bytes, &trimmed, false); 3867 + if (ret) 3868 + return; 3869 + 3870 + bg->discard_cursor += trimmed; 3871 + 3872 + if (bg->discard_cursor < end) 3873 + return; 3874 + 3875 + btrfs_complete_bg_remapping(bg); 3876 + } 3877 + 3837 3878 /* 3838 3879 * If we break out of trimming a bitmap prematurely, we should reset the 3839 3880 * trimming bit. In a rather contrived case, it's possible to race here so ··· 4003 3956 if (async && *total_trimmed) { 4004 3957 spin_unlock(&ctl->tree_lock); 4005 3958 mutex_unlock(&ctl->cache_writeout_mutex); 4006 - goto out; 3959 + return ret; 4007 3960 } 4008 3961 4009 3962 bytes = min(bytes, end - start); ··· 4064 4017 if (offset >= end) 4065 4018 block_group->discard_cursor = end; 4066 4019 4067 - out: 4068 4020 return ret; 4069 4021 } 4070 4022 ··· 4156 4110 { 4157 4111 struct btrfs_block_group *block_group; 4158 4112 struct rb_node *node; 4159 - int ret = 0; 4160 4113 4161 4114 btrfs_info(fs_info, "cleaning free space cache v1"); 4162 4115 4163 4116 node = rb_first_cached(&fs_info->block_group_cache_tree); 4164 4117 while (node) { 4118 + int ret; 4119 + 4165 4120 block_group = rb_entry(node, struct btrfs_block_group, cache_node); 4166 4121 ret = btrfs_remove_free_space_inode(trans, NULL, block_group); 4167 4122 if (ret) 4168 - goto out; 4123 + return ret; 4169 4124 node = rb_next(node); 4170 4125 } 4171 - out: 4172 - return ret; 4126 + return 0; 4173 4127 } 4174 4128 4175 4129 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active)

+1

fs/btrfs/free-space-cache.h

··· 166 166 int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, 167 167 u64 *trimmed, u64 start, u64 end, u64 minlen, 168 168 u64 maxlen, bool async); 169 + void btrfs_trim_fully_remapped_block_group(struct btrfs_block_group *bg); 169 170 170 171 bool btrfs_free_space_cache_v1_active(struct btrfs_fs_info *fs_info); 171 172 int btrfs_set_free_space_cache_v1_active(struct btrfs_fs_info *fs_info, bool active);

+121 -29

fs/btrfs/free-space-tree.c

··· 21 21 struct btrfs_block_group *block_group, 22 22 struct btrfs_path *path); 23 23 24 - static struct btrfs_root *btrfs_free_space_root( 25 - struct btrfs_block_group *block_group) 24 + struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group) 26 25 { 27 26 struct btrfs_key key = { 28 27 .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, ··· 92 93 return 0; 93 94 } 94 95 95 - EXPORT_FOR_TESTS 96 96 struct btrfs_free_space_info *btrfs_search_free_space_info( 97 97 struct btrfs_trans_handle *trans, 98 98 struct btrfs_block_group *block_group, ··· 218 220 return 0; 219 221 220 222 start = block_group->start; 221 - end = block_group->start + block_group->length; 223 + end = btrfs_block_group_end(block_group); 222 224 223 225 key.objectid = end - 1; 224 226 key.type = (u8)-1; ··· 358 360 return 0; 359 361 360 362 start = block_group->start; 361 - end = block_group->start + block_group->length; 363 + end = btrfs_block_group_end(block_group); 362 364 363 365 key.objectid = end - 1; 364 366 key.type = (u8)-1; ··· 665 667 * Read the bit for the block immediately after the extent of space if 666 668 * that block is within the block group. 667 669 */ 668 - if (end < block_group->start + block_group->length) { 670 + if (end < btrfs_block_group_end(block_group)) { 669 671 /* The next block may be in the next bitmap. */ 670 672 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 671 673 if (end >= key.objectid + key.offset) { ··· 938 940 939 941 right: 940 942 /* Search for a neighbor on the right. */ 941 - if (end == block_group->start + block_group->length) 943 + if (end == btrfs_block_group_end(block_group)) 942 944 goto insert; 943 945 key.objectid = end; 944 946 key.type = (u8)-1; ··· 1104 1106 * highest, block group). 1105 1107 */ 1106 1108 start = block_group->start; 1107 - end = block_group->start + block_group->length; 1109 + end = btrfs_block_group_end(block_group); 1108 1110 while (ret == 0) { 1109 1111 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 1110 1112 ··· 1394 1396 * can use multiple transactions, every time btrfs_end_transaction() is 1395 1397 * called at btrfs_rebuild_free_space_tree() we finish the creation of 1396 1398 * new block groups by calling btrfs_create_pending_block_groups(), and 1397 - * that in turn calls us, through add_block_group_free_space(), to add 1398 - * a free space info item and a free space extent item for the block 1399 - * group. 1399 + * that in turn calls us, through btrfs_add_block_group_free_space(), 1400 + * to add a free space info item and a free space extent item for the 1401 + * block group. 1400 1402 * 1401 1403 * Then later btrfs_rebuild_free_space_tree() may find such new block 1402 1404 * groups and processes them with populate_free_space_tree(), which can ··· 1477 1479 } 1478 1480 1479 1481 start = block_group->start; 1480 - end = block_group->start + block_group->length; 1482 + end = btrfs_block_group_end(block_group); 1481 1483 1482 1484 key.objectid = end - 1; 1483 1485 key.type = (u8)-1; ··· 1523 1525 btrfs_release_path(path); 1524 1526 } 1525 1527 1526 - ret = 0; 1527 - 1528 - return ret; 1528 + return 0; 1529 1529 } 1530 1530 1531 1531 static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, 1532 1532 struct btrfs_path *path, 1533 1533 u32 expected_extent_count) 1534 1534 { 1535 - struct btrfs_block_group *block_group; 1536 - struct btrfs_fs_info *fs_info; 1535 + struct btrfs_block_group *block_group = caching_ctl->block_group; 1536 + struct btrfs_fs_info *fs_info = block_group->fs_info; 1537 1537 struct btrfs_root *root; 1538 1538 struct btrfs_key key; 1539 1539 bool prev_bit_set = false; 1540 1540 /* Initialize to silence GCC. */ 1541 1541 u64 extent_start = 0; 1542 - u64 end, offset; 1542 + const u64 end = btrfs_block_group_end(block_group); 1543 + u64 offset; 1543 1544 u64 total_found = 0; 1544 1545 u32 extent_count = 0; 1545 1546 int ret; 1546 1547 1547 - block_group = caching_ctl->block_group; 1548 - fs_info = block_group->fs_info; 1549 1548 root = btrfs_free_space_root(block_group); 1550 - 1551 - end = block_group->start + block_group->length; 1552 1549 1553 1550 while (1) { 1554 1551 ret = btrfs_next_item(root, path); ··· 1610 1617 struct btrfs_path *path, 1611 1618 u32 expected_extent_count) 1612 1619 { 1613 - struct btrfs_block_group *block_group; 1614 - struct btrfs_fs_info *fs_info; 1620 + struct btrfs_block_group *block_group = caching_ctl->block_group; 1621 + struct btrfs_fs_info *fs_info = block_group->fs_info; 1615 1622 struct btrfs_root *root; 1616 1623 struct btrfs_key key; 1617 - u64 end; 1624 + const u64 end = btrfs_block_group_end(block_group); 1618 1625 u64 total_found = 0; 1619 1626 u32 extent_count = 0; 1620 1627 int ret; 1621 1628 1622 - block_group = caching_ctl->block_group; 1623 - fs_info = block_group->fs_info; 1624 1629 root = btrfs_free_space_root(block_group); 1625 - 1626 - end = block_group->start + block_group->length; 1627 1630 1628 1631 while (1) { 1629 1632 u64 space_added; ··· 1700 1711 return load_free_space_bitmaps(caching_ctl, path, extent_count); 1701 1712 else 1702 1713 return load_free_space_extents(caching_ctl, path, extent_count); 1714 + } 1715 + 1716 + static int delete_orphan_free_space_entries(struct btrfs_root *fst_root, 1717 + struct btrfs_path *path, 1718 + u64 first_bg_bytenr) 1719 + { 1720 + struct btrfs_trans_handle *trans; 1721 + int ret; 1722 + 1723 + trans = btrfs_start_transaction(fst_root, 1); 1724 + if (IS_ERR(trans)) 1725 + return PTR_ERR(trans); 1726 + 1727 + while (true) { 1728 + struct btrfs_key key = { 0 }; 1729 + int i; 1730 + 1731 + ret = btrfs_search_slot(trans, fst_root, &key, path, -1, 1); 1732 + if (ret < 0) 1733 + break; 1734 + ASSERT(ret > 0); 1735 + ret = 0; 1736 + for (i = 0; i < btrfs_header_nritems(path->nodes[0]); i++) { 1737 + btrfs_item_key_to_cpu(path->nodes[0], &key, i); 1738 + if (key.objectid >= first_bg_bytenr) { 1739 + /* 1740 + * Only break the for() loop and continue to 1741 + * delete items. 1742 + */ 1743 + break; 1744 + } 1745 + } 1746 + /* No items to delete, finished. */ 1747 + if (i == 0) 1748 + break; 1749 + 1750 + ret = btrfs_del_items(trans, fst_root, path, 0, i); 1751 + if (ret < 0) 1752 + break; 1753 + btrfs_release_path(path); 1754 + } 1755 + btrfs_release_path(path); 1756 + btrfs_end_transaction(trans); 1757 + if (ret == 0) 1758 + btrfs_info(fst_root->fs_info, "deleted orphan free space tree entries"); 1759 + return ret; 1760 + } 1761 + 1762 + /* Remove any free space entry before the first block group. */ 1763 + int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info) 1764 + { 1765 + BTRFS_PATH_AUTO_RELEASE(path); 1766 + struct btrfs_key key = { 1767 + .objectid = BTRFS_FREE_SPACE_TREE_OBJECTID, 1768 + .type = BTRFS_ROOT_ITEM_KEY, 1769 + .offset = 0, 1770 + }; 1771 + struct btrfs_root *root; 1772 + struct btrfs_block_group *bg; 1773 + u64 first_bg_bytenr; 1774 + int ret; 1775 + 1776 + /* 1777 + * Extent tree v2 has multiple global roots based on the block group. 1778 + * This means we cannot easily grab the global free space tree and locate 1779 + * orphan items. Furthermore this is still experimental, all users 1780 + * should use the latest btrfs-progs anyway. 1781 + */ 1782 + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) 1783 + return 0; 1784 + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) 1785 + return 0; 1786 + root = btrfs_global_root(fs_info, &key); 1787 + if (!root) 1788 + return 0; 1789 + 1790 + key.objectid = 0; 1791 + key.type = 0; 1792 + key.offset = 0; 1793 + 1794 + bg = btrfs_lookup_first_block_group(fs_info, 0); 1795 + if (unlikely(!bg)) { 1796 + btrfs_err(fs_info, "no block group found"); 1797 + return -EUCLEAN; 1798 + } 1799 + first_bg_bytenr = bg->start; 1800 + btrfs_put_block_group(bg); 1801 + 1802 + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); 1803 + if (ret < 0) 1804 + return ret; 1805 + /* There should not be an all-zero key in fst. */ 1806 + ASSERT(ret > 0); 1807 + 1808 + /* Empty free space tree. */ 1809 + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) 1810 + return 0; 1811 + 1812 + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); 1813 + if (key.objectid >= first_bg_bytenr) 1814 + return 0; 1815 + btrfs_release_path(&path); 1816 + return delete_orphan_free_space_entries(root, &path, first_bg_bytenr); 1703 1817 }

+4 -2

fs/btrfs/free-space-tree.h

··· 35 35 u64 start, u64 size); 36 36 int btrfs_remove_from_free_space_tree(struct btrfs_trans_handle *trans, 37 37 u64 start, u64 size); 38 - 39 - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 38 + int btrfs_delete_orphan_free_space_entries(struct btrfs_fs_info *fs_info); 40 39 struct btrfs_free_space_info * 41 40 btrfs_search_free_space_info(struct btrfs_trans_handle *trans, 42 41 struct btrfs_block_group *block_group, 43 42 struct btrfs_path *path, int cow); 43 + struct btrfs_root *btrfs_free_space_root(struct btrfs_block_group *block_group); 44 + 45 + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 44 46 int __btrfs_add_to_free_space_tree(struct btrfs_trans_handle *trans, 45 47 struct btrfs_block_group *block_group, 46 48 struct btrfs_path *path, u64 start, u64 size);

+87 -15

fs/btrfs/fs.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0 2 2 3 + #include <linux/crc32.h> 3 4 #include "messages.h" 4 5 #include "fs.h" 5 6 #include "accessors.h" ··· 9 8 static const struct btrfs_csums { 10 9 u16 size; 11 10 const char name[10]; 12 - const char driver[12]; 13 11 } btrfs_csums[] = { 14 12 [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" }, 15 13 [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" }, 16 14 [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" }, 17 - [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b", 18 - .driver = "blake2b-256" }, 15 + [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b" }, 19 16 }; 20 17 21 18 /* This exists for btrfs-progs usages. */ ··· 36 37 return btrfs_csums[csum_type].name; 37 38 } 38 39 39 - /* 40 - * Return driver name if defined, otherwise the name that's also a valid driver 41 - * name. 42 - */ 43 - const char *btrfs_super_csum_driver(u16 csum_type) 44 - { 45 - /* csum type is validated at mount time */ 46 - return btrfs_csums[csum_type].driver[0] ? 47 - btrfs_csums[csum_type].driver : 48 - btrfs_csums[csum_type].name; 49 - } 50 - 51 40 size_t __attribute_const__ btrfs_get_num_csums(void) 52 41 { 53 42 return ARRAY_SIZE(btrfs_csums); 43 + } 44 + 45 + void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out) 46 + { 47 + switch (csum_type) { 48 + case BTRFS_CSUM_TYPE_CRC32: 49 + put_unaligned_le32(~crc32c(~0, data, len), out); 50 + break; 51 + case BTRFS_CSUM_TYPE_XXHASH: 52 + put_unaligned_le64(xxh64(data, len, 0), out); 53 + break; 54 + case BTRFS_CSUM_TYPE_SHA256: 55 + sha256(data, len, out); 56 + break; 57 + case BTRFS_CSUM_TYPE_BLAKE2: 58 + blake2b(NULL, 0, data, len, out, 32); 59 + break; 60 + default: 61 + /* Checksum type is validated at mount time. */ 62 + BUG(); 63 + } 64 + } 65 + 66 + void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type) 67 + { 68 + ctx->csum_type = csum_type; 69 + switch (ctx->csum_type) { 70 + case BTRFS_CSUM_TYPE_CRC32: 71 + ctx->crc32 = ~0; 72 + break; 73 + case BTRFS_CSUM_TYPE_XXHASH: 74 + xxh64_reset(&ctx->xxh64, 0); 75 + break; 76 + case BTRFS_CSUM_TYPE_SHA256: 77 + sha256_init(&ctx->sha256); 78 + break; 79 + case BTRFS_CSUM_TYPE_BLAKE2: 80 + blake2b_init(&ctx->blake2b, 32); 81 + break; 82 + default: 83 + /* Checksume type is validated at mount time. */ 84 + BUG(); 85 + } 86 + } 87 + 88 + void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len) 89 + { 90 + switch (ctx->csum_type) { 91 + case BTRFS_CSUM_TYPE_CRC32: 92 + ctx->crc32 = crc32c(ctx->crc32, data, len); 93 + break; 94 + case BTRFS_CSUM_TYPE_XXHASH: 95 + xxh64_update(&ctx->xxh64, data, len); 96 + break; 97 + case BTRFS_CSUM_TYPE_SHA256: 98 + sha256_update(&ctx->sha256, data, len); 99 + break; 100 + case BTRFS_CSUM_TYPE_BLAKE2: 101 + blake2b_update(&ctx->blake2b, data, len); 102 + break; 103 + default: 104 + /* Checksum type is validated at mount time. */ 105 + BUG(); 106 + } 107 + } 108 + 109 + void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out) 110 + { 111 + switch (ctx->csum_type) { 112 + case BTRFS_CSUM_TYPE_CRC32: 113 + put_unaligned_le32(~ctx->crc32, out); 114 + break; 115 + case BTRFS_CSUM_TYPE_XXHASH: 116 + put_unaligned_le64(xxh64_digest(&ctx->xxh64), out); 117 + break; 118 + case BTRFS_CSUM_TYPE_SHA256: 119 + sha256_final(&ctx->sha256, out); 120 + break; 121 + case BTRFS_CSUM_TYPE_BLAKE2: 122 + blake2b_final(&ctx->blake2b, out); 123 + break; 124 + default: 125 + /* Checksum type is validated at mount time. */ 126 + BUG(); 127 + } 54 128 } 55 129 56 130 /*

+60 -19

fs/btrfs/fs.h

··· 3 3 #ifndef BTRFS_FS_H 4 4 #define BTRFS_FS_H 5 5 6 + #include <crypto/blake2b.h> 7 + #include <crypto/sha2.h> 6 8 #include <linux/blkdev.h> 7 9 #include <linux/sizes.h> 8 10 #include <linux/time64.h> ··· 26 24 #include <linux/wait_bit.h> 27 25 #include <linux/sched.h> 28 26 #include <linux/rbtree.h> 27 + #include <linux/xxhash.h> 29 28 #include <uapi/linux/btrfs.h> 30 29 #include <uapi/linux/btrfs_tree.h> 31 30 #include "extent-io-tree.h" ··· 38 35 struct super_block; 39 36 struct kobject; 40 37 struct reloc_control; 41 - struct crypto_shash; 42 38 struct ulist; 43 39 struct btrfs_device; 44 40 struct btrfs_block_group; 45 41 struct btrfs_root; 46 42 struct btrfs_fs_devices; 47 43 struct btrfs_transaction; 48 - struct btrfs_delayed_root; 49 44 struct btrfs_balance_control; 50 45 struct btrfs_subpage_info; 51 46 struct btrfs_stripe_hash_table; ··· 64 63 #define BTRFS_MAX_BLOCKSIZE (SZ_64K) 65 64 66 65 #define BTRFS_MAX_EXTENT_SIZE SZ_128M 66 + 67 + /* 68 + * Maximum length to trim in a single iteration to avoid holding device list 69 + * mutex for too long. 70 + */ 71 + #define BTRFS_MAX_TRIM_LENGTH SZ_2G 67 72 68 73 #define BTRFS_OLDEST_GENERATION 0ULL 69 74 ··· 320 313 #define BTRFS_FEATURE_INCOMPAT_SUPP \ 321 314 (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ 322 315 BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ 323 - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) 316 + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 | \ 317 + BTRFS_FEATURE_INCOMPAT_REMAP_TREE) 324 318 325 319 #else 326 320 ··· 469 461 u64 critical_section_start_time; 470 462 }; 471 463 464 + struct btrfs_delayed_root { 465 + spinlock_t lock; 466 + int nodes; /* for delayed nodes */ 467 + struct list_head node_list; 468 + /* 469 + * Used for delayed nodes which is waiting to be dealt with by the 470 + * worker. If the delayed node is inserted into the work queue, we 471 + * drop it from this list. 472 + */ 473 + struct list_head prepare_list; 474 + atomic_t items; /* for delayed items */ 475 + atomic_t items_seq; /* for delayed items */ 476 + wait_queue_head_t wait; 477 + }; 478 + 472 479 struct btrfs_fs_info { 473 480 u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; 474 481 unsigned long flags; ··· 496 473 struct btrfs_root *data_reloc_root; 497 474 struct btrfs_root *block_group_root; 498 475 struct btrfs_root *stripe_root; 476 + struct btrfs_root *remap_root; 499 477 500 478 /* The log root tree is a directory of all the other log roots */ 501 479 struct btrfs_root *log_root_tree; ··· 531 507 struct btrfs_block_rsv trans_block_rsv; 532 508 /* Block reservation for chunk tree */ 533 509 struct btrfs_block_rsv chunk_block_rsv; 510 + /* Block reservation for remap tree. */ 511 + struct btrfs_block_rsv remap_block_rsv; 534 512 /* Block reservation for delayed operations */ 535 513 struct btrfs_block_rsv delayed_block_rsv; 536 514 /* Block reservation for delayed refs */ ··· 607 581 struct mutex transaction_kthread_mutex; 608 582 struct mutex cleaner_mutex; 609 583 struct mutex chunk_mutex; 584 + struct mutex remap_mutex; 610 585 611 586 /* 612 587 * This is taken to make sure we don't set block groups ro after the ··· 837 810 /* Filesystem state */ 838 811 unsigned long fs_state; 839 812 840 - struct btrfs_delayed_root *delayed_root; 813 + struct btrfs_delayed_root delayed_root; 841 814 842 815 /* Entries are eb->start >> nodesize_bits */ 843 816 struct xarray buffer_tree; ··· 861 834 struct list_head reclaim_bgs; 862 835 int bg_reclaim_threshold; 863 836 864 - /* Protects the lists unused_bgs and reclaim_bgs. */ 837 + /* Protects the lists unused_bgs, reclaim_bgs, and fully_remapped_bgs. */ 865 838 spinlock_t unused_bgs_lock; 866 839 /* Protected by unused_bgs_lock. */ 867 840 struct list_head unused_bgs; 841 + struct list_head fully_remapped_bgs; 868 842 struct mutex unused_bg_unpin_mutex; 869 843 /* Protect block groups that are going to be deleted */ 870 844 struct mutex reclaim_bgs_lock; ··· 878 850 u32 sectorsize_bits; 879 851 u32 block_min_order; 880 852 u32 block_max_order; 853 + u32 stripesize; 881 854 u32 csum_size; 882 855 u32 csums_per_leaf; 883 - u32 stripesize; 856 + u32 csum_type; 884 857 885 858 /* 886 859 * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular ··· 892 863 /* Block groups and devices containing active swapfiles. */ 893 864 spinlock_t swapfile_pins_lock; 894 865 struct rb_root swapfile_pins; 895 - 896 - struct crypto_shash *csum_shash; 897 866 898 867 /* Type of exclusive operation running, protected by super_lock */ 899 868 enum btrfs_exclusive_operation exclusive_operation; ··· 1084 1057 u16 btrfs_csum_type_size(u16 type); 1085 1058 int btrfs_super_csum_size(const struct btrfs_super_block *s); 1086 1059 const char *btrfs_super_csum_name(u16 csum_type); 1087 - const char *btrfs_super_csum_driver(u16 csum_type); 1088 1060 size_t __attribute_const__ btrfs_get_num_csums(void); 1061 + struct btrfs_csum_ctx { 1062 + u16 csum_type; 1063 + union { 1064 + u32 crc32; 1065 + struct xxh64_state xxh64; 1066 + struct sha256_ctx sha256; 1067 + struct blake2b_ctx blake2b; 1068 + }; 1069 + }; 1070 + void btrfs_csum(u16 csum_type, const u8 *data, size_t len, u8 *out); 1071 + void btrfs_csum_init(struct btrfs_csum_ctx *ctx, u16 csum_type); 1072 + void btrfs_csum_update(struct btrfs_csum_ctx *ctx, const u8 *data, size_t len); 1073 + void btrfs_csum_final(struct btrfs_csum_ctx *ctx, u8 *out); 1089 1074 1090 1075 static inline bool btrfs_is_empty_uuid(const u8 *uuid) 1091 1076 { ··· 1144 1105 #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ 1145 1106 BTRFS_MOUNT_##opt) 1146 1107 1147 - static inline int btrfs_fs_closing(const struct btrfs_fs_info *fs_info) 1108 + static inline bool btrfs_fs_closing(const struct btrfs_fs_info *fs_info) 1148 1109 { 1149 - /* Do it this way so we only ever do one test_bit in the normal case. */ 1150 - if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { 1151 - if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) 1152 - return 2; 1153 - return 1; 1154 - } 1155 - return 0; 1110 + return unlikely(test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)); 1111 + } 1112 + 1113 + static inline bool btrfs_fs_closing_done(const struct btrfs_fs_info *fs_info) 1114 + { 1115 + if (btrfs_fs_closing(fs_info) && test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) 1116 + return true; 1117 + 1118 + return false; 1156 1119 } 1157 1120 1158 1121 /* ··· 1182 1141 (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ 1183 1142 &(fs_info)->fs_state))) 1184 1143 1185 - static inline bool btrfs_is_shutdown(struct btrfs_fs_info *fs_info) 1144 + static inline bool btrfs_is_shutdown(const struct btrfs_fs_info *fs_info) 1186 1145 { 1187 - return test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state); 1146 + return unlikely(test_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)); 1188 1147 } 1189 1148 1190 1149 static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info)

+3 -4

fs/btrfs/inode-item.c

··· 371 371 struct btrfs_path *path, u64 objectid) 372 372 { 373 373 struct btrfs_key key; 374 - int ret; 374 + 375 375 key.objectid = objectid; 376 376 key.type = BTRFS_INODE_ITEM_KEY; 377 377 key.offset = 0; 378 378 379 - ret = btrfs_insert_empty_item(trans, root, path, &key, 380 - sizeof(struct btrfs_inode_item)); 381 - return ret; 379 + return btrfs_insert_empty_item(trans, root, path, &key, 380 + sizeof(struct btrfs_inode_item)); 382 381 } 383 382 384 383 int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root

+324 -275

fs/btrfs/inode.c

··· 3 3 * Copyright (C) 2007 Oracle. All rights reserved. 4 4 */ 5 5 6 - #include <crypto/hash.h> 7 6 #include <linux/kernel.h> 8 7 #include <linux/bio.h> 9 8 #include <linux/blk-cgroup.h> ··· 218 219 int mirror_num) 219 220 { 220 221 struct btrfs_fs_info *fs_info = inode->root->fs_info; 221 - struct btrfs_path path = { 0 }; 222 + BTRFS_PATH_AUTO_RELEASE(path); 222 223 struct btrfs_key found_key = { 0 }; 223 224 struct extent_buffer *eb; 224 225 struct btrfs_extent_item *ei; ··· 256 257 if (ret < 0) { 257 258 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", 258 259 logical, ret); 259 - btrfs_release_path(&path); 260 260 return; 261 261 } 262 262 eb = path.nodes[0]; ··· 285 287 (ref_level ? "node" : "leaf"), 286 288 ref_level, ref_root); 287 289 } 288 - btrfs_release_path(&path); 289 290 } else { 290 291 struct btrfs_backref_walk_ctx ctx = { 0 }; 291 292 struct data_reloc_warn reloc_warn = { 0 }; 292 293 294 + /* 295 + * Do not hold the path as later iterate_extent_inodes() call 296 + * can be time consuming. 297 + */ 293 298 btrfs_release_path(&path); 294 299 295 300 ctx.bytenr = found_key.objectid; ··· 508 507 ret = btrfs_insert_empty_item(trans, root, path, &key, 509 508 datasize); 510 509 if (ret) 511 - goto fail; 510 + return ret; 512 511 } 513 512 leaf = path->nodes[0]; 514 513 ei = btrfs_item_ptr(leaf, path->slots[0], ··· 547 546 ret = btrfs_inode_set_file_extent_range(inode, 0, 548 547 ALIGN(size, root->fs_info->sectorsize)); 549 548 if (ret) 550 - goto fail; 549 + return ret; 551 550 552 551 /* 553 552 * We're an inline extent, so nobody can extend the file past i_size ··· 563 562 } 564 563 inode->disk_i_size = i_size; 565 564 566 - fail: 567 - return ret; 565 + return 0; 568 566 } 569 567 570 568 static bool can_cow_file_range_inline(struct btrfs_inode *inode, ··· 690 690 /* 691 691 * Don't forget to free the reserved space, as for inlined extent 692 692 * it won't count as data extent, free them directly here. 693 - * And at reserve time, it's always aligned to page size, so 694 - * just free one page here. 693 + * And at reserve time, it's always aligned to sector size, so 694 + * just free one sector here. 695 695 * 696 696 * If we fallback to non-inline (ret == 1) due to -ENOSPC, then we need 697 697 * to keep the data reservation. ··· 756 756 struct async_extent { 757 757 u64 start; 758 758 u64 ram_size; 759 - u64 compressed_size; 760 - struct folio **folios; 761 - unsigned long nr_folios; 762 - int compress_type; 759 + struct compressed_bio *cb; 763 760 struct list_head list; 764 761 }; 765 762 ··· 777 780 struct async_chunk chunks[]; 778 781 }; 779 782 780 - static noinline int add_async_extent(struct async_chunk *cow, 781 - u64 start, u64 ram_size, 782 - u64 compressed_size, 783 - struct folio **folios, 784 - unsigned long nr_folios, 785 - int compress_type) 783 + static int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, 784 + struct compressed_bio *cb) 786 785 { 787 786 struct async_extent *async_extent; 788 787 789 788 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 790 789 if (!async_extent) 791 790 return -ENOMEM; 791 + ASSERT(ram_size < U32_MAX); 792 792 async_extent->start = start; 793 793 async_extent->ram_size = ram_size; 794 - async_extent->compressed_size = compressed_size; 795 - async_extent->folios = folios; 796 - async_extent->nr_folios = nr_folios; 797 - async_extent->compress_type = compress_type; 794 + async_extent->cb = cb; 798 795 list_add_tail(&async_extent->list, &cow->extents); 799 796 return 0; 800 797 } ··· 807 816 return 0; 808 817 } 809 818 819 + /* 820 + * If the delalloc range is only one fs block and can not be inlined, 821 + * do not even bother try compression, as there will be no space saving 822 + * and will always fallback to regular write later. 823 + */ 824 + if (start != 0 && end + 1 - start <= fs_info->sectorsize) 825 + return 0; 810 826 /* Defrag ioctl takes precedence over mount options and properties. */ 811 827 if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) 812 828 return 0; ··· 862 864 return ret; 863 865 } 864 866 867 + static struct folio *compressed_bio_last_folio(struct compressed_bio *cb) 868 + { 869 + struct bio *bio = &cb->bbio.bio; 870 + struct bio_vec *bvec; 871 + phys_addr_t paddr; 872 + 873 + /* 874 + * Make sure all folios have the same min_folio_size. 875 + * 876 + * Otherwise we cannot simply use offset_in_offset(folio, bi_size) to 877 + * calculate the end of the last folio. 878 + */ 879 + if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { 880 + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); 881 + const u32 min_folio_size = btrfs_min_folio_size(fs_info); 882 + struct folio_iter fi; 883 + 884 + bio_for_each_folio_all(fi, bio) 885 + ASSERT(folio_size(fi.folio) == min_folio_size); 886 + } 887 + 888 + /* The bio must not be empty. */ 889 + ASSERT(bio->bi_vcnt); 890 + 891 + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; 892 + paddr = page_to_phys(bvec->bv_page) + bvec->bv_offset + bvec->bv_len - 1; 893 + return page_folio(phys_to_page(paddr)); 894 + } 895 + 896 + static void zero_last_folio(struct compressed_bio *cb) 897 + { 898 + struct bio *bio = &cb->bbio.bio; 899 + struct folio *last_folio = compressed_bio_last_folio(cb); 900 + const u32 bio_size = bio->bi_iter.bi_size; 901 + const u32 foffset = offset_in_folio(last_folio, bio_size); 902 + 903 + folio_zero_range(last_folio, foffset, folio_size(last_folio) - foffset); 904 + } 905 + 906 + static void round_up_last_block(struct compressed_bio *cb, u32 blocksize) 907 + { 908 + struct bio *bio = &cb->bbio.bio; 909 + struct folio *last_folio = compressed_bio_last_folio(cb); 910 + const u32 bio_size = bio->bi_iter.bi_size; 911 + const u32 foffset = offset_in_folio(last_folio, bio_size); 912 + bool ret; 913 + 914 + if (IS_ALIGNED(bio_size, blocksize)) 915 + return; 916 + 917 + ret = bio_add_folio(bio, last_folio, round_up(foffset, blocksize) - foffset, foffset); 918 + /* The remaining part should be merged thus never fail. */ 919 + ASSERT(ret); 920 + } 921 + 865 922 /* 866 923 * Work queue call back to started compression on a file and pages. 867 924 * ··· 937 884 struct btrfs_inode *inode = async_chunk->inode; 938 885 struct btrfs_fs_info *fs_info = inode->root->fs_info; 939 886 struct address_space *mapping = inode->vfs_inode.i_mapping; 940 - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 887 + struct compressed_bio *cb = NULL; 941 888 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 942 889 u64 blocksize = fs_info->sectorsize; 943 890 u64 start = async_chunk->start; 944 891 u64 end = async_chunk->end; 945 892 u64 actual_end; 946 893 u64 i_size; 894 + u32 cur_len; 947 895 int ret = 0; 948 - struct folio **folios = NULL; 949 - unsigned long nr_folios; 950 896 unsigned long total_compressed = 0; 951 897 unsigned long total_in = 0; 952 898 unsigned int loff; 953 - int i; 954 899 int compress_type = fs_info->compress_type; 955 900 int compress_level = fs_info->compress_level; 956 901 957 - if (unlikely(btrfs_is_shutdown(fs_info))) 902 + if (btrfs_is_shutdown(fs_info)) 958 903 goto cleanup_and_bail_uncompressed; 959 904 960 905 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); ··· 967 916 /* 968 917 * All the folios should have been locked thus no failure. 969 918 * 970 - * And even if some folios are missing, btrfs_compress_folios() 919 + * And even if some folios are missing, btrfs_compress_bio() 971 920 * would handle them correctly, so here just do an ASSERT() check for 972 921 * early logic errors. 973 922 */ ··· 987 936 barrier(); 988 937 actual_end = min_t(u64, i_size, end + 1); 989 938 again: 990 - folios = NULL; 991 - nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; 992 - nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); 939 + total_in = 0; 940 + cur_len = min(end + 1 - start, BTRFS_MAX_UNCOMPRESSED); 941 + ret = 0; 942 + cb = NULL; 993 943 994 944 /* 995 945 * we don't want to send crud past the end of i_size through ··· 1005 953 if (actual_end <= start) 1006 954 goto cleanup_and_bail_uncompressed; 1007 955 1008 - total_compressed = actual_end - start; 1009 - 1010 - /* 1011 - * Skip compression for a small file range(<=blocksize) that 1012 - * isn't an inline extent, since it doesn't save disk space at all. 1013 - */ 1014 - if (total_compressed <= blocksize && 1015 - (start > 0 || end + 1 < inode->disk_i_size)) 1016 - goto cleanup_and_bail_uncompressed; 1017 - 1018 - total_compressed = min_t(unsigned long, total_compressed, 1019 - BTRFS_MAX_UNCOMPRESSED); 1020 - total_in = 0; 1021 - ret = 0; 1022 - 1023 956 /* 1024 957 * We do compression for mount -o compress and when the inode has not 1025 958 * been flagged as NOCOMPRESS. This flag can change at any time if we ··· 1012 975 */ 1013 976 if (!inode_need_compress(inode, start, end)) 1014 977 goto cleanup_and_bail_uncompressed; 1015 - 1016 - folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); 1017 - if (!folios) { 1018 - /* 1019 - * Memory allocation failure is not a fatal error, we can fall 1020 - * back to uncompressed code. 1021 - */ 1022 - goto cleanup_and_bail_uncompressed; 1023 - } 1024 978 1025 979 if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { 1026 980 compress_type = inode->defrag_compress; ··· 1021 993 } 1022 994 1023 995 /* Compression level is applied here. */ 1024 - ret = btrfs_compress_folios(compress_type, compress_level, 1025 - inode, start, folios, &nr_folios, &total_in, 1026 - &total_compressed); 1027 - if (ret) 996 + cb = btrfs_compress_bio(inode, start, cur_len, compress_type, 997 + compress_level, async_chunk->write_flags); 998 + if (IS_ERR(cb)) { 999 + cb = NULL; 1028 1000 goto mark_incompressible; 1001 + } 1002 + 1003 + total_compressed = cb->bbio.bio.bi_iter.bi_size; 1004 + total_in = cur_len; 1029 1005 1030 1006 /* 1031 1007 * Zero the tail end of the last folio, as we might be sending it down ··· 1037 1005 */ 1038 1006 loff = (total_compressed & (min_folio_size - 1)); 1039 1007 if (loff) 1040 - folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); 1008 + zero_last_folio(cb); 1041 1009 1042 1010 /* 1043 1011 * Try to create an inline extent. ··· 1053 1021 BTRFS_COMPRESS_NONE, NULL, false); 1054 1022 else 1055 1023 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, 1056 - compress_type, folios[0], false); 1024 + compress_type, 1025 + bio_first_folio_all(&cb->bbio.bio), false); 1057 1026 if (ret <= 0) { 1027 + cleanup_compressed_bio(cb); 1058 1028 if (ret < 0) 1059 1029 mapping_set_error(mapping, -EIO); 1060 - goto free_pages; 1030 + return; 1061 1031 } 1062 1032 1063 1033 /* ··· 1067 1033 * block size boundary so the allocator does sane things. 1068 1034 */ 1069 1035 total_compressed = ALIGN(total_compressed, blocksize); 1036 + round_up_last_block(cb, blocksize); 1070 1037 1071 1038 /* 1072 1039 * One last check to make sure the compression is really a win, compare ··· 1078 1043 if (total_compressed + blocksize > total_in) 1079 1044 goto mark_incompressible; 1080 1045 1046 + 1081 1047 /* 1082 1048 * The async work queues will take care of doing actual allocation on 1083 1049 * disk for these compressed pages, and will submit the bios. 1084 1050 */ 1085 - ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, 1086 - nr_folios, compress_type); 1051 + ret = add_async_extent(async_chunk, start, total_in, cb); 1087 1052 BUG_ON(ret); 1088 1053 if (start + total_in < end) { 1089 1054 start += total_in; ··· 1096 1061 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) 1097 1062 inode->flags |= BTRFS_INODE_NOCOMPRESS; 1098 1063 cleanup_and_bail_uncompressed: 1099 - ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, 1100 - BTRFS_COMPRESS_NONE); 1064 + ret = add_async_extent(async_chunk, start, end - start + 1, NULL); 1101 1065 BUG_ON(ret); 1102 - free_pages: 1103 - if (folios) { 1104 - for (i = 0; i < nr_folios; i++) { 1105 - WARN_ON(folios[i]->mapping); 1106 - btrfs_free_compr_folio(folios[i]); 1107 - } 1108 - kfree(folios); 1109 - } 1110 - } 1111 - 1112 - static void free_async_extent_pages(struct async_extent *async_extent) 1113 - { 1114 - int i; 1115 - 1116 - if (!async_extent->folios) 1117 - return; 1118 - 1119 - for (i = 0; i < async_extent->nr_folios; i++) { 1120 - WARN_ON(async_extent->folios[i]->mapping); 1121 - btrfs_free_compr_folio(async_extent->folios[i]); 1122 - } 1123 - kfree(async_extent->folios); 1124 - async_extent->nr_folios = 0; 1125 - async_extent->folios = NULL; 1066 + if (cb) 1067 + cleanup_compressed_bio(cb); 1126 1068 } 1127 1069 1128 1070 static void submit_uncompressed_range(struct btrfs_inode *inode, ··· 1146 1134 struct extent_state *cached = NULL; 1147 1135 struct extent_map *em; 1148 1136 int ret = 0; 1149 - bool free_pages = false; 1137 + u32 compressed_size; 1150 1138 u64 start = async_extent->start; 1151 1139 u64 end = async_extent->start + async_extent->ram_size - 1; 1152 1140 ··· 1166 1154 locked_folio = async_chunk->locked_folio; 1167 1155 } 1168 1156 1169 - if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { 1170 - ASSERT(!async_extent->folios); 1171 - ASSERT(async_extent->nr_folios == 0); 1157 + if (!async_extent->cb) { 1172 1158 submit_uncompressed_range(inode, async_extent, locked_folio); 1173 - free_pages = true; 1174 1159 goto done; 1175 1160 } 1176 1161 1162 + compressed_size = async_extent->cb->bbio.bio.bi_iter.bi_size; 1177 1163 ret = btrfs_reserve_extent(root, async_extent->ram_size, 1178 - async_extent->compressed_size, 1179 - async_extent->compressed_size, 1164 + compressed_size, compressed_size, 1180 1165 0, *alloc_hint, &ins, true, true); 1181 1166 if (ret) { 1182 1167 /* ··· 1183 1174 * fall back to uncompressed. 1184 1175 */ 1185 1176 submit_uncompressed_range(inode, async_extent, locked_folio); 1186 - free_pages = true; 1177 + cleanup_compressed_bio(async_extent->cb); 1178 + async_extent->cb = NULL; 1187 1179 goto done; 1188 1180 } 1189 1181 ··· 1196 1186 file_extent.ram_bytes = async_extent->ram_size; 1197 1187 file_extent.num_bytes = async_extent->ram_size; 1198 1188 file_extent.offset = 0; 1199 - file_extent.compression = async_extent->compress_type; 1189 + file_extent.compression = async_extent->cb->compress_type; 1190 + 1191 + async_extent->cb->bbio.bio.bi_iter.bi_sector = ins.objectid >> SECTOR_SHIFT; 1200 1192 1201 1193 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); 1202 1194 if (IS_ERR(em)) { ··· 1214 1202 ret = PTR_ERR(ordered); 1215 1203 goto out_free_reserve; 1216 1204 } 1205 + async_extent->cb->bbio.ordered = ordered; 1217 1206 btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1218 1207 1219 1208 /* Clear dirty, set writeback and unlock the pages. */ 1220 1209 extent_clear_unlock_delalloc(inode, start, end, 1221 1210 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, 1222 1211 PAGE_UNLOCK | PAGE_START_WRITEBACK); 1223 - btrfs_submit_compressed_write(ordered, 1224 - async_extent->folios, /* compressed_folios */ 1225 - async_extent->nr_folios, 1226 - async_chunk->write_flags, true); 1212 + btrfs_submit_bbio(&async_extent->cb->bbio, 0); 1213 + async_extent->cb = NULL; 1214 + 1227 1215 *alloc_hint = ins.objectid + ins.offset; 1228 1216 done: 1229 1217 if (async_chunk->blkcg_css) 1230 1218 kthread_associate_blkcg(NULL); 1231 - if (free_pages) 1232 - free_async_extent_pages(async_extent); 1233 1219 kfree(async_extent); 1234 1220 return; 1235 1221 ··· 1242 1232 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 1243 1233 PAGE_UNLOCK | PAGE_START_WRITEBACK | 1244 1234 PAGE_END_WRITEBACK); 1245 - free_async_extent_pages(async_extent); 1235 + if (async_extent->cb) 1236 + cleanup_compressed_bio(async_extent->cb); 1246 1237 if (async_chunk->blkcg_css) 1247 1238 kthread_associate_blkcg(NULL); 1248 1239 btrfs_debug(fs_info, ··· 1286 1275 } 1287 1276 1288 1277 /* 1278 + * Handle COW for one range. 1279 + * 1280 + * @ins: The key representing the allocated range. 1281 + * @file_offset: The file offset of the COW range 1282 + * @num_bytes: The expected length of the COW range 1283 + * The actually allocated length can be smaller than it. 1284 + * @min_alloc_size: The minimal extent size. 1285 + * @alloc_hint: The hint for the extent allocator. 1286 + * @ret_alloc_size: The COW range handles by this function. 1287 + * 1288 + * Return 0 if everything is fine and update @ret_alloc_size updated. The 1289 + * range is still locked, and caller should unlock the range after everything 1290 + * is done or for error handling. 1291 + * 1292 + * Return <0 for error and @is updated for where the extra cleanup should 1293 + * happen. The range [file_offset, file_offset + ret_alloc_size) will be 1294 + * cleaned up by this function. 1295 + */ 1296 + static int cow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, 1297 + struct btrfs_key *ins, struct extent_state **cached, 1298 + u64 file_offset, u32 num_bytes, u32 min_alloc_size, 1299 + u64 alloc_hint, u32 *ret_alloc_size) 1300 + { 1301 + struct btrfs_root *root = inode->root; 1302 + struct btrfs_fs_info *fs_info = root->fs_info; 1303 + struct btrfs_ordered_extent *ordered; 1304 + struct btrfs_file_extent file_extent; 1305 + struct extent_map *em; 1306 + u32 cur_len = 0; 1307 + u64 cur_end; 1308 + int ret; 1309 + 1310 + ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 1311 + 0, alloc_hint, ins, true, true); 1312 + if (ret < 0) { 1313 + *ret_alloc_size = cur_len; 1314 + return ret; 1315 + } 1316 + 1317 + cur_len = ins->offset; 1318 + cur_end = file_offset + cur_len - 1; 1319 + 1320 + file_extent.disk_bytenr = ins->objectid; 1321 + file_extent.disk_num_bytes = ins->offset; 1322 + file_extent.num_bytes = ins->offset; 1323 + file_extent.ram_bytes = ins->offset; 1324 + file_extent.offset = 0; 1325 + file_extent.compression = BTRFS_COMPRESS_NONE; 1326 + 1327 + /* 1328 + * Locked range will be released either during error clean up (inside 1329 + * this function or by the caller for previously successful ranges) or 1330 + * after the whole range is finished. 1331 + */ 1332 + btrfs_lock_extent(&inode->io_tree, file_offset, cur_end, cached); 1333 + em = btrfs_create_io_em(inode, file_offset, &file_extent, BTRFS_ORDERED_REGULAR); 1334 + if (IS_ERR(em)) { 1335 + ret = PTR_ERR(em); 1336 + goto free_reserved; 1337 + } 1338 + btrfs_free_extent_map(em); 1339 + 1340 + ordered = btrfs_alloc_ordered_extent(inode, file_offset, &file_extent, 1341 + 1U << BTRFS_ORDERED_REGULAR); 1342 + if (IS_ERR(ordered)) { 1343 + btrfs_drop_extent_map_range(inode, file_offset, cur_end, false); 1344 + ret = PTR_ERR(ordered); 1345 + goto free_reserved; 1346 + } 1347 + 1348 + if (btrfs_is_data_reloc_root(root)) { 1349 + ret = btrfs_reloc_clone_csums(ordered); 1350 + 1351 + /* 1352 + * Only drop cache here, and process as normal. 1353 + * 1354 + * We must not allow extent_clear_unlock_delalloc() at 1355 + * free_reserved label to free meta of this ordered extent, as 1356 + * its meta should be freed by btrfs_finish_ordered_io(). 1357 + * 1358 + * So we must continue until @start is increased to 1359 + * skip current ordered extent. 1360 + */ 1361 + if (ret) 1362 + btrfs_drop_extent_map_range(inode, file_offset, 1363 + cur_end, false); 1364 + } 1365 + btrfs_put_ordered_extent(ordered); 1366 + btrfs_dec_block_group_reservations(fs_info, ins->objectid); 1367 + /* 1368 + * Error handling for btrfs_reloc_clone_csums(). 1369 + * 1370 + * Treat the range as finished, thus only clear EXTENT_LOCKED | EXTENT_DELALLOC. 1371 + * The accounting will be done by ordered extents. 1372 + */ 1373 + if (unlikely(ret < 0)) { 1374 + btrfs_cleanup_ordered_extents(inode, file_offset, cur_len); 1375 + extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, 1376 + EXTENT_LOCKED | EXTENT_DELALLOC, 1377 + PAGE_UNLOCK | PAGE_START_WRITEBACK | 1378 + PAGE_END_WRITEBACK); 1379 + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); 1380 + } 1381 + *ret_alloc_size = cur_len; 1382 + return ret; 1383 + 1384 + free_reserved: 1385 + extent_clear_unlock_delalloc(inode, file_offset, cur_end, locked_folio, cached, 1386 + EXTENT_LOCKED | EXTENT_DELALLOC | 1387 + EXTENT_DELALLOC_NEW | 1388 + EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, 1389 + PAGE_UNLOCK | PAGE_START_WRITEBACK | 1390 + PAGE_END_WRITEBACK); 1391 + btrfs_qgroup_free_data(inode, NULL, file_offset, cur_len, NULL); 1392 + btrfs_dec_block_group_reservations(fs_info, ins->objectid); 1393 + btrfs_free_reserved_extent(fs_info, ins->objectid, ins->offset, true); 1394 + mapping_set_error(inode->vfs_inode.i_mapping, -EIO); 1395 + *ret_alloc_size = cur_len; 1396 + /* 1397 + * We should not return -EAGAIN where it's a special return code for 1398 + * zoned to catch btrfs_reserved_extent(). 1399 + */ 1400 + ASSERT(ret != -EAGAIN); 1401 + return ret; 1402 + } 1403 + 1404 + /* 1289 1405 * when extent_io.c finds a delayed allocation range in the file, 1290 1406 * the call backs end up in this code. The basic idea is to 1291 1407 * allocate extents on disk for the range, and create ordered data structs ··· 1448 1310 u64 alloc_hint = 0; 1449 1311 u64 orig_start = start; 1450 1312 u64 num_bytes; 1451 - u64 cur_alloc_size = 0; 1452 - u64 min_alloc_size; 1453 - u64 blocksize = fs_info->sectorsize; 1313 + u32 min_alloc_size; 1314 + u32 blocksize = fs_info->sectorsize; 1315 + u32 cur_alloc_size = 0; 1454 1316 struct btrfs_key ins; 1455 - struct extent_map *em; 1456 1317 unsigned clear_bits; 1457 1318 unsigned long page_ops; 1458 1319 int ret = 0; 1459 1320 1460 - if (unlikely(btrfs_is_shutdown(fs_info))) { 1321 + if (btrfs_is_shutdown(fs_info)) { 1461 1322 ret = -EIO; 1462 1323 goto out_unlock; 1463 1324 } ··· 1520 1383 min_alloc_size = fs_info->sectorsize; 1521 1384 1522 1385 while (num_bytes > 0) { 1523 - struct btrfs_ordered_extent *ordered; 1524 - struct btrfs_file_extent file_extent; 1386 + ret = cow_one_range(inode, locked_folio, &ins, &cached, start, 1387 + num_bytes, min_alloc_size, alloc_hint, &cur_alloc_size); 1525 1388 1526 - ret = btrfs_reserve_extent(root, num_bytes, num_bytes, 1527 - min_alloc_size, 0, alloc_hint, 1528 - &ins, true, true); 1529 1389 if (ret == -EAGAIN) { 1530 1390 /* 1531 - * btrfs_reserve_extent only returns -EAGAIN for zoned 1532 - * file systems, which is an indication that there are 1391 + * cow_one_range() only returns -EAGAIN for zoned 1392 + * file systems (from btrfs_reserve_extent()), which 1393 + * is an indication that there are 1533 1394 * no active zones to allocate from at the moment. 1534 1395 * 1535 1396 * If this is the first loop iteration, wait for at ··· 1556 1421 } 1557 1422 if (ret < 0) 1558 1423 goto out_unlock; 1559 - cur_alloc_size = ins.offset; 1560 1424 1561 - file_extent.disk_bytenr = ins.objectid; 1562 - file_extent.disk_num_bytes = ins.offset; 1563 - file_extent.num_bytes = ins.offset; 1564 - file_extent.ram_bytes = ins.offset; 1565 - file_extent.offset = 0; 1566 - file_extent.compression = BTRFS_COMPRESS_NONE; 1425 + /* We should not allocate an extent larger than requested.*/ 1426 + ASSERT(cur_alloc_size <= num_bytes); 1567 1427 1568 - /* 1569 - * Locked range will be released either during error clean up or 1570 - * after the whole range is finished. 1571 - */ 1572 - btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, 1573 - &cached); 1574 - 1575 - em = btrfs_create_io_em(inode, start, &file_extent, 1576 - BTRFS_ORDERED_REGULAR); 1577 - if (IS_ERR(em)) { 1578 - btrfs_unlock_extent(&inode->io_tree, start, 1579 - start + cur_alloc_size - 1, &cached); 1580 - ret = PTR_ERR(em); 1581 - goto out_reserve; 1582 - } 1583 - btrfs_free_extent_map(em); 1584 - 1585 - ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1586 - 1U << BTRFS_ORDERED_REGULAR); 1587 - if (IS_ERR(ordered)) { 1588 - btrfs_unlock_extent(&inode->io_tree, start, 1589 - start + cur_alloc_size - 1, &cached); 1590 - ret = PTR_ERR(ordered); 1591 - goto out_drop_extent_cache; 1592 - } 1593 - 1594 - if (btrfs_is_data_reloc_root(root)) { 1595 - ret = btrfs_reloc_clone_csums(ordered); 1596 - 1597 - /* 1598 - * Only drop cache here, and process as normal. 1599 - * 1600 - * We must not allow extent_clear_unlock_delalloc() 1601 - * at out_unlock label to free meta of this ordered 1602 - * extent, as its meta should be freed by 1603 - * btrfs_finish_ordered_io(). 1604 - * 1605 - * So we must continue until @start is increased to 1606 - * skip current ordered extent. 1607 - */ 1608 - if (ret) 1609 - btrfs_drop_extent_map_range(inode, start, 1610 - start + cur_alloc_size - 1, 1611 - false); 1612 - } 1613 - btrfs_put_ordered_extent(ordered); 1614 - 1615 - btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1616 - 1617 - if (num_bytes < cur_alloc_size) 1618 - num_bytes = 0; 1619 - else 1620 - num_bytes -= cur_alloc_size; 1428 + num_bytes -= cur_alloc_size; 1621 1429 alloc_hint = ins.objectid + ins.offset; 1622 1430 start += cur_alloc_size; 1623 1431 cur_alloc_size = 0; 1624 - 1625 - /* 1626 - * btrfs_reloc_clone_csums() error, since start is increased 1627 - * extent_clear_unlock_delalloc() at out_unlock label won't 1628 - * free metadata of current ordered extent, we're OK to exit. 1629 - */ 1630 - if (ret) 1631 - goto out_unlock; 1632 1432 } 1633 1433 extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, 1634 1434 EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); ··· 1572 1502 *done_offset = end; 1573 1503 return ret; 1574 1504 1575 - out_drop_extent_cache: 1576 - btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); 1577 - out_reserve: 1578 - btrfs_dec_block_group_reservations(fs_info, ins.objectid); 1579 - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); 1580 1505 out_unlock: 1581 1506 /* 1582 1507 * Now, we have three regions to clean up: ··· 1608 1543 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; 1609 1544 1610 1545 /* 1611 - * For the range (2). If we reserved an extent for our delalloc range 1612 - * (or a subrange) and failed to create the respective ordered extent, 1613 - * then it means that when we reserved the extent we decremented the 1614 - * extent's size from the data space_info's bytes_may_use counter and 1615 - * incremented the space_info's bytes_reserved counter by the same 1616 - * amount. We must make sure extent_clear_unlock_delalloc() does not try 1617 - * to decrement again the data space_info's bytes_may_use counter, 1618 - * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. 1619 - */ 1620 - if (cur_alloc_size) { 1621 - extent_clear_unlock_delalloc(inode, start, 1622 - start + cur_alloc_size - 1, 1623 - locked_folio, &cached, clear_bits, 1624 - page_ops); 1625 - btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); 1626 - } 1627 - 1628 - /* 1546 + * For the range (2) the error handling is done by cow_one_range() itself. 1547 + * Nothing needs to be done. 1548 + * 1629 1549 * For the range (3). We never touched the region. In addition to the 1630 1550 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data 1631 1551 * space_info's bytes_may_use counter, reserved in ··· 1625 1575 end - start - cur_alloc_size + 1, NULL); 1626 1576 } 1627 1577 btrfs_err(fs_info, 1628 - "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", 1578 + "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%u: %d", 1629 1579 __func__, btrfs_root_id(inode->root), 1630 1580 btrfs_ino(inode), orig_start, end + 1 - orig_start, 1631 1581 start, cur_alloc_size, ret); ··· 2122 2072 */ 2123 2073 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); 2124 2074 2125 - if (unlikely(btrfs_is_shutdown(fs_info))) { 2075 + if (btrfs_is_shutdown(fs_info)) { 2126 2076 ret = -EIO; 2127 2077 goto error; 2128 2078 } ··· 2422 2372 u64 start, u64 end, struct writeback_control *wbc) 2423 2373 { 2424 2374 const bool zoned = btrfs_is_zoned(inode->root->fs_info); 2425 - int ret; 2426 2375 2427 2376 /* 2428 2377 * The range must cover part of the @locked_folio, or a return of 1 ··· 2430 2381 ASSERT(!(end <= folio_pos(locked_folio) || 2431 2382 start >= folio_next_pos(locked_folio))); 2432 2383 2433 - if (should_nocow(inode, start, end)) { 2434 - ret = run_delalloc_nocow(inode, locked_folio, start, end); 2435 - return ret; 2436 - } 2384 + if (should_nocow(inode, start, end)) 2385 + return run_delalloc_nocow(inode, locked_folio, start, end); 2437 2386 2438 2387 if (btrfs_inode_can_compress(inode) && 2439 2388 inode_need_compress(inode, start, end) && ··· 2439 2392 return 1; 2440 2393 2441 2394 if (zoned) 2442 - ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, 2443 - true); 2395 + return run_delalloc_cow(inode, locked_folio, start, end, wbc, true); 2444 2396 else 2445 - ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); 2446 - return ret; 2397 + return cow_file_range(inode, locked_folio, start, end, NULL, 0); 2447 2398 } 2448 2399 2449 2400 void btrfs_split_delalloc_extent(struct btrfs_inode *inode, ··· 3052 3007 drop_args.extent_item_size = sizeof(*stack_fi); 3053 3008 ret = btrfs_drop_extents(trans, root, inode, &drop_args); 3054 3009 if (ret) 3055 - goto out; 3010 + return ret; 3056 3011 3057 3012 if (!drop_args.extent_inserted) { 3058 3013 ins.objectid = btrfs_ino(inode); ··· 3062 3017 ret = btrfs_insert_empty_item(trans, root, path, &ins, 3063 3018 sizeof(*stack_fi)); 3064 3019 if (ret) 3065 - goto out; 3020 + return ret; 3066 3021 } 3067 3022 leaf = path->nodes[0]; 3068 3023 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); ··· 3097 3052 3098 3053 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); 3099 3054 if (ret) 3100 - goto out; 3055 + return ret; 3101 3056 3102 - ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 3103 - file_pos - offset, 3104 - qgroup_reserved, &ins); 3105 - out: 3106 - return ret; 3057 + return btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), 3058 + file_pos - offset, 3059 + qgroup_reserved, &ins); 3107 3060 } 3108 3061 3109 3062 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, ··· 3269 3226 logical_len); 3270 3227 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, 3271 3228 ordered_extent->disk_num_bytes); 3229 + if (unlikely(ret < 0)) { 3230 + btrfs_abort_transaction(trans, ret); 3231 + goto out; 3232 + } 3272 3233 } else { 3273 3234 BUG_ON(root == fs_info->tree_root); 3274 3235 ret = insert_ordered_extent_file_extent(trans, ordered_extent); 3275 - if (!ret) { 3276 - clear_reserved_extent = false; 3277 - btrfs_release_delalloc_bytes(fs_info, 3278 - ordered_extent->disk_bytenr, 3279 - ordered_extent->disk_num_bytes); 3236 + if (unlikely(ret < 0)) { 3237 + btrfs_abort_transaction(trans, ret); 3238 + goto out; 3280 3239 } 3281 - } 3282 - if (unlikely(ret < 0)) { 3283 - btrfs_abort_transaction(trans, ret); 3284 - goto out; 3240 + clear_reserved_extent = false; 3241 + btrfs_release_delalloc_bytes(fs_info, 3242 + ordered_extent->disk_bytenr, 3243 + ordered_extent->disk_num_bytes); 3285 3244 } 3286 3245 3287 3246 ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, ··· 3381 3336 btrfs_discard_extent(fs_info, 3382 3337 ordered_extent->disk_bytenr, 3383 3338 ordered_extent->disk_num_bytes, 3384 - NULL); 3339 + NULL, true); 3385 3340 btrfs_free_reserved_extent(fs_info, 3386 3341 ordered_extent->disk_bytenr, 3387 3342 ordered_extent->disk_num_bytes, true); ··· 3463 3418 const u32 blocksize = fs_info->sectorsize; 3464 3419 const u32 step = min(blocksize, PAGE_SIZE); 3465 3420 const u32 nr_steps = blocksize / step; 3466 - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 3421 + struct btrfs_csum_ctx csum; 3467 3422 3468 - shash->tfm = fs_info->csum_shash; 3469 - crypto_shash_init(shash); 3423 + btrfs_csum_init(&csum, fs_info->csum_type); 3470 3424 for (int i = 0; i < nr_steps; i++) { 3471 3425 const phys_addr_t paddr = paddrs[i]; 3472 3426 void *kaddr; 3473 3427 3474 3428 ASSERT(offset_in_page(paddr) + step <= PAGE_SIZE); 3475 3429 kaddr = kmap_local_page(phys_to_page(paddr)) + offset_in_page(paddr); 3476 - crypto_shash_update(shash, kaddr, step); 3430 + btrfs_csum_update(&csum, kaddr, step); 3477 3431 kunmap_local(kaddr); 3478 3432 } 3479 - crypto_shash_final(shash, dest); 3433 + btrfs_csum_final(&csum, dest); 3480 3434 } 3481 3435 3482 3436 /* ··· 7181 7137 read_unlock(&em_tree->lock); 7182 7138 7183 7139 if (em) { 7184 - if (em->start > start || em->start + em->len <= start) 7140 + if (em->start > start || btrfs_extent_map_end(em) <= start) 7185 7141 btrfs_free_extent_map(em); 7186 7142 else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) 7187 7143 btrfs_free_extent_map(em); ··· 9834 9790 struct extent_state *cached_state = NULL; 9835 9791 struct btrfs_ordered_extent *ordered; 9836 9792 struct btrfs_file_extent file_extent; 9793 + struct compressed_bio *cb = NULL; 9837 9794 int compression; 9838 9795 size_t orig_count; 9796 + const u32 min_folio_size = btrfs_min_folio_size(fs_info); 9839 9797 u64 start, end; 9840 9798 u64 num_bytes, ram_bytes, disk_num_bytes; 9841 - unsigned long nr_folios, i; 9842 - struct folio **folios; 9843 9799 struct btrfs_key ins; 9844 9800 bool extent_reserved = false; 9845 9801 struct extent_map *em; ··· 9928 9884 * isn't. 9929 9885 */ 9930 9886 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); 9931 - nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); 9932 - folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT); 9933 - if (!folios) 9934 - return -ENOMEM; 9935 - for (i = 0; i < nr_folios; i++) { 9936 - size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); 9887 + 9888 + cb = btrfs_alloc_compressed_write(inode, start, num_bytes); 9889 + for (int i = 0; i * min_folio_size < disk_num_bytes; i++) { 9890 + struct folio *folio; 9891 + size_t bytes = min(min_folio_size, iov_iter_count(from)); 9937 9892 char *kaddr; 9938 9893 9939 - folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0); 9940 - if (!folios[i]) { 9894 + folio = btrfs_alloc_compr_folio(fs_info); 9895 + if (!folio) { 9941 9896 ret = -ENOMEM; 9942 - goto out_folios; 9897 + goto out_cb; 9943 9898 } 9944 - kaddr = kmap_local_folio(folios[i], 0); 9945 - if (copy_from_iter(kaddr, bytes, from) != bytes) { 9946 - kunmap_local(kaddr); 9947 - ret = -EFAULT; 9948 - goto out_folios; 9949 - } 9950 - if (bytes < PAGE_SIZE) 9951 - memset(kaddr + bytes, 0, PAGE_SIZE - bytes); 9899 + kaddr = kmap_local_folio(folio, 0); 9900 + ret = copy_from_iter(kaddr, bytes, from); 9952 9901 kunmap_local(kaddr); 9902 + if (ret != bytes) { 9903 + folio_put(folio); 9904 + ret = -EFAULT; 9905 + goto out_cb; 9906 + } 9907 + if (bytes < min_folio_size) 9908 + folio_zero_range(folio, bytes, min_folio_size - bytes); 9909 + ret = bio_add_folio(&cb->bbio.bio, folio, folio_size(folio), 0); 9910 + if (unlikely(!ret)) { 9911 + folio_put(folio); 9912 + ret = -EINVAL; 9913 + goto out_cb; 9914 + } 9953 9915 } 9916 + ASSERT(cb->bbio.bio.bi_iter.bi_size == disk_num_bytes); 9954 9917 9955 9918 for (;;) { 9956 9919 ret = btrfs_wait_ordered_range(inode, start, num_bytes); 9957 9920 if (ret) 9958 - goto out_folios; 9921 + goto out_cb; 9959 9922 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, 9960 9923 start >> PAGE_SHIFT, 9961 9924 end >> PAGE_SHIFT); 9962 9925 if (ret) 9963 - goto out_folios; 9926 + goto out_cb; 9964 9927 btrfs_lock_extent(io_tree, start, end, &cached_state); 9965 9928 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); 9966 9929 if (!ordered && ··· 9999 9948 encoded->unencoded_offset == 0 && 10000 9949 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) { 10001 9950 ret = __cow_file_range_inline(inode, encoded->len, 10002 - orig_count, compression, folios[0], 9951 + orig_count, compression, 9952 + bio_first_folio_all(&cb->bbio.bio), 10003 9953 true); 10004 9954 if (ret <= 0) { 10005 9955 if (ret == 0) ··· 10045 9993 10046 9994 btrfs_delalloc_release_extents(inode, num_bytes); 10047 9995 10048 - btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false); 9996 + btrfs_submit_compressed_write(ordered, cb); 10049 9997 ret = orig_count; 10050 9998 goto out; 10051 9999 ··· 10067 10015 btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); 10068 10016 out_unlock: 10069 10017 btrfs_unlock_extent(io_tree, start, end, &cached_state); 10070 - out_folios: 10071 - for (i = 0; i < nr_folios; i++) { 10072 - if (folios[i]) 10073 - folio_put(folios[i]); 10074 - } 10075 - kvfree(folios); 10018 + out_cb: 10019 + if (cb) 10020 + cleanup_compressed_bio(cb); 10076 10021 out: 10077 10022 if (ret >= 0) 10078 10023 iocb->ki_pos += encoded->len;

+18 -28

fs/btrfs/ioctl.c

··· 1107 1107 bool readonly, 1108 1108 struct btrfs_qgroup_inherit *inherit) 1109 1109 { 1110 - int ret = 0; 1110 + int ret; 1111 1111 struct qstr qname = QSTR_INIT(name, strlen(name)); 1112 1112 1113 1113 if (!S_ISDIR(file_inode(file)->i_mode)) ··· 1115 1115 1116 1116 ret = mnt_want_write_file(file); 1117 1117 if (ret) 1118 - goto out; 1118 + return ret; 1119 1119 1120 1120 if (strchr(name, '/')) { 1121 1121 ret = -EINVAL; ··· 1167 1167 } 1168 1168 out_drop_write: 1169 1169 mnt_drop_write_file(file); 1170 - out: 1171 1170 return ret; 1172 1171 } 1173 1172 ··· 1282 1283 struct btrfs_trans_handle *trans; 1283 1284 u64 root_flags; 1284 1285 u64 flags; 1285 - int ret = 0; 1286 + int ret; 1286 1287 1287 1288 if (!inode_owner_or_capable(file_mnt_idmap(file), inode)) 1288 1289 return -EPERM; 1289 1290 1290 1291 ret = mnt_want_write_file(file); 1291 1292 if (ret) 1292 - goto out; 1293 + return ret; 1293 1294 1294 1295 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) { 1295 1296 ret = -EINVAL; ··· 1358 1359 up_write(&fs_info->subvol_sem); 1359 1360 out_drop_write: 1360 1361 mnt_drop_write_file(file); 1361 - out: 1362 1362 return ret; 1363 1363 } 1364 1364 ··· 1423 1425 continue; 1424 1426 1425 1427 if (sizeof(sh) + item_len > *buf_size) { 1426 - if (*num_found) { 1427 - ret = 1; 1428 - goto out; 1429 - } 1428 + if (*num_found) 1429 + return 1; 1430 1430 1431 1431 /* 1432 1432 * return one empty item back for v1, which does not ··· 1436 1440 ret = -EOVERFLOW; 1437 1441 } 1438 1442 1439 - if (sizeof(sh) + item_len + *sk_offset > *buf_size) { 1440 - ret = 1; 1441 - goto out; 1442 - } 1443 + if (sizeof(sh) + item_len + *sk_offset > *buf_size) 1444 + return 1; 1443 1445 1444 1446 sh.objectid = key->objectid; 1445 1447 sh.type = key->type; ··· 1451 1457 * problem. Otherwise we'll fault and then copy the buffer in 1452 1458 * properly this next time through 1453 1459 */ 1454 - if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { 1455 - ret = 0; 1456 - goto out; 1457 - } 1460 + if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) 1461 + return 0; 1458 1462 1459 1463 *sk_offset += sizeof(sh); 1460 1464 ··· 1464 1472 */ 1465 1473 if (read_extent_buffer_to_user_nofault(leaf, up, 1466 1474 item_off, item_len)) { 1467 - ret = 0; 1468 1475 *sk_offset -= sizeof(sh); 1469 - goto out; 1476 + return 0; 1470 1477 } 1471 1478 1472 1479 *sk_offset += item_len; 1473 1480 } 1474 1481 (*num_found)++; 1475 1482 1476 - if (ret) /* -EOVERFLOW from above */ 1477 - goto out; 1483 + /* -EOVERFLOW from above. */ 1484 + if (ret) 1485 + return ret; 1478 1486 1479 - if (*num_found >= sk->nr_items) { 1480 - ret = 1; 1481 - goto out; 1482 - } 1487 + if (*num_found >= sk->nr_items) 1488 + return 1; 1483 1489 } 1484 1490 advance_key: 1485 1491 ret = 0; ··· 1497 1507 key->objectid++; 1498 1508 } else 1499 1509 ret = 1; 1500 - out: 1510 + 1501 1511 /* 1502 1512 * 0: all items from this leaf copied, continue with next 1503 1513 * 1: * more items can be copied, but unused buffer is too small ··· 4921 4931 4922 4932 int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) 4923 4933 { 4924 - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file))))) 4934 + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(cmd->file)))) 4925 4935 return -EIO; 4926 4936 4927 4937 switch (cmd->cmd_op) {

+1

fs/btrfs/locking.c

··· 73 73 { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, 74 74 { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, 75 75 { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, 76 + { .id = BTRFS_REMAP_TREE_OBJECTID, DEFINE_NAME("remap") }, 76 77 { .id = 0, DEFINE_NAME("tree") }, 77 78 }; 78 79

+195 -100

fs/btrfs/lzo.c

··· 123 123 } 124 124 125 125 /* 126 + * Write data into @out_folio and queue it into @out_bio. 127 + * 128 + * Return 0 if everything is fine and @total_out will be increased. 129 + * Return <0 for error. 130 + * 131 + * The @out_folio can be NULL after a full folio is queued. 132 + * Thus the caller should check and allocate a new folio when needed. 133 + */ 134 + static int write_and_queue_folio(struct bio *out_bio, struct folio **out_folio, 135 + u32 *total_out, u32 write_len) 136 + { 137 + const u32 fsize = folio_size(*out_folio); 138 + const u32 foffset = offset_in_folio(*out_folio, *total_out); 139 + 140 + ASSERT(out_folio && *out_folio); 141 + /* Should not cross folio boundary. */ 142 + ASSERT(foffset + write_len <= fsize); 143 + 144 + /* We can not use bio_add_folio_nofail() which doesn't do any merge. */ 145 + if (!bio_add_folio(out_bio, *out_folio, write_len, foffset)) { 146 + /* 147 + * We have allocated a bio that havs BTRFS_MAX_COMPRESSED_PAGES 148 + * vecs, and all ranges inside the same folio should have been 149 + * merged. If bio_add_folio() still failed, that means we have 150 + * reached the bvec limits. 151 + * 152 + * This should only happen at the beginning of a folio, and 153 + * caller is responsible for releasing the folio, since it's 154 + * not yet queued into the bio. 155 + */ 156 + ASSERT(IS_ALIGNED(*total_out, fsize)); 157 + return -E2BIG; 158 + } 159 + 160 + *total_out += write_len; 161 + /* 162 + * The full folio has been filled and queued, reset @out_folio to NULL, 163 + * so that error handling is fully handled by the bio. 164 + */ 165 + if (IS_ALIGNED(*total_out, fsize)) 166 + *out_folio = NULL; 167 + return 0; 168 + } 169 + 170 + /* 171 + * Copy compressed data to bio. 172 + * 173 + * @out_bio: The bio that will contain all the compressed data. 174 + * @compressed_data: The compressed data of this segment. 175 + * @compressed_size: The size of the compressed data. 176 + * @out_folio: The current output folio, will be updated if a new 177 + * folio is allocated. 178 + * @total_out: The total bytes of current output. 179 + * @max_out: The maximum size of the compressed data. 180 + * 126 181 * Will do: 127 182 * 128 183 * - Write a segment header into the destination 129 184 * - Copy the compressed buffer into the destination 130 185 * - Make sure we have enough space in the last sector to fit a segment header 131 186 * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. 187 + * - If a full folio is filled, it will be queued into @out_bio, and @out_folio 188 + * will be updated. 132 189 * 133 190 * Will allocate new pages when needed. 134 191 */ 135 - static int copy_compressed_data_to_page(struct btrfs_fs_info *fs_info, 136 - char *compressed_data, 137 - size_t compressed_size, 138 - struct folio **out_folios, 139 - unsigned long max_nr_folio, 140 - u32 *cur_out) 192 + static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info, 193 + struct bio *out_bio, 194 + const char *compressed_data, 195 + size_t compressed_size, 196 + struct folio **out_folio, 197 + u32 *total_out, u32 max_out) 141 198 { 142 199 const u32 sectorsize = fs_info->sectorsize; 143 - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 200 + const u32 sectorsize_bits = fs_info->sectorsize_bits; 201 + const u32 fsize = btrfs_min_folio_size(fs_info); 202 + const u32 old_size = out_bio->bi_iter.bi_size; 203 + u32 copy_start; 144 204 u32 sector_bytes_left; 145 - u32 orig_out; 146 - struct folio *cur_folio; 147 205 char *kaddr; 206 + int ret; 148 207 149 - if ((*cur_out >> min_folio_shift) >= max_nr_folio) 150 - return -E2BIG; 208 + ASSERT(out_folio); 209 + 210 + /* There should be at least a lzo header queued. */ 211 + ASSERT(old_size); 212 + ASSERT(old_size == *total_out); 151 213 152 214 /* 153 215 * We never allow a segment header crossing sector boundary, previous 154 216 * run should ensure we have enough space left inside the sector. 155 217 */ 156 - ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); 218 + ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits); 157 219 158 - cur_folio = out_folios[*cur_out >> min_folio_shift]; 159 - /* Allocate a new page */ 160 - if (!cur_folio) { 161 - cur_folio = btrfs_alloc_compr_folio(fs_info); 162 - if (!cur_folio) 220 + if (!*out_folio) { 221 + *out_folio = btrfs_alloc_compr_folio(fs_info); 222 + if (!*out_folio) 163 223 return -ENOMEM; 164 - out_folios[*cur_out >> min_folio_shift] = cur_folio; 165 224 } 166 225 167 - kaddr = kmap_local_folio(cur_folio, offset_in_folio(cur_folio, *cur_out)); 226 + /* Write the segment header first. */ 227 + kaddr = kmap_local_folio(*out_folio, offset_in_folio(*out_folio, *total_out)); 168 228 write_compress_length(kaddr, compressed_size); 169 - *cur_out += LZO_LEN; 229 + kunmap_local(kaddr); 230 + ret = write_and_queue_folio(out_bio, out_folio, total_out, LZO_LEN); 231 + if (ret < 0) 232 + return ret; 170 233 171 - orig_out = *cur_out; 234 + copy_start = *total_out; 172 235 173 - /* Copy compressed data */ 174 - while (*cur_out - orig_out < compressed_size) { 175 - u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, 176 - orig_out + compressed_size - *cur_out); 236 + /* Copy compressed data. */ 237 + while (*total_out - copy_start < compressed_size) { 238 + u32 copy_len = min_t(u32, sectorsize - *total_out % sectorsize, 239 + copy_start + compressed_size - *total_out); 240 + u32 foffset = *total_out & (fsize - 1); 177 241 178 - kunmap_local(kaddr); 179 - 180 - if ((*cur_out >> min_folio_shift) >= max_nr_folio) 242 + /* With the range copied, we're larger than the original range. */ 243 + if (((*total_out + copy_len) >> sectorsize_bits) >= 244 + max_out >> sectorsize_bits) 181 245 return -E2BIG; 182 246 183 - cur_folio = out_folios[*cur_out >> min_folio_shift]; 184 - /* Allocate a new page */ 185 - if (!cur_folio) { 186 - cur_folio = btrfs_alloc_compr_folio(fs_info); 187 - if (!cur_folio) 247 + if (!*out_folio) { 248 + *out_folio = btrfs_alloc_compr_folio(fs_info); 249 + if (!*out_folio) 188 250 return -ENOMEM; 189 - out_folios[*cur_out >> min_folio_shift] = cur_folio; 190 251 } 191 - kaddr = kmap_local_folio(cur_folio, 0); 192 252 193 - memcpy(kaddr + offset_in_folio(cur_folio, *cur_out), 194 - compressed_data + *cur_out - orig_out, copy_len); 195 - 196 - *cur_out += copy_len; 253 + kaddr = kmap_local_folio(*out_folio, foffset); 254 + memcpy(kaddr, compressed_data + *total_out - copy_start, copy_len); 255 + kunmap_local(kaddr); 256 + ret = write_and_queue_folio(out_bio, out_folio, total_out, copy_len); 257 + if (ret < 0) 258 + return ret; 197 259 } 198 260 199 261 /* 200 262 * Check if we can fit the next segment header into the remaining space 201 263 * of the sector. 202 264 */ 203 - sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out; 265 + sector_bytes_left = round_up(*total_out, sectorsize) - *total_out; 204 266 if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) 205 - goto out; 267 + return 0; 268 + 269 + ASSERT(*out_folio); 206 270 207 271 /* The remaining size is not enough, pad it with zeros */ 208 - memset(kaddr + offset_in_page(*cur_out), 0, 209 - sector_bytes_left); 210 - *cur_out += sector_bytes_left; 211 - 212 - out: 213 - kunmap_local(kaddr); 214 - return 0; 272 + folio_zero_range(*out_folio, offset_in_folio(*out_folio, *total_out), sector_bytes_left); 273 + return write_and_queue_folio(out_bio, out_folio, total_out, sector_bytes_left); 215 274 } 216 275 217 - int lzo_compress_folios(struct list_head *ws, struct btrfs_inode *inode, 218 - u64 start, struct folio **folios, unsigned long *out_folios, 219 - unsigned long *total_in, unsigned long *total_out) 276 + int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb) 220 277 { 278 + struct btrfs_inode *inode = cb->bbio.inode; 221 279 struct btrfs_fs_info *fs_info = inode->root->fs_info; 222 280 struct workspace *workspace = list_entry(ws, struct workspace, list); 281 + struct bio *bio = &cb->bbio.bio; 282 + const u64 start = cb->start; 283 + const u32 len = cb->len; 223 284 const u32 sectorsize = fs_info->sectorsize; 224 285 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 225 286 struct address_space *mapping = inode->vfs_inode.i_mapping; 226 287 struct folio *folio_in = NULL; 288 + struct folio *folio_out = NULL; 227 289 char *sizes_ptr; 228 - const unsigned long max_nr_folio = *out_folios; 229 290 int ret = 0; 230 - /* Points to the file offset of input data */ 291 + /* Points to the file offset of input data. */ 231 292 u64 cur_in = start; 232 - /* Points to the current output byte */ 233 - u32 cur_out = 0; 234 - u32 len = *total_out; 293 + /* Points to the current output byte. */ 294 + u32 total_out = 0; 235 295 236 - ASSERT(max_nr_folio > 0); 237 - *out_folios = 0; 238 - *total_out = 0; 239 - *total_in = 0; 296 + ASSERT(bio->bi_iter.bi_size == 0); 297 + ASSERT(len); 240 298 241 - /* 242 - * Skip the header for now, we will later come back and write the total 243 - * compressed size 244 - */ 245 - cur_out += LZO_LEN; 299 + folio_out = btrfs_alloc_compr_folio(fs_info); 300 + if (!folio_out) 301 + return -ENOMEM; 302 + 303 + /* Queue a segment header first. */ 304 + ret = write_and_queue_folio(bio, &folio_out, &total_out, LZO_LEN); 305 + /* The first header should not fail. */ 306 + ASSERT(ret == 0); 307 + 246 308 while (cur_in < start + len) { 247 309 char *data_in; 248 310 const u32 sectorsize_mask = sectorsize - 1; ··· 312 250 u32 in_len; 313 251 size_t out_len; 314 252 315 - /* Get the input page first */ 253 + /* Get the input page first. */ 316 254 if (!folio_in) { 317 255 ret = btrfs_compress_filemap_get_folio(mapping, cur_in, &folio_in); 318 256 if (ret < 0) 319 257 goto out; 320 258 } 321 259 322 - /* Compress at most one sector of data each time */ 260 + /* Compress at most one sector of data each time. */ 323 261 in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); 324 262 ASSERT(in_len); 325 263 data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); 326 - ret = lzo1x_1_compress(data_in, in_len, 327 - workspace->cbuf, &out_len, 264 + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, 328 265 workspace->mem); 329 266 kunmap_local(data_in); 330 267 if (unlikely(ret < 0)) { ··· 332 271 goto out; 333 272 } 334 273 335 - ret = copy_compressed_data_to_page(fs_info, workspace->cbuf, out_len, 336 - folios, max_nr_folio, 337 - &cur_out); 274 + ret = copy_compressed_data_to_bio(fs_info, bio, workspace->cbuf, out_len, 275 + &folio_out, &total_out, len); 338 276 if (ret < 0) 339 277 goto out; 340 278 ··· 343 283 * Check if we're making it bigger after two sectors. And if 344 284 * it is so, give up. 345 285 */ 346 - if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) { 286 + if (cur_in - start > sectorsize * 2 && cur_in - start < total_out) { 347 287 ret = -E2BIG; 348 288 goto out; 349 289 } 350 290 351 - /* Check if we have reached folio boundary. */ 291 + /* Check if we have reached input folio boundary. */ 352 292 if (IS_ALIGNED(cur_in, min_folio_size)) { 353 293 folio_put(folio_in); 354 294 folio_in = NULL; 355 295 } 356 296 } 297 + /* 298 + * The last folio is already queued. Bio is responsible for freeing 299 + * those folios now. 300 + */ 301 + folio_out = NULL; 357 302 358 303 /* Store the size of all chunks of compressed data */ 359 - sizes_ptr = kmap_local_folio(folios[0], 0); 360 - write_compress_length(sizes_ptr, cur_out); 304 + sizes_ptr = kmap_local_folio(bio_first_folio_all(bio), 0); 305 + write_compress_length(sizes_ptr, total_out); 361 306 kunmap_local(sizes_ptr); 362 - 363 - ret = 0; 364 - *total_out = cur_out; 365 - *total_in = cur_in - start; 366 307 out: 308 + /* 309 + * We can only free the folio that has no part queued into the bio. 310 + * 311 + * As any folio that is already queued into bio will be released by 312 + * the endio function of bio. 313 + */ 314 + if (folio_out && IS_ALIGNED(total_out, min_folio_size)) { 315 + btrfs_free_compr_folio(folio_out); 316 + folio_out = NULL; 317 + } 367 318 if (folio_in) 368 319 folio_put(folio_in); 369 - *out_folios = DIV_ROUND_UP(cur_out, min_folio_size); 370 320 return ret; 321 + } 322 + 323 + static struct folio *get_current_folio(struct compressed_bio *cb, struct folio_iter *fi, 324 + u32 *cur_folio_index, u32 cur_in) 325 + { 326 + struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); 327 + const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 328 + 329 + ASSERT(cur_folio_index); 330 + 331 + /* Need to switch to the next folio. */ 332 + if (cur_in >> min_folio_shift != *cur_folio_index) { 333 + /* We can only do the switch one folio a time. */ 334 + ASSERT(cur_in >> min_folio_shift == *cur_folio_index + 1); 335 + 336 + bio_next_folio(fi, &cb->bbio.bio); 337 + (*cur_folio_index)++; 338 + } 339 + return fi->folio; 371 340 } 372 341 373 342 /* ··· 405 316 * For the payload there will be no padding, just need to do page switching. 406 317 */ 407 318 static void copy_compressed_segment(struct compressed_bio *cb, 319 + struct folio_iter *fi, u32 *cur_folio_index, 408 320 char *dest, u32 len, u32 *cur_in) 409 321 { 410 - struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); 411 - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 412 322 u32 orig_in = *cur_in; 413 323 414 324 while (*cur_in < orig_in + len) { 415 - struct folio *cur_folio = cb->compressed_folios[*cur_in >> min_folio_shift]; 416 - u32 copy_len = min_t(u32, orig_in + len - *cur_in, 417 - folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); 325 + struct folio *cur_folio = get_current_folio(cb, fi, cur_folio_index, *cur_in); 326 + u32 copy_len; 418 327 328 + ASSERT(cur_folio); 329 + copy_len = min_t(u32, orig_in + len - *cur_in, 330 + folio_size(cur_folio) - offset_in_folio(cur_folio, *cur_in)); 419 331 ASSERT(copy_len); 420 332 421 333 memcpy_from_folio(dest + *cur_in - orig_in, cur_folio, ··· 431 341 struct workspace *workspace = list_entry(ws, struct workspace, list); 432 342 const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; 433 343 const u32 sectorsize = fs_info->sectorsize; 434 - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 344 + struct folio_iter fi; 435 345 char *kaddr; 436 346 int ret; 437 347 /* Compressed data length, can be unaligned */ ··· 440 350 u32 cur_in = 0; 441 351 /* Bytes decompressed so far */ 442 352 u32 cur_out = 0; 353 + /* The current folio index number inside the bio. */ 354 + u32 cur_folio_index = 0; 443 355 444 - kaddr = kmap_local_folio(cb->compressed_folios[0], 0); 356 + bio_first_folio(&fi, &cb->bbio.bio, 0); 357 + /* There must be a compressed folio and matches the sectorsize. */ 358 + if (unlikely(!fi.folio)) 359 + return -EINVAL; 360 + ASSERT(folio_size(fi.folio) == sectorsize); 361 + kaddr = kmap_local_folio(fi.folio, 0); 445 362 len_in = read_compress_length(kaddr); 446 363 kunmap_local(kaddr); 447 364 cur_in += LZO_LEN; ··· 485 388 */ 486 389 ASSERT(cur_in / sectorsize == 487 390 (cur_in + LZO_LEN - 1) / sectorsize); 488 - cur_folio = cb->compressed_folios[cur_in >> min_folio_shift]; 391 + cur_folio = get_current_folio(cb, &fi, &cur_folio_index, cur_in); 489 392 ASSERT(cur_folio); 490 393 kaddr = kmap_local_folio(cur_folio, 0); 491 394 seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); ··· 507 410 } 508 411 509 412 /* Copy the compressed segment payload into workspace */ 510 - copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); 413 + copy_compressed_segment(cb, &fi, &cur_folio_index, workspace->cbuf, 414 + seg_len, &cur_in); 511 415 512 416 /* Decompress the data */ 513 417 ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, ··· 554 456 size_t in_len; 555 457 size_t out_len; 556 458 size_t max_segment_len = workspace_buf_length(fs_info); 557 - int ret = 0; 459 + int ret; 558 460 559 461 if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) 560 462 return -EUCLEAN; ··· 565 467 data_in += LZO_LEN; 566 468 567 469 in_len = read_compress_length(data_in); 568 - if (unlikely(in_len != srclen - LZO_LEN * 2)) { 569 - ret = -EUCLEAN; 570 - goto out; 571 - } 470 + if (unlikely(in_len != srclen - LZO_LEN * 2)) 471 + return -EUCLEAN; 572 472 data_in += LZO_LEN; 573 473 574 474 out_len = sectorsize; ··· 578 482 "lzo decompression failed, error %d root %llu inode %llu offset %llu", 579 483 ret, btrfs_root_id(inode->root), btrfs_ino(inode), 580 484 folio_pos(dest_folio)); 581 - ret = -EIO; 582 - goto out; 485 + return -EIO; 583 486 } 584 487 585 488 ASSERT(out_len <= sectorsize); 586 489 memcpy_to_folio(dest_folio, dest_pgoff, workspace->buf, out_len); 587 490 /* Early end, considered as an error. */ 588 491 if (unlikely(out_len < destlen)) { 589 - ret = -EIO; 590 492 folio_zero_range(dest_folio, dest_pgoff + out_len, destlen - out_len); 493 + return -EIO; 591 494 } 592 - out: 593 - return ret; 495 + 496 + return 0; 594 497 } 595 498 596 499 const struct btrfs_compress_levels btrfs_lzo_compress = {

+6 -20

fs/btrfs/messages.c

··· 211 211 RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), 212 212 }; 213 213 214 - void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) 214 + __printf(3, 4) __cold 215 + void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...) 215 216 { 216 - char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; 217 217 struct va_format vaf; 218 218 va_list args; 219 - int kern_level; 220 - const char *type = logtypes[4]; 221 - struct ratelimit_state *ratelimit = &printk_limits[4]; 219 + const char *type = logtypes[level]; 220 + struct ratelimit_state *ratelimit = &printk_limits[level]; 222 221 223 222 #ifdef CONFIG_PRINTK_INDEX 224 223 printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); 225 224 #endif 226 225 227 226 va_start(args, fmt); 228 - 229 - while ((kern_level = printk_get_level(fmt)) != 0) { 230 - size_t size = printk_skip_level(fmt) - fmt; 231 - 232 - if (kern_level >= '0' && kern_level <= '7') { 233 - memcpy(lvl, fmt, size); 234 - lvl[size] = '\0'; 235 - type = logtypes[kern_level - '0']; 236 - ratelimit = &printk_limits[kern_level - '0']; 237 - } 238 - fmt += size; 239 - } 240 - 241 227 vaf.fmt = fmt; 242 228 vaf.va = &args; 243 229 ··· 233 247 char statestr[STATE_STRING_BUF_LEN]; 234 248 235 249 btrfs_state_to_string(fs_info, statestr); 236 - _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, 250 + _printk(KERN_SOH "%dBTRFS %s (device %s%s): %pV\n", level, type, 237 251 fs_info->sb->s_id, statestr, &vaf); 238 252 } else { 239 - _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); 253 + _printk(KERN_SOH "%dBTRFS %s: %pV\n", level, type, &vaf); 240 254 } 241 255 } 242 256

+33 -43

fs/btrfs/messages.h

··· 23 23 24 24 #ifdef CONFIG_PRINTK 25 25 26 - #define btrfs_printk(fs_info, fmt, args...) \ 27 - _btrfs_printk(fs_info, fmt, ##args) 28 - 29 - __printf(2, 3) 30 - __cold 31 - void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); 26 + __printf(3, 4) __cold 27 + void _btrfs_printk(const struct btrfs_fs_info *fs_info, unsigned int level, const char *fmt, ...); 32 28 33 29 #else 34 30 35 - #define btrfs_printk(fs_info, fmt, args...) \ 31 + #define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ 36 32 btrfs_no_printk(fs_info, fmt, ##args) 33 + 34 + #define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ 35 + btrfs_no_printk(fs_info, fmt, ##args) 36 + 37 + #define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \ 38 + btrfs_no_printk(fs_info, fmt, ##args) 39 + 37 40 #endif 38 41 39 42 /* 40 43 * Print a message with filesystem info, enclosed in RCU protection. 41 44 */ 42 45 #define btrfs_crit(fs_info, fmt, args...) \ 43 - btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) 46 + btrfs_printk_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args) 44 47 #define btrfs_err(fs_info, fmt, args...) \ 45 - btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) 48 + btrfs_printk_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args) 46 49 #define btrfs_warn(fs_info, fmt, args...) \ 47 - btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) 50 + btrfs_printk_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args) 48 51 #define btrfs_info(fs_info, fmt, args...) \ 49 - btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) 52 + btrfs_printk_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args) 50 53 51 54 /* 52 55 * Wrappers that use a ratelimited printk 53 56 */ 54 57 #define btrfs_crit_rl(fs_info, fmt, args...) \ 55 - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) 58 + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_CRIT, fmt, ##args) 56 59 #define btrfs_err_rl(fs_info, fmt, args...) \ 57 - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) 60 + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_ERR, fmt, ##args) 58 61 #define btrfs_warn_rl(fs_info, fmt, args...) \ 59 - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) 62 + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_WARNING, fmt, ##args) 60 63 #define btrfs_info_rl(fs_info, fmt, args...) \ 61 - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) 64 + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEL_INFO, fmt, ##args) 62 65 63 66 #if defined(CONFIG_DYNAMIC_DEBUG) 64 67 #define btrfs_debug(fs_info, fmt, args...) \ 65 68 _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ 66 - fs_info, KERN_DEBUG fmt, ##args) 69 + fs_info, LOGLEVEL_DEBUG, fmt, ##args) 67 70 #define btrfs_debug_rl(fs_info, fmt, args...) \ 68 71 _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ 69 - fs_info, KERN_DEBUG fmt, ##args) 72 + fs_info, LOGLEVEL_DEBUG, fmt, ##args) 70 73 #elif defined(DEBUG) 71 74 #define btrfs_debug(fs_info, fmt, args...) \ 72 - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) 75 + btrfs_printk_in_rcu(fs_info, LOGLEVEL_DEBUG, fmt, ##args) 73 76 #define btrfs_debug_rl(fs_info, fmt, args...) \ 74 - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) 77 + btrfs_printk_rl_in_rcu(fs_info, LOGLEVEl_DEBUG, fmt, ##args) 75 78 #else 76 79 /* When printk() is no_printk(), expand to no-op. */ 77 80 #define btrfs_debug(fs_info, fmt, args...) do { (void)(fs_info); } while(0) 78 81 #define btrfs_debug_rl(fs_info, fmt, args...) do { (void)(fs_info); } while(0) 79 82 #endif 80 83 81 - #define btrfs_printk_in_rcu(fs_info, fmt, args...) \ 82 - do { \ 83 - rcu_read_lock(); \ 84 - btrfs_printk(fs_info, fmt, ##args); \ 85 - rcu_read_unlock(); \ 84 + #ifdef CONFIG_PRINTK 85 + 86 + #define btrfs_printk_in_rcu(fs_info, level, fmt, args...) \ 87 + do { \ 88 + rcu_read_lock(); \ 89 + _btrfs_printk(fs_info, level, fmt, ##args); \ 90 + rcu_read_unlock(); \ 86 91 } while (0) 87 92 88 - #define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ 93 + #define btrfs_printk_rl_in_rcu(fs_info, level, fmt, args...) \ 89 94 do { \ 90 95 static DEFINE_RATELIMIT_STATE(_rs, \ 91 96 DEFAULT_RATELIMIT_INTERVAL, \ ··· 98 93 \ 99 94 rcu_read_lock(); \ 100 95 if (__ratelimit(&_rs)) \ 101 - btrfs_printk(fs_info, fmt, ##args); \ 96 + _btrfs_printk(fs_info, level, fmt, ##args); \ 102 97 rcu_read_unlock(); \ 103 98 } while (0) 99 + 100 + #endif 104 101 105 102 #ifdef CONFIG_BTRFS_ASSERT 106 103 ··· 120 113 */ 121 114 #define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__ 122 115 123 - #if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000 124 116 /* 125 117 * Assertion with optional printk() format. 126 118 * ··· 156 150 BUG(); \ 157 151 } \ 158 152 } while(0) 159 - 160 - #else 161 - 162 - /* For GCC < 8.x only the simple output. */ 163 - 164 - #define ASSERT(cond, args...) \ 165 - do { \ 166 - verify_assert_printk_format("check the format string" args); \ 167 - if (!likely(cond)) { \ 168 - pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ 169 - #cond, (long)(cond), __FILE__, __LINE__); \ 170 - BUG(); \ 171 - } \ 172 - } while(0) 173 - 174 - #endif 175 153 176 154 #else 177 155 /* Compile check the @cond expression but don't generate any code. */

+87 -38

fs/btrfs/qgroup.c

··· 346 346 } 347 347 #endif 348 348 349 + static bool squota_check_parent_usage(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *parent) 350 + { 351 + u64 excl_sum = 0; 352 + u64 rfer_sum = 0; 353 + u64 excl_cmpr_sum = 0; 354 + u64 rfer_cmpr_sum = 0; 355 + struct btrfs_qgroup_list *glist; 356 + int nr_members = 0; 357 + bool mismatch; 358 + 359 + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) 360 + return false; 361 + if (btrfs_qgroup_level(parent->qgroupid) == 0) 362 + return false; 363 + 364 + /* Eligible parent qgroup. Squota; level > 0; empty members list. */ 365 + list_for_each_entry(glist, &parent->members, next_member) { 366 + excl_sum += glist->member->excl; 367 + rfer_sum += glist->member->rfer; 368 + excl_cmpr_sum += glist->member->excl_cmpr; 369 + rfer_cmpr_sum += glist->member->rfer_cmpr; 370 + nr_members++; 371 + } 372 + mismatch = (parent->excl != excl_sum || parent->rfer != rfer_sum || 373 + parent->excl_cmpr != excl_cmpr_sum || parent->rfer_cmpr != excl_cmpr_sum); 374 + 375 + WARN(mismatch, 376 + "parent squota qgroup %hu/%llu has mismatched usage from its %d members. " 377 + "%llu %llu %llu %llu vs %llu %llu %llu %llu\n", 378 + btrfs_qgroup_level(parent->qgroupid), 379 + btrfs_qgroup_subvolid(parent->qgroupid), nr_members, parent->excl, 380 + parent->rfer, parent->excl_cmpr, parent->rfer_cmpr, excl_sum, 381 + rfer_sum, excl_cmpr_sum, rfer_cmpr_sum); 382 + return mismatch; 383 + } 384 + 349 385 __printf(2, 3) 350 386 static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info, const char *fmt, ...) 351 387 { ··· 694 658 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, 695 659 u64 dst) 696 660 { 697 - int ret; 698 661 struct btrfs_root *quota_root = trans->fs_info->quota_root; 699 662 BTRFS_PATH_AUTO_FREE(path); 700 663 struct btrfs_key key; ··· 706 671 key.type = BTRFS_QGROUP_RELATION_KEY; 707 672 key.offset = dst; 708 673 709 - ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 710 - return ret; 674 + return btrfs_insert_empty_item(trans, quota_root, path, &key, 0); 711 675 } 712 676 713 677 static int del_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, ··· 831 797 if (ret > 0) 832 798 return -ENOENT; 833 799 834 - ret = btrfs_del_item(trans, quota_root, path); 835 - 836 - return ret; 800 + return btrfs_del_item(trans, quota_root, path); 837 801 } 838 802 839 803 static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, ··· 1594 1562 goto out; 1595 1563 } 1596 1564 ret = quick_update_accounting(fs_info, src, dst, 1); 1565 + squota_check_parent_usage(fs_info, parent); 1597 1566 spin_unlock(&fs_info->qgroup_lock); 1598 1567 out: 1599 1568 kfree(prealloc); ··· 1613 1580 int ret = 0; 1614 1581 int ret2; 1615 1582 1616 - if (!fs_info->quota_root) { 1617 - ret = -ENOTCONN; 1618 - goto out; 1619 - } 1583 + if (!fs_info->quota_root) 1584 + return -ENOTCONN; 1620 1585 1621 1586 member = find_qgroup_rb(fs_info, src); 1622 1587 parent = find_qgroup_rb(fs_info, dst); ··· 1636 1605 delete_item: 1637 1606 ret = del_qgroup_relation_item(trans, src, dst); 1638 1607 if (ret < 0 && ret != -ENOENT) 1639 - goto out; 1608 + return ret; 1640 1609 ret2 = del_qgroup_relation_item(trans, dst, src); 1641 1610 if (ret2 < 0 && ret2 != -ENOENT) 1642 - goto out; 1611 + return ret2; 1643 1612 1644 1613 /* At least one deletion succeeded, return 0 */ 1645 1614 if (!ret || !ret2) ··· 1649 1618 spin_lock(&fs_info->qgroup_lock); 1650 1619 del_relation_rb(fs_info, src, dst); 1651 1620 ret = quick_update_accounting(fs_info, src, dst, -1); 1621 + ASSERT(parent); 1622 + squota_check_parent_usage(fs_info, parent); 1652 1623 spin_unlock(&fs_info->qgroup_lock); 1653 1624 } 1654 - out: 1625 + 1655 1626 return ret; 1656 1627 } 1657 1628 ··· 1712 1679 return ret; 1713 1680 } 1714 1681 1682 + static bool can_delete_parent_qgroup(struct btrfs_qgroup *qgroup) 1683 + 1684 + { 1685 + ASSERT(btrfs_qgroup_level(qgroup->qgroupid)); 1686 + return list_empty(&qgroup->members); 1687 + } 1688 + 1689 + /* 1690 + * Return true if we can delete the squota qgroup and false otherwise. 1691 + * 1692 + * Rules for whether we can delete: 1693 + * 1694 + * A subvolume qgroup can be removed iff the subvolume is fully deleted, which 1695 + * is iff there is 0 usage in the qgroup. 1696 + * 1697 + * A higher level qgroup can be removed iff it has no members. 1698 + * Note: We audit its usage to warn on inconsitencies without blocking deletion. 1699 + */ 1700 + static bool can_delete_squota_qgroup(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup) 1701 + { 1702 + ASSERT(btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); 1703 + 1704 + if (btrfs_qgroup_level(qgroup->qgroupid) > 0) { 1705 + squota_check_parent_usage(fs_info, qgroup); 1706 + return can_delete_parent_qgroup(qgroup); 1707 + } 1708 + 1709 + return !(qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr); 1710 + } 1711 + 1715 1712 /* 1716 1713 * Return 0 if we can not delete the qgroup (not empty or has children etc). 1717 1714 * Return >0 if we can delete the qgroup. ··· 1752 1689 struct btrfs_key key; 1753 1690 BTRFS_PATH_AUTO_FREE(path); 1754 1691 1755 - /* 1756 - * Squota would never be inconsistent, but there can still be case 1757 - * where a dropped subvolume still has qgroup numbers, and squota 1758 - * relies on such qgroup for future accounting. 1759 - * 1760 - * So for squota, do not allow dropping any non-zero qgroup. 1761 - */ 1762 - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && 1763 - (qgroup->rfer || qgroup->excl || qgroup->excl_cmpr || qgroup->rfer_cmpr)) 1764 - return 0; 1692 + /* Since squotas cannot be inconsistent, they have special rules for deletion. */ 1693 + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 1694 + return can_delete_squota_qgroup(fs_info, qgroup); 1765 1695 1766 1696 /* For higher level qgroup, we can only delete it if it has no child. */ 1767 - if (btrfs_qgroup_level(qgroup->qgroupid)) { 1768 - if (!list_empty(&qgroup->members)) 1769 - return 0; 1770 - return 1; 1771 - } 1697 + if (btrfs_qgroup_level(qgroup->qgroupid)) 1698 + return can_delete_parent_qgroup(qgroup); 1772 1699 1773 1700 /* 1774 1701 * For level-0 qgroups, we can only delete it if it has no subvolume ··· 2486 2433 2487 2434 /* This node is old, no need to trace */ 2488 2435 if (child_gen < last_snapshot) 2489 - goto out; 2436 + return ret; 2490 2437 2491 2438 eb = btrfs_read_node_slot(eb, parent_slot); 2492 - if (IS_ERR(eb)) { 2493 - ret = PTR_ERR(eb); 2494 - goto out; 2495 - } 2439 + if (IS_ERR(eb)) 2440 + return PTR_ERR(eb); 2496 2441 2497 2442 dst_path->nodes[cur_level] = eb; 2498 2443 dst_path->slots[cur_level] = 0; ··· 2535 2484 dst_path->slots[cur_level] = 0; 2536 2485 dst_path->locks[cur_level] = 0; 2537 2486 } 2538 - out: 2487 + 2539 2488 return ret; 2540 2489 } 2541 2490 ··· 2647 2596 return ret; 2648 2597 } 2649 2598 2650 - if (root_level == 0) { 2651 - ret = btrfs_qgroup_trace_leaf_items(trans, root_eb); 2652 - return ret; 2653 - } 2599 + if (root_level == 0) 2600 + return btrfs_qgroup_trace_leaf_items(trans, root_eb); 2654 2601 2655 2602 path = btrfs_alloc_path(); 2656 2603 if (!path)

+8 -3

fs/btrfs/reflink.c

··· 754 754 755 755 /* 756 756 * We may have copied an inline extent into a page of the destination 757 - * range, so wait for writeback to complete before invalidating pages 758 - * from the page cache. This is a rare case. 757 + * range. So flush delalloc and wait for ordered extent completion. 758 + * This is to ensure the invalidation below does not fail, as if for 759 + * example it finds a dirty folio, our folio release callback 760 + * (btrfs_release_folio()) returns false, which makes the invalidation 761 + * return an -EBUSY error. We can't ignore such failures since they 762 + * could come from some range other than the copied inline extent's 763 + * destination range and we have no way to know that. 759 764 */ 760 765 ret = btrfs_wait_ordered_range(BTRFS_I(inode), destoff, len); 761 766 if (ret < 0) ··· 878 873 bool same_inode = dst_inode == src_inode; 879 874 int ret; 880 875 881 - if (unlikely(btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file))))) 876 + if (btrfs_is_shutdown(inode_to_fs_info(file_inode(src_file)))) 882 877 return -EIO; 883 878 884 879 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))

+1714 -51

fs/btrfs/relocation.c

··· 37 37 #include "super.h" 38 38 #include "tree-checker.h" 39 39 #include "raid-stripe-tree.h" 40 + #include "free-space-tree.h" 40 41 41 42 /* 42 43 * Relocation overview ··· 3255 3254 struct btrfs_key key; 3256 3255 bool found = false; 3257 3256 int i; 3258 - int ret; 3259 3257 3260 3258 if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID) 3261 3259 return 0; ··· 3278 3278 } 3279 3279 if (!found) 3280 3280 return -ENOENT; 3281 - ret = delete_block_group_cache(block_group, NULL, space_cache_ino); 3282 - return ret; 3281 + 3282 + return delete_block_group_cache(block_group, NULL, space_cache_ino); 3283 3283 } 3284 3284 3285 3285 /* ··· 3616 3616 btrfs_btree_balance_dirty(fs_info); 3617 3617 } 3618 3618 3619 - if (!err) { 3619 + if (!err && !btrfs_fs_incompat(fs_info, REMAP_TREE)) { 3620 3620 ret = relocate_file_extent_cluster(rc); 3621 3621 if (ret < 0) 3622 3622 err = ret; ··· 3860 3860 return "unknown"; 3861 3861 } 3862 3862 3863 + static int add_remap_tree_entries(struct btrfs_trans_handle *trans, struct btrfs_path *path, 3864 + struct btrfs_key *entries, unsigned int num_entries) 3865 + { 3866 + int ret; 3867 + struct btrfs_fs_info *fs_info = trans->fs_info; 3868 + struct btrfs_item_batch batch; 3869 + u32 *data_sizes; 3870 + u32 max_items; 3871 + 3872 + max_items = BTRFS_LEAF_DATA_SIZE(trans->fs_info) / sizeof(struct btrfs_item); 3873 + 3874 + data_sizes = kzalloc(sizeof(u32) * min_t(u32, num_entries, max_items), GFP_NOFS); 3875 + if (!data_sizes) 3876 + return -ENOMEM; 3877 + 3878 + while (true) { 3879 + batch.keys = entries; 3880 + batch.data_sizes = data_sizes; 3881 + batch.total_data_size = 0; 3882 + batch.nr = min_t(u32, num_entries, max_items); 3883 + 3884 + ret = btrfs_insert_empty_items(trans, fs_info->remap_root, path, &batch); 3885 + btrfs_release_path(path); 3886 + 3887 + if (num_entries <= max_items) 3888 + break; 3889 + 3890 + num_entries -= max_items; 3891 + entries += max_items; 3892 + } 3893 + 3894 + kfree(data_sizes); 3895 + 3896 + return ret; 3897 + } 3898 + 3899 + struct space_run { 3900 + u64 start; 3901 + u64 end; 3902 + }; 3903 + 3904 + static void parse_bitmap(u64 block_size, const unsigned long *bitmap, 3905 + unsigned long size, u64 address, struct space_run *space_runs, 3906 + unsigned int *num_space_runs) 3907 + { 3908 + unsigned long pos, end; 3909 + u64 run_start, run_length; 3910 + 3911 + pos = find_first_bit(bitmap, size); 3912 + if (pos == size) 3913 + return; 3914 + 3915 + while (true) { 3916 + end = find_next_zero_bit(bitmap, size, pos); 3917 + 3918 + run_start = address + (pos * block_size); 3919 + run_length = (end - pos) * block_size; 3920 + 3921 + if (*num_space_runs != 0 && 3922 + space_runs[*num_space_runs - 1].end == run_start) { 3923 + space_runs[*num_space_runs - 1].end += run_length; 3924 + } else { 3925 + space_runs[*num_space_runs].start = run_start; 3926 + space_runs[*num_space_runs].end = run_start + run_length; 3927 + 3928 + (*num_space_runs)++; 3929 + } 3930 + 3931 + if (end == size) 3932 + break; 3933 + 3934 + pos = find_next_bit(bitmap, size, end + 1); 3935 + if (pos == size) 3936 + break; 3937 + } 3938 + } 3939 + 3940 + static void adjust_block_group_remap_bytes(struct btrfs_trans_handle *trans, 3941 + struct btrfs_block_group *bg, s64 diff) 3942 + { 3943 + struct btrfs_fs_info *fs_info = trans->fs_info; 3944 + bool bg_already_dirty = true; 3945 + bool mark_unused = false; 3946 + 3947 + spin_lock(&bg->lock); 3948 + bg->remap_bytes += diff; 3949 + if (bg->used == 0 && bg->remap_bytes == 0) 3950 + mark_unused = true; 3951 + spin_unlock(&bg->lock); 3952 + 3953 + if (mark_unused) 3954 + btrfs_mark_bg_unused(bg); 3955 + 3956 + spin_lock(&trans->transaction->dirty_bgs_lock); 3957 + if (list_empty(&bg->dirty_list)) { 3958 + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); 3959 + bg_already_dirty = false; 3960 + btrfs_get_block_group(bg); 3961 + } 3962 + spin_unlock(&trans->transaction->dirty_bgs_lock); 3963 + 3964 + /* Modified block groups are accounted for in the delayed_refs_rsv. */ 3965 + if (!bg_already_dirty) 3966 + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); 3967 + } 3968 + 3969 + /* Private structure for I/O from copy_remapped_data(). */ 3970 + struct reloc_io_private { 3971 + struct completion done; 3972 + refcount_t pending_refs; 3973 + blk_status_t status; 3974 + }; 3975 + 3976 + static void reloc_endio(struct btrfs_bio *bbio) 3977 + { 3978 + struct reloc_io_private *priv = bbio->private; 3979 + 3980 + if (bbio->bio.bi_status) 3981 + WRITE_ONCE(priv->status, bbio->bio.bi_status); 3982 + 3983 + if (refcount_dec_and_test(&priv->pending_refs)) 3984 + complete(&priv->done); 3985 + 3986 + bio_put(&bbio->bio); 3987 + } 3988 + 3989 + static int copy_remapped_data_io(struct btrfs_fs_info *fs_info, 3990 + struct reloc_io_private *priv, 3991 + struct page **pages, u64 addr, u64 length, 3992 + blk_opf_t op) 3993 + { 3994 + struct btrfs_bio *bbio; 3995 + int i; 3996 + 3997 + init_completion(&priv->done); 3998 + refcount_set(&priv->pending_refs, 1); 3999 + priv->status = 0; 4000 + 4001 + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, BTRFS_I(fs_info->btree_inode), 4002 + addr, reloc_endio, priv); 4003 + bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT); 4004 + bbio->is_remap = true; 4005 + 4006 + i = 0; 4007 + do { 4008 + size_t bytes = min_t(u64, length, PAGE_SIZE); 4009 + 4010 + if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { 4011 + refcount_inc(&priv->pending_refs); 4012 + btrfs_submit_bbio(bbio, 0); 4013 + 4014 + bbio = btrfs_bio_alloc(BIO_MAX_VECS, op, 4015 + BTRFS_I(fs_info->btree_inode), 4016 + addr, reloc_endio, priv); 4017 + bbio->bio.bi_iter.bi_sector = (addr >> SECTOR_SHIFT); 4018 + bbio->is_remap = true; 4019 + continue; 4020 + } 4021 + 4022 + i++; 4023 + addr += bytes; 4024 + length -= bytes; 4025 + } while (length); 4026 + 4027 + refcount_inc(&priv->pending_refs); 4028 + btrfs_submit_bbio(bbio, 0); 4029 + 4030 + if (!refcount_dec_and_test(&priv->pending_refs)) 4031 + wait_for_completion_io(&priv->done); 4032 + 4033 + return blk_status_to_errno(READ_ONCE(priv->status)); 4034 + } 4035 + 4036 + static int copy_remapped_data(struct btrfs_fs_info *fs_info, u64 old_addr, 4037 + u64 new_addr, u64 length) 4038 + { 4039 + int ret; 4040 + u64 copy_len = min_t(u64, length, SZ_1M); 4041 + struct page **pages; 4042 + struct reloc_io_private priv; 4043 + unsigned int nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); 4044 + 4045 + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); 4046 + if (!pages) 4047 + return -ENOMEM; 4048 + 4049 + ret = btrfs_alloc_page_array(nr_pages, pages, 0); 4050 + if (ret) { 4051 + ret = -ENOMEM; 4052 + goto end; 4053 + } 4054 + 4055 + /* Copy 1MB at a time, to avoid using too much memory. */ 4056 + do { 4057 + u64 to_copy = min_t(u64, length, copy_len); 4058 + 4059 + /* Limit to one bio. */ 4060 + to_copy = min_t(u64, to_copy, BIO_MAX_VECS << PAGE_SHIFT); 4061 + 4062 + ret = copy_remapped_data_io(fs_info, &priv, pages, old_addr, 4063 + to_copy, REQ_OP_READ); 4064 + if (ret) 4065 + goto end; 4066 + 4067 + ret = copy_remapped_data_io(fs_info, &priv, pages, new_addr, 4068 + to_copy, REQ_OP_WRITE); 4069 + if (ret) 4070 + goto end; 4071 + 4072 + if (to_copy == length) 4073 + break; 4074 + 4075 + old_addr += to_copy; 4076 + new_addr += to_copy; 4077 + length -= to_copy; 4078 + } while (true); 4079 + 4080 + ret = 0; 4081 + end: 4082 + for (int i = 0; i < nr_pages; i++) { 4083 + if (pages[i]) 4084 + __free_page(pages[i]); 4085 + } 4086 + kfree(pages); 4087 + 4088 + return ret; 4089 + } 4090 + 4091 + static int add_remap_item(struct btrfs_trans_handle *trans, 4092 + struct btrfs_path *path, u64 new_addr, u64 length, 4093 + u64 old_addr) 4094 + { 4095 + struct btrfs_fs_info *fs_info = trans->fs_info; 4096 + struct btrfs_remap_item remap = { 0 }; 4097 + struct btrfs_key key; 4098 + struct extent_buffer *leaf; 4099 + int ret; 4100 + 4101 + key.objectid = old_addr; 4102 + key.type = BTRFS_REMAP_KEY; 4103 + key.offset = length; 4104 + 4105 + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, 4106 + &key, sizeof(struct btrfs_remap_item)); 4107 + if (ret) 4108 + return ret; 4109 + 4110 + leaf = path->nodes[0]; 4111 + btrfs_set_stack_remap_address(&remap, new_addr); 4112 + write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]), 4113 + sizeof(struct btrfs_remap_item)); 4114 + 4115 + btrfs_release_path(path); 4116 + 4117 + return 0; 4118 + } 4119 + 4120 + static int add_remap_backref_item(struct btrfs_trans_handle *trans, 4121 + struct btrfs_path *path, u64 new_addr, 4122 + u64 length, u64 old_addr) 4123 + { 4124 + struct btrfs_fs_info *fs_info = trans->fs_info; 4125 + struct btrfs_remap_item remap = { 0 }; 4126 + struct btrfs_key key; 4127 + struct extent_buffer *leaf; 4128 + int ret; 4129 + 4130 + key.objectid = new_addr; 4131 + key.type = BTRFS_REMAP_BACKREF_KEY; 4132 + key.offset = length; 4133 + 4134 + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, &key, 4135 + sizeof(struct btrfs_remap_item)); 4136 + if (ret) 4137 + return ret; 4138 + 4139 + leaf = path->nodes[0]; 4140 + btrfs_set_stack_remap_address(&remap, old_addr); 4141 + write_extent_buffer(leaf, &remap, btrfs_item_ptr_offset(leaf, path->slots[0]), 4142 + sizeof(struct btrfs_remap_item)); 4143 + 4144 + btrfs_release_path(path); 4145 + 4146 + return 0; 4147 + } 4148 + 4149 + static int move_existing_remap(struct btrfs_fs_info *fs_info, 4150 + struct btrfs_path *path, 4151 + struct btrfs_block_group *bg, u64 new_addr, 4152 + u64 length, u64 old_addr) 4153 + { 4154 + struct btrfs_trans_handle *trans; 4155 + struct extent_buffer *leaf; 4156 + struct btrfs_remap_item *remap_ptr; 4157 + struct btrfs_remap_item remap = { 0 }; 4158 + struct btrfs_key key, ins; 4159 + u64 dest_addr, dest_length, min_size; 4160 + struct btrfs_block_group *dest_bg; 4161 + int ret; 4162 + const bool is_data = (bg->flags & BTRFS_BLOCK_GROUP_DATA); 4163 + struct btrfs_space_info *sinfo = bg->space_info; 4164 + bool mutex_taken = false; 4165 + bool bg_needs_free_space; 4166 + 4167 + spin_lock(&sinfo->lock); 4168 + btrfs_space_info_update_bytes_may_use(sinfo, length); 4169 + spin_unlock(&sinfo->lock); 4170 + 4171 + if (is_data) 4172 + min_size = fs_info->sectorsize; 4173 + else 4174 + min_size = fs_info->nodesize; 4175 + 4176 + ret = btrfs_reserve_extent(fs_info->fs_root, length, length, min_size, 4177 + 0, 0, &ins, is_data, false); 4178 + if (unlikely(ret)) { 4179 + spin_lock(&sinfo->lock); 4180 + btrfs_space_info_update_bytes_may_use(sinfo, -length); 4181 + spin_unlock(&sinfo->lock); 4182 + return ret; 4183 + } 4184 + 4185 + dest_addr = ins.objectid; 4186 + dest_length = ins.offset; 4187 + 4188 + if (!is_data && !IS_ALIGNED(dest_length, fs_info->nodesize)) { 4189 + u64 new_length = ALIGN_DOWN(dest_length, fs_info->nodesize); 4190 + 4191 + btrfs_free_reserved_extent(fs_info, dest_addr + new_length, 4192 + dest_length - new_length, 0); 4193 + 4194 + dest_length = new_length; 4195 + } 4196 + 4197 + trans = btrfs_join_transaction(fs_info->remap_root); 4198 + if (IS_ERR(trans)) { 4199 + ret = PTR_ERR(trans); 4200 + trans = NULL; 4201 + goto end; 4202 + } 4203 + 4204 + mutex_lock(&fs_info->remap_mutex); 4205 + mutex_taken = true; 4206 + 4207 + /* Find old remap entry. */ 4208 + key.objectid = old_addr; 4209 + key.type = BTRFS_REMAP_KEY; 4210 + key.offset = length; 4211 + 4212 + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); 4213 + if (ret == 1) { 4214 + /* 4215 + * Not a problem if the remap entry wasn't found: that means 4216 + * that another transaction has deallocated the data. 4217 + * move_existing_remaps() loops until the BG contains no 4218 + * remaps, so we can just return 0 in this case. 4219 + */ 4220 + btrfs_release_path(path); 4221 + ret = 0; 4222 + goto end; 4223 + } else if (unlikely(ret)) { 4224 + goto end; 4225 + } 4226 + 4227 + ret = copy_remapped_data(fs_info, new_addr, dest_addr, dest_length); 4228 + if (unlikely(ret)) 4229 + goto end; 4230 + 4231 + /* Change data of old remap entry. */ 4232 + leaf = path->nodes[0]; 4233 + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); 4234 + btrfs_set_remap_address(leaf, remap_ptr, dest_addr); 4235 + btrfs_mark_buffer_dirty(trans, leaf); 4236 + 4237 + if (dest_length != length) { 4238 + key.offset = dest_length; 4239 + btrfs_set_item_key_safe(trans, path, &key); 4240 + } 4241 + 4242 + btrfs_release_path(path); 4243 + 4244 + if (dest_length != length) { 4245 + /* Add remap item for remainder. */ 4246 + ret = add_remap_item(trans, path, new_addr + dest_length, 4247 + length - dest_length, old_addr + dest_length); 4248 + if (unlikely(ret)) 4249 + goto end; 4250 + } 4251 + 4252 + /* Change or remove old backref. */ 4253 + key.objectid = new_addr; 4254 + key.type = BTRFS_REMAP_BACKREF_KEY; 4255 + key.offset = length; 4256 + 4257 + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); 4258 + if (unlikely(ret)) { 4259 + if (ret == 1) { 4260 + btrfs_release_path(path); 4261 + ret = -ENOENT; 4262 + } 4263 + goto end; 4264 + } 4265 + 4266 + leaf = path->nodes[0]; 4267 + 4268 + if (dest_length == length) { 4269 + ret = btrfs_del_item(trans, fs_info->remap_root, path); 4270 + if (unlikely(ret)) { 4271 + btrfs_release_path(path); 4272 + goto end; 4273 + } 4274 + } else { 4275 + key.objectid += dest_length; 4276 + key.offset -= dest_length; 4277 + btrfs_set_item_key_safe(trans, path, &key); 4278 + btrfs_set_stack_remap_address(&remap, old_addr + dest_length); 4279 + 4280 + write_extent_buffer(leaf, &remap, 4281 + btrfs_item_ptr_offset(leaf, path->slots[0]), 4282 + sizeof(struct btrfs_remap_item)); 4283 + } 4284 + 4285 + btrfs_release_path(path); 4286 + 4287 + /* Add new backref. */ 4288 + ret = add_remap_backref_item(trans, path, dest_addr, dest_length, old_addr); 4289 + if (unlikely(ret)) 4290 + goto end; 4291 + 4292 + adjust_block_group_remap_bytes(trans, bg, -dest_length); 4293 + 4294 + ret = btrfs_add_to_free_space_tree(trans, new_addr, dest_length); 4295 + if (unlikely(ret)) 4296 + goto end; 4297 + 4298 + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr); 4299 + 4300 + adjust_block_group_remap_bytes(trans, dest_bg, dest_length); 4301 + 4302 + mutex_lock(&dest_bg->free_space_lock); 4303 + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, 4304 + &dest_bg->runtime_flags); 4305 + mutex_unlock(&dest_bg->free_space_lock); 4306 + btrfs_put_block_group(dest_bg); 4307 + 4308 + if (bg_needs_free_space) { 4309 + ret = btrfs_add_block_group_free_space(trans, dest_bg); 4310 + if (unlikely(ret)) 4311 + goto end; 4312 + } 4313 + 4314 + ret = btrfs_remove_from_free_space_tree(trans, dest_addr, dest_length); 4315 + if (unlikely(ret)) { 4316 + btrfs_remove_from_free_space_tree(trans, new_addr, dest_length); 4317 + goto end; 4318 + } 4319 + 4320 + ret = 0; 4321 + 4322 + end: 4323 + if (mutex_taken) 4324 + mutex_unlock(&fs_info->remap_mutex); 4325 + 4326 + btrfs_dec_block_group_reservations(fs_info, dest_addr); 4327 + 4328 + if (unlikely(ret)) { 4329 + btrfs_free_reserved_extent(fs_info, dest_addr, dest_length, 0); 4330 + 4331 + if (trans) { 4332 + btrfs_abort_transaction(trans, ret); 4333 + btrfs_end_transaction(trans); 4334 + } 4335 + } else { 4336 + dest_bg = btrfs_lookup_block_group(fs_info, dest_addr); 4337 + btrfs_free_reserved_bytes(dest_bg, dest_length, 0); 4338 + btrfs_put_block_group(dest_bg); 4339 + 4340 + ret = btrfs_commit_transaction(trans); 4341 + } 4342 + 4343 + return ret; 4344 + } 4345 + 4346 + static int move_existing_remaps(struct btrfs_fs_info *fs_info, 4347 + struct btrfs_block_group *bg, 4348 + struct btrfs_path *path) 4349 + { 4350 + int ret; 4351 + struct btrfs_key key; 4352 + struct extent_buffer *leaf; 4353 + struct btrfs_remap_item *remap; 4354 + u64 old_addr; 4355 + 4356 + /* Look for backrefs in remap tree. */ 4357 + while (bg->remap_bytes > 0) { 4358 + key.objectid = bg->start; 4359 + key.type = BTRFS_REMAP_BACKREF_KEY; 4360 + key.offset = 0; 4361 + 4362 + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0); 4363 + if (ret < 0) 4364 + return ret; 4365 + 4366 + leaf = path->nodes[0]; 4367 + 4368 + if (path->slots[0] >= btrfs_header_nritems(leaf)) { 4369 + ret = btrfs_next_leaf(fs_info->remap_root, path); 4370 + if (ret < 0) { 4371 + btrfs_release_path(path); 4372 + return ret; 4373 + } 4374 + 4375 + if (ret) { 4376 + btrfs_release_path(path); 4377 + break; 4378 + } 4379 + 4380 + leaf = path->nodes[0]; 4381 + } 4382 + 4383 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4384 + 4385 + if (key.type != BTRFS_REMAP_BACKREF_KEY) { 4386 + path->slots[0]++; 4387 + 4388 + if (path->slots[0] >= btrfs_header_nritems(leaf)) { 4389 + ret = btrfs_next_leaf(fs_info->remap_root, path); 4390 + if (ret < 0) { 4391 + btrfs_release_path(path); 4392 + return ret; 4393 + } 4394 + 4395 + if (ret) { 4396 + btrfs_release_path(path); 4397 + break; 4398 + } 4399 + 4400 + leaf = path->nodes[0]; 4401 + } 4402 + } 4403 + 4404 + remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); 4405 + old_addr = btrfs_remap_address(leaf, remap); 4406 + 4407 + btrfs_release_path(path); 4408 + 4409 + ret = move_existing_remap(fs_info, path, bg, key.objectid, 4410 + key.offset, old_addr); 4411 + if (ret) 4412 + return ret; 4413 + } 4414 + 4415 + ASSERT(bg->remap_bytes == 0); 4416 + 4417 + return 0; 4418 + } 4419 + 4420 + static int create_remap_tree_entries(struct btrfs_trans_handle *trans, 4421 + struct btrfs_path *path, 4422 + struct btrfs_block_group *bg) 4423 + { 4424 + struct btrfs_fs_info *fs_info = trans->fs_info; 4425 + struct btrfs_free_space_info *fsi; 4426 + struct btrfs_key key, found_key; 4427 + struct extent_buffer *leaf; 4428 + struct btrfs_root *space_root; 4429 + u32 extent_count; 4430 + struct space_run *space_runs = NULL; 4431 + unsigned int num_space_runs = 0; 4432 + struct btrfs_key *entries = NULL; 4433 + unsigned int max_entries, num_entries; 4434 + int ret; 4435 + 4436 + mutex_lock(&bg->free_space_lock); 4437 + 4438 + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &bg->runtime_flags)) { 4439 + mutex_unlock(&bg->free_space_lock); 4440 + 4441 + ret = btrfs_add_block_group_free_space(trans, bg); 4442 + if (ret) 4443 + return ret; 4444 + 4445 + mutex_lock(&bg->free_space_lock); 4446 + } 4447 + 4448 + fsi = btrfs_search_free_space_info(trans, bg, path, 0); 4449 + if (IS_ERR(fsi)) { 4450 + mutex_unlock(&bg->free_space_lock); 4451 + return PTR_ERR(fsi); 4452 + } 4453 + 4454 + extent_count = btrfs_free_space_extent_count(path->nodes[0], fsi); 4455 + 4456 + btrfs_release_path(path); 4457 + 4458 + space_runs = kmalloc(sizeof(*space_runs) * extent_count, GFP_NOFS); 4459 + if (!space_runs) { 4460 + mutex_unlock(&bg->free_space_lock); 4461 + return -ENOMEM; 4462 + } 4463 + 4464 + key.objectid = bg->start; 4465 + key.type = 0; 4466 + key.offset = 0; 4467 + 4468 + space_root = btrfs_free_space_root(bg); 4469 + 4470 + ret = btrfs_search_slot(trans, space_root, &key, path, 0, 0); 4471 + if (ret < 0) { 4472 + mutex_unlock(&bg->free_space_lock); 4473 + goto out; 4474 + } 4475 + 4476 + ret = 0; 4477 + 4478 + while (true) { 4479 + leaf = path->nodes[0]; 4480 + 4481 + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4482 + 4483 + if (found_key.objectid >= bg->start + bg->length) 4484 + break; 4485 + 4486 + if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { 4487 + if (num_space_runs != 0 && 4488 + space_runs[num_space_runs - 1].end == found_key.objectid) { 4489 + space_runs[num_space_runs - 1].end = 4490 + found_key.objectid + found_key.offset; 4491 + } else { 4492 + ASSERT(num_space_runs < extent_count); 4493 + 4494 + space_runs[num_space_runs].start = found_key.objectid; 4495 + space_runs[num_space_runs].end = 4496 + found_key.objectid + found_key.offset; 4497 + 4498 + num_space_runs++; 4499 + } 4500 + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { 4501 + void *bitmap; 4502 + unsigned long offset; 4503 + u32 data_size; 4504 + 4505 + offset = btrfs_item_ptr_offset(leaf, path->slots[0]); 4506 + data_size = btrfs_item_size(leaf, path->slots[0]); 4507 + 4508 + if (data_size != 0) { 4509 + bitmap = kmalloc(data_size, GFP_NOFS); 4510 + if (!bitmap) { 4511 + mutex_unlock(&bg->free_space_lock); 4512 + ret = -ENOMEM; 4513 + goto out; 4514 + } 4515 + 4516 + read_extent_buffer(leaf, bitmap, offset, data_size); 4517 + 4518 + parse_bitmap(fs_info->sectorsize, bitmap, 4519 + data_size * BITS_PER_BYTE, 4520 + found_key.objectid, space_runs, 4521 + &num_space_runs); 4522 + 4523 + ASSERT(num_space_runs <= extent_count); 4524 + 4525 + kfree(bitmap); 4526 + } 4527 + } 4528 + 4529 + path->slots[0]++; 4530 + 4531 + if (path->slots[0] >= btrfs_header_nritems(leaf)) { 4532 + ret = btrfs_next_leaf(space_root, path); 4533 + if (ret != 0) { 4534 + if (ret == 1) 4535 + ret = 0; 4536 + break; 4537 + } 4538 + leaf = path->nodes[0]; 4539 + } 4540 + } 4541 + 4542 + btrfs_release_path(path); 4543 + 4544 + mutex_unlock(&bg->free_space_lock); 4545 + 4546 + max_entries = extent_count + 2; 4547 + entries = kmalloc(sizeof(*entries) * max_entries, GFP_NOFS); 4548 + if (!entries) { 4549 + ret = -ENOMEM; 4550 + goto out; 4551 + } 4552 + 4553 + num_entries = 0; 4554 + 4555 + if (num_space_runs == 0) { 4556 + entries[num_entries].objectid = bg->start; 4557 + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; 4558 + entries[num_entries].offset = bg->length; 4559 + num_entries++; 4560 + } else { 4561 + if (space_runs[0].start > bg->start) { 4562 + entries[num_entries].objectid = bg->start; 4563 + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; 4564 + entries[num_entries].offset = space_runs[0].start - bg->start; 4565 + num_entries++; 4566 + } 4567 + 4568 + for (unsigned int i = 1; i < num_space_runs; i++) { 4569 + entries[num_entries].objectid = space_runs[i - 1].end; 4570 + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; 4571 + entries[num_entries].offset = 4572 + space_runs[i].start - space_runs[i - 1].end; 4573 + num_entries++; 4574 + } 4575 + 4576 + if (space_runs[num_space_runs - 1].end < bg->start + bg->length) { 4577 + entries[num_entries].objectid = 4578 + space_runs[num_space_runs - 1].end; 4579 + entries[num_entries].type = BTRFS_IDENTITY_REMAP_KEY; 4580 + entries[num_entries].offset = 4581 + bg->start + bg->length - space_runs[num_space_runs - 1].end; 4582 + num_entries++; 4583 + } 4584 + 4585 + if (num_entries == 0) 4586 + goto out; 4587 + } 4588 + 4589 + bg->identity_remap_count = num_entries; 4590 + 4591 + ret = add_remap_tree_entries(trans, path, entries, num_entries); 4592 + 4593 + out: 4594 + kfree(entries); 4595 + kfree(space_runs); 4596 + 4597 + return ret; 4598 + } 4599 + 4600 + static int find_next_identity_remap(struct btrfs_trans_handle *trans, 4601 + struct btrfs_path *path, u64 bg_end, 4602 + u64 last_start, u64 *start, u64 *length) 4603 + { 4604 + int ret; 4605 + struct btrfs_key key, found_key; 4606 + struct btrfs_root *remap_root = trans->fs_info->remap_root; 4607 + struct extent_buffer *leaf; 4608 + 4609 + key.objectid = last_start; 4610 + key.type = BTRFS_IDENTITY_REMAP_KEY; 4611 + key.offset = 0; 4612 + 4613 + ret = btrfs_search_slot(trans, remap_root, &key, path, 0, 0); 4614 + if (ret < 0) 4615 + goto out; 4616 + 4617 + leaf = path->nodes[0]; 4618 + while (true) { 4619 + if (path->slots[0] >= btrfs_header_nritems(leaf)) { 4620 + ret = btrfs_next_leaf(remap_root, path); 4621 + 4622 + if (ret != 0) { 4623 + if (ret == 1) 4624 + ret = -ENOENT; 4625 + goto out; 4626 + } 4627 + 4628 + leaf = path->nodes[0]; 4629 + } 4630 + 4631 + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4632 + 4633 + if (found_key.objectid >= bg_end) { 4634 + ret = -ENOENT; 4635 + goto out; 4636 + } 4637 + 4638 + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) { 4639 + *start = found_key.objectid; 4640 + *length = found_key.offset; 4641 + ret = 0; 4642 + goto out; 4643 + } 4644 + 4645 + path->slots[0]++; 4646 + } 4647 + 4648 + out: 4649 + btrfs_release_path(path); 4650 + 4651 + return ret; 4652 + } 4653 + 4654 + static int remove_chunk_stripes(struct btrfs_trans_handle *trans, 4655 + struct btrfs_chunk_map *chunk_map, 4656 + struct btrfs_path *path) 4657 + { 4658 + struct btrfs_fs_info *fs_info = trans->fs_info; 4659 + struct btrfs_key key; 4660 + struct extent_buffer *leaf; 4661 + struct btrfs_chunk *chunk; 4662 + int ret; 4663 + 4664 + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4665 + key.type = BTRFS_CHUNK_ITEM_KEY; 4666 + key.offset = chunk_map->start; 4667 + 4668 + btrfs_reserve_chunk_metadata(trans, false); 4669 + 4670 + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1); 4671 + if (ret) { 4672 + if (ret == 1) { 4673 + btrfs_release_path(path); 4674 + ret = -ENOENT; 4675 + } 4676 + btrfs_trans_release_chunk_metadata(trans); 4677 + return ret; 4678 + } 4679 + 4680 + leaf = path->nodes[0]; 4681 + 4682 + chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); 4683 + btrfs_set_chunk_num_stripes(leaf, chunk, 0); 4684 + btrfs_set_chunk_sub_stripes(leaf, chunk, 0); 4685 + 4686 + btrfs_truncate_item(trans, path, offsetof(struct btrfs_chunk, stripe), 1); 4687 + 4688 + btrfs_mark_buffer_dirty(trans, leaf); 4689 + 4690 + btrfs_release_path(path); 4691 + btrfs_trans_release_chunk_metadata(trans); 4692 + 4693 + return 0; 4694 + } 4695 + 4696 + int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, 4697 + struct btrfs_block_group *bg) 4698 + { 4699 + struct btrfs_fs_info *fs_info = bg->fs_info; 4700 + struct btrfs_trans_handle *trans; 4701 + int ret; 4702 + unsigned int num_items; 4703 + BTRFS_PATH_AUTO_FREE(path); 4704 + 4705 + path = btrfs_alloc_path(); 4706 + if (!path) 4707 + return -ENOMEM; 4708 + 4709 + /* 4710 + * One item for each entry we're removing in the dev extents tree, and 4711 + * another for each device. DUP chunks are all on one device, 4712 + * everything else has one device per stripe. 4713 + */ 4714 + if (bg->flags & BTRFS_BLOCK_GROUP_DUP) 4715 + num_items = chunk_map->num_stripes + 1; 4716 + else 4717 + num_items = 2 * chunk_map->num_stripes; 4718 + 4719 + trans = btrfs_start_transaction_fallback_global_rsv(fs_info->tree_root, num_items); 4720 + if (IS_ERR(trans)) 4721 + return PTR_ERR(trans); 4722 + 4723 + ret = btrfs_remove_dev_extents(trans, chunk_map); 4724 + if (unlikely(ret)) { 4725 + btrfs_abort_transaction(trans, ret); 4726 + return ret; 4727 + } 4728 + 4729 + mutex_lock(&trans->fs_info->chunk_mutex); 4730 + for (unsigned int i = 0; i < chunk_map->num_stripes; i++) { 4731 + ret = btrfs_update_device(trans, chunk_map->stripes[i].dev); 4732 + if (unlikely(ret)) { 4733 + mutex_unlock(&trans->fs_info->chunk_mutex); 4734 + btrfs_abort_transaction(trans, ret); 4735 + return ret; 4736 + } 4737 + } 4738 + mutex_unlock(&trans->fs_info->chunk_mutex); 4739 + 4740 + write_lock(&trans->fs_info->mapping_tree_lock); 4741 + btrfs_chunk_map_device_clear_bits(chunk_map, CHUNK_ALLOCATED); 4742 + write_unlock(&trans->fs_info->mapping_tree_lock); 4743 + 4744 + btrfs_remove_bg_from_sinfo(bg); 4745 + 4746 + spin_lock(&bg->lock); 4747 + clear_bit(BLOCK_GROUP_FLAG_STRIPE_REMOVAL_PENDING, &bg->runtime_flags); 4748 + spin_unlock(&bg->lock); 4749 + 4750 + ret = remove_chunk_stripes(trans, chunk_map, path); 4751 + if (unlikely(ret)) { 4752 + btrfs_abort_transaction(trans, ret); 4753 + return ret; 4754 + } 4755 + 4756 + ret = btrfs_commit_transaction(trans); 4757 + if (ret) 4758 + return ret; 4759 + 4760 + return 0; 4761 + } 4762 + 4763 + static void adjust_identity_remap_count(struct btrfs_trans_handle *trans, 4764 + struct btrfs_block_group *bg, int delta) 4765 + { 4766 + struct btrfs_fs_info *fs_info = trans->fs_info; 4767 + bool bg_already_dirty = true; 4768 + bool mark_fully_remapped = false; 4769 + 4770 + WARN_ON(delta < 0 && -delta > bg->identity_remap_count); 4771 + 4772 + spin_lock(&bg->lock); 4773 + 4774 + bg->identity_remap_count += delta; 4775 + 4776 + if (bg->identity_remap_count == 0 && 4777 + !test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags)) { 4778 + set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &bg->runtime_flags); 4779 + mark_fully_remapped = true; 4780 + } 4781 + 4782 + spin_unlock(&bg->lock); 4783 + 4784 + spin_lock(&trans->transaction->dirty_bgs_lock); 4785 + if (list_empty(&bg->dirty_list)) { 4786 + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); 4787 + bg_already_dirty = false; 4788 + btrfs_get_block_group(bg); 4789 + } 4790 + spin_unlock(&trans->transaction->dirty_bgs_lock); 4791 + 4792 + /* Modified block groups are accounted for in the delayed_refs_rsv. */ 4793 + if (!bg_already_dirty) 4794 + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); 4795 + 4796 + if (mark_fully_remapped) 4797 + btrfs_mark_bg_fully_remapped(bg, trans); 4798 + } 4799 + 4800 + static int add_remap_entry(struct btrfs_trans_handle *trans, 4801 + struct btrfs_path *path, 4802 + struct btrfs_block_group *src_bg, u64 old_addr, 4803 + u64 new_addr, u64 length) 4804 + { 4805 + struct btrfs_fs_info *fs_info = trans->fs_info; 4806 + struct btrfs_key key, new_key; 4807 + int ret; 4808 + int identity_count_delta = 0; 4809 + 4810 + key.objectid = old_addr; 4811 + key.type = (u8)-1; 4812 + key.offset = (u64)-1; 4813 + 4814 + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); 4815 + if (ret < 0) 4816 + goto end; 4817 + 4818 + if (path->slots[0] == 0) { 4819 + ret = -ENOENT; 4820 + goto end; 4821 + } 4822 + 4823 + path->slots[0]--; 4824 + 4825 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4826 + 4827 + if (key.type != BTRFS_IDENTITY_REMAP_KEY || 4828 + key.objectid > old_addr || 4829 + key.objectid + key.offset <= old_addr) { 4830 + ret = -ENOENT; 4831 + goto end; 4832 + } 4833 + 4834 + /* Shorten or delete identity mapping entry. */ 4835 + if (key.objectid == old_addr) { 4836 + ret = btrfs_del_item(trans, fs_info->remap_root, path); 4837 + if (ret) 4838 + goto end; 4839 + 4840 + identity_count_delta--; 4841 + } else { 4842 + new_key.objectid = key.objectid; 4843 + new_key.type = BTRFS_IDENTITY_REMAP_KEY; 4844 + new_key.offset = old_addr - key.objectid; 4845 + 4846 + btrfs_set_item_key_safe(trans, path, &new_key); 4847 + } 4848 + 4849 + btrfs_release_path(path); 4850 + 4851 + /* Create new remap entry. */ 4852 + ret = add_remap_item(trans, path, new_addr, length, old_addr); 4853 + if (ret) 4854 + goto end; 4855 + 4856 + /* Add entry for remainder of identity mapping, if necessary. */ 4857 + if (key.objectid + key.offset != old_addr + length) { 4858 + new_key.objectid = old_addr + length; 4859 + new_key.type = BTRFS_IDENTITY_REMAP_KEY; 4860 + new_key.offset = key.objectid + key.offset - old_addr - length; 4861 + 4862 + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, 4863 + path, &new_key, 0); 4864 + if (ret) 4865 + goto end; 4866 + 4867 + btrfs_release_path(path); 4868 + 4869 + identity_count_delta++; 4870 + } 4871 + 4872 + /* Add backref. */ 4873 + ret = add_remap_backref_item(trans, path, new_addr, length, old_addr); 4874 + if (ret) 4875 + goto end; 4876 + 4877 + if (identity_count_delta != 0) 4878 + adjust_identity_remap_count(trans, src_bg, identity_count_delta); 4879 + 4880 + end: 4881 + btrfs_release_path(path); 4882 + 4883 + return ret; 4884 + } 4885 + 4886 + static int mark_chunk_remapped(struct btrfs_trans_handle *trans, 4887 + struct btrfs_path *path, u64 start) 4888 + { 4889 + struct btrfs_fs_info *fs_info = trans->fs_info; 4890 + struct btrfs_chunk_map *chunk_map; 4891 + struct btrfs_key key; 4892 + u64 type; 4893 + int ret; 4894 + struct extent_buffer *leaf; 4895 + struct btrfs_chunk *chunk; 4896 + 4897 + read_lock(&fs_info->mapping_tree_lock); 4898 + 4899 + chunk_map = btrfs_find_chunk_map_nolock(fs_info, start, 1); 4900 + if (!chunk_map) { 4901 + read_unlock(&fs_info->mapping_tree_lock); 4902 + return -ENOENT; 4903 + } 4904 + 4905 + chunk_map->type |= BTRFS_BLOCK_GROUP_REMAPPED; 4906 + type = chunk_map->type; 4907 + 4908 + read_unlock(&fs_info->mapping_tree_lock); 4909 + 4910 + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 4911 + key.type = BTRFS_CHUNK_ITEM_KEY; 4912 + key.offset = start; 4913 + 4914 + ret = btrfs_search_slot(trans, fs_info->chunk_root, &key, path, 0, 1); 4915 + if (ret == 1) { 4916 + ret = -ENOENT; 4917 + goto end; 4918 + } else if (ret < 0) 4919 + goto end; 4920 + 4921 + leaf = path->nodes[0]; 4922 + 4923 + chunk = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_chunk); 4924 + btrfs_set_chunk_type(leaf, chunk, type); 4925 + btrfs_mark_buffer_dirty(trans, leaf); 4926 + 4927 + ret = 0; 4928 + end: 4929 + btrfs_free_chunk_map(chunk_map); 4930 + btrfs_release_path(path); 4931 + 4932 + return ret; 4933 + } 4934 + 4935 + static int do_remap_reloc_trans(struct btrfs_fs_info *fs_info, 4936 + struct btrfs_block_group *src_bg, 4937 + struct btrfs_path *path, u64 *last_start) 4938 + { 4939 + struct btrfs_trans_handle *trans; 4940 + struct btrfs_root *extent_root; 4941 + struct btrfs_key ins; 4942 + struct btrfs_block_group *dest_bg = NULL; 4943 + u64 start = 0, remap_length = 0; 4944 + u64 length, new_addr, min_size; 4945 + int ret; 4946 + const bool is_data = (src_bg->flags & BTRFS_BLOCK_GROUP_DATA); 4947 + bool no_more = false; 4948 + bool made_reservation = false, bg_needs_free_space; 4949 + struct btrfs_space_info *sinfo = src_bg->space_info; 4950 + 4951 + extent_root = btrfs_extent_root(fs_info, src_bg->start); 4952 + 4953 + trans = btrfs_start_transaction(extent_root, 0); 4954 + if (IS_ERR(trans)) 4955 + return PTR_ERR(trans); 4956 + 4957 + mutex_lock(&fs_info->remap_mutex); 4958 + 4959 + ret = find_next_identity_remap(trans, path, src_bg->start + src_bg->length, 4960 + *last_start, &start, &remap_length); 4961 + if (ret == -ENOENT) { 4962 + no_more = true; 4963 + goto next; 4964 + } else if (ret) { 4965 + mutex_unlock(&fs_info->remap_mutex); 4966 + btrfs_end_transaction(trans); 4967 + return ret; 4968 + } 4969 + 4970 + /* Try to reserve enough space for block. */ 4971 + spin_lock(&sinfo->lock); 4972 + btrfs_space_info_update_bytes_may_use(sinfo, remap_length); 4973 + spin_unlock(&sinfo->lock); 4974 + 4975 + if (is_data) 4976 + min_size = fs_info->sectorsize; 4977 + else 4978 + min_size = fs_info->nodesize; 4979 + 4980 + /* 4981 + * We're using btrfs_reserve_extent() to allocate a contiguous 4982 + * logical address range, but this will become a remap item rather than 4983 + * an extent in the extent tree. 4984 + * 4985 + * Short allocations are fine: it means that we chop off the beginning 4986 + * of the identity remap that we're processing, and will tackle the 4987 + * rest of it the next time round. 4988 + */ 4989 + ret = btrfs_reserve_extent(fs_info->fs_root, remap_length, remap_length, 4990 + min_size, 0, 0, &ins, is_data, false); 4991 + if (ret) { 4992 + spin_lock(&sinfo->lock); 4993 + btrfs_space_info_update_bytes_may_use(sinfo, -remap_length); 4994 + spin_unlock(&sinfo->lock); 4995 + 4996 + mutex_unlock(&fs_info->remap_mutex); 4997 + btrfs_end_transaction(trans); 4998 + return ret; 4999 + } 5000 + 5001 + made_reservation = true; 5002 + 5003 + new_addr = ins.objectid; 5004 + length = ins.offset; 5005 + 5006 + if (!is_data && !IS_ALIGNED(length, fs_info->nodesize)) { 5007 + u64 new_length = ALIGN_DOWN(length, fs_info->nodesize); 5008 + 5009 + btrfs_free_reserved_extent(fs_info, new_addr + new_length, 5010 + length - new_length, 0); 5011 + 5012 + length = new_length; 5013 + } 5014 + 5015 + dest_bg = btrfs_lookup_block_group(fs_info, new_addr); 5016 + 5017 + mutex_lock(&dest_bg->free_space_lock); 5018 + bg_needs_free_space = test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, 5019 + &dest_bg->runtime_flags); 5020 + mutex_unlock(&dest_bg->free_space_lock); 5021 + 5022 + if (bg_needs_free_space) { 5023 + ret = btrfs_add_block_group_free_space(trans, dest_bg); 5024 + if (ret) 5025 + goto fail; 5026 + } 5027 + 5028 + ret = copy_remapped_data(fs_info, start, new_addr, length); 5029 + if (ret) 5030 + goto fail; 5031 + 5032 + ret = btrfs_remove_from_free_space_tree(trans, new_addr, length); 5033 + if (ret) 5034 + goto fail; 5035 + 5036 + ret = add_remap_entry(trans, path, src_bg, start, new_addr, length); 5037 + if (ret) { 5038 + btrfs_add_to_free_space_tree(trans, new_addr, length); 5039 + goto fail; 5040 + } 5041 + 5042 + adjust_block_group_remap_bytes(trans, dest_bg, length); 5043 + btrfs_free_reserved_bytes(dest_bg, length, 0); 5044 + 5045 + spin_lock(&sinfo->lock); 5046 + sinfo->bytes_readonly += length; 5047 + spin_unlock(&sinfo->lock); 5048 + 5049 + next: 5050 + if (dest_bg) 5051 + btrfs_put_block_group(dest_bg); 5052 + 5053 + if (made_reservation) 5054 + btrfs_dec_block_group_reservations(fs_info, new_addr); 5055 + 5056 + mutex_unlock(&fs_info->remap_mutex); 5057 + 5058 + if (src_bg->identity_remap_count == 0) { 5059 + bool mark_fully_remapped = false; 5060 + 5061 + spin_lock(&src_bg->lock); 5062 + if (!test_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags)) { 5063 + mark_fully_remapped = true; 5064 + set_bit(BLOCK_GROUP_FLAG_FULLY_REMAPPED, &src_bg->runtime_flags); 5065 + } 5066 + spin_unlock(&src_bg->lock); 5067 + 5068 + if (mark_fully_remapped) 5069 + btrfs_mark_bg_fully_remapped(src_bg, trans); 5070 + } 5071 + 5072 + ret = btrfs_end_transaction(trans); 5073 + if (ret) 5074 + return ret; 5075 + 5076 + if (no_more) 5077 + return 1; 5078 + 5079 + *last_start = start; 5080 + 5081 + return 0; 5082 + 5083 + fail: 5084 + if (dest_bg) 5085 + btrfs_put_block_group(dest_bg); 5086 + 5087 + btrfs_free_reserved_extent(fs_info, new_addr, length, 0); 5088 + 5089 + mutex_unlock(&fs_info->remap_mutex); 5090 + btrfs_end_transaction(trans); 5091 + 5092 + return ret; 5093 + } 5094 + 5095 + static int do_remap_reloc(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 5096 + struct btrfs_block_group *bg) 5097 + { 5098 + u64 last_start = bg->start; 5099 + int ret; 5100 + 5101 + while (true) { 5102 + ret = do_remap_reloc_trans(fs_info, bg, path, &last_start); 5103 + if (ret) { 5104 + if (ret == 1) 5105 + ret = 0; 5106 + break; 5107 + } 5108 + } 5109 + 5110 + return ret; 5111 + } 5112 + 5113 + int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length) 5114 + { 5115 + int ret; 5116 + struct btrfs_key key, found_key; 5117 + struct extent_buffer *leaf; 5118 + struct btrfs_remap_item *remap; 5119 + BTRFS_PATH_AUTO_FREE(path); 5120 + 5121 + path = btrfs_alloc_path(); 5122 + if (!path) 5123 + return -ENOMEM; 5124 + 5125 + key.objectid = *logical; 5126 + key.type = (u8)-1; 5127 + key.offset = (u64)-1; 5128 + 5129 + ret = btrfs_search_slot(NULL, fs_info->remap_root, &key, path, 0, 0); 5130 + if (ret < 0) 5131 + return ret; 5132 + 5133 + leaf = path->nodes[0]; 5134 + if (path->slots[0] == 0) 5135 + return -ENOENT; 5136 + 5137 + path->slots[0]--; 5138 + 5139 + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 5140 + 5141 + if (found_key.type != BTRFS_REMAP_KEY && 5142 + found_key.type != BTRFS_IDENTITY_REMAP_KEY) { 5143 + return -ENOENT; 5144 + } 5145 + 5146 + if (found_key.objectid > *logical || 5147 + found_key.objectid + found_key.offset <= *logical) { 5148 + return -ENOENT; 5149 + } 5150 + 5151 + if (*logical + *length > found_key.objectid + found_key.offset) 5152 + *length = found_key.objectid + found_key.offset - *logical; 5153 + 5154 + if (found_key.type == BTRFS_IDENTITY_REMAP_KEY) 5155 + return 0; 5156 + 5157 + remap = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); 5158 + *logical += btrfs_remap_address(leaf, remap) - found_key.objectid; 5159 + 5160 + return 0; 5161 + } 5162 + 5163 + static int start_block_group_remapping(struct btrfs_fs_info *fs_info, 5164 + struct btrfs_path *path, 5165 + struct btrfs_block_group *bg) 5166 + { 5167 + struct btrfs_trans_handle *trans; 5168 + bool bg_already_dirty = true; 5169 + int ret, ret2; 5170 + 5171 + ret = btrfs_cache_block_group(bg, true); 5172 + if (ret) 5173 + return ret; 5174 + 5175 + trans = btrfs_start_transaction(fs_info->remap_root, 0); 5176 + if (IS_ERR(trans)) 5177 + return PTR_ERR(trans); 5178 + 5179 + /* We need to run delayed refs, to make sure FST is up to date. */ 5180 + ret = btrfs_run_delayed_refs(trans, U64_MAX); 5181 + if (ret) { 5182 + btrfs_end_transaction(trans); 5183 + return ret; 5184 + } 5185 + 5186 + mutex_lock(&fs_info->remap_mutex); 5187 + 5188 + if (bg->flags & BTRFS_BLOCK_GROUP_REMAPPED) { 5189 + ret = 0; 5190 + goto end; 5191 + } 5192 + 5193 + ret = create_remap_tree_entries(trans, path, bg); 5194 + if (unlikely(ret)) { 5195 + btrfs_abort_transaction(trans, ret); 5196 + goto end; 5197 + } 5198 + 5199 + spin_lock(&bg->lock); 5200 + bg->flags |= BTRFS_BLOCK_GROUP_REMAPPED; 5201 + spin_unlock(&bg->lock); 5202 + 5203 + spin_lock(&trans->transaction->dirty_bgs_lock); 5204 + if (list_empty(&bg->dirty_list)) { 5205 + list_add_tail(&bg->dirty_list, &trans->transaction->dirty_bgs); 5206 + bg_already_dirty = false; 5207 + btrfs_get_block_group(bg); 5208 + } 5209 + spin_unlock(&trans->transaction->dirty_bgs_lock); 5210 + 5211 + /* Modified block groups are accounted for in the delayed_refs_rsv. */ 5212 + if (!bg_already_dirty) 5213 + btrfs_inc_delayed_refs_rsv_bg_updates(fs_info); 5214 + 5215 + ret = mark_chunk_remapped(trans, path, bg->start); 5216 + if (unlikely(ret)) { 5217 + btrfs_abort_transaction(trans, ret); 5218 + goto end; 5219 + } 5220 + 5221 + ret = btrfs_remove_block_group_free_space(trans, bg); 5222 + if (unlikely(ret)) { 5223 + btrfs_abort_transaction(trans, ret); 5224 + goto end; 5225 + } 5226 + 5227 + btrfs_remove_free_space_cache(bg); 5228 + 5229 + end: 5230 + mutex_unlock(&fs_info->remap_mutex); 5231 + 5232 + ret2 = btrfs_end_transaction(trans); 5233 + if (!ret) 5234 + ret = ret2; 5235 + 5236 + return ret; 5237 + } 5238 + 5239 + static int do_nonremap_reloc(struct btrfs_fs_info *fs_info, bool verbose, 5240 + struct reloc_control *rc) 5241 + { 5242 + int ret; 5243 + 5244 + while (1) { 5245 + enum reloc_stage finishes_stage; 5246 + 5247 + mutex_lock(&fs_info->cleaner_mutex); 5248 + ret = relocate_block_group(rc); 5249 + mutex_unlock(&fs_info->cleaner_mutex); 5250 + 5251 + finishes_stage = rc->stage; 5252 + /* 5253 + * We may have gotten ENOSPC after we already dirtied some 5254 + * extents. If writeout happens while we're relocating a 5255 + * different block group we could end up hitting the 5256 + * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 5257 + * btrfs_reloc_cow_block. Make sure we write everything out 5258 + * properly so we don't trip over this problem, and then break 5259 + * out of the loop if we hit an error. 5260 + */ 5261 + if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 5262 + int wb_ret; 5263 + 5264 + wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 5265 + 0, (u64)-1); 5266 + if (wb_ret && ret == 0) 5267 + ret = wb_ret; 5268 + invalidate_mapping_pages(rc->data_inode->i_mapping, 0, -1); 5269 + rc->stage = UPDATE_DATA_PTRS; 5270 + } 5271 + 5272 + if (ret < 0) 5273 + return ret; 5274 + 5275 + if (rc->extents_found == 0) 5276 + break; 5277 + 5278 + if (verbose) 5279 + btrfs_info(fs_info, "found %llu extents, stage: %s", 5280 + rc->extents_found, stage_to_string(finishes_stage)); 5281 + } 5282 + 5283 + WARN_ON(rc->block_group->pinned > 0); 5284 + WARN_ON(rc->block_group->reserved > 0); 5285 + WARN_ON(rc->block_group->used > 0); 5286 + 5287 + return 0; 5288 + } 5289 + 3863 5290 /* 3864 5291 * function to relocate all extents in a block group. 3865 5292 */ ··· 5297 3870 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start); 5298 3871 struct reloc_control *rc; 5299 3872 struct inode *inode; 5300 - struct btrfs_path *path; 3873 + struct btrfs_path *path = NULL; 5301 3874 int ret; 5302 3875 bool bg_is_ro = false; 5303 3876 ··· 5359 3932 } 5360 3933 5361 3934 inode = lookup_free_space_inode(rc->block_group, path); 5362 - btrfs_free_path(path); 3935 + btrfs_release_path(path); 5363 3936 5364 3937 if (!IS_ERR(inode)) 5365 3938 ret = delete_block_group_cache(rc->block_group, inode, 0); ··· 5369 3942 if (ret && ret != -ENOENT) 5370 3943 goto out; 5371 3944 5372 - rc->data_inode = create_reloc_inode(rc->block_group); 5373 - if (IS_ERR(rc->data_inode)) { 5374 - ret = PTR_ERR(rc->data_inode); 5375 - rc->data_inode = NULL; 5376 - goto out; 3945 + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) { 3946 + rc->data_inode = create_reloc_inode(rc->block_group); 3947 + if (IS_ERR(rc->data_inode)) { 3948 + ret = PTR_ERR(rc->data_inode); 3949 + rc->data_inode = NULL; 3950 + goto out; 3951 + } 5377 3952 } 5378 3953 5379 3954 if (verbose) ··· 5388 3959 ret = btrfs_zone_finish(rc->block_group); 5389 3960 WARN_ON(ret && ret != -EAGAIN); 5390 3961 5391 - while (1) { 5392 - enum reloc_stage finishes_stage; 5393 - 5394 - mutex_lock(&fs_info->cleaner_mutex); 5395 - ret = relocate_block_group(rc); 5396 - mutex_unlock(&fs_info->cleaner_mutex); 5397 - 5398 - finishes_stage = rc->stage; 5399 - /* 5400 - * We may have gotten ENOSPC after we already dirtied some 5401 - * extents. If writeout happens while we're relocating a 5402 - * different block group we could end up hitting the 5403 - * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in 5404 - * btrfs_reloc_cow_block. Make sure we write everything out 5405 - * properly so we don't trip over this problem, and then break 5406 - * out of the loop if we hit an error. 5407 - */ 5408 - if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { 5409 - int wb_ret; 5410 - 5411 - wb_ret = btrfs_wait_ordered_range(BTRFS_I(rc->data_inode), 0, 5412 - (u64)-1); 5413 - if (wb_ret && ret == 0) 5414 - ret = wb_ret; 5415 - invalidate_mapping_pages(rc->data_inode->i_mapping, 5416 - 0, -1); 5417 - rc->stage = UPDATE_DATA_PTRS; 3962 + if (should_relocate_using_remap_tree(bg)) { 3963 + if (bg->remap_bytes != 0) { 3964 + ret = move_existing_remaps(fs_info, bg, path); 3965 + if (ret) 3966 + goto out; 5418 3967 } 5419 - 5420 - if (ret < 0) 3968 + ret = start_block_group_remapping(fs_info, path, bg); 3969 + if (ret) 5421 3970 goto out; 5422 3971 5423 - if (rc->extents_found == 0) 5424 - break; 3972 + ret = do_remap_reloc(fs_info, path, rc->block_group); 3973 + if (ret) 3974 + goto out; 5425 3975 5426 - if (verbose) 5427 - btrfs_info(fs_info, "found %llu extents, stage: %s", 5428 - rc->extents_found, 5429 - stage_to_string(finishes_stage)); 3976 + btrfs_delete_unused_bgs(fs_info); 3977 + } else { 3978 + ret = do_nonremap_reloc(fs_info, verbose, rc); 5430 3979 } 5431 3980 5432 - WARN_ON(rc->block_group->pinned > 0); 5433 - WARN_ON(rc->block_group->reserved > 0); 5434 - WARN_ON(rc->block_group->used > 0); 5435 3981 out: 5436 3982 if (ret && bg_is_ro) 5437 3983 btrfs_dec_block_group_ro(rc->block_group); 5438 - iput(rc->data_inode); 3984 + if (!btrfs_fs_incompat(fs_info, REMAP_TREE)) 3985 + iput(rc->data_inode); 3986 + btrfs_free_path(path); 5439 3987 reloc_chunk_end(fs_info); 5440 3988 out_put_bg: 5441 3989 btrfs_put_block_group(bg); ··· 5606 4200 5607 4201 btrfs_free_path(path); 5608 4202 5609 - if (ret == 0) { 4203 + if (ret == 0 && !btrfs_fs_incompat(fs_info, REMAP_TREE)) { 5610 4204 /* cleanup orphan inode in data relocation tree */ 5611 4205 fs_root = btrfs_grab_root(fs_info->data_reloc_root); 5612 4206 ASSERT(fs_root); ··· 5819 4413 if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group) 5820 4414 logical = fs_info->reloc_ctl->block_group->start; 5821 4415 return logical; 4416 + } 4417 + 4418 + static int insert_remap_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, 4419 + u64 old_addr, u64 length, u64 new_addr) 4420 + { 4421 + int ret; 4422 + struct btrfs_fs_info *fs_info = trans->fs_info; 4423 + struct btrfs_key key; 4424 + struct btrfs_remap_item remap = { 0 }; 4425 + 4426 + if (old_addr == new_addr) { 4427 + /* Add new identity remap item. */ 4428 + key.objectid = old_addr; 4429 + key.type = BTRFS_IDENTITY_REMAP_KEY; 4430 + key.offset = length; 4431 + 4432 + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, path, 4433 + &key, 0); 4434 + if (ret) 4435 + return ret; 4436 + } else { 4437 + /* Add new remap item. */ 4438 + key.objectid = old_addr; 4439 + key.type = BTRFS_REMAP_KEY; 4440 + key.offset = length; 4441 + 4442 + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, 4443 + path, &key, sizeof(struct btrfs_remap_item)); 4444 + if (ret) 4445 + return ret; 4446 + 4447 + btrfs_set_stack_remap_address(&remap, new_addr); 4448 + 4449 + write_extent_buffer(path->nodes[0], &remap, 4450 + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), 4451 + sizeof(struct btrfs_remap_item)); 4452 + 4453 + btrfs_release_path(path); 4454 + 4455 + /* Add new backref item. */ 4456 + key.objectid = new_addr; 4457 + key.type = BTRFS_REMAP_BACKREF_KEY; 4458 + key.offset = length; 4459 + 4460 + ret = btrfs_insert_empty_item(trans, fs_info->remap_root, 4461 + path, &key, 4462 + sizeof(struct btrfs_remap_item)); 4463 + if (ret) 4464 + return ret; 4465 + 4466 + btrfs_set_stack_remap_address(&remap, old_addr); 4467 + 4468 + write_extent_buffer(path->nodes[0], &remap, 4469 + btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), 4470 + sizeof(struct btrfs_remap_item)); 4471 + } 4472 + 4473 + btrfs_release_path(path); 4474 + 4475 + return 0; 4476 + } 4477 + 4478 + /* 4479 + * Punch a hole in the remap item or identity remap item pointed to by path, 4480 + * for the range [hole_start, hole_start + hole_length). 4481 + */ 4482 + static int remove_range_from_remap_tree(struct btrfs_trans_handle *trans, 4483 + struct btrfs_path *path, 4484 + struct btrfs_block_group *bg, 4485 + u64 hole_start, u64 hole_length) 4486 + { 4487 + int ret; 4488 + struct btrfs_fs_info *fs_info = trans->fs_info; 4489 + struct extent_buffer *leaf = path->nodes[0]; 4490 + struct btrfs_key key; 4491 + u64 hole_end, new_addr, remap_start, remap_length, remap_end; 4492 + u64 overlap_length; 4493 + bool is_identity_remap; 4494 + int identity_count_delta = 0; 4495 + 4496 + hole_end = hole_start + hole_length; 4497 + 4498 + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4499 + 4500 + is_identity_remap = (key.type == BTRFS_IDENTITY_REMAP_KEY); 4501 + 4502 + remap_start = key.objectid; 4503 + remap_length = key.offset; 4504 + remap_end = remap_start + remap_length; 4505 + 4506 + if (is_identity_remap) { 4507 + new_addr = remap_start; 4508 + } else { 4509 + struct btrfs_remap_item *remap_ptr; 4510 + 4511 + remap_ptr = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_remap_item); 4512 + new_addr = btrfs_remap_address(leaf, remap_ptr); 4513 + } 4514 + 4515 + /* Delete old item. */ 4516 + ret = btrfs_del_item(trans, fs_info->remap_root, path); 4517 + btrfs_release_path(path); 4518 + if (ret) 4519 + return ret; 4520 + 4521 + if (is_identity_remap) { 4522 + identity_count_delta = -1; 4523 + } else { 4524 + /* Remove backref. */ 4525 + key.objectid = new_addr; 4526 + key.type = BTRFS_REMAP_BACKREF_KEY; 4527 + key.offset = remap_length; 4528 + 4529 + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); 4530 + if (ret) { 4531 + if (ret == 1) { 4532 + btrfs_release_path(path); 4533 + ret = -ENOENT; 4534 + } 4535 + return ret; 4536 + } 4537 + 4538 + ret = btrfs_del_item(trans, fs_info->remap_root, path); 4539 + 4540 + btrfs_release_path(path); 4541 + 4542 + if (ret) 4543 + return ret; 4544 + } 4545 + 4546 + /* If hole_start > remap_start, re-add the start of the remap item. */ 4547 + if (hole_start > remap_start) { 4548 + ret = insert_remap_item(trans, path, remap_start, 4549 + hole_start - remap_start, new_addr); 4550 + if (ret) 4551 + return ret; 4552 + 4553 + if (is_identity_remap) 4554 + identity_count_delta++; 4555 + } 4556 + 4557 + /* If hole_end < remap_end, re-add the end of the remap item. */ 4558 + if (hole_end < remap_end) { 4559 + ret = insert_remap_item(trans, path, hole_end, 4560 + remap_end - hole_end, 4561 + hole_end - remap_start + new_addr); 4562 + if (ret) 4563 + return ret; 4564 + 4565 + if (is_identity_remap) 4566 + identity_count_delta++; 4567 + } 4568 + 4569 + if (identity_count_delta != 0) 4570 + adjust_identity_remap_count(trans, bg, identity_count_delta); 4571 + 4572 + overlap_length = min_t(u64, hole_end, remap_end) - 4573 + max_t(u64, hole_start, remap_start); 4574 + 4575 + if (!is_identity_remap) { 4576 + struct btrfs_block_group *dest_bg; 4577 + 4578 + dest_bg = btrfs_lookup_block_group(fs_info, new_addr); 4579 + adjust_block_group_remap_bytes(trans, dest_bg, -overlap_length); 4580 + btrfs_put_block_group(dest_bg); 4581 + ret = btrfs_add_to_free_space_tree(trans, 4582 + hole_start - remap_start + new_addr, 4583 + overlap_length); 4584 + if (ret) 4585 + return ret; 4586 + } 4587 + 4588 + ret = overlap_length; 4589 + 4590 + return ret; 4591 + } 4592 + 4593 + /* 4594 + * Return 1 if remove_range_from_remap_tree() has been called successfully, 4595 + * 0 if block group wasn't remapped, and a negative number on error. 4596 + */ 4597 + int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans, 4598 + struct btrfs_path *path, 4599 + u64 bytenr, u64 num_bytes) 4600 + { 4601 + struct btrfs_fs_info *fs_info = trans->fs_info; 4602 + struct btrfs_key key, found_key; 4603 + struct extent_buffer *leaf; 4604 + struct btrfs_block_group *bg; 4605 + int ret, length; 4606 + 4607 + if (!(btrfs_super_incompat_flags(fs_info->super_copy) & 4608 + BTRFS_FEATURE_INCOMPAT_REMAP_TREE)) 4609 + return 0; 4610 + 4611 + bg = btrfs_lookup_block_group(fs_info, bytenr); 4612 + if (!bg) 4613 + return 0; 4614 + 4615 + mutex_lock(&fs_info->remap_mutex); 4616 + 4617 + if (!(bg->flags & BTRFS_BLOCK_GROUP_REMAPPED)) { 4618 + mutex_unlock(&fs_info->remap_mutex); 4619 + btrfs_put_block_group(bg); 4620 + return 0; 4621 + } 4622 + 4623 + do { 4624 + key.objectid = bytenr; 4625 + key.type = (u8)-1; 4626 + key.offset = (u64)-1; 4627 + 4628 + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, -1, 1); 4629 + if (ret < 0) 4630 + goto end; 4631 + 4632 + leaf = path->nodes[0]; 4633 + if (path->slots[0] == 0) { 4634 + ret = -ENOENT; 4635 + goto end; 4636 + } 4637 + 4638 + path->slots[0]--; 4639 + 4640 + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4641 + 4642 + if (found_key.type != BTRFS_IDENTITY_REMAP_KEY && 4643 + found_key.type != BTRFS_REMAP_KEY) { 4644 + ret = -ENOENT; 4645 + goto end; 4646 + } 4647 + 4648 + if (bytenr < found_key.objectid || 4649 + bytenr >= found_key.objectid + found_key.offset) { 4650 + ret = -ENOENT; 4651 + goto end; 4652 + } 4653 + 4654 + length = remove_range_from_remap_tree(trans, path, bg, bytenr, num_bytes); 4655 + if (length < 0) { 4656 + ret = length; 4657 + goto end; 4658 + } 4659 + 4660 + bytenr += length; 4661 + num_bytes -= length; 4662 + } while (num_bytes > 0); 4663 + 4664 + ret = 1; 4665 + 4666 + end: 4667 + mutex_unlock(&fs_info->remap_mutex); 4668 + 4669 + btrfs_put_block_group(bg); 4670 + btrfs_release_path(path); 4671 + 4672 + return ret; 5822 4673 }

+17

fs/btrfs/relocation.h

··· 12 12 struct btrfs_ordered_extent; 13 13 struct btrfs_pending_snapshot; 14 14 15 + static inline bool should_relocate_using_remap_tree(const struct btrfs_block_group *bg) 16 + { 17 + if (!btrfs_fs_incompat(bg->fs_info, REMAP_TREE)) 18 + return false; 19 + 20 + if (bg->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) 21 + return false; 22 + 23 + return true; 24 + } 25 + 15 26 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start, 16 27 bool verbose); 17 28 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); ··· 42 31 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); 43 32 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root); 44 33 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info); 34 + int btrfs_translate_remap(struct btrfs_fs_info *fs_info, u64 *logical, u64 *length); 35 + int btrfs_remove_extent_from_remap_tree(struct btrfs_trans_handle *trans, 36 + struct btrfs_path *path, 37 + u64 bytenr, u64 num_bytes); 38 + int btrfs_last_identity_remap_gone(struct btrfs_chunk_map *chunk_map, 39 + struct btrfs_block_group *bg); 45 40 46 41 #endif

+21 -26

fs/btrfs/root-tree.c

··· 217 217 BTRFS_PATH_AUTO_FREE(path); 218 218 struct btrfs_key key; 219 219 struct btrfs_root *root; 220 - int err = 0; 221 - int ret; 222 220 223 221 path = btrfs_alloc_path(); 224 222 if (!path) ··· 228 230 229 231 while (1) { 230 232 u64 root_objectid; 233 + int ret; 231 234 232 235 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 233 - if (ret < 0) { 234 - err = ret; 235 - break; 236 - } 236 + if (ret < 0) 237 + return ret; 237 238 238 239 leaf = path->nodes[0]; 239 240 if (path->slots[0] >= btrfs_header_nritems(leaf)) { 240 241 ret = btrfs_next_leaf(tree_root, path); 241 242 if (ret < 0) 242 - err = ret; 243 - if (ret != 0) 244 - break; 243 + return ret; 244 + else if (ret > 0) 245 + return 0; 245 246 leaf = path->nodes[0]; 246 247 } 247 248 ··· 249 252 250 253 if (key.objectid != BTRFS_ORPHAN_OBJECTID || 251 254 key.type != BTRFS_ORPHAN_ITEM_KEY) 252 - break; 255 + return 0; 253 256 254 257 root_objectid = key.offset; 255 258 key.offset++; 256 259 257 260 root = btrfs_get_fs_root(fs_info, root_objectid, false); 258 - err = PTR_ERR_OR_ZERO(root); 259 - if (err && err != -ENOENT) { 261 + ret = PTR_ERR_OR_ZERO(root); 262 + if (ret && ret != -ENOENT) { 260 263 break; 261 - } else if (err == -ENOENT) { 264 + } else if (ret == -ENOENT) { 262 265 struct btrfs_trans_handle *trans; 263 - 264 - btrfs_release_path(path); 265 266 266 267 trans = btrfs_join_transaction(tree_root); 267 268 if (IS_ERR(trans)) { 268 - err = PTR_ERR(trans); 269 - btrfs_handle_fs_error(fs_info, err, 270 - "Failed to start trans to delete orphan item"); 271 - break; 269 + ret = PTR_ERR(trans); 270 + btrfs_err(fs_info, 271 + "failed to join transaction to delete orphan item: %d", 272 + ret); 273 + return ret; 272 274 } 273 - err = btrfs_del_orphan_item(trans, tree_root, 274 - root_objectid); 275 + ret = btrfs_del_orphan_item(trans, tree_root, root_objectid); 275 276 btrfs_end_transaction(trans); 276 - if (err) { 277 - btrfs_handle_fs_error(fs_info, err, 278 - "Failed to delete root orphan item"); 279 - break; 277 + if (ret) { 278 + btrfs_err(fs_info, 279 + "failed to delete root orphan item: %d", ret); 280 + return ret; 280 281 } 281 282 continue; 282 283 } ··· 302 307 btrfs_put_root(root); 303 308 } 304 309 305 - return err; 310 + return 0; 306 311 } 307 312 308 313 /* drop the root item for 'key' from the tree root */

+24 -32

fs/btrfs/scrub.c

··· 6 6 #include <linux/blkdev.h> 7 7 #include <linux/ratelimit.h> 8 8 #include <linux/sched/mm.h> 9 - #include <crypto/hash.h> 10 9 #include "ctree.h" 11 10 #include "discard.h" 12 11 #include "volumes.h" ··· 717 718 const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); 718 719 void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); 719 720 struct btrfs_header *header = first_kaddr; 720 - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); 721 + struct btrfs_csum_ctx csum; 721 722 u8 on_disk_csum[BTRFS_CSUM_SIZE]; 722 723 u8 calculated_csum[BTRFS_CSUM_SIZE]; 723 724 ··· 759 760 } 760 761 761 762 /* Now check tree block csum. */ 762 - shash->tfm = fs_info->csum_shash; 763 - crypto_shash_init(shash); 764 - crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, 765 - fs_info->sectorsize - BTRFS_CSUM_SIZE); 763 + btrfs_csum_init(&csum, fs_info->csum_type); 764 + btrfs_csum_update(&csum, first_kaddr + BTRFS_CSUM_SIZE, 765 + fs_info->sectorsize - BTRFS_CSUM_SIZE); 766 766 767 767 for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { 768 - crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), 769 - fs_info->sectorsize); 768 + btrfs_csum_update(&csum, scrub_stripe_get_kaddr(stripe, i), 769 + fs_info->sectorsize); 770 770 } 771 771 772 - crypto_shash_final(shash, calculated_csum); 772 + btrfs_csum_final(&csum, calculated_csum); 773 773 if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { 774 774 scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); 775 775 scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); ··· 1688 1690 scrub_stripe_reset_bitmaps(stripe); 1689 1691 1690 1692 /* The range must be inside the bg. */ 1691 - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length, 1693 + ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg), 1692 1694 "bg->start=%llu logical_start=%llu logical_end=%llu end=%llu", 1693 - bg->start, logical_start, logical_end, bg->start + bg->length); 1695 + bg->start, logical_start, logical_end, btrfs_block_group_end(bg)); 1694 1696 1695 1697 ret = find_first_extent_item(extent_root, extent_path, logical_start, 1696 1698 logical_len); 1697 1699 /* Either error or not found. */ 1698 1700 if (ret) 1699 - goto out; 1701 + return ret; 1700 1702 get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, 1701 1703 &extent_gen); 1702 1704 if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) ··· 1729 1731 ret = find_first_extent_item(extent_root, extent_path, cur_logical, 1730 1732 stripe_end - cur_logical + 1); 1731 1733 if (ret < 0) 1732 - goto out; 1734 + return ret; 1733 1735 if (ret > 0) { 1734 1736 ret = 0; 1735 1737 break; ··· 1763 1765 stripe->logical, stripe_end, 1764 1766 stripe->csums, &csum_bitmap); 1765 1767 if (ret < 0) 1766 - goto out; 1768 + return ret; 1767 1769 if (ret > 0) 1768 1770 ret = 0; 1769 1771 ··· 1773 1775 } 1774 1776 } 1775 1777 set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); 1776 - out: 1778 + 1777 1779 return ret; 1778 1780 } 1779 1781 ··· 2171 2173 u64 full_stripe_start) 2172 2174 { 2173 2175 struct btrfs_fs_info *fs_info = sctx->fs_info; 2174 - struct btrfs_path extent_path = { 0 }; 2175 - struct btrfs_path csum_path = { 0 }; 2176 + BTRFS_PATH_AUTO_RELEASE(extent_path); 2177 + BTRFS_PATH_AUTO_RELEASE(csum_path); 2176 2178 struct scrub_stripe *stripe; 2177 2179 bool all_empty = true; 2178 2180 const int data_stripes = nr_data_stripes(map); ··· 2224 2226 full_stripe_start + btrfs_stripe_nr_to_offset(i), 2225 2227 BTRFS_STRIPE_LEN, stripe); 2226 2228 if (ret < 0) 2227 - goto out; 2229 + return ret; 2228 2230 /* 2229 2231 * No extent in this data stripe, need to manually mark them 2230 2232 * initialized to make later read submission happy. ··· 2246 2248 break; 2247 2249 } 2248 2250 } 2249 - if (all_empty) { 2250 - ret = 0; 2251 - goto out; 2252 - } 2251 + if (all_empty) 2252 + return 0; 2253 2253 2254 2254 for (int i = 0; i < data_stripes; i++) { 2255 2255 stripe = &sctx->raid56_data_stripes[i]; ··· 2288 2292 "scrub: unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", 2289 2293 full_stripe_start, i, stripe->nr_sectors, 2290 2294 &error); 2291 - ret = -EIO; 2292 - goto out; 2295 + return ret; 2293 2296 } 2294 2297 bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, 2295 2298 stripe->nr_sectors); 2296 2299 } 2297 2300 2298 2301 /* Now we can check and regenerate the P/Q stripe. */ 2299 - ret = scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2300 - &extent_bitmap); 2301 - out: 2302 - btrfs_release_path(&extent_path); 2303 - btrfs_release_path(&csum_path); 2304 - return ret; 2302 + return scrub_raid56_cached_parity(sctx, scrub_dev, map, full_stripe_start, 2303 + &extent_bitmap); 2305 2304 } 2306 2305 2307 2306 /* ··· 2319 2328 int ret = 0; 2320 2329 2321 2330 /* The range must be inside the bg */ 2322 - ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); 2331 + ASSERT(logical_start >= bg->start && logical_end <= btrfs_block_group_end(bg)); 2323 2332 2324 2333 /* Go through each extent items inside the logical range */ 2325 2334 while (cur_logical < logical_end) { ··· 2411 2420 const u64 logical_increment = simple_stripe_full_stripe_len(map); 2412 2421 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); 2413 2422 const u64 orig_physical = map->stripes[stripe_index].physical; 2423 + const u64 end = btrfs_block_group_end(bg); 2414 2424 const int mirror_num = simple_stripe_mirror_num(map, stripe_index); 2415 2425 u64 cur_logical = orig_logical; 2416 2426 u64 cur_physical = orig_physical; 2417 2427 int ret = 0; 2418 2428 2419 - while (cur_logical < bg->start + bg->length) { 2429 + while (cur_logical < end) { 2420 2430 /* 2421 2431 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is 2422 2432 * just RAID1, so we can reuse scrub_simple_mirror() to scrub

+33 -43

fs/btrfs/send.c

··· 6449 6449 if (sctx->parent_root && !sctx->cur_inode_new) { 6450 6450 ret = is_extent_unchanged(sctx, path, key); 6451 6451 if (ret < 0) 6452 - goto out; 6453 - if (ret) { 6454 - ret = 0; 6452 + return ret; 6453 + if (ret) 6455 6454 goto out_hole; 6456 - } 6457 6455 } else { 6458 6456 struct btrfs_file_extent_item *ei; 6459 6457 u8 type; ··· 6467 6469 * we have enough commands queued up to justify rev'ing 6468 6470 * the send spec. 6469 6471 */ 6470 - if (type == BTRFS_FILE_EXTENT_PREALLOC) { 6471 - ret = 0; 6472 - goto out; 6473 - } 6472 + if (type == BTRFS_FILE_EXTENT_PREALLOC) 6473 + return 0; 6474 6474 6475 6475 /* Have a hole, just skip it. */ 6476 - if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) { 6477 - ret = 0; 6478 - goto out; 6479 - } 6476 + if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) 6477 + return 0; 6480 6478 } 6481 6479 } 6482 6480 6483 6481 ret = find_extent_clone(sctx, path, key->objectid, key->offset, 6484 6482 sctx->cur_inode_size, &found_clone); 6485 6483 if (ret != -ENOENT && ret < 0) 6486 - goto out; 6484 + return ret; 6487 6485 6488 6486 ret = send_write_or_clone(sctx, path, key, found_clone); 6489 6487 if (ret) 6490 - goto out; 6488 + return ret; 6491 6489 out_hole: 6492 - ret = maybe_send_hole(sctx, path, key); 6493 - out: 6494 - return ret; 6490 + return maybe_send_hole(sctx, path, key); 6495 6491 } 6496 6492 6497 6493 static int process_all_extents(struct send_ctx *sctx) ··· 6527 6535 int *pending_move, 6528 6536 int *refs_processed) 6529 6537 { 6530 - int ret = 0; 6538 + int ret; 6531 6539 6532 6540 if (sctx->cur_ino == 0) 6533 - goto out; 6541 + return 0; 6542 + 6534 6543 if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid && 6535 6544 sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY) 6536 - goto out; 6545 + return 0; 6546 + 6537 6547 if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs)) 6538 - goto out; 6548 + return 0; 6539 6549 6540 6550 ret = process_recorded_refs(sctx, pending_move); 6541 6551 if (ret < 0) 6542 - goto out; 6552 + return ret; 6543 6553 6544 6554 *refs_processed = 1; 6545 - out: 6546 - return ret; 6555 + return 0; 6547 6556 } 6548 6557 6549 6558 static int finish_inode_if_needed(struct send_ctx *sctx, bool at_end) ··· 6761 6768 static int changed_inode(struct send_ctx *sctx, 6762 6769 enum btrfs_compare_tree_result result) 6763 6770 { 6764 - int ret = 0; 6771 + int ret; 6765 6772 struct btrfs_key *key = sctx->cmp_key; 6766 6773 struct btrfs_inode_item *left_ii = NULL; 6767 6774 struct btrfs_inode_item *right_ii = NULL; ··· 6853 6860 if (result == BTRFS_COMPARE_TREE_NEW) { 6854 6861 if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) { 6855 6862 sctx->ignore_cur_inode = true; 6856 - goto out; 6863 + return 0; 6857 6864 } 6858 6865 sctx->cur_inode_gen = left_gen; 6859 6866 sctx->cur_inode_new = true; ··· 6881 6888 old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii); 6882 6889 if (new_nlinks == 0 && old_nlinks == 0) { 6883 6890 sctx->ignore_cur_inode = true; 6884 - goto out; 6891 + return 0; 6885 6892 } else if (new_nlinks == 0 || old_nlinks == 0) { 6886 6893 sctx->cur_inode_new_gen = 1; 6887 6894 } ··· 6907 6914 ret = process_all_refs(sctx, 6908 6915 BTRFS_COMPARE_TREE_DELETED); 6909 6916 if (ret < 0) 6910 - goto out; 6917 + return ret; 6911 6918 } 6912 6919 6913 6920 /* ··· 6928 6935 left_ii); 6929 6936 ret = send_create_inode_if_needed(sctx); 6930 6937 if (ret < 0) 6931 - goto out; 6938 + return ret; 6932 6939 6933 6940 ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW); 6934 6941 if (ret < 0) 6935 - goto out; 6942 + return ret; 6936 6943 /* 6937 6944 * Advance send_progress now as we did not get 6938 6945 * into process_recorded_refs_if_needed in the ··· 6946 6953 */ 6947 6954 ret = process_all_extents(sctx); 6948 6955 if (ret < 0) 6949 - goto out; 6956 + return ret; 6950 6957 ret = process_all_new_xattrs(sctx); 6951 6958 if (ret < 0) 6952 - goto out; 6959 + return ret; 6953 6960 } 6954 6961 } else { 6955 6962 sctx->cur_inode_gen = left_gen; ··· 6963 6970 } 6964 6971 } 6965 6972 6966 - out: 6967 - return ret; 6973 + return 0; 6968 6974 } 6969 6975 6970 6976 /* ··· 7096 7104 u32 item_size; 7097 7105 u32 cur_offset = 0; 7098 7106 int ref_name_len; 7099 - int ret = 0; 7100 7107 7101 7108 /* Easy case, just check this one dirid */ 7102 7109 if (key->type == BTRFS_INODE_REF_KEY) { 7103 7110 dirid = key->offset; 7104 7111 7105 - ret = dir_changed(sctx, dirid); 7106 - goto out; 7112 + return dir_changed(sctx, dirid); 7107 7113 } 7108 7114 7109 7115 leaf = path->nodes[0]; 7110 7116 item_size = btrfs_item_size(leaf, path->slots[0]); 7111 7117 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); 7112 7118 while (cur_offset < item_size) { 7119 + int ret; 7120 + 7113 7121 extref = (struct btrfs_inode_extref *)(ptr + 7114 7122 cur_offset); 7115 7123 dirid = btrfs_inode_extref_parent(leaf, extref); ··· 7119 7127 continue; 7120 7128 ret = dir_changed(sctx, dirid); 7121 7129 if (ret) 7122 - break; 7130 + return ret; 7123 7131 last_dirid = dirid; 7124 7132 } 7125 - out: 7126 - return ret; 7133 + return 0; 7127 7134 } 7128 7135 7129 7136 /* ··· 7203 7212 7204 7213 ret = finish_inode_if_needed(sctx, 0); 7205 7214 if (ret < 0) 7206 - goto out; 7215 + return ret; 7207 7216 7208 7217 /* Ignore non-FS objects */ 7209 7218 if (key->objectid == BTRFS_FREE_INO_OBJECTID || 7210 7219 key->objectid == BTRFS_FREE_SPACE_OBJECTID) 7211 - goto out; 7220 + return 0; 7212 7221 7213 7222 if (key->type == BTRFS_INODE_ITEM_KEY) { 7214 7223 ret = changed_inode(sctx, result); ··· 7225 7234 ret = changed_verity(sctx, result); 7226 7235 } 7227 7236 7228 - out: 7229 7237 return ret; 7230 7238 } 7231 7239

+37 -36

fs/btrfs/space-info.c

··· 215 215 216 216 if (flags & BTRFS_BLOCK_GROUP_DATA) 217 217 return BTRFS_MAX_DATA_CHUNK_SIZE; 218 - else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) 218 + else if (flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA_REMAP)) 219 219 return SZ_32M; 220 220 221 221 /* Handle BTRFS_BLOCK_GROUP_METADATA */ ··· 329 329 struct btrfs_super_block *disk_super; 330 330 u64 features; 331 331 u64 flags; 332 - int mixed = 0; 332 + bool mixed = false; 333 333 int ret; 334 334 335 335 disk_super = fs_info->super_copy; ··· 338 338 339 339 features = btrfs_super_incompat_flags(disk_super); 340 340 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) 341 - mixed = 1; 341 + mixed = true; 342 342 343 343 flags = BTRFS_BLOCK_GROUP_SYSTEM; 344 344 ret = create_space_info(fs_info, flags); 345 345 if (ret) 346 - goto out; 346 + return ret; 347 347 348 348 if (mixed) { 349 349 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA; 350 350 ret = create_space_info(fs_info, flags); 351 + if (ret) 352 + return ret; 351 353 } else { 352 354 flags = BTRFS_BLOCK_GROUP_METADATA; 353 355 ret = create_space_info(fs_info, flags); 354 356 if (ret) 355 - goto out; 357 + return ret; 356 358 357 359 flags = BTRFS_BLOCK_GROUP_DATA; 358 360 ret = create_space_info(fs_info, flags); 361 + if (ret) 362 + return ret; 359 363 } 360 - out: 364 + 365 + if (features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE) { 366 + flags = BTRFS_BLOCK_GROUP_METADATA_REMAP; 367 + ret = create_space_info(fs_info, flags); 368 + } 369 + 361 370 return ret; 362 371 } 363 372 ··· 379 370 factor = btrfs_bg_type_to_factor(block_group->flags); 380 371 381 372 spin_lock(&space_info->lock); 382 - space_info->total_bytes += block_group->length; 383 - space_info->disk_total += block_group->length * factor; 373 + 374 + if (!(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED) || 375 + block_group->identity_remap_count != 0) { 376 + space_info->total_bytes += block_group->length; 377 + space_info->disk_total += block_group->length * factor; 378 + } 379 + 384 380 space_info->bytes_used += block_group->used; 385 381 space_info->disk_used += block_group->used * factor; 386 382 space_info->bytes_readonly += block_group->bytes_super; ··· 620 606 spin_unlock(&__rsv->lock); \ 621 607 } while (0) 622 608 623 - static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info) 624 - { 625 - switch (space_info->flags) { 626 - case BTRFS_BLOCK_GROUP_SYSTEM: 627 - return "SYSTEM"; 628 - case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: 629 - return "DATA+METADATA"; 630 - case BTRFS_BLOCK_GROUP_DATA: 631 - return "DATA"; 632 - case BTRFS_BLOCK_GROUP_METADATA: 633 - return "METADATA"; 634 - default: 635 - return "UNKNOWN"; 636 - } 637 - } 638 - 639 609 static void dump_global_block_rsv(struct btrfs_fs_info *fs_info) 640 610 { 641 611 DUMP_BLOCK_RSV(fs_info, global_block_rsv); 642 612 DUMP_BLOCK_RSV(fs_info, trans_block_rsv); 643 613 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv); 614 + DUMP_BLOCK_RSV(fs_info, remap_block_rsv); 644 615 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv); 645 616 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv); 646 617 } ··· 633 634 static void __btrfs_dump_space_info(const struct btrfs_space_info *info) 634 635 { 635 636 const struct btrfs_fs_info *fs_info = info->fs_info; 636 - const char *flag_str = space_info_flag_to_str(info); 637 + const char *flag_str = btrfs_space_info_type_str(info); 637 638 lockdep_assert_held(&info->lock); 638 639 639 640 /* The free space could be negative in case of overcommit */ ··· 671 672 u64 avail; 672 673 673 674 spin_lock(&cache->lock); 674 - avail = cache->length - cache->used - cache->pinned - 675 - cache->reserved - cache->bytes_super - cache->zone_unusable; 675 + avail = btrfs_block_group_available_space(cache); 676 676 btrfs_info(fs_info, 677 677 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", 678 678 cache->start, cache->length, cache->used, cache->pinned, ··· 2097 2099 return unalloc < data_chunk_size; 2098 2100 } 2099 2101 2100 - static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) 2102 + static bool do_reclaim_sweep(struct btrfs_space_info *space_info, int raid) 2101 2103 { 2102 2104 struct btrfs_block_group *bg; 2103 2105 int thresh_pct; 2104 - bool try_again = true; 2106 + bool will_reclaim = false; 2105 2107 bool urgent; 2106 2108 2107 2109 spin_lock(&space_info->lock); ··· 2119 2121 spin_lock(&bg->lock); 2120 2122 thresh = mult_perc(bg->length, thresh_pct); 2121 2123 if (bg->used < thresh && bg->reclaim_mark) { 2122 - try_again = false; 2124 + will_reclaim = true; 2123 2125 reclaim = true; 2124 2126 } 2125 2127 bg->reclaim_mark++; ··· 2136 2138 * If we have any staler groups, we don't touch the fresher ones, but if we 2137 2139 * really need a block group, do take a fresh one. 2138 2140 */ 2139 - if (try_again && urgent) { 2140 - try_again = false; 2141 + if (!will_reclaim && urgent) { 2142 + urgent = false; 2141 2143 goto again; 2142 2144 } 2143 2145 2144 2146 up_read(&space_info->groups_sem); 2147 + return will_reclaim; 2145 2148 } 2146 2149 2147 2150 void btrfs_space_info_update_reclaimable(struct btrfs_space_info *space_info, s64 bytes) ··· 2152 2153 lockdep_assert_held(&space_info->lock); 2153 2154 space_info->reclaimable_bytes += bytes; 2154 2155 2155 - if (space_info->reclaimable_bytes >= chunk_sz) 2156 + if (space_info->reclaimable_bytes > 0 && 2157 + space_info->reclaimable_bytes >= chunk_sz) 2156 2158 btrfs_set_periodic_reclaim_ready(space_info, true); 2157 2159 } 2158 2160 ··· 2180 2180 2181 2181 spin_lock(&space_info->lock); 2182 2182 ret = space_info->periodic_reclaim_ready; 2183 - btrfs_set_periodic_reclaim_ready(space_info, false); 2184 2183 spin_unlock(&space_info->lock); 2185 2184 2186 2185 return ret; ··· 2193 2194 list_for_each_entry(space_info, &fs_info->space_info, list) { 2194 2195 if (!btrfs_should_periodic_reclaim(space_info)) 2195 2196 continue; 2196 - for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) 2197 - do_reclaim_sweep(space_info, raid); 2197 + for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++) { 2198 + if (do_reclaim_sweep(space_info, raid)) 2199 + btrfs_set_periodic_reclaim_ready(space_info, false); 2200 + } 2198 2201 } 2199 2202 } 2200 2203

+16

fs/btrfs/space-info.h

··· 307 307 void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info); 308 308 void btrfs_return_free_space(struct btrfs_space_info *space_info, u64 len); 309 309 310 + static inline const char *btrfs_space_info_type_str(const struct btrfs_space_info *space_info) 311 + { 312 + switch (space_info->flags) { 313 + case BTRFS_BLOCK_GROUP_SYSTEM: 314 + return "SYSTEM"; 315 + case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: 316 + return "DATA+METADATA"; 317 + case BTRFS_BLOCK_GROUP_DATA: 318 + return "DATA"; 319 + case BTRFS_BLOCK_GROUP_METADATA: 320 + return "METADATA"; 321 + default: 322 + return "UNKNOWN"; 323 + } 324 + } 325 + 310 326 #endif /* BTRFS_SPACE_INFO_H */

+13 -4

fs/btrfs/super.c

··· 2483 2483 } 2484 2484 #endif 2485 2485 2486 + static int btrfs_show_stats(struct seq_file *seq, struct dentry *root) 2487 + { 2488 + struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); 2489 + 2490 + if (btrfs_is_zoned(fs_info)) { 2491 + btrfs_show_zoned_stats(fs_info, seq); 2492 + return 0; 2493 + } 2494 + 2495 + return 0; 2496 + } 2497 + 2486 2498 static const struct super_operations btrfs_super_ops = { 2487 2499 .drop_inode = btrfs_drop_inode, 2488 2500 .evict_inode = btrfs_evict_inode, ··· 2510 2498 .unfreeze_fs = btrfs_unfreeze, 2511 2499 .nr_cached_objects = btrfs_nr_cached_objects, 2512 2500 .free_cached_objects = btrfs_free_cached_objects, 2501 + .show_stats = btrfs_show_stats, 2513 2502 #ifdef CONFIG_BTRFS_EXPERIMENTAL 2514 2503 .remove_bdev = btrfs_remove_bdev, 2515 2504 .shutdown = btrfs_shutdown, ··· 2713 2700 2714 2701 MODULE_DESCRIPTION("B-Tree File System (BTRFS)"); 2715 2702 MODULE_LICENSE("GPL"); 2716 - MODULE_SOFTDEP("pre: crc32c"); 2717 - MODULE_SOFTDEP("pre: xxhash64"); 2718 - MODULE_SOFTDEP("pre: sha256"); 2719 - MODULE_SOFTDEP("pre: blake2b-256");

+7 -48

fs/btrfs/sysfs.c

··· 11 11 #include <linux/bug.h> 12 12 #include <linux/list.h> 13 13 #include <linux/string_choices.h> 14 - #include <crypto/hash.h> 15 14 #include "messages.h" 16 15 #include "ctree.h" 17 16 #include "discard.h" ··· 299 300 BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); 300 301 /* Remove once support for raid stripe tree is feature complete. */ 301 302 BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); 303 + /* Remove once support for remap tree is feature complete. */ 304 + BTRFS_FEAT_ATTR_INCOMPAT(remap_tree, REMAP_TREE); 302 305 #endif 303 306 #ifdef CONFIG_FS_VERITY 304 307 BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); ··· 333 332 #ifdef CONFIG_BTRFS_EXPERIMENTAL 334 333 BTRFS_FEAT_ATTR_PTR(extent_tree_v2), 335 334 BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), 335 + BTRFS_FEAT_ATTR_PTR(remap_tree), 336 336 #endif 337 337 #ifdef CONFIG_FS_VERITY 338 338 BTRFS_FEAT_ATTR_PTR(verity), ··· 1255 1253 { 1256 1254 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 1257 1255 u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); 1256 + const char *csum_name = btrfs_super_csum_name(csum_type); 1258 1257 1259 - return sysfs_emit(buf, "%s (%s)\n", 1260 - btrfs_super_csum_name(csum_type), 1261 - crypto_shash_driver_name(fs_info->csum_shash)); 1258 + return sysfs_emit(buf, "%s (%s-lib)\n", csum_name, csum_name); 1262 1259 } 1263 1260 1264 1261 BTRFS_ATTR(, checksum, btrfs_checksum_show); ··· 1541 1540 BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, 1542 1541 btrfs_bg_reclaim_threshold_store); 1543 1542 1544 - #ifdef CONFIG_BTRFS_EXPERIMENTAL 1545 - static ssize_t btrfs_offload_csum_show(struct kobject *kobj, 1546 - struct kobj_attribute *a, char *buf) 1547 - { 1548 - struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); 1549 - 1550 - switch (READ_ONCE(fs_devices->offload_csum_mode)) { 1551 - case BTRFS_OFFLOAD_CSUM_AUTO: 1552 - return sysfs_emit(buf, "auto\n"); 1553 - case BTRFS_OFFLOAD_CSUM_FORCE_ON: 1554 - return sysfs_emit(buf, "1\n"); 1555 - case BTRFS_OFFLOAD_CSUM_FORCE_OFF: 1556 - return sysfs_emit(buf, "0\n"); 1557 - default: 1558 - WARN_ON(1); 1559 - return -EINVAL; 1560 - } 1561 - } 1562 - 1563 - static ssize_t btrfs_offload_csum_store(struct kobject *kobj, 1564 - struct kobj_attribute *a, const char *buf, 1565 - size_t len) 1566 - { 1567 - struct btrfs_fs_devices *fs_devices = to_fs_devs(kobj); 1568 - int ret; 1569 - bool val; 1570 - 1571 - ret = kstrtobool(buf, &val); 1572 - if (ret == 0) 1573 - WRITE_ONCE(fs_devices->offload_csum_mode, 1574 - val ? BTRFS_OFFLOAD_CSUM_FORCE_ON : BTRFS_OFFLOAD_CSUM_FORCE_OFF); 1575 - else if (ret == -EINVAL && sysfs_streq(buf, "auto")) 1576 - WRITE_ONCE(fs_devices->offload_csum_mode, BTRFS_OFFLOAD_CSUM_AUTO); 1577 - else 1578 - return -EINVAL; 1579 - 1580 - return len; 1581 - } 1582 - BTRFS_ATTR_RW(, offload_csum, btrfs_offload_csum_show, btrfs_offload_csum_store); 1583 - #endif 1584 - 1585 1543 /* 1586 1544 * Per-filesystem information and stats. 1587 1545 * ··· 1560 1600 BTRFS_ATTR_PTR(, bg_reclaim_threshold), 1561 1601 BTRFS_ATTR_PTR(, commit_stats), 1562 1602 BTRFS_ATTR_PTR(, temp_fsid), 1563 - #ifdef CONFIG_BTRFS_EXPERIMENTAL 1564 - BTRFS_ATTR_PTR(, offload_csum), 1565 - #endif 1566 1603 NULL, 1567 1604 }; 1568 1605 ··· 1929 1972 case BTRFS_BLOCK_GROUP_SYSTEM: 1930 1973 ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); 1931 1974 return "system"; 1975 + case BTRFS_BLOCK_GROUP_METADATA_REMAP: 1976 + return "metadata-remap"; 1932 1977 default: 1933 1978 WARN_ON(1); 1934 1979 return "invalid-combination";

+3

fs/btrfs/tests/btrfs-tests.c

··· 301 301 ret = btrfs_test_delayed_refs(sectorsize, nodesize); 302 302 if (ret) 303 303 goto out; 304 + ret = btrfs_test_chunk_allocation(sectorsize, nodesize); 305 + if (ret) 306 + goto out; 304 307 } 305 308 } 306 309 ret = btrfs_test_extent_map();

+7

fs/btrfs/tests/btrfs-tests.h

··· 7 7 #define BTRFS_TESTS_H 8 8 9 9 #include <linux/types.h> 10 + #include <linux/cleanup.h> 10 11 11 12 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 13 + 12 14 int btrfs_run_sanity_tests(void); 13 15 14 16 #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt "\n", ##__VA_ARGS__) ··· 47 45 int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize); 48 46 int btrfs_test_extent_map(void); 49 47 int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize); 48 + int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize); 50 49 struct inode *btrfs_new_test_inode(void); 51 50 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); 52 51 void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); 52 + DEFINE_FREE(btrfs_free_dummy_fs_info, struct btrfs_fs_info *, 53 + btrfs_free_dummy_fs_info(_T)) 53 54 void btrfs_free_dummy_root(struct btrfs_root *root); 54 55 struct btrfs_block_group * 55 56 btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); 56 57 void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); 58 + DEFINE_FREE(btrfs_free_dummy_block_group, struct btrfs_block_group *, 59 + btrfs_free_dummy_block_group(_T)); 57 60 void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, 58 61 struct btrfs_fs_info *fs_info); 59 62 void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info);

+476

fs/btrfs/tests/chunk-allocation-tests.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2026 Meta. All rights reserved. 4 + */ 5 + 6 + #include <linux/sizes.h> 7 + #include "btrfs-tests.h" 8 + #include "../volumes.h" 9 + #include "../disk-io.h" 10 + #include "../extent-io-tree.h" 11 + 12 + /* 13 + * Tests for chunk allocator pending extent internals. 14 + * These two functions form the core of searching the chunk allocation pending 15 + * extent bitmap and have relatively easily definable semantics, so unit 16 + * testing them can help ensure the correctness of chunk allocation. 17 + */ 18 + 19 + /* 20 + * Describes the inputs to the system and expected results 21 + * when testing btrfs_find_hole_in_pending_extents(). 22 + */ 23 + struct pending_extent_test_case { 24 + const char *name; 25 + /* Input range to search. */ 26 + u64 hole_start; 27 + u64 hole_len; 28 + /* The size of hole we are searching for. */ 29 + u64 min_hole_size; 30 + /* 31 + * Pending extents to set up (up to 2 for up to 3 holes) 32 + * If len == 0, then it is skipped. 33 + */ 34 + struct { 35 + u64 start; 36 + u64 len; 37 + } pending_extents[2]; 38 + /* Expected outputs. */ 39 + bool expected_found; 40 + u64 expected_start; 41 + u64 expected_len; 42 + }; 43 + 44 + static const struct pending_extent_test_case find_hole_tests[] = { 45 + { 46 + .name = "no pending extents", 47 + .hole_start = 0, 48 + .hole_len = 10ULL * SZ_1G, 49 + .min_hole_size = SZ_1G, 50 + .pending_extents = { }, 51 + .expected_found = true, 52 + .expected_start = 0, 53 + .expected_len = 10ULL * SZ_1G, 54 + }, 55 + { 56 + .name = "pending extent at start of range", 57 + .hole_start = 0, 58 + .hole_len = 10ULL * SZ_1G, 59 + .min_hole_size = SZ_1G, 60 + .pending_extents = { 61 + { .start = 0, .len = SZ_1G }, 62 + }, 63 + .expected_found = true, 64 + .expected_start = SZ_1G, 65 + .expected_len = 9ULL * SZ_1G, 66 + }, 67 + { 68 + .name = "pending extent overlapping start of range", 69 + .hole_start = SZ_1G, 70 + .hole_len = 9ULL * SZ_1G, 71 + .min_hole_size = SZ_1G, 72 + .pending_extents = { 73 + { .start = 0, .len = SZ_2G }, 74 + }, 75 + .expected_found = true, 76 + .expected_start = SZ_2G, 77 + .expected_len = 8ULL * SZ_1G, 78 + }, 79 + { 80 + .name = "two holes; first hole is exactly big enough", 81 + .hole_start = 0, 82 + .hole_len = 10ULL * SZ_1G, 83 + .min_hole_size = SZ_1G, 84 + .pending_extents = { 85 + { .start = SZ_1G, .len = SZ_1G }, 86 + }, 87 + .expected_found = true, 88 + .expected_start = 0, 89 + .expected_len = SZ_1G, 90 + }, 91 + { 92 + .name = "two holes; first hole is big enough", 93 + .hole_start = 0, 94 + .hole_len = 10ULL * SZ_1G, 95 + .min_hole_size = SZ_1G, 96 + .pending_extents = { 97 + { .start = SZ_2G, .len = SZ_1G }, 98 + }, 99 + .expected_found = true, 100 + .expected_start = 0, 101 + .expected_len = SZ_2G, 102 + }, 103 + { 104 + .name = "two holes; second hole is big enough", 105 + .hole_start = 0, 106 + .hole_len = 10ULL * SZ_1G, 107 + .min_hole_size = SZ_2G, 108 + .pending_extents = { 109 + { .start = SZ_1G, .len = SZ_1G }, 110 + }, 111 + .expected_found = true, 112 + .expected_start = SZ_2G, 113 + .expected_len = 8ULL * SZ_1G, 114 + }, 115 + { 116 + .name = "three holes; first hole big enough", 117 + .hole_start = 0, 118 + .hole_len = 10ULL * SZ_1G, 119 + .min_hole_size = SZ_2G, 120 + .pending_extents = { 121 + { .start = SZ_2G, .len = SZ_1G }, 122 + { .start = 4ULL * SZ_1G, .len = SZ_1G }, 123 + }, 124 + .expected_found = true, 125 + .expected_start = 0, 126 + .expected_len = SZ_2G, 127 + }, 128 + { 129 + .name = "three holes; second hole big enough", 130 + .hole_start = 0, 131 + .hole_len = 10ULL * SZ_1G, 132 + .min_hole_size = SZ_2G, 133 + .pending_extents = { 134 + { .start = SZ_1G, .len = SZ_1G }, 135 + { .start = 5ULL * SZ_1G, .len = SZ_1G }, 136 + }, 137 + .expected_found = true, 138 + .expected_start = SZ_2G, 139 + .expected_len = 3ULL * SZ_1G, 140 + }, 141 + { 142 + .name = "three holes; third hole big enough", 143 + .hole_start = 0, 144 + .hole_len = 10ULL * SZ_1G, 145 + .min_hole_size = SZ_2G, 146 + .pending_extents = { 147 + { .start = SZ_1G, .len = SZ_1G }, 148 + { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G }, 149 + }, 150 + .expected_found = true, 151 + .expected_start = 8ULL * SZ_1G, 152 + .expected_len = SZ_2G, 153 + }, 154 + { 155 + .name = "three holes; all holes too small", 156 + .hole_start = 0, 157 + .hole_len = 10ULL * SZ_1G, 158 + .min_hole_size = SZ_2G, 159 + .pending_extents = { 160 + { .start = SZ_1G, .len = SZ_1G }, 161 + { .start = 3ULL * SZ_1G, .len = 6ULL * SZ_1G }, 162 + }, 163 + .expected_found = false, 164 + .expected_start = 0, 165 + .expected_len = SZ_1G, 166 + }, 167 + { 168 + .name = "three holes; all holes too small; first biggest", 169 + .hole_start = 0, 170 + .hole_len = 10ULL * SZ_1G, 171 + .min_hole_size = 3ULL * SZ_1G, 172 + .pending_extents = { 173 + { .start = SZ_2G, .len = SZ_1G }, 174 + { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G }, 175 + }, 176 + .expected_found = false, 177 + .expected_start = 0, 178 + .expected_len = SZ_2G, 179 + }, 180 + { 181 + .name = "three holes; all holes too small; second biggest", 182 + .hole_start = 0, 183 + .hole_len = 10ULL * SZ_1G, 184 + .min_hole_size = 3ULL * SZ_1G, 185 + .pending_extents = { 186 + { .start = SZ_1G, .len = SZ_1G }, 187 + { .start = 4ULL * SZ_1G, .len = 5ULL * SZ_1G }, 188 + }, 189 + .expected_found = false, 190 + .expected_start = SZ_2G, 191 + .expected_len = SZ_2G, 192 + }, 193 + { 194 + .name = "three holes; all holes too small; third biggest", 195 + .hole_start = 0, 196 + .hole_len = 10ULL * SZ_1G, 197 + .min_hole_size = 3ULL * SZ_1G, 198 + .pending_extents = { 199 + { .start = SZ_1G, .len = SZ_1G }, 200 + { .start = 3ULL * SZ_1G, .len = 5ULL * SZ_1G }, 201 + }, 202 + .expected_found = false, 203 + .expected_start = 8ULL * SZ_1G, 204 + .expected_len = SZ_2G, 205 + }, 206 + { 207 + .name = "hole entirely allocated by pending", 208 + .hole_start = 0, 209 + .hole_len = 10ULL * SZ_1G, 210 + .min_hole_size = SZ_1G, 211 + .pending_extents = { 212 + { .start = 0, .len = 10ULL * SZ_1G }, 213 + }, 214 + .expected_found = false, 215 + .expected_start = 10ULL * SZ_1G, 216 + .expected_len = 0, 217 + }, 218 + { 219 + .name = "pending extent at end of range", 220 + .hole_start = 0, 221 + .hole_len = 10ULL * SZ_1G, 222 + .min_hole_size = SZ_1G, 223 + .pending_extents = { 224 + { .start = 9ULL * SZ_1G, .len = SZ_2G }, 225 + }, 226 + .expected_found = true, 227 + .expected_start = 0, 228 + .expected_len = 9ULL * SZ_1G, 229 + }, 230 + { 231 + .name = "zero length input", 232 + .hole_start = SZ_1G, 233 + .hole_len = 0, 234 + .min_hole_size = SZ_1G, 235 + .pending_extents = { }, 236 + .expected_found = false, 237 + .expected_start = SZ_1G, 238 + .expected_len = 0, 239 + }, 240 + }; 241 + 242 + static int test_find_hole_in_pending(u32 sectorsize, u32 nodesize) 243 + { 244 + struct btrfs_fs_info *fs_info; 245 + struct btrfs_device *device; 246 + int ret = 0; 247 + 248 + test_msg("running find_hole_in_pending_extents tests"); 249 + 250 + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); 251 + if (!fs_info) { 252 + test_std_err(TEST_ALLOC_FS_INFO); 253 + return -ENOMEM; 254 + } 255 + 256 + device = btrfs_alloc_dummy_device(fs_info); 257 + if (IS_ERR(device)) { 258 + test_err("failed to allocate dummy device"); 259 + ret = PTR_ERR(device); 260 + goto out_free_fs_info; 261 + } 262 + device->fs_info = fs_info; 263 + 264 + for (int i = 0; i < ARRAY_SIZE(find_hole_tests); i++) { 265 + const struct pending_extent_test_case *test_case = &find_hole_tests[i]; 266 + u64 hole_start = test_case->hole_start; 267 + u64 hole_len = test_case->hole_len; 268 + bool found; 269 + 270 + for (int j = 0; j < ARRAY_SIZE(test_case->pending_extents); j++) { 271 + u64 start = test_case->pending_extents[j].start; 272 + u64 len = test_case->pending_extents[j].len; 273 + 274 + if (!len) 275 + continue; 276 + btrfs_set_extent_bit(&device->alloc_state, 277 + start, start + len - 1, 278 + CHUNK_ALLOCATED, NULL); 279 + } 280 + 281 + mutex_lock(&fs_info->chunk_mutex); 282 + found = btrfs_find_hole_in_pending_extents(device, &hole_start, &hole_len, 283 + test_case->min_hole_size); 284 + mutex_unlock(&fs_info->chunk_mutex); 285 + 286 + if (found != test_case->expected_found) { 287 + test_err("%s: expected found=%d, got found=%d", 288 + test_case->name, test_case->expected_found, found); 289 + ret = -EINVAL; 290 + goto out_clear_pending_extents; 291 + } 292 + if (hole_start != test_case->expected_start || 293 + hole_len != test_case->expected_len) { 294 + test_err("%s: expected [%llu, %llu), got [%llu, %llu)", 295 + test_case->name, test_case->expected_start, 296 + test_case->expected_start + 297 + test_case->expected_len, 298 + hole_start, hole_start + hole_len); 299 + ret = -EINVAL; 300 + goto out_clear_pending_extents; 301 + } 302 + out_clear_pending_extents: 303 + btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1, 304 + CHUNK_ALLOCATED, NULL); 305 + if (ret) 306 + break; 307 + } 308 + 309 + out_free_fs_info: 310 + btrfs_free_dummy_fs_info(fs_info); 311 + return ret; 312 + } 313 + 314 + /* 315 + * Describes the inputs to the system and expected results 316 + * when testing btrfs_first_pending_extent(). 317 + */ 318 + struct first_pending_test_case { 319 + const char *name; 320 + /* The range to look for a pending extent in. */ 321 + u64 hole_start; 322 + u64 hole_len; 323 + /* The pending extent to look for. */ 324 + struct { 325 + u64 start; 326 + u64 len; 327 + } pending_extent; 328 + /* Expected outputs. */ 329 + bool expected_found; 330 + u64 expected_pending_start; 331 + u64 expected_pending_end; 332 + }; 333 + 334 + static const struct first_pending_test_case first_pending_tests[] = { 335 + { 336 + .name = "no pending extent", 337 + .hole_start = 0, 338 + .hole_len = 10ULL * SZ_1G, 339 + .pending_extent = { 0, 0 }, 340 + .expected_found = false, 341 + }, 342 + { 343 + .name = "pending extent at search start", 344 + .hole_start = SZ_1G, 345 + .hole_len = 9ULL * SZ_1G, 346 + .pending_extent = { SZ_1G, SZ_1G }, 347 + .expected_found = true, 348 + .expected_pending_start = SZ_1G, 349 + .expected_pending_end = SZ_2G - 1, 350 + }, 351 + { 352 + .name = "pending extent overlapping search start", 353 + .hole_start = SZ_1G, 354 + .hole_len = 9ULL * SZ_1G, 355 + .pending_extent = { 0, SZ_2G }, 356 + .expected_found = true, 357 + .expected_pending_start = 0, 358 + .expected_pending_end = SZ_2G - 1, 359 + }, 360 + { 361 + .name = "pending extent inside search range", 362 + .hole_start = 0, 363 + .hole_len = 10ULL * SZ_1G, 364 + .pending_extent = { SZ_2G, SZ_1G }, 365 + .expected_found = true, 366 + .expected_pending_start = SZ_2G, 367 + .expected_pending_end = 3ULL * SZ_1G - 1, 368 + }, 369 + { 370 + .name = "pending extent outside search range", 371 + .hole_start = 0, 372 + .hole_len = SZ_1G, 373 + .pending_extent = { SZ_2G, SZ_1G }, 374 + .expected_found = false, 375 + }, 376 + { 377 + .name = "pending extent overlapping end of search range", 378 + .hole_start = 0, 379 + .hole_len = SZ_2G, 380 + .pending_extent = { SZ_1G, SZ_2G }, 381 + .expected_found = true, 382 + .expected_pending_start = SZ_1G, 383 + .expected_pending_end = 3ULL * SZ_1G - 1, 384 + }, 385 + }; 386 + 387 + static int test_first_pending_extent(u32 sectorsize, u32 nodesize) 388 + { 389 + struct btrfs_fs_info *fs_info; 390 + struct btrfs_device *device; 391 + int ret = 0; 392 + 393 + test_msg("running first_pending_extent tests"); 394 + 395 + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); 396 + if (!fs_info) { 397 + test_std_err(TEST_ALLOC_FS_INFO); 398 + return -ENOMEM; 399 + } 400 + 401 + device = btrfs_alloc_dummy_device(fs_info); 402 + if (IS_ERR(device)) { 403 + test_err("failed to allocate dummy device"); 404 + ret = PTR_ERR(device); 405 + goto out_free_fs_info; 406 + } 407 + 408 + device->fs_info = fs_info; 409 + 410 + for (int i = 0; i < ARRAY_SIZE(first_pending_tests); i++) { 411 + const struct first_pending_test_case *test_case = &first_pending_tests[i]; 412 + u64 start = test_case->pending_extent.start; 413 + u64 len = test_case->pending_extent.len; 414 + u64 pending_start, pending_end; 415 + bool found; 416 + 417 + if (len) { 418 + btrfs_set_extent_bit(&device->alloc_state, 419 + start, start + len - 1, 420 + CHUNK_ALLOCATED, NULL); 421 + } 422 + 423 + mutex_lock(&fs_info->chunk_mutex); 424 + found = btrfs_first_pending_extent(device, test_case->hole_start, 425 + test_case->hole_len, 426 + &pending_start, &pending_end); 427 + mutex_unlock(&fs_info->chunk_mutex); 428 + 429 + if (found != test_case->expected_found) { 430 + test_err("%s: expected found=%d, got found=%d", 431 + test_case->name, test_case->expected_found, found); 432 + ret = -EINVAL; 433 + goto out_clear_pending_extents; 434 + } 435 + if (!found) 436 + goto out_clear_pending_extents; 437 + 438 + if (pending_start != test_case->expected_pending_start || 439 + pending_end != test_case->expected_pending_end) { 440 + test_err("%s: expected pending [%llu, %llu], got [%llu, %llu]", 441 + test_case->name, 442 + test_case->expected_pending_start, 443 + test_case->expected_pending_end, 444 + pending_start, pending_end); 445 + ret = -EINVAL; 446 + goto out_clear_pending_extents; 447 + } 448 + 449 + out_clear_pending_extents: 450 + btrfs_clear_extent_bit(&device->alloc_state, 0, (u64)-1, 451 + CHUNK_ALLOCATED, NULL); 452 + if (ret) 453 + break; 454 + } 455 + 456 + out_free_fs_info: 457 + btrfs_free_dummy_fs_info(fs_info); 458 + return ret; 459 + } 460 + 461 + int btrfs_test_chunk_allocation(u32 sectorsize, u32 nodesize) 462 + { 463 + int ret; 464 + 465 + test_msg("running chunk allocation tests"); 466 + 467 + ret = test_first_pending_extent(sectorsize, nodesize); 468 + if (ret) 469 + return ret; 470 + 471 + ret = test_find_hole_in_pending(sectorsize, nodesize); 472 + if (ret) 473 + return ret; 474 + 475 + return 0; 476 + }

+11 -5

fs/btrfs/tests/extent-map-tests.c

··· 173 173 return -ENOMEM; 174 174 } 175 175 176 - /* Add [0, 1K) */ 176 + /* 177 + * Add [0, 1K) which is inlined. And the extent map length must 178 + * be one block. 179 + */ 177 180 em->start = 0; 178 - em->len = SZ_1K; 181 + em->len = SZ_4K; 179 182 em->disk_bytenr = EXTENT_MAP_INLINE; 180 183 em->disk_num_bytes = 0; 181 184 em->ram_bytes = SZ_1K; ··· 222 219 223 220 /* Add [0, 1K) */ 224 221 em->start = 0; 225 - em->len = SZ_1K; 222 + em->len = SZ_4K; 226 223 em->disk_bytenr = EXTENT_MAP_INLINE; 227 224 em->disk_num_bytes = 0; 228 225 em->ram_bytes = SZ_1K; ··· 238 235 ret = -ENOENT; 239 236 goto out; 240 237 } 241 - if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K || 238 + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_4K || 242 239 em->disk_bytenr != EXTENT_MAP_INLINE) { 243 240 test_err( 244 241 "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", ··· 1134 1131 /* 1135 1132 * Note: the fs_info is not set up completely, we only need 1136 1133 * fs_info::fsid for the tracepoint. 1134 + * 1135 + * And all the immediate numbers are based on 4K blocksize, 1136 + * thus we have to use 4K as sectorsize no matter the page size. 1137 1137 */ 1138 - fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); 1138 + fs_info = btrfs_alloc_dummy_fs_info(SZ_4K, SZ_4K); 1139 1139 if (!fs_info) { 1140 1140 test_std_err(TEST_ALLOC_FS_INFO); 1141 1141 return -ENOMEM;

+2 -2

fs/btrfs/tests/free-space-tree-tests.c

··· 49 49 if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { 50 50 if (path->slots[0] != 0) 51 51 goto invalid; 52 - end = cache->start + cache->length; 52 + end = btrfs_block_group_end(cache); 53 53 i = 0; 54 54 while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { 55 55 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ··· 216 216 int ret; 217 217 218 218 ret = __btrfs_remove_from_free_space_tree(trans, cache, path, 219 - cache->start + cache->length - alignment, 219 + btrfs_block_group_end(cache) - alignment, 220 220 alignment); 221 221 if (ret) { 222 222 test_err("could not remove free space");

+64 -62

fs/btrfs/tests/inode-tests.c

··· 81 81 * diagram of how the extents will look though this may not be possible we still 82 82 * want to make sure everything acts normally (the last number is not inclusive) 83 83 * 84 - * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] 85 - * [inline][hole but no extent][ hole ][ regular ][regular1 split] 84 + * The numbers are using 4K fs block size as an example, the real test will scale 85 + * all the extent maps (except the inlined one) according to the block size. 86 86 * 87 - * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] 88 - * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] 87 + * [ 0 - 6 ][ 6 - 4K ][ 4K - 8K ][ 8K - 12K ] 88 + * [ inline ][ implied hole ][ regular ][ regular1 split ] 89 89 * 90 - * [36867 - 45059][45059 - 53251][53251 - 57347][57347 - 61443][61443- 69635] 91 - * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1] 90 + * [ 12K - 16K ][ 16K - 24K ][ 24K - 28K ][ 28K - 32K ][ 32K - 36K ] 91 + * [ hole ][ regular1 split ][ prealloc ][ prealloc1 ][ prealloc1 written ] 92 92 * 93 - * [69635-73731][ 73731 - 86019 ][86019-90115] 94 - * [ regular ][ hole but no extent][ regular ] 93 + * [ 36K - 44K ][ 44K - 52K ][ 52K - 56K ][ 56K - 60K ][ 60K - 68 K ] 94 + * [ prealloc1 ][ compressed ][ compressed1 ][ regular ][ compressed1 ] 95 + * 96 + * [ 68K - 72K ][ 72K - 84K ][ 84K - 88K ] 97 + * [ regular ][ hole but no extent ][ regular ] 95 98 */ 96 99 static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) 97 100 { ··· 103 100 u64 offset = 0; 104 101 105 102 /* 103 + * Start 0, length 6, inlined. 104 + * 106 105 * Tree-checker has strict limits on inline extents that they can only 107 106 * exist at file offset 0, thus we can only have one inline file extent 108 107 * at most. ··· 114 109 slot++; 115 110 offset = sectorsize; 116 111 117 - /* Now another hole */ 118 - insert_extent(root, offset, 4, 4, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, 119 - slot); 112 + /* Start 1 * blocksize, length 1 * blocksize, regular. */ 113 + insert_extent(root, offset, sectorsize, sectorsize, 0, 114 + disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); 120 115 slot++; 121 - offset += 4; 122 116 123 - /* Now for a regular extent */ 124 - insert_extent(root, offset, sectorsize - 1, sectorsize - 1, 0, 125 - disk_bytenr, sectorsize - 1, BTRFS_FILE_EXTENT_REG, 0, slot); 126 - slot++; 127 - disk_bytenr += sectorsize; 128 - offset += sectorsize - 1; 117 + /* We don't want the regular em merged with the next one. */ 118 + disk_bytenr += 2 * sectorsize; 119 + offset += sectorsize; 129 120 130 121 /* 122 + * Start 2 * blocksize, length 1 * blocksize, regular. 123 + * 131 124 * Now for 3 extents that were split from a hole punch so we test 132 125 * offsets properly. 133 126 */ ··· 133 130 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); 134 131 slot++; 135 132 offset += sectorsize; 133 + 134 + /* Start 3 * blocksize, length 1 * blocksize, regular, explicit hole. */ 136 135 insert_extent(root, offset, sectorsize, sectorsize, 0, 0, 0, 137 136 BTRFS_FILE_EXTENT_REG, 0, slot); 138 137 slot++; 139 138 offset += sectorsize; 139 + 140 + /* Start 4 * blocksize, length 2 * blocksize, regular. */ 140 141 insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 141 142 2 * sectorsize, disk_bytenr, 4 * sectorsize, 142 143 BTRFS_FILE_EXTENT_REG, 0, slot); ··· 148 141 offset += 2 * sectorsize; 149 142 disk_bytenr += 4 * sectorsize; 150 143 151 - /* Now for a unwritten prealloc extent */ 144 + /* Start 6 * blocksize, length 1 * blocksize, preallocated. */ 152 145 insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, 153 146 sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); 154 147 slot++; ··· 161 154 disk_bytenr += 2 * sectorsize; 162 155 163 156 /* 157 + * Start 7 * blocksize, length 1 * blocksize, prealloc. 158 + * 164 159 * Now for a partially written prealloc extent, basically the same as 165 160 * the hole punch example above. Ram_bytes never changes when you mark 166 161 * extents written btw. ··· 171 162 4 * sectorsize, BTRFS_FILE_EXTENT_PREALLOC, 0, slot); 172 163 slot++; 173 164 offset += sectorsize; 165 + 166 + /* Start 8 * blocksize, length 1 * blocksize, regular. */ 174 167 insert_extent(root, offset, sectorsize, 4 * sectorsize, sectorsize, 175 168 disk_bytenr, 4 * sectorsize, BTRFS_FILE_EXTENT_REG, 0, 176 169 slot); 177 170 slot++; 178 171 offset += sectorsize; 172 + 173 + /* Start 9 * blocksize, length 2 * blocksize, prealloc. */ 179 174 insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 180 175 2 * sectorsize, disk_bytenr, 4 * sectorsize, 181 176 BTRFS_FILE_EXTENT_PREALLOC, 0, slot); ··· 187 174 offset += 2 * sectorsize; 188 175 disk_bytenr += 4 * sectorsize; 189 176 190 - /* Now a normal compressed extent */ 177 + /* Start 11 * blocksize, length 2 * blocksize, regular. */ 191 178 insert_extent(root, offset, 2 * sectorsize, 2 * sectorsize, 0, 192 179 disk_bytenr, sectorsize, BTRFS_FILE_EXTENT_REG, 193 180 BTRFS_COMPRESS_ZLIB, slot); ··· 196 183 /* No merges */ 197 184 disk_bytenr += 2 * sectorsize; 198 185 199 - /* Now a split compressed extent */ 186 + /* Start 13 * blocksize, length 1 * blocksize, regular. */ 200 187 insert_extent(root, offset, sectorsize, 4 * sectorsize, 0, disk_bytenr, 201 188 sectorsize, BTRFS_FILE_EXTENT_REG, 202 189 BTRFS_COMPRESS_ZLIB, slot); 203 190 slot++; 204 191 offset += sectorsize; 192 + 193 + /* Start 14 * blocksize, length 1 * blocksize, regular. */ 205 194 insert_extent(root, offset, sectorsize, sectorsize, 0, 206 195 disk_bytenr + sectorsize, sectorsize, 207 196 BTRFS_FILE_EXTENT_REG, 0, slot); 208 197 slot++; 209 198 offset += sectorsize; 199 + 200 + /* Start 15 * blocksize, length 2 * blocksize, regular. */ 210 201 insert_extent(root, offset, 2 * sectorsize, 4 * sectorsize, 211 202 2 * sectorsize, disk_bytenr, sectorsize, 212 203 BTRFS_FILE_EXTENT_REG, BTRFS_COMPRESS_ZLIB, slot); ··· 218 201 offset += 2 * sectorsize; 219 202 disk_bytenr += 2 * sectorsize; 220 203 221 - /* Now extents that have a hole but no hole extent */ 204 + /* Start 17 * blocksize, length 1 * blocksize, regular. */ 222 205 insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, 223 206 sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); 224 207 slot++; 225 208 offset += 4 * sectorsize; 226 209 disk_bytenr += sectorsize; 210 + 211 + /* 212 + * Start 18 * blocksize, length 3 * blocksize, implied hole (aka no 213 + * file extent item). 214 + * 215 + * Start 21 * blocksize, length 1 * blocksize, regular. 216 + */ 227 217 insert_extent(root, offset, sectorsize, sectorsize, 0, disk_bytenr, 228 218 sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); 229 219 } ··· 337 313 * unless we have a page for it to write into. Maybe we should change 338 314 * this? 339 315 */ 340 - offset = em->start + em->len; 341 - btrfs_free_extent_map(em); 342 - 343 - em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); 344 - if (IS_ERR(em)) { 345 - test_err("got an error when we shouldn't have"); 346 - goto out; 347 - } 348 - if (em->disk_bytenr != EXTENT_MAP_HOLE) { 349 - test_err("expected a hole, got %llu", em->disk_bytenr); 350 - goto out; 351 - } 352 - if (em->start != offset || em->len != 4) { 353 - test_err( 354 - "unexpected extent wanted start %llu len 4, got start %llu len %llu", 355 - offset, em->start, em->len); 356 - goto out; 357 - } 358 - if (em->flags != 0) { 359 - test_err("unexpected flags set, want 0 have %u", em->flags); 360 - goto out; 361 - } 362 - offset = em->start + em->len; 316 + offset = btrfs_extent_map_end(em); 363 317 btrfs_free_extent_map(em); 364 318 365 319 /* Regular extent */ ··· 350 348 test_err("expected a real extent, got %llu", em->disk_bytenr); 351 349 goto out; 352 350 } 353 - if (em->start != offset || em->len != sectorsize - 1) { 351 + if (em->start != offset || em->len != sectorsize) { 354 352 test_err( 355 - "unexpected extent wanted start %llu len 4095, got start %llu len %llu", 356 - offset, em->start, em->len); 353 + "unexpected extent wanted start %llu len %u, got start %llu len %llu", 354 + offset, sectorsize, em->start, em->len); 357 355 goto out; 358 356 } 359 357 if (em->flags != 0) { ··· 364 362 test_err("wrong offset, want 0, have %llu", em->offset); 365 363 goto out; 366 364 } 367 - offset = em->start + em->len; 365 + offset = btrfs_extent_map_end(em); 368 366 btrfs_free_extent_map(em); 369 367 370 368 /* The next 3 are split extents */ ··· 393 391 } 394 392 disk_bytenr = btrfs_extent_map_block_start(em); 395 393 orig_start = em->start; 396 - offset = em->start + em->len; 394 + offset = btrfs_extent_map_end(em); 397 395 btrfs_free_extent_map(em); 398 396 399 397 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); ··· 415 413 test_err("unexpected flags set, want 0 have %u", em->flags); 416 414 goto out; 417 415 } 418 - offset = em->start + em->len; 416 + offset = btrfs_extent_map_end(em); 419 417 btrfs_free_extent_map(em); 420 418 421 419 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); ··· 448 446 disk_bytenr, btrfs_extent_map_block_start(em)); 449 447 goto out; 450 448 } 451 - offset = em->start + em->len; 449 + offset = btrfs_extent_map_end(em); 452 450 btrfs_free_extent_map(em); 453 451 454 452 /* Prealloc extent */ ··· 476 474 test_err("wrong offset, want 0, have %llu", em->offset); 477 475 goto out; 478 476 } 479 - offset = em->start + em->len; 477 + offset = btrfs_extent_map_end(em); 480 478 btrfs_free_extent_map(em); 481 479 482 480 /* The next 3 are a half written prealloc extent */ ··· 506 504 } 507 505 disk_bytenr = btrfs_extent_map_block_start(em); 508 506 orig_start = em->start; 509 - offset = em->start + em->len; 507 + offset = btrfs_extent_map_end(em); 510 508 btrfs_free_extent_map(em); 511 509 512 510 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); ··· 538 536 disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); 539 537 goto out; 540 538 } 541 - offset = em->start + em->len; 539 + offset = btrfs_extent_map_end(em); 542 540 btrfs_free_extent_map(em); 543 541 544 542 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); ··· 571 569 disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); 572 570 goto out; 573 571 } 574 - offset = em->start + em->len; 572 + offset = btrfs_extent_map_end(em); 575 573 btrfs_free_extent_map(em); 576 574 577 575 /* Now for the compressed extent */ ··· 604 602 BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); 605 603 goto out; 606 604 } 607 - offset = em->start + em->len; 605 + offset = btrfs_extent_map_end(em); 608 606 btrfs_free_extent_map(em); 609 607 610 608 /* Split compressed extent */ ··· 639 637 } 640 638 disk_bytenr = btrfs_extent_map_block_start(em); 641 639 orig_start = em->start; 642 - offset = em->start + em->len; 640 + offset = btrfs_extent_map_end(em); 643 641 btrfs_free_extent_map(em); 644 642 645 643 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); ··· 665 663 test_err("wrong offset, want 0, have %llu", em->offset); 666 664 goto out; 667 665 } 668 - offset = em->start + em->len; 666 + offset = btrfs_extent_map_end(em); 669 667 btrfs_free_extent_map(em); 670 668 671 669 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); ··· 699 697 BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); 700 698 goto out; 701 699 } 702 - offset = em->start + em->len; 700 + offset = btrfs_extent_map_end(em); 703 701 btrfs_free_extent_map(em); 704 702 705 703 /* A hole between regular extents but no hole extent */ ··· 726 724 test_err("wrong offset, want 0, have %llu", em->offset); 727 725 goto out; 728 726 } 729 - offset = em->start + em->len; 727 + offset = btrfs_extent_map_end(em); 730 728 btrfs_free_extent_map(em); 731 729 732 730 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); ··· 758 756 test_err("wrong offset, want 0, have %llu", em->offset); 759 757 goto out; 760 758 } 761 - offset = em->start + em->len; 759 + offset = btrfs_extent_map_end(em); 762 760 btrfs_free_extent_map(em); 763 761 764 762 em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize);

+39 -39

fs/btrfs/transaction.c

··· 950 950 951 951 if (transid) { 952 952 if (transid <= btrfs_get_last_trans_committed(fs_info)) 953 - goto out; 953 + return 0; 954 954 955 955 /* find specified transaction */ 956 956 spin_lock(&fs_info->trans_lock); ··· 975 975 if (!cur_trans) { 976 976 if (transid > btrfs_get_last_trans_committed(fs_info)) 977 977 ret = -EINVAL; 978 - goto out; 978 + return ret; 979 979 } 980 980 } else { 981 981 /* find newest transaction that is committing | committed */ ··· 991 991 } 992 992 } 993 993 spin_unlock(&fs_info->trans_lock); 994 + /* Nothing committing or committed. */ 994 995 if (!cur_trans) 995 - goto out; /* nothing committing|committed */ 996 + return ret; 996 997 } 997 998 998 999 wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); 999 1000 ret = cur_trans->aborted; 1000 1001 btrfs_put_transaction(cur_trans); 1001 - out: 1002 + 1002 1003 return ret; 1003 1004 } 1004 1005 ··· 1516 1515 1517 1516 btrfs_free_log(trans, root); 1518 1517 ret2 = btrfs_update_reloc_root(trans, root); 1519 - if (ret2) 1518 + if (unlikely(ret2)) 1520 1519 return ret2; 1521 1520 1522 1521 /* see comments in should_cow_block() */ ··· 1533 1532 ret2 = btrfs_update_root(trans, fs_info->tree_root, 1534 1533 &root->root_key, 1535 1534 &root->root_item); 1536 - if (ret2) 1535 + if (unlikely(ret2)) 1537 1536 return ret2; 1538 1537 spin_lock(&fs_info->fs_roots_radix_lock); 1539 1538 } ··· 1622 1621 goto out; 1623 1622 switch_commit_roots(trans); 1624 1623 ret = btrfs_write_and_wait_transaction(trans); 1625 - if (ret) 1626 - btrfs_handle_fs_error(fs_info, ret, 1627 - "Error while writing out transaction for qgroup"); 1624 + if (unlikely(ret)) 1625 + btrfs_err(fs_info, 1626 + "error while writing out transaction during qgroup snapshot accounting: %d", ret); 1628 1627 1629 1628 out: 1630 1629 /* ··· 1688 1687 &pending->dentry->d_name, 0, 1689 1688 &fname); 1690 1689 memalloc_nofs_restore(nofs_flags); 1691 - if (pending->error) 1690 + if (unlikely(pending->error)) 1692 1691 goto free_pending; 1693 1692 1694 1693 pending->error = btrfs_get_free_objectid(tree_root, &objectid); 1695 - if (pending->error) 1694 + if (unlikely(pending->error)) 1696 1695 goto free_fname; 1697 1696 1698 1697 /* ··· 1708 1707 &pending->block_rsv, 1709 1708 to_reserve, 1710 1709 BTRFS_RESERVE_NO_FLUSH); 1711 - if (pending->error) 1710 + if (unlikely(pending->error)) 1712 1711 goto clear_skip_qgroup; 1713 1712 } 1714 1713 ··· 1720 1719 trans->bytes_reserved, 1); 1721 1720 parent_root = parent_inode->root; 1722 1721 ret = record_root_in_trans(trans, parent_root, 0); 1723 - if (ret) 1722 + if (unlikely(ret)) 1724 1723 goto fail; 1725 1724 cur_time = current_time(&parent_inode->vfs_inode); 1726 1725 ··· 1737 1736 dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, 1738 1737 btrfs_ino(parent_inode), 1739 1738 &fname.disk_name, 0); 1740 - if (dir_item != NULL && !IS_ERR(dir_item)) { 1739 + if (unlikely(dir_item != NULL && !IS_ERR(dir_item))) { 1741 1740 pending->error = -EEXIST; 1742 1741 goto dir_item_existed; 1743 1742 } else if (IS_ERR(dir_item)) { ··· 1874 1873 else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) 1875 1874 ret = btrfs_qgroup_inherit(trans, btrfs_root_id(root), objectid, 1876 1875 btrfs_root_id(parent_root), pending->inherit); 1877 - if (ret < 0) 1876 + if (unlikely(ret < 0)) 1878 1877 goto fail; 1879 1878 1880 1879 ret = btrfs_insert_dir_item(trans, &fname.disk_name, ··· 1940 1939 list_for_each_entry_safe(pending, next, head, list) { 1941 1940 list_del(&pending->list); 1942 1941 ret = create_pending_snapshot(trans, pending); 1943 - if (ret) 1942 + if (unlikely(ret)) 1944 1943 break; 1945 1944 } 1946 1945 return ret; ··· 1968 1967 super->cache_generation = 0; 1969 1968 if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) 1970 1969 super->uuid_tree_generation = root_item->generation; 1970 + 1971 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) { 1972 + root_item = &fs_info->remap_root->root_item; 1973 + super->remap_root = root_item->bytenr; 1974 + super->remap_root_generation = root_item->generation; 1975 + super->remap_root_level = root_item->level; 1976 + } 1971 1977 } 1972 1978 1973 1979 int btrfs_transaction_blocked(struct btrfs_fs_info *info) ··· 2266 2258 2267 2259 if (run_it) { 2268 2260 ret = btrfs_start_dirty_block_groups(trans); 2269 - if (ret) 2261 + if (unlikely(ret)) 2270 2262 goto lockdep_trans_commit_start_release; 2271 2263 } 2272 2264 } ··· 2316 2308 ret = READ_ONCE(prev_trans->aborted); 2317 2309 2318 2310 btrfs_put_transaction(prev_trans); 2319 - if (ret) 2311 + if (unlikely(ret)) 2320 2312 goto lockdep_release; 2321 2313 spin_lock(&fs_info->trans_lock); 2322 2314 } ··· 2346 2338 extwriter_counter_dec(cur_trans, trans->type); 2347 2339 2348 2340 ret = btrfs_start_delalloc_flush(fs_info); 2349 - if (ret) 2341 + if (unlikely(ret)) 2350 2342 goto lockdep_release; 2351 2343 2352 2344 ret = btrfs_run_delayed_items(trans); 2353 - if (ret) 2345 + if (unlikely(ret)) 2354 2346 goto lockdep_release; 2355 2347 2356 2348 /* ··· 2365 2357 2366 2358 /* some pending stuffs might be added after the previous flush. */ 2367 2359 ret = btrfs_run_delayed_items(trans); 2368 - if (ret) { 2360 + if (unlikely(ret)) { 2369 2361 btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); 2370 2362 goto cleanup_transaction; 2371 2363 } ··· 2437 2429 * core function of the snapshot creation. 2438 2430 */ 2439 2431 ret = create_pending_snapshots(trans); 2440 - if (ret) 2432 + if (unlikely(ret)) 2441 2433 goto unlock_reloc; 2442 2434 2443 2435 /* ··· 2451 2443 * the nodes and leaves. 2452 2444 */ 2453 2445 ret = btrfs_run_delayed_items(trans); 2454 - if (ret) 2446 + if (unlikely(ret)) 2455 2447 goto unlock_reloc; 2456 2448 2457 2449 ret = btrfs_run_delayed_refs(trans, U64_MAX); 2458 - if (ret) 2450 + if (unlikely(ret)) 2459 2451 goto unlock_reloc; 2460 2452 2461 2453 /* ··· 2467 2459 WARN_ON(cur_trans != trans->transaction); 2468 2460 2469 2461 ret = commit_fs_roots(trans); 2470 - if (ret) 2462 + if (unlikely(ret)) 2471 2463 goto unlock_reloc; 2472 2464 2473 2465 /* commit_fs_roots gets rid of all the tree log roots, it is now ··· 2480 2472 * new_roots. So let's do quota accounting. 2481 2473 */ 2482 2474 ret = btrfs_qgroup_account_extents(trans); 2483 - if (ret < 0) 2475 + if (unlikely(ret < 0)) 2484 2476 goto unlock_reloc; 2485 2477 2486 2478 ret = commit_cowonly_roots(trans); 2487 - if (ret) 2479 + if (unlikely(ret)) 2488 2480 goto unlock_reloc; 2489 2481 2490 2482 /* ··· 2507 2499 fs_info->chunk_root->node); 2508 2500 list_add_tail(&fs_info->chunk_root->dirty_list, 2509 2501 &cur_trans->switch_commits); 2510 - 2511 - if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { 2512 - btrfs_set_root_node(&fs_info->block_group_root->root_item, 2513 - fs_info->block_group_root->node); 2514 - list_add_tail(&fs_info->block_group_root->dirty_list, 2515 - &cur_trans->switch_commits); 2516 - } 2517 2502 2518 2503 switch_commit_roots(trans); 2519 2504 ··· 2551 2550 wake_up_process(fs_info->cleaner_kthread); 2552 2551 2553 2552 ret = btrfs_write_and_wait_transaction(trans); 2554 - if (ret) { 2555 - btrfs_handle_fs_error(fs_info, ret, 2556 - "Error while writing out transaction"); 2553 + if (unlikely(ret)) { 2554 + btrfs_err(fs_info, "error while writing out transaction: %d", ret); 2557 2555 mutex_unlock(&fs_info->tree_log_mutex); 2558 2556 goto scrub_continue; 2559 2557 } ··· 2563 2563 * to go about their business 2564 2564 */ 2565 2565 mutex_unlock(&fs_info->tree_log_mutex); 2566 - if (ret) 2566 + if (unlikely(ret)) 2567 2567 goto scrub_continue; 2568 2568 2569 2569 update_commit_stats(fs_info); ··· 2576 2576 btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); 2577 2577 2578 2578 ret = btrfs_finish_extent_commit(trans); 2579 - if (ret) 2579 + if (unlikely(ret)) 2580 2580 goto scrub_continue; 2581 2581 2582 2582 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))

+56 -28

fs/btrfs/tree-checker.c

··· 688 688 u64 chunk_objectid; 689 689 u64 flags; 690 690 u64 type; 691 + size_t exp_size; 691 692 692 693 /* 693 694 * Here we don't really care about alignment since extent allocator can ··· 700 699 return -EUCLEAN; 701 700 } 702 701 703 - if (unlikely(item_size != sizeof(bgi))) { 702 + if (btrfs_fs_incompat(fs_info, REMAP_TREE)) 703 + exp_size = sizeof(struct btrfs_block_group_item_v2); 704 + else 705 + exp_size = sizeof(struct btrfs_block_group_item); 706 + 707 + if (unlikely(item_size != exp_size)) { 704 708 block_group_err(leaf, slot, 705 709 "invalid item size, have %u expect %zu", 706 - item_size, sizeof(bgi)); 710 + item_size, exp_size); 707 711 return -EUCLEAN; 708 712 } 709 713 ··· 754 748 return -EUCLEAN; 755 749 } 756 750 751 + if (unlikely(flags & BTRFS_BLOCK_GROUP_METADATA_REMAP && 752 + !btrfs_fs_incompat(fs_info, REMAP_TREE))) { 753 + block_group_err(leaf, slot, 754 + "invalid flags, have 0x%llx (METADATA_REMAP flag set) but no remap-tree incompat flag", 755 + flags); 756 + return -EUCLEAN; 757 + } 758 + 757 759 type = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; 758 760 if (unlikely(type != BTRFS_BLOCK_GROUP_DATA && 759 761 type != BTRFS_BLOCK_GROUP_METADATA && 760 762 type != BTRFS_BLOCK_GROUP_SYSTEM && 763 + type != BTRFS_BLOCK_GROUP_METADATA_REMAP && 761 764 type != (BTRFS_BLOCK_GROUP_METADATA | 762 765 BTRFS_BLOCK_GROUP_DATA))) { 763 766 block_group_err(leaf, slot, 764 - "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx or 0x%llx", 767 + "invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx, 0x%llx or 0x%llx", 765 768 type, hweight64(type), 766 769 BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA, 767 - BTRFS_BLOCK_GROUP_SYSTEM, 770 + BTRFS_BLOCK_GROUP_SYSTEM, BTRFS_BLOCK_GROUP_METADATA_REMAP, 768 771 BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); 769 772 return -EUCLEAN; 770 773 } ··· 822 807 va_end(args); 823 808 } 824 809 810 + static bool valid_stripe_count(u64 profile, u16 num_stripes, u16 sub_stripes) 811 + { 812 + switch (profile) { 813 + case BTRFS_BLOCK_GROUP_RAID0: 814 + return true; 815 + case BTRFS_BLOCK_GROUP_RAID10: 816 + return sub_stripes == btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes; 817 + case BTRFS_BLOCK_GROUP_RAID1: 818 + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1].devs_min; 819 + case BTRFS_BLOCK_GROUP_RAID1C3: 820 + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min; 821 + case BTRFS_BLOCK_GROUP_RAID1C4: 822 + return num_stripes == btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min; 823 + case BTRFS_BLOCK_GROUP_RAID5: 824 + return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID5].devs_min; 825 + case BTRFS_BLOCK_GROUP_RAID6: 826 + return num_stripes >= btrfs_raid_array[BTRFS_RAID_RAID6].devs_min; 827 + case BTRFS_BLOCK_GROUP_DUP: 828 + return num_stripes == btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes; 829 + case 0: /* SINGLE */ 830 + return num_stripes == btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes; 831 + default: 832 + BUG(); 833 + } 834 + } 835 + 825 836 /* 826 837 * The common chunk check which could also work on super block sys chunk array. 827 838 * ··· 871 830 u64 features; 872 831 u32 chunk_sector_size; 873 832 bool mixed = false; 833 + bool remapped; 874 834 int raid_index; 875 835 int nparity; 876 836 int ncopies; ··· 894 852 raid_index = btrfs_bg_flags_to_raid_index(type); 895 853 ncopies = btrfs_raid_array[raid_index].ncopies; 896 854 nparity = btrfs_raid_array[raid_index].nparity; 855 + remapped = (type & BTRFS_BLOCK_GROUP_REMAPPED); 897 856 898 - if (unlikely(!num_stripes)) { 857 + if (unlikely(!remapped && !num_stripes)) { 899 858 chunk_err(fs_info, leaf, chunk, logical, 900 859 "invalid chunk num_stripes, have %u", num_stripes); 901 860 return -EUCLEAN; 902 861 } 903 - if (unlikely(num_stripes < ncopies)) { 862 + if (unlikely(num_stripes != 0 && num_stripes < ncopies)) { 904 863 chunk_err(fs_info, leaf, chunk, logical, 905 864 "invalid chunk num_stripes < ncopies, have %u < %d", 906 865 num_stripes, ncopies); ··· 956 913 length, btrfs_stripe_nr_to_offset(U32_MAX)); 957 914 return -EUCLEAN; 958 915 } 959 - if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 960 - BTRFS_BLOCK_GROUP_PROFILE_MASK))) { 916 + if (unlikely(type & ~BTRFS_BLOCK_GROUP_VALID)) { 961 917 chunk_err(fs_info, leaf, chunk, logical, 962 918 "unrecognized chunk type: 0x%llx", 963 - ~(BTRFS_BLOCK_GROUP_TYPE_MASK | 964 - BTRFS_BLOCK_GROUP_PROFILE_MASK) & type); 919 + type & ~BTRFS_BLOCK_GROUP_VALID); 965 920 return -EUCLEAN; 966 921 } 967 922 ··· 999 958 } 1000 959 } 1001 960 1002 - if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && 1003 - sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) || 1004 - (type & BTRFS_BLOCK_GROUP_RAID1 && 1005 - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) || 1006 - (type & BTRFS_BLOCK_GROUP_RAID1C3 && 1007 - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) || 1008 - (type & BTRFS_BLOCK_GROUP_RAID1C4 && 1009 - num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) || 1010 - (type & BTRFS_BLOCK_GROUP_RAID5 && 1011 - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) || 1012 - (type & BTRFS_BLOCK_GROUP_RAID6 && 1013 - num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) || 1014 - (type & BTRFS_BLOCK_GROUP_DUP && 1015 - num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || 1016 - ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && 1017 - num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { 961 + if (!remapped && 962 + !valid_stripe_count(type & BTRFS_BLOCK_GROUP_PROFILE_MASK, 963 + num_stripes, sub_stripes)) { 1018 964 chunk_err(fs_info, leaf, chunk, logical, 1019 965 "invalid num_stripes:sub_stripes %u:%u for profile %llu", 1020 966 num_stripes, sub_stripes, ··· 1025 997 struct btrfs_fs_info *fs_info = leaf->fs_info; 1026 998 int num_stripes; 1027 999 1028 - if (unlikely(btrfs_item_size(leaf, slot) < sizeof(struct btrfs_chunk))) { 1000 + if (unlikely(btrfs_item_size(leaf, slot) < offsetof(struct btrfs_chunk, stripe))) { 1029 1001 chunk_err(fs_info, leaf, chunk, key->offset, 1030 1002 "invalid chunk item size: have %u expect [%zu, %u)", 1031 1003 btrfs_item_size(leaf, slot), 1032 - sizeof(struct btrfs_chunk), 1004 + offsetof(struct btrfs_chunk, stripe), 1033 1005 BTRFS_LEAF_DATA_SIZE(fs_info)); 1034 1006 return -EUCLEAN; 1035 1007 }

+5

fs/btrfs/tree-checker.h

··· 57 57 BTRFS_TREE_BLOCK_WRITTEN_NOT_SET, 58 58 }; 59 59 60 + 61 + #define BTRFS_BLOCK_GROUP_VALID (BTRFS_BLOCK_GROUP_TYPE_MASK | \ 62 + BTRFS_BLOCK_GROUP_PROFILE_MASK | \ 63 + BTRFS_BLOCK_GROUP_REMAPPED) 64 + 60 65 /* 61 66 * Exported simply for btrfs-progs which wants to have the 62 67 * btrfs_tree_block_status return codes.

+1 -1

fs/btrfs/tree-log.c

··· 5160 5160 if (ctx->logged_before) { 5161 5161 drop_args.path = path; 5162 5162 drop_args.start = em->start; 5163 - drop_args.end = em->start + em->len; 5163 + drop_args.end = btrfs_extent_map_end(em); 5164 5164 drop_args.replace_extent = true; 5165 5165 drop_args.extent_item_size = sizeof(fi); 5166 5166 ret = btrfs_drop_extents(trans, log, inode, &drop_args);

+6 -10

fs/btrfs/uuid-tree.c

··· 207 207 208 208 /* 1 - for the uuid item */ 209 209 trans = btrfs_start_transaction(uuid_root, 1); 210 - if (IS_ERR(trans)) { 211 - ret = PTR_ERR(trans); 212 - goto out; 213 - } 210 + if (IS_ERR(trans)) 211 + return PTR_ERR(trans); 214 212 215 213 ret = btrfs_uuid_tree_remove(trans, uuid, type, subid); 216 214 btrfs_end_transaction(trans); 217 - 218 - out: 219 215 return ret; 220 216 } 221 217 ··· 231 235 232 236 if (type != BTRFS_UUID_KEY_SUBVOL && 233 237 type != BTRFS_UUID_KEY_RECEIVED_SUBVOL) 234 - goto out; 238 + return 0; 235 239 236 240 subvol_root = btrfs_get_fs_root(fs_info, subvolid, true); 237 241 if (IS_ERR(subvol_root)) { 238 242 ret = PTR_ERR(subvol_root); 239 243 if (ret == -ENOENT) 240 - ret = 1; 241 - goto out; 244 + return 1; 245 + return ret; 242 246 } 243 247 244 248 switch (type) { ··· 253 257 break; 254 258 } 255 259 btrfs_put_root(subvol_root); 256 - out: 260 + 257 261 return ret; 258 262 } 259 263

+5 -8

fs/btrfs/verity.c

··· 525 525 ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0, 526 526 (const char *)&item, sizeof(item)); 527 527 if (ret) 528 - goto out; 528 + return ret; 529 529 530 530 /* Write out the descriptor itself */ 531 531 ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1, 532 532 desc, desc_size); 533 533 if (ret) 534 - goto out; 534 + return ret; 535 535 536 536 /* 537 537 * 1 for updating the inode flag 538 538 * 1 for deleting the orphan 539 539 */ 540 540 trans = btrfs_start_transaction(root, 2); 541 - if (IS_ERR(trans)) { 542 - ret = PTR_ERR(trans); 543 - goto out; 544 - } 541 + if (IS_ERR(trans)) 542 + return PTR_ERR(trans); 545 543 inode->ro_flags |= BTRFS_INODE_RO_VERITY; 546 544 btrfs_sync_inode_flags_to_i_flags(inode); 547 545 ret = btrfs_update_inode(trans, inode); ··· 552 554 btrfs_set_fs_compat_ro(root->fs_info, VERITY); 553 555 end_trans: 554 556 btrfs_end_transaction(trans); 555 - out: 556 - return ret; 557 + return 0; 557 558 558 559 } 559 560

+485 -146

fs/btrfs/volumes.c

··· 231 231 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data"); 232 232 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system"); 233 233 DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata"); 234 + /* Block groups containing the remap tree. */ 235 + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA_REMAP, "metadata-remap"); 236 + /* Block group that has been remapped. */ 237 + DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_REMAPPED, "remapped"); 234 238 235 239 DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single"); 236 240 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) ··· 1173 1169 * any transaction and set the error state, guaranteeing no commits of 1174 1170 * unsafe super blocks. 1175 1171 */ 1176 - device->last_flush_error = 0; 1172 + clear_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); 1177 1173 1178 1174 /* Verify the device is back in a pristine state */ 1179 1175 WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); ··· 1509 1505 } 1510 1506 1511 1507 /* 1512 - * Try to find a chunk that intersects [start, start + len] range and when one 1513 - * such is found, record the end of it in *start 1508 + * Find the first pending extent intersecting a range. 1509 + * 1510 + * @device: the device to search 1511 + * @start: start of the range to check 1512 + * @len: length of the range to check 1513 + * @pending_start: output pointer for the start of the found pending extent 1514 + * @pending_end: output pointer for the end of the found pending extent (inclusive) 1515 + * 1516 + * Search for a pending chunk allocation that intersects the half-open range 1517 + * [start, start + len). 1518 + * 1519 + * Return: true if a pending extent was found, false otherwise. 1520 + * If the return value is true, store the first pending extent in 1521 + * [*pending_start, *pending_end]. Otherwise, the two output variables 1522 + * may still be modified, to something outside the range and should not 1523 + * be used. 1514 1524 */ 1515 - static bool contains_pending_extent(struct btrfs_device *device, u64 *start, 1516 - u64 len) 1525 + bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len, 1526 + u64 *pending_start, u64 *pending_end) 1517 1527 { 1518 - u64 physical_start, physical_end; 1519 - 1520 1528 lockdep_assert_held(&device->fs_info->chunk_mutex); 1521 1529 1522 - if (btrfs_find_first_extent_bit(&device->alloc_state, *start, 1523 - &physical_start, &physical_end, 1530 + if (btrfs_find_first_extent_bit(&device->alloc_state, start, 1531 + pending_start, pending_end, 1524 1532 CHUNK_ALLOCATED, NULL)) { 1525 1533 1526 - if (in_range(physical_start, *start, len) || 1527 - in_range(*start, physical_start, 1528 - physical_end + 1 - physical_start)) { 1529 - *start = physical_end + 1; 1534 + if (in_range(*pending_start, start, len) || 1535 + in_range(start, *pending_start, *pending_end + 1 - *pending_start)) { 1530 1536 return true; 1531 1537 } 1532 1538 } 1533 1539 return false; 1540 + } 1541 + 1542 + /* 1543 + * Find the first real hole accounting for pending extents. 1544 + * 1545 + * @device: the device containing the candidate hole 1546 + * @start: input/output pointer for the hole start position 1547 + * @len: input/output pointer for the hole length 1548 + * @min_hole_size: the size of hole we are looking for 1549 + * 1550 + * Given a potential hole specified by [*start, *start + *len), check for pending 1551 + * chunk allocations within that range. If pending extents are found, the hole is 1552 + * adjusted to represent the first true free space that is large enough when 1553 + * accounting for pending chunks. 1554 + * 1555 + * Note that this function must handle various cases involving non consecutive 1556 + * pending extents. 1557 + * 1558 + * Returns: true if a suitable hole was found and false otherwise. 1559 + * If the return value is true, then *start and *len are set to represent the hole. 1560 + * If the return value is false, then *start is set to the largest hole we 1561 + * found and *len is set to its length. 1562 + * If there are no holes at all, then *start is set to the end of the range and 1563 + * *len is set to 0. 1564 + */ 1565 + bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, u64 *start, 1566 + u64 *len, u64 min_hole_size) 1567 + { 1568 + u64 pending_start, pending_end; 1569 + u64 end; 1570 + u64 max_hole_start = 0; 1571 + u64 max_hole_len = 0; 1572 + 1573 + lockdep_assert_held(&device->fs_info->chunk_mutex); 1574 + 1575 + if (*len == 0) 1576 + return false; 1577 + 1578 + end = *start + *len - 1; 1579 + 1580 + /* 1581 + * Loop until we either see a large enough hole or check every pending 1582 + * extent overlapping the candidate hole. 1583 + * At every hole that we observe, record it if it is the new max. 1584 + * At the end of the iteration, set the output variables to the max hole. 1585 + */ 1586 + while (true) { 1587 + if (btrfs_first_pending_extent(device, *start, *len, &pending_start, &pending_end)) { 1588 + /* 1589 + * Case 1: the pending extent overlaps the start of 1590 + * candidate hole. That means the true hole is after the 1591 + * pending extent, but we need to find the next pending 1592 + * extent to properly size the hole. In the next loop, 1593 + * we will reduce to case 2 or 3. 1594 + * e.g., 1595 + * 1596 + * |----pending A----| real hole |----pending B----| 1597 + * | candidate hole | 1598 + * *start end 1599 + */ 1600 + if (pending_start <= *start) { 1601 + *start = pending_end + 1; 1602 + goto next; 1603 + } 1604 + /* 1605 + * Case 2: The pending extent starts after *start (and overlaps 1606 + * [*start, end), so the first hole just goes up to the start 1607 + * of the pending extent. 1608 + * e.g., 1609 + * 1610 + * | real hole |----pending A----| 1611 + * | candidate hole | 1612 + * *start end 1613 + */ 1614 + *len = pending_start - *start; 1615 + if (*len > max_hole_len) { 1616 + max_hole_start = *start; 1617 + max_hole_len = *len; 1618 + } 1619 + if (*len >= min_hole_size) 1620 + break; 1621 + /* 1622 + * If the hole wasn't big enough, then we advance past 1623 + * the pending extent and keep looking. 1624 + */ 1625 + *start = pending_end + 1; 1626 + goto next; 1627 + } else { 1628 + /* 1629 + * Case 3: There is no pending extent overlapping the 1630 + * range [*start, *start + *len - 1], so the only remaining 1631 + * hole is the remaining range. 1632 + * e.g., 1633 + * 1634 + * | candidate hole | 1635 + * | real hole | 1636 + * *start end 1637 + */ 1638 + 1639 + if (*len > max_hole_len) { 1640 + max_hole_start = *start; 1641 + max_hole_len = *len; 1642 + } 1643 + break; 1644 + } 1645 + next: 1646 + if (*start > end) 1647 + break; 1648 + *len = end - *start + 1; 1649 + } 1650 + if (max_hole_len) { 1651 + *start = max_hole_start; 1652 + *len = max_hole_len; 1653 + } else { 1654 + *start = end + 1; 1655 + *len = 0; 1656 + } 1657 + return max_hole_len >= min_hole_size; 1534 1658 } 1535 1659 1536 1660 static u64 dev_extent_search_start(struct btrfs_device *device) ··· 1725 1593 } 1726 1594 1727 1595 /* 1728 - * Check if specified hole is suitable for allocation. 1596 + * Validate and adjust a hole for chunk allocation 1729 1597 * 1730 - * @device: the device which we have the hole 1731 - * @hole_start: starting position of the hole 1732 - * @hole_size: the size of the hole 1733 - * @num_bytes: the size of the free space that we need 1598 + * @device: the device containing the candidate hole 1599 + * @hole_start: input/output pointer for the hole start position 1600 + * @hole_size: input/output pointer for the hole size 1601 + * @num_bytes: minimum allocation size required 1734 1602 * 1735 - * This function may modify @hole_start and @hole_size to reflect the suitable 1736 - * position for allocation. Returns 1 if hole position is updated, 0 otherwise. 1603 + * Check if the specified hole is suitable for allocation and adjust it if 1604 + * necessary. The hole may be modified to skip over pending chunk allocations 1605 + * and to satisfy stricter zoned requirements on zoned filesystems. 1606 + * 1607 + * For regular (non-zoned) allocation, if the hole after adjustment is smaller 1608 + * than @num_bytes, the search continues past additional pending extents until 1609 + * either a sufficiently large hole is found or no more pending extents exist. 1610 + * 1611 + * Return: true if a suitable hole was found and false otherwise. 1612 + * If the return value is true, then *hole_start and *hole_size are set to 1613 + * represent the hole we found. 1614 + * If the return value is false, then *hole_start is set to the largest 1615 + * hole we found and *hole_size is set to its length. 1616 + * If there are no holes at all, then *hole_start is set to the end of the range 1617 + * and *hole_size is set to 0. 1737 1618 */ 1738 1619 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, 1739 1620 u64 *hole_size, u64 num_bytes) 1740 1621 { 1741 - bool changed = false; 1742 - u64 hole_end = *hole_start + *hole_size; 1622 + bool found = false; 1623 + const u64 hole_end = *hole_start + *hole_size - 1; 1743 1624 1744 - for (;;) { 1745 - /* 1746 - * Check before we set max_hole_start, otherwise we could end up 1747 - * sending back this offset anyway. 1748 - */ 1749 - if (contains_pending_extent(device, hole_start, *hole_size)) { 1750 - if (hole_end >= *hole_start) 1751 - *hole_size = hole_end - *hole_start; 1752 - else 1753 - *hole_size = 0; 1754 - changed = true; 1755 - } 1625 + ASSERT(*hole_size > 0); 1756 1626 1757 - switch (device->fs_devices->chunk_alloc_policy) { 1758 - default: 1759 - btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); 1760 - fallthrough; 1761 - case BTRFS_CHUNK_ALLOC_REGULAR: 1762 - /* No extra check */ 1763 - break; 1764 - case BTRFS_CHUNK_ALLOC_ZONED: 1765 - if (dev_extent_hole_check_zoned(device, hole_start, 1766 - hole_size, num_bytes)) { 1767 - changed = true; 1768 - /* 1769 - * The changed hole can contain pending extent. 1770 - * Loop again to check that. 1771 - */ 1772 - continue; 1773 - } 1774 - break; 1775 - } 1627 + again: 1628 + *hole_size = hole_end - *hole_start + 1; 1629 + found = btrfs_find_hole_in_pending_extents(device, hole_start, hole_size, num_bytes); 1630 + if (!found) 1631 + return found; 1632 + ASSERT(*hole_size >= num_bytes); 1776 1633 1634 + switch (device->fs_devices->chunk_alloc_policy) { 1635 + default: 1636 + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); 1637 + fallthrough; 1638 + case BTRFS_CHUNK_ALLOC_REGULAR: 1639 + return found; 1640 + case BTRFS_CHUNK_ALLOC_ZONED: 1641 + if (dev_extent_hole_check_zoned(device, hole_start, hole_size, num_bytes)) 1642 + goto again; 1777 1643 break; 1778 1644 } 1779 1645 1780 - return changed; 1646 + return found; 1781 1647 } 1782 1648 1783 1649 /* ··· 1834 1704 ret = -ENOMEM; 1835 1705 goto out; 1836 1706 } 1837 - again: 1707 + 1838 1708 if (search_start >= search_end || 1839 1709 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { 1840 1710 ret = -ENOSPC; ··· 1921 1791 */ 1922 1792 if (search_end > search_start) { 1923 1793 hole_size = search_end - search_start; 1924 - if (dev_extent_hole_check(device, &search_start, &hole_size, 1925 - num_bytes)) { 1926 - btrfs_release_path(path); 1927 - goto again; 1928 - } 1794 + dev_extent_hole_check(device, &search_start, &hole_size, num_bytes); 1929 1795 1930 1796 if (hole_size > max_hole_size) { 1931 1797 max_hole_start = search_start; ··· 2442 2316 free_fs_devices(cur_devices); 2443 2317 } 2444 2318 2445 - ret = btrfs_commit_transaction(trans); 2446 - 2447 - return ret; 2319 + return btrfs_commit_transaction(trans); 2448 2320 2449 2321 error_undo: 2450 2322 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { ··· 3047 2923 return ret; 3048 2924 } 3049 2925 3050 - static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, 3051 - struct btrfs_device *device) 2926 + int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device) 3052 2927 { 3053 2928 int ret; 3054 2929 BTRFS_PATH_AUTO_FREE(path); ··· 3345 3222 return btrfs_free_chunk(trans, chunk_offset); 3346 3223 } 3347 3224 3348 - int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3225 + int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map) 3349 3226 { 3350 3227 struct btrfs_fs_info *fs_info = trans->fs_info; 3351 - struct btrfs_chunk_map *map; 3228 + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3352 3229 u64 dev_extent_len = 0; 3353 3230 int i, ret = 0; 3354 - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 3355 - 3356 - map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3357 - if (IS_ERR(map)) { 3358 - /* 3359 - * This is a logic error, but we don't want to just rely on the 3360 - * user having built with ASSERT enabled, so if ASSERT doesn't 3361 - * do anything we still error out. 3362 - */ 3363 - DEBUG_WARN("errr %ld reading chunk map at offset %llu", 3364 - PTR_ERR(map), chunk_offset); 3365 - return PTR_ERR(map); 3366 - } 3367 3231 3368 3232 /* 3369 3233 * First delete the device extent items from the devices btree. ··· 3371 3261 if (unlikely(ret)) { 3372 3262 mutex_unlock(&fs_devices->device_list_mutex); 3373 3263 btrfs_abort_transaction(trans, ret); 3374 - goto out; 3264 + return ret; 3375 3265 } 3376 3266 3377 3267 if (device->bytes_used > 0) { ··· 3390 3280 } 3391 3281 } 3392 3282 mutex_unlock(&fs_devices->device_list_mutex); 3283 + 3284 + return 0; 3285 + } 3286 + 3287 + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) 3288 + { 3289 + struct btrfs_fs_info *fs_info = trans->fs_info; 3290 + struct btrfs_chunk_map *map; 3291 + int ret; 3292 + 3293 + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); 3294 + if (IS_ERR(map)) { 3295 + DEBUG_WARN("errr %ld reading chunk map at offset %llu", 3296 + PTR_ERR(map), chunk_offset); 3297 + return PTR_ERR(map); 3298 + } 3299 + 3300 + ret = btrfs_remove_dev_extents(trans, map); 3301 + if (ret) 3302 + goto out; 3393 3303 3394 3304 /* 3395 3305 * We acquire fs_info->chunk_mutex for 2 reasons: ··· 3506 3376 */ 3507 3377 btrfs_trans_release_chunk_metadata(trans); 3508 3378 3379 + /* On error, btrfs_remove_block_group() aborts the transaction. */ 3509 3380 ret = btrfs_remove_block_group(trans, map); 3510 - if (unlikely(ret)) { 3511 - btrfs_abort_transaction(trans, ret); 3512 - goto out; 3513 - } 3381 + if (unlikely(ret)) 3382 + ASSERT(BTRFS_FS_ERROR(fs_info) != 0); 3514 3383 3515 3384 out: 3516 3385 if (trans->removing_chunk) { ··· 3521 3392 return ret; 3522 3393 } 3523 3394 3524 - int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, 3525 - bool verbose) 3395 + static int btrfs_relocate_chunk_finish(struct btrfs_fs_info *fs_info, 3396 + struct btrfs_block_group *bg) 3526 3397 { 3527 3398 struct btrfs_root *root = fs_info->chunk_root; 3528 3399 struct btrfs_trans_handle *trans; 3529 - struct btrfs_block_group *block_group; 3530 3400 u64 length; 3401 + int ret; 3402 + 3403 + btrfs_discard_cancel_work(&fs_info->discard_ctl, bg); 3404 + length = bg->length; 3405 + btrfs_put_block_group(bg); 3406 + 3407 + /* 3408 + * On a zoned file system, discard the whole block group, this will 3409 + * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3410 + * resetting the zone fails, don't treat it as a fatal problem from the 3411 + * filesystem's point of view. 3412 + */ 3413 + if (btrfs_is_zoned(fs_info)) { 3414 + ret = btrfs_discard_extent(fs_info, bg->start, length, NULL, true); 3415 + if (ret) 3416 + btrfs_info(fs_info, "failed to reset zone %llu after relocation", 3417 + bg->start); 3418 + } 3419 + 3420 + trans = btrfs_start_trans_remove_block_group(root->fs_info, bg->start); 3421 + if (IS_ERR(trans)) { 3422 + ret = PTR_ERR(trans); 3423 + btrfs_handle_fs_error(root->fs_info, ret, NULL); 3424 + return ret; 3425 + } 3426 + 3427 + /* Step two, delete the device extents and the chunk tree entries. */ 3428 + ret = btrfs_remove_chunk(trans, bg->start); 3429 + btrfs_end_transaction(trans); 3430 + 3431 + return ret; 3432 + } 3433 + 3434 + int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, bool verbose) 3435 + { 3436 + struct btrfs_block_group *block_group; 3531 3437 int ret; 3532 3438 3533 3439 if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { ··· 3602 3438 block_group = btrfs_lookup_block_group(fs_info, chunk_offset); 3603 3439 if (!block_group) 3604 3440 return -ENOENT; 3605 - btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); 3606 - length = block_group->length; 3607 - btrfs_put_block_group(block_group); 3608 3441 3609 - /* 3610 - * On a zoned file system, discard the whole block group, this will 3611 - * trigger a REQ_OP_ZONE_RESET operation on the device zone. If 3612 - * resetting the zone fails, don't treat it as a fatal problem from the 3613 - * filesystem's point of view. 3614 - */ 3615 - if (btrfs_is_zoned(fs_info)) { 3616 - ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); 3617 - if (ret) 3618 - btrfs_info(fs_info, 3619 - "failed to reset zone %llu after relocation", 3620 - chunk_offset); 3442 + if (should_relocate_using_remap_tree(block_group)) { 3443 + /* If we're relocating using the remap tree we're now done. */ 3444 + btrfs_put_block_group(block_group); 3445 + ret = 0; 3446 + } else { 3447 + ret = btrfs_relocate_chunk_finish(fs_info, block_group); 3621 3448 } 3622 3449 3623 - trans = btrfs_start_trans_remove_block_group(root->fs_info, 3624 - chunk_offset); 3625 - if (IS_ERR(trans)) { 3626 - ret = PTR_ERR(trans); 3627 - btrfs_handle_fs_error(root->fs_info, ret, NULL); 3628 - return ret; 3629 - } 3630 - 3631 - /* 3632 - * step two, delete the device extents and the 3633 - * chunk tree entries 3634 - */ 3635 - ret = btrfs_remove_chunk(trans, chunk_offset); 3636 - btrfs_end_transaction(trans); 3637 3450 return ret; 3638 3451 } 3639 3452 ··· 3787 3646 struct btrfs_path *path; 3788 3647 struct extent_buffer *leaf; 3789 3648 struct btrfs_key key; 3790 - int ret, err; 3649 + int ret; 3791 3650 3792 3651 path = btrfs_alloc_path(); 3793 3652 if (!path) ··· 3822 3681 btrfs_set_balance_flags(leaf, item, bctl->flags); 3823 3682 out: 3824 3683 btrfs_free_path(path); 3825 - err = btrfs_commit_transaction(trans); 3826 - if (err && !ret) 3827 - ret = err; 3684 + if (ret == 0) 3685 + ret = btrfs_commit_transaction(trans); 3686 + else 3687 + btrfs_end_transaction(trans); 3688 + 3828 3689 return ret; 3829 3690 } 3830 3691 ··· 3836 3693 struct btrfs_trans_handle *trans; 3837 3694 struct btrfs_path *path; 3838 3695 struct btrfs_key key; 3839 - int ret, err; 3696 + int ret; 3840 3697 3841 3698 path = btrfs_alloc_path(); 3842 3699 if (!path) ··· 3863 3720 ret = btrfs_del_item(trans, root, path); 3864 3721 out: 3865 3722 btrfs_free_path(path); 3866 - err = btrfs_commit_transaction(trans); 3867 - if (err && !ret) 3868 - ret = err; 3723 + if (ret == 0) 3724 + ret = btrfs_commit_transaction(trans); 3725 + else 3726 + btrfs_end_transaction(trans); 3727 + 3869 3728 return ret; 3870 3729 } 3871 3730 ··· 4111 3966 struct btrfs_balance_args *bargs = NULL; 4112 3967 u64 chunk_type = btrfs_chunk_type(leaf, chunk); 4113 3968 3969 + /* Treat METADATA_REMAP chunks as METADATA. */ 3970 + if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) { 3971 + chunk_type &= ~BTRFS_BLOCK_GROUP_METADATA_REMAP; 3972 + chunk_type |= BTRFS_BLOCK_GROUP_METADATA; 3973 + } 3974 + 4114 3975 /* type filter */ 4115 3976 if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & 4116 3977 (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { ··· 4198 4047 return true; 4199 4048 } 4200 4049 4050 + struct remap_chunk_info { 4051 + struct list_head list; 4052 + u64 offset; 4053 + struct btrfs_block_group *bg; 4054 + bool made_ro; 4055 + }; 4056 + 4057 + static int cow_remap_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path) 4058 + { 4059 + struct btrfs_fs_info *fs_info = trans->fs_info; 4060 + struct btrfs_key key = { 0 }; 4061 + int ret; 4062 + 4063 + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); 4064 + if (ret < 0) 4065 + return ret; 4066 + 4067 + while (true) { 4068 + ret = btrfs_next_leaf(fs_info->remap_root, path); 4069 + if (ret < 0) { 4070 + return ret; 4071 + } else if (ret > 0) { 4072 + ret = 0; 4073 + break; 4074 + } 4075 + 4076 + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 4077 + 4078 + btrfs_release_path(path); 4079 + 4080 + ret = btrfs_search_slot(trans, fs_info->remap_root, &key, path, 0, 1); 4081 + if (ret < 0) 4082 + break; 4083 + } 4084 + 4085 + return ret; 4086 + } 4087 + 4088 + static int balance_remap_chunks(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 4089 + struct list_head *chunks) 4090 + { 4091 + struct remap_chunk_info *rci, *tmp; 4092 + struct btrfs_trans_handle *trans; 4093 + int ret; 4094 + 4095 + list_for_each_entry_safe(rci, tmp, chunks, list) { 4096 + rci->bg = btrfs_lookup_block_group(fs_info, rci->offset); 4097 + if (!rci->bg) { 4098 + list_del(&rci->list); 4099 + kfree(rci); 4100 + continue; 4101 + } 4102 + 4103 + ret = btrfs_inc_block_group_ro(rci->bg, false); 4104 + if (ret) 4105 + goto end; 4106 + 4107 + rci->made_ro = true; 4108 + } 4109 + 4110 + if (list_empty(chunks)) 4111 + return 0; 4112 + 4113 + trans = btrfs_start_transaction(fs_info->remap_root, 0); 4114 + if (IS_ERR(trans)) { 4115 + ret = PTR_ERR(trans); 4116 + goto end; 4117 + } 4118 + 4119 + mutex_lock(&fs_info->remap_mutex); 4120 + ret = cow_remap_tree(trans, path); 4121 + mutex_unlock(&fs_info->remap_mutex); 4122 + 4123 + btrfs_release_path(path); 4124 + btrfs_commit_transaction(trans); 4125 + 4126 + end: 4127 + while (!list_empty(chunks)) { 4128 + bool is_unused; 4129 + 4130 + rci = list_first_entry(chunks, struct remap_chunk_info, list); 4131 + 4132 + spin_lock(&rci->bg->lock); 4133 + is_unused = !btrfs_is_block_group_used(rci->bg); 4134 + spin_unlock(&rci->bg->lock); 4135 + 4136 + if (is_unused) 4137 + btrfs_mark_bg_unused(rci->bg); 4138 + 4139 + if (rci->made_ro) 4140 + btrfs_dec_block_group_ro(rci->bg); 4141 + 4142 + btrfs_put_block_group(rci->bg); 4143 + 4144 + list_del(&rci->list); 4145 + kfree(rci); 4146 + } 4147 + 4148 + return ret; 4149 + } 4150 + 4201 4151 static int __btrfs_balance(struct btrfs_fs_info *fs_info) 4202 4152 { 4203 4153 struct btrfs_balance_control *bctl = fs_info->balance_ctl; ··· 4321 4069 u32 count_meta = 0; 4322 4070 u32 count_sys = 0; 4323 4071 int chunk_reserved = 0; 4072 + struct remap_chunk_info *rci; 4073 + unsigned int num_remap_chunks = 0; 4074 + LIST_HEAD(remap_chunks); 4324 4075 4325 4076 path = btrfs_alloc_path(); 4326 4077 if (!path) { ··· 4390 4135 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 4391 4136 chunk_type = btrfs_chunk_type(leaf, chunk); 4392 4137 4138 + /* Check if chunk has already been fully relocated. */ 4139 + if (chunk_type & BTRFS_BLOCK_GROUP_REMAPPED && 4140 + btrfs_chunk_num_stripes(leaf, chunk) == 0) { 4141 + btrfs_release_path(path); 4142 + mutex_unlock(&fs_info->reclaim_bgs_lock); 4143 + goto loop; 4144 + } 4145 + 4393 4146 if (!counting) { 4394 4147 spin_lock(&fs_info->balance_lock); 4395 4148 bctl->stat.considered++; ··· 4422 4159 count_data++; 4423 4160 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) 4424 4161 count_sys++; 4425 - else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA) 4162 + else if (chunk_type & (BTRFS_BLOCK_GROUP_METADATA | 4163 + BTRFS_BLOCK_GROUP_METADATA_REMAP)) 4426 4164 count_meta++; 4427 4165 4428 4166 goto loop; ··· 4440 4176 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && 4441 4177 count_sys < bctl->sys.limit_min)) { 4442 4178 mutex_unlock(&fs_info->reclaim_bgs_lock); 4179 + goto loop; 4180 + } 4181 + 4182 + /* 4183 + * Balancing METADATA_REMAP chunks takes place separately - add 4184 + * the details to a list so it can be processed later. 4185 + */ 4186 + if (chunk_type & BTRFS_BLOCK_GROUP_METADATA_REMAP) { 4187 + mutex_unlock(&fs_info->reclaim_bgs_lock); 4188 + 4189 + rci = kmalloc(sizeof(struct remap_chunk_info), GFP_NOFS); 4190 + if (!rci) { 4191 + ret = -ENOMEM; 4192 + goto error; 4193 + } 4194 + 4195 + rci->offset = found_key.offset; 4196 + rci->bg = NULL; 4197 + rci->made_ro = false; 4198 + list_add_tail(&rci->list, &remap_chunks); 4199 + 4200 + num_remap_chunks++; 4201 + 4443 4202 goto loop; 4444 4203 } 4445 4204 ··· 4505 4218 key.offset = found_key.offset - 1; 4506 4219 } 4507 4220 4221 + btrfs_release_path(path); 4222 + 4508 4223 if (counting) { 4509 - btrfs_release_path(path); 4510 4224 counting = false; 4511 4225 goto again; 4226 + } 4227 + 4228 + if (!list_empty(&remap_chunks)) { 4229 + ret = balance_remap_chunks(fs_info, path, &remap_chunks); 4230 + if (ret == -ENOSPC) 4231 + enospc_errors++; 4232 + 4233 + if (!ret) { 4234 + spin_lock(&fs_info->balance_lock); 4235 + bctl->stat.completed += num_remap_chunks; 4236 + spin_unlock(&fs_info->balance_lock); 4237 + } 4512 4238 } 4513 4239 error: 4514 4240 if (enospc_errors) { ··· 5144 4844 u64 diff; 5145 4845 u64 start; 5146 4846 u64 free_diff = 0; 4847 + u64 pending_start, pending_end; 5147 4848 5148 4849 new_size = round_down(new_size, fs_info->sectorsize); 5149 4850 start = new_size; ··· 5190 4889 * in-memory chunks are synced to disk so that the loop below sees them 5191 4890 * and relocates them accordingly. 5192 4891 */ 5193 - if (contains_pending_extent(device, &start, diff)) { 4892 + if (btrfs_first_pending_extent(device, start, diff, &pending_start, &pending_end)) { 5194 4893 mutex_unlock(&fs_info->chunk_mutex); 5195 4894 ret = btrfs_commit_transaction(trans); 5196 4895 if (ret) ··· 5711 5410 } 5712 5411 } 5713 5412 5714 - static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) 5413 + void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) 5715 5414 { 5716 5415 for (int i = 0; i < map->num_stripes; i++) { 5717 5416 struct btrfs_io_stripe *stripe = &map->stripes[i]; ··· 5728 5427 write_lock(&fs_info->mapping_tree_lock); 5729 5428 rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); 5730 5429 RB_CLEAR_NODE(&map->rb_node); 5731 - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); 5430 + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); 5732 5431 write_unlock(&fs_info->mapping_tree_lock); 5733 5432 5734 5433 /* Once for the tree reference. */ ··· 5764 5463 return -EEXIST; 5765 5464 } 5766 5465 chunk_map_device_set_bits(map, CHUNK_ALLOCATED); 5767 - chunk_map_device_clear_bits(map, CHUNK_TRIMMED); 5466 + btrfs_chunk_map_device_clear_bits(map, CHUNK_TRIMMED); 5768 5467 write_unlock(&fs_info->mapping_tree_lock); 5769 5468 5770 5469 return 0; ··· 6120 5819 map = rb_entry(node, struct btrfs_chunk_map, rb_node); 6121 5820 rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); 6122 5821 RB_CLEAR_NODE(&map->rb_node); 6123 - chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); 5822 + btrfs_chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); 6124 5823 /* Once for the tree ref. */ 6125 5824 btrfs_free_chunk_map(map); 6126 5825 cond_resched_rwlock_write(&fs_info->mapping_tree_lock); ··· 6367 6066 */ 6368 6067 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, 6369 6068 u64 logical, u64 *length_ret, 6370 - u32 *num_stripes) 6069 + u32 *num_stripes, bool do_remap) 6371 6070 { 6372 6071 struct btrfs_chunk_map *map; 6373 6072 struct btrfs_discard_stripe *stripes; ··· 6390 6089 map = btrfs_get_chunk_map(fs_info, logical, length); 6391 6090 if (IS_ERR(map)) 6392 6091 return ERR_CAST(map); 6092 + 6093 + if (do_remap && (map->type & BTRFS_BLOCK_GROUP_REMAPPED)) { 6094 + u64 new_logical = logical; 6095 + 6096 + ret = btrfs_translate_remap(fs_info, &new_logical, &length); 6097 + if (ret) 6098 + goto out_free_map; 6099 + 6100 + if (new_logical != logical) { 6101 + btrfs_free_chunk_map(map); 6102 + 6103 + map = btrfs_get_chunk_map(fs_info, new_logical, length); 6104 + if (IS_ERR(map)) 6105 + return ERR_CAST(map); 6106 + 6107 + logical = new_logical; 6108 + } 6109 + } 6393 6110 6394 6111 /* we don't discard raid56 yet */ 6395 6112 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ··· 6896 6577 if (IS_ERR(map)) 6897 6578 return PTR_ERR(map); 6898 6579 6580 + if (map->type & BTRFS_BLOCK_GROUP_REMAPPED) { 6581 + u64 new_logical = logical; 6582 + 6583 + ret = btrfs_translate_remap(fs_info, &new_logical, length); 6584 + if (ret) 6585 + return ret; 6586 + 6587 + if (new_logical != logical) { 6588 + btrfs_free_chunk_map(map); 6589 + 6590 + map = btrfs_get_chunk_map(fs_info, new_logical, *length); 6591 + if (IS_ERR(map)) 6592 + return PTR_ERR(map); 6593 + 6594 + logical = new_logical; 6595 + } 6596 + } 6597 + 6899 6598 num_copies = btrfs_chunk_map_num_copies(map); 6900 6599 if (io_geom.mirror_num > num_copies) 6901 6600 return -EINVAL; ··· 7378 7041 */ 7379 7042 map->sub_stripes = btrfs_raid_array[index].sub_stripes; 7380 7043 map->verified_stripes = 0; 7381 - map->stripe_size = btrfs_calc_stripe_length(map); 7044 + 7045 + if (num_stripes > 0) 7046 + map->stripe_size = btrfs_calc_stripe_length(map); 7047 + else 7048 + map->stripe_size = 0; 7049 + 7382 7050 for (i = 0; i < num_stripes; i++) { 7383 7051 map->stripes[i].physical = 7384 7052 btrfs_stripe_offset_nr(leaf, chunk, i); ··· 7509 7167 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 7510 7168 struct btrfs_device *device; 7511 7169 u64 devid; 7512 - int ret; 7513 7170 u8 fs_uuid[BTRFS_FSID_SIZE]; 7514 7171 u8 dev_uuid[BTRFS_UUID_SIZE]; 7515 7172 ··· 7608 7267 atomic64_add(device->total_bytes - device->bytes_used, 7609 7268 &fs_info->free_chunk_space); 7610 7269 } 7611 - ret = 0; 7612 - return ret; 7270 + 7271 + return 0; 7613 7272 } 7614 7273 7615 7274 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) ··· 7698 7357 7699 7358 map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); 7700 7359 /* No chunk at all? Return false anyway */ 7701 - if (!map) { 7702 - ret = false; 7703 - goto out; 7704 - } 7360 + if (!map) 7361 + return false; 7362 + 7705 7363 while (map) { 7706 7364 int missing = 0; 7707 7365 int max_tolerated; ··· 7714 7374 7715 7375 if (!dev || !dev->bdev || 7716 7376 test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || 7717 - dev->last_flush_error) 7377 + test_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &dev->dev_state)) 7718 7378 missing++; 7719 7379 else if (failing_dev && failing_dev == dev) 7720 7380 missing++; ··· 7725 7385 "chunk %llu missing %d devices, max tolerance is %d for writable mount", 7726 7386 map->start, missing, max_tolerated); 7727 7387 btrfs_free_chunk_map(map); 7728 - ret = false; 7729 - goto out; 7388 + return false; 7730 7389 } 7731 7390 next_start = map->start + map->chunk_len; 7732 7391 btrfs_free_chunk_map(map); 7733 7392 7734 7393 map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); 7735 7394 } 7736 - out: 7395 + 7737 7396 return ret; 7738 7397 } 7739 7398 ··· 8364 8025 if (!path) 8365 8026 return -ENOMEM; 8366 8027 8367 - path->reada = READA_FORWARD; 8028 + path->reada = READA_FORWARD_ALWAYS; 8368 8029 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 8369 8030 if (ret < 0) 8370 8031 return ret;

+27 -30

fs/btrfs/volumes.h

··· 30 30 struct btrfs_trans_handle; 31 31 struct btrfs_transaction; 32 32 struct btrfs_zoned_device_info; 33 + struct btrfs_space_info; 33 34 34 35 #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) 35 36 ··· 59 58 */ 60 59 static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < 61 60 const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); 62 - static_assert(ilog2(BTRFS_BLOCK_GROUP_RAID0) > ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); 63 61 64 62 /* ilog2() can handle both constants and variables */ 65 63 #define BTRFS_BG_FLAG_TO_INDEX(profile) \ ··· 80 80 BTRFS_NR_RAID_TYPES 81 81 }; 82 82 83 + static_assert(BTRFS_RAID_RAID0 == 1); 84 + static_assert(BTRFS_RAID_RAID1 == 2); 85 + static_assert(BTRFS_RAID_DUP == 3); 86 + static_assert(BTRFS_RAID_RAID10 == 4); 87 + static_assert(BTRFS_RAID_RAID5 == 5); 88 + static_assert(BTRFS_RAID_RAID6 == 6); 89 + static_assert(BTRFS_RAID_RAID1C3 == 7); 90 + static_assert(BTRFS_RAID_RAID1C4 == 8); 91 + 83 92 /* 84 93 * Use sequence counter to get consistent device stat data on 85 94 * 32-bit processors. ··· 108 99 #define BTRFS_DEV_STATE_REPLACE_TGT (3) 109 100 #define BTRFS_DEV_STATE_FLUSH_SENT (4) 110 101 #define BTRFS_DEV_STATE_NO_READA (5) 102 + #define BTRFS_DEV_STATE_FLUSH_FAILED (6) 111 103 112 104 /* Set when the device item is found in chunk tree, used to catch unexpected registered device. */ 113 105 #define BTRFS_DEV_STATE_ITEM_FOUND (7) ··· 135 125 136 126 struct btrfs_zoned_device_info *zone_info; 137 127 138 - /* 139 - * Device's major-minor number. Must be set even if the device is not 140 - * opened (bdev == NULL), unless the device is missing. 141 - */ 142 - dev_t devt; 143 128 unsigned long dev_state; 144 - blk_status_t last_flush_error; 145 129 146 130 #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED 147 131 seqcount_t data_seqcount; ··· 198 194 /* Counter to record the change of device stats */ 199 195 atomic_t dev_stats_ccnt; 200 196 atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; 197 + 198 + /* 199 + * Device's major-minor number. Must be set even if the device is not 200 + * opened (bdev == NULL), unless the device is missing. 201 + */ 202 + dev_t devt; 201 203 202 204 struct extent_io_tree alloc_state; 203 205 ··· 331 321 BTRFS_NR_READ_POLICY, 332 322 }; 333 323 334 - #ifdef CONFIG_BTRFS_EXPERIMENTAL 335 - /* 336 - * Checksum mode - offload it to workqueues or do it synchronously in 337 - * btrfs_submit_chunk(). 338 - */ 339 - enum btrfs_offload_csum_mode { 340 - /* 341 - * Choose offloading checksum or do it synchronously automatically. 342 - * Do it synchronously if the checksum is fast, or offload to workqueues 343 - * otherwise. 344 - */ 345 - BTRFS_OFFLOAD_CSUM_AUTO, 346 - /* Always offload checksum to workqueues. */ 347 - BTRFS_OFFLOAD_CSUM_FORCE_ON, 348 - /* Never offload checksum to workqueues. */ 349 - BTRFS_OFFLOAD_CSUM_FORCE_OFF, 350 - }; 351 - #endif 352 - 353 324 struct btrfs_fs_devices { 354 325 u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ 355 326 ··· 457 466 458 467 /* Device to be used for reading in case of RAID1. */ 459 468 u64 read_devid; 460 - 461 - /* Checksum mode - offload it or do it synchronously. */ 462 - enum btrfs_offload_csum_mode offload_csum_mode; 463 469 #endif 464 470 }; 465 471 ··· 634 646 kfree(map); 635 647 } 636 648 } 649 + DEFINE_FREE(btrfs_free_chunk_map, struct btrfs_chunk_map *, btrfs_free_chunk_map(_T)) 637 650 638 651 struct btrfs_balance_control { 639 652 struct btrfs_balance_args data; ··· 716 727 u32 length, int mirror_num); 717 728 struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, 718 729 u64 logical, u64 *length_ret, 719 - u32 *num_stripes); 730 + u32 *num_stripes, bool do_remap); 720 731 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); 721 732 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); 722 733 struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, ··· 778 789 int btrfs_nr_parity_stripes(u64 type); 779 790 int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, 780 791 struct btrfs_block_group *bg); 792 + int btrfs_remove_dev_extents(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); 781 793 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); 782 794 783 795 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS ··· 891 901 892 902 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); 893 903 const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb); 904 + int btrfs_update_device(struct btrfs_trans_handle *trans, struct btrfs_device *device); 905 + void btrfs_chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits); 906 + 907 + bool btrfs_first_pending_extent(struct btrfs_device *device, u64 start, u64 len, 908 + u64 *pending_start, u64 *pending_end); 909 + bool btrfs_find_hole_in_pending_extents(struct btrfs_device *device, 910 + u64 *start, u64 *len, u64 min_hole_size); 894 911 895 912 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 896 913 struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,

+54 -43

fs/btrfs/zlib.c

··· 145 145 return 0; 146 146 } 147 147 148 - int zlib_compress_folios(struct list_head *ws, struct btrfs_inode *inode, 149 - u64 start, struct folio **folios, unsigned long *out_folios, 150 - unsigned long *total_in, unsigned long *total_out) 148 + int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) 151 149 { 150 + struct btrfs_inode *inode = cb->bbio.inode; 152 151 struct btrfs_fs_info *fs_info = inode->root->fs_info; 153 152 struct workspace *workspace = list_entry(ws, struct workspace, list); 154 153 struct address_space *mapping = inode->vfs_inode.i_mapping; 155 - const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; 154 + struct bio *bio = &cb->bbio.bio; 155 + u64 start = cb->start; 156 + u32 len = cb->len; 156 157 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 157 158 int ret; 158 159 char *data_in = NULL; 159 160 char *cfolio_out; 160 - int nr_folios = 0; 161 161 struct folio *in_folio = NULL; 162 162 struct folio *out_folio = NULL; 163 - unsigned long len = *total_out; 164 - unsigned long nr_dest_folios = *out_folios; 165 - const unsigned long max_out = nr_dest_folios << min_folio_shift; 166 163 const u32 blocksize = fs_info->sectorsize; 167 164 const u64 orig_end = start + len; 168 - 169 - *out_folios = 0; 170 - *total_out = 0; 171 - *total_in = 0; 172 165 173 166 ret = zlib_deflateInit(&workspace->strm, workspace->level); 174 167 if (unlikely(ret != Z_OK)) { ··· 181 188 goto out; 182 189 } 183 190 cfolio_out = folio_address(out_folio); 184 - folios[0] = out_folio; 185 - nr_folios = 1; 186 191 187 192 workspace->strm.next_in = workspace->buf; 188 193 workspace->strm.avail_in = 0; ··· 189 198 190 199 while (workspace->strm.total_in < len) { 191 200 /* 192 - * Get next input pages and copy the contents to 193 - * the workspace buffer if required. 201 + * Get next input pages and copy the contents to the workspace 202 + * buffer if required. 194 203 */ 195 204 if (workspace->strm.avail_in == 0) { 196 205 unsigned long bytes_left = len - workspace->strm.total_in; ··· 241 250 goto out; 242 251 } 243 252 244 - /* we're making it bigger, give up */ 253 + /* We're making it bigger, give up. */ 245 254 if (workspace->strm.total_in > blocksize * 2 && 246 - workspace->strm.total_in < 247 - workspace->strm.total_out) { 255 + workspace->strm.total_in < workspace->strm.total_out) { 248 256 ret = -E2BIG; 249 257 goto out; 250 258 } 251 - /* we need another page for writing out. Test this 252 - * before the total_in so we will pull in a new page for 253 - * the stream end if required 254 - */ 259 + if (workspace->strm.total_out >= len) { 260 + ret = -E2BIG; 261 + goto out; 262 + } 263 + /* Queue the full folio and allocate a new one. */ 255 264 if (workspace->strm.avail_out == 0) { 256 - if (nr_folios == nr_dest_folios) { 265 + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { 257 266 ret = -E2BIG; 258 267 goto out; 259 268 } 269 + 260 270 out_folio = btrfs_alloc_compr_folio(fs_info); 261 271 if (out_folio == NULL) { 262 272 ret = -ENOMEM; 263 273 goto out; 264 274 } 265 275 cfolio_out = folio_address(out_folio); 266 - folios[nr_folios] = out_folio; 267 - nr_folios++; 268 276 workspace->strm.avail_out = min_folio_size; 269 277 workspace->strm.next_out = cfolio_out; 270 278 } 271 - /* we're all done */ 279 + /* We're all done. */ 272 280 if (workspace->strm.total_in >= len) 273 281 break; 274 - if (workspace->strm.total_out > max_out) 275 - break; 276 282 } 283 + 277 284 workspace->strm.avail_in = 0; 285 + 278 286 /* 279 287 * Call deflate with Z_FINISH flush parameter providing more output 280 288 * space but no more input data, until it returns with Z_STREAM_END. ··· 287 297 ret = -EIO; 288 298 goto out; 289 299 } else if (workspace->strm.avail_out == 0) { 290 - /* Get another folio for the stream end. */ 291 - if (nr_folios == nr_dest_folios) { 300 + if (workspace->strm.total_out >= len) { 292 301 ret = -E2BIG; 293 302 goto out; 294 303 } 304 + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { 305 + ret = -E2BIG; 306 + goto out; 307 + } 308 + /* Get another folio for the stream end. */ 295 309 out_folio = btrfs_alloc_compr_folio(fs_info); 296 310 if (out_folio == NULL) { 297 311 ret = -ENOMEM; 298 312 goto out; 299 313 } 300 314 cfolio_out = folio_address(out_folio); 301 - folios[nr_folios] = out_folio; 302 - nr_folios++; 303 315 workspace->strm.avail_out = min_folio_size; 304 316 workspace->strm.next_out = cfolio_out; 305 317 } 306 318 } 319 + /* Queue the remaining part of the folio. */ 320 + if (workspace->strm.total_out > bio->bi_iter.bi_size) { 321 + u32 cur_len = offset_in_folio(out_folio, workspace->strm.total_out); 322 + 323 + if (!bio_add_folio(bio, out_folio, cur_len, 0)) { 324 + ret = -E2BIG; 325 + goto out; 326 + } 327 + } else { 328 + /* The last folio hasn't' been utilized. */ 329 + btrfs_free_compr_folio(out_folio); 330 + } 331 + out_folio = NULL; 332 + ASSERT(bio->bi_iter.bi_size == workspace->strm.total_out); 307 333 zlib_deflateEnd(&workspace->strm); 308 334 309 335 if (workspace->strm.total_out >= workspace->strm.total_in) { ··· 328 322 } 329 323 330 324 ret = 0; 331 - *total_out = workspace->strm.total_out; 332 - *total_in = workspace->strm.total_in; 333 325 out: 334 - *out_folios = nr_folios; 326 + if (out_folio) 327 + btrfs_free_compr_folio(out_folio); 335 328 if (data_in) { 336 329 kunmap_local(data_in); 337 330 folio_put(in_folio); ··· 343 338 { 344 339 struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); 345 340 struct workspace *workspace = list_entry(ws, struct workspace, list); 341 + struct folio_iter fi; 346 342 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 347 343 int ret = 0, ret2; 348 344 int wbits = MAX_WBITS; 349 345 char *data_in; 350 346 size_t total_out = 0; 351 - unsigned long folio_in_index = 0; 352 347 size_t srclen = cb->compressed_len; 353 - unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); 354 348 unsigned long buf_start; 355 - struct folio **folios_in = cb->compressed_folios; 356 349 357 - data_in = kmap_local_folio(folios_in[folio_in_index], 0); 350 + bio_first_folio(&fi, &cb->bbio.bio, 0); 351 + 352 + /* We must have at least one folio here, that has the correct size. */ 353 + if (unlikely(!fi.folio)) 354 + return -EINVAL; 355 + ASSERT(folio_size(fi.folio) == min_folio_size); 356 + 357 + data_in = kmap_local_folio(fi.folio, 0); 358 358 workspace->strm.next_in = data_in; 359 359 workspace->strm.avail_in = min_t(size_t, srclen, min_folio_size); 360 360 workspace->strm.total_in = 0; ··· 414 404 if (workspace->strm.avail_in == 0) { 415 405 unsigned long tmp; 416 406 kunmap_local(data_in); 417 - folio_in_index++; 418 - if (folio_in_index >= total_folios_in) { 407 + bio_next_folio(&fi, &cb->bbio.bio); 408 + if (!fi.folio) { 419 409 data_in = NULL; 420 410 break; 421 411 } 422 - data_in = kmap_local_folio(folios_in[folio_in_index], 0); 412 + ASSERT(folio_size(fi.folio) == min_folio_size); 413 + data_in = kmap_local_folio(fi.folio, 0); 423 414 workspace->strm.next_in = data_in; 424 415 tmp = srclen - workspace->strm.total_in; 425 416 workspace->strm.avail_in = min(tmp, min_folio_size);

+328 -70

fs/btrfs/zoned.c

··· 1231 1231 BTRFS_PATH_AUTO_FREE(path); 1232 1232 struct btrfs_key key; 1233 1233 struct btrfs_key found_key; 1234 + const u64 bg_end = btrfs_block_group_end(cache); 1234 1235 int ret; 1235 1236 u64 length; 1236 1237 ··· 1254 1253 if (!path) 1255 1254 return -ENOMEM; 1256 1255 1257 - key.objectid = cache->start + cache->length; 1256 + key.objectid = bg_end; 1258 1257 key.type = 0; 1259 1258 key.offset = 0; 1260 1259 ··· 1283 1282 length = fs_info->nodesize; 1284 1283 1285 1284 if (unlikely(!(found_key.objectid >= cache->start && 1286 - found_key.objectid + length <= cache->start + cache->length))) { 1285 + found_key.objectid + length <= bg_end))) { 1287 1286 return -EUCLEAN; 1288 1287 } 1289 1288 *offset_ret = found_key.objectid + length - cache->start; ··· 1438 1437 bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); 1439 1438 1440 1439 if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { 1441 - btrfs_err(bg->fs_info, 1440 + btrfs_err(fs_info, 1442 1441 "zoned: cannot recover write pointer for zone %llu", 1443 1442 zone_info[0].physical); 1444 1443 return -EIO; 1445 1444 } 1446 1445 if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { 1447 - btrfs_err(bg->fs_info, 1446 + btrfs_err(fs_info, 1448 1447 "zoned: cannot recover write pointer for zone %llu", 1449 1448 zone_info[1].physical); 1450 1449 return -EIO; 1450 + } 1451 + 1452 + /* 1453 + * When the last extent is removed, last_alloc can be smaller than the other write 1454 + * pointer. In that case, last_alloc should be moved to the corresponding write 1455 + * pointer position. 1456 + */ 1457 + for (int i = 0; i < map->num_stripes; i++) { 1458 + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) 1459 + continue; 1460 + if (last_alloc <= zone_info[i].alloc_offset) { 1461 + last_alloc = zone_info[i].alloc_offset; 1462 + break; 1463 + } 1451 1464 } 1452 1465 1453 1466 if (zone_info[0].alloc_offset == WP_CONVENTIONAL) ··· 1471 1456 zone_info[1].alloc_offset = last_alloc; 1472 1457 1473 1458 if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { 1474 - btrfs_err(bg->fs_info, 1459 + btrfs_err(fs_info, 1475 1460 "zoned: write pointer offset mismatch of zones in DUP profile"); 1476 1461 return -EIO; 1477 1462 } ··· 1504 1489 1505 1490 /* In case a device is missing we have a cap of 0, so don't use it. */ 1506 1491 bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); 1492 + 1493 + /* 1494 + * When the last extent is removed, last_alloc can be smaller than the other write 1495 + * pointer. In that case, last_alloc should be moved to the corresponding write 1496 + * pointer position. 1497 + */ 1498 + for (i = 0; i < map->num_stripes; i++) { 1499 + if (zone_info[i].alloc_offset == WP_MISSING_DEV || 1500 + zone_info[i].alloc_offset == WP_CONVENTIONAL) 1501 + continue; 1502 + if (last_alloc <= zone_info[i].alloc_offset) { 1503 + last_alloc = zone_info[i].alloc_offset; 1504 + break; 1505 + } 1506 + } 1507 1507 1508 1508 for (i = 0; i < map->num_stripes; i++) { 1509 1509 if (zone_info[i].alloc_offset == WP_MISSING_DEV) ··· 1561 1531 { 1562 1532 struct btrfs_fs_info *fs_info = bg->fs_info; 1563 1533 u64 stripe_nr = 0, stripe_offset = 0; 1534 + u64 prev_offset = 0; 1564 1535 u32 stripe_index = 0; 1536 + bool has_partial = false, has_conventional = false; 1565 1537 1566 1538 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1567 1539 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1568 1540 btrfs_bg_type_to_raid_name(map->type)); 1569 1541 return -EINVAL; 1570 1542 } 1543 + 1544 + /* 1545 + * When the last extent is removed, last_alloc can be smaller than the other write 1546 + * pointer. In that case, last_alloc should be moved to the corresponding write 1547 + * pointer position. 1548 + */ 1549 + for (int i = 0; i < map->num_stripes; i++) { 1550 + u64 alloc; 1551 + 1552 + if (zone_info[i].alloc_offset == WP_MISSING_DEV || 1553 + zone_info[i].alloc_offset == WP_CONVENTIONAL) 1554 + continue; 1555 + 1556 + stripe_nr = zone_info[i].alloc_offset >> BTRFS_STRIPE_LEN_SHIFT; 1557 + stripe_offset = zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK; 1558 + if (stripe_offset == 0 && stripe_nr > 0) { 1559 + stripe_nr--; 1560 + stripe_offset = BTRFS_STRIPE_LEN; 1561 + } 1562 + alloc = ((stripe_nr * map->num_stripes + i) << BTRFS_STRIPE_LEN_SHIFT) + 1563 + stripe_offset; 1564 + last_alloc = max(last_alloc, alloc); 1565 + 1566 + /* Partially written stripe found. It should be last. */ 1567 + if (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) 1568 + break; 1569 + } 1570 + stripe_nr = 0; 1571 + stripe_offset = 0; 1571 1572 1572 1573 if (last_alloc) { 1573 1574 u32 factor = map->num_stripes; ··· 1613 1552 continue; 1614 1553 1615 1554 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { 1616 - 1555 + has_conventional = true; 1617 1556 zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); 1618 1557 1619 1558 if (stripe_index > i) ··· 1621 1560 else if (stripe_index == i) 1622 1561 zone_info[i].alloc_offset += stripe_offset; 1623 1562 } 1563 + 1564 + /* Verification */ 1565 + if (i != 0) { 1566 + if (unlikely(prev_offset < zone_info[i].alloc_offset)) { 1567 + btrfs_err(fs_info, 1568 + "zoned: stripe position disorder found in block group %llu", 1569 + bg->start); 1570 + return -EIO; 1571 + } 1572 + 1573 + if (unlikely(has_partial && 1574 + (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK))) { 1575 + btrfs_err(fs_info, 1576 + "zoned: multiple partial written stripe found in block group %llu", 1577 + bg->start); 1578 + return -EIO; 1579 + } 1580 + } 1581 + prev_offset = zone_info[i].alloc_offset; 1582 + 1583 + if ((zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) != 0) 1584 + has_partial = true; 1624 1585 1625 1586 if (test_bit(0, active) != test_bit(i, active)) { 1626 1587 if (unlikely(!btrfs_zone_activate(bg))) ··· 1655 1572 bg->alloc_offset += zone_info[i].alloc_offset; 1656 1573 } 1657 1574 1575 + /* Check if all devices stay in the same stripe row. */ 1576 + if (unlikely(zone_info[0].alloc_offset - 1577 + zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) { 1578 + btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", bg->start); 1579 + return -EIO; 1580 + } 1581 + 1582 + if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) { 1583 + btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu", 1584 + bg->alloc_offset, last_alloc); 1585 + return -EIO; 1586 + } 1587 + 1658 1588 return 0; 1659 1589 } 1660 1590 ··· 1678 1582 u64 last_alloc) 1679 1583 { 1680 1584 struct btrfs_fs_info *fs_info = bg->fs_info; 1585 + u64 AUTO_KFREE(raid0_allocs); 1681 1586 u64 stripe_nr = 0, stripe_offset = 0; 1682 1587 u32 stripe_index = 0; 1588 + bool has_partial = false, has_conventional = false; 1589 + u64 prev_offset = 0; 1683 1590 1684 1591 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1685 1592 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1686 1593 btrfs_bg_type_to_raid_name(map->type)); 1687 1594 return -EINVAL; 1688 1595 } 1596 + 1597 + raid0_allocs = kcalloc(map->num_stripes / map->sub_stripes, sizeof(*raid0_allocs), 1598 + GFP_NOFS); 1599 + if (!raid0_allocs) 1600 + return -ENOMEM; 1601 + 1602 + /* 1603 + * When the last extent is removed, last_alloc can be smaller than the other write 1604 + * pointer. In that case, last_alloc should be moved to the corresponding write 1605 + * pointer position. 1606 + */ 1607 + for (int i = 0; i < map->num_stripes; i += map->sub_stripes) { 1608 + u64 alloc = zone_info[i].alloc_offset; 1609 + 1610 + for (int j = 1; j < map->sub_stripes; j++) { 1611 + int idx = i + j; 1612 + 1613 + if (zone_info[idx].alloc_offset == WP_MISSING_DEV || 1614 + zone_info[idx].alloc_offset == WP_CONVENTIONAL) 1615 + continue; 1616 + if (alloc == WP_MISSING_DEV || alloc == WP_CONVENTIONAL) { 1617 + alloc = zone_info[idx].alloc_offset; 1618 + } else if (unlikely(zone_info[idx].alloc_offset != alloc)) { 1619 + btrfs_err(fs_info, 1620 + "zoned: write pointer mismatch found in block group %llu", 1621 + bg->start); 1622 + return -EIO; 1623 + } 1624 + } 1625 + 1626 + raid0_allocs[i / map->sub_stripes] = alloc; 1627 + if (alloc == WP_CONVENTIONAL) 1628 + continue; 1629 + if (unlikely(alloc == WP_MISSING_DEV)) { 1630 + btrfs_err(fs_info, 1631 + "zoned: cannot recover write pointer of block group %llu due to missing device", 1632 + bg->start); 1633 + return -EIO; 1634 + } 1635 + 1636 + stripe_nr = alloc >> BTRFS_STRIPE_LEN_SHIFT; 1637 + stripe_offset = alloc & BTRFS_STRIPE_LEN_MASK; 1638 + if (stripe_offset == 0 && stripe_nr > 0) { 1639 + stripe_nr--; 1640 + stripe_offset = BTRFS_STRIPE_LEN; 1641 + } 1642 + 1643 + alloc = ((stripe_nr * (map->num_stripes / map->sub_stripes) + 1644 + (i / map->sub_stripes)) << 1645 + BTRFS_STRIPE_LEN_SHIFT) + stripe_offset; 1646 + last_alloc = max(last_alloc, alloc); 1647 + } 1648 + stripe_nr = 0; 1649 + stripe_offset = 0; 1689 1650 1690 1651 if (last_alloc) { 1691 1652 u32 factor = map->num_stripes / map->sub_stripes; ··· 1753 1600 } 1754 1601 1755 1602 for (int i = 0; i < map->num_stripes; i++) { 1756 - if (zone_info[i].alloc_offset == WP_MISSING_DEV) 1757 - continue; 1603 + int idx = i / map->sub_stripes; 1604 + 1605 + if (raid0_allocs[idx] == WP_CONVENTIONAL) { 1606 + has_conventional = true; 1607 + raid0_allocs[idx] = btrfs_stripe_nr_to_offset(stripe_nr); 1608 + 1609 + if (stripe_index > idx) 1610 + raid0_allocs[idx] += BTRFS_STRIPE_LEN; 1611 + else if (stripe_index == idx) 1612 + raid0_allocs[idx] += stripe_offset; 1613 + } 1614 + 1615 + if ((i % map->sub_stripes) == 0) { 1616 + /* Verification */ 1617 + if (i != 0) { 1618 + if (unlikely(prev_offset < raid0_allocs[idx])) { 1619 + btrfs_err(fs_info, 1620 + "zoned: stripe position disorder found in block group %llu", 1621 + bg->start); 1622 + return -EIO; 1623 + } 1624 + 1625 + if (unlikely(has_partial && 1626 + (raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK))) { 1627 + btrfs_err(fs_info, 1628 + "zoned: multiple partial written stripe found in block group %llu", 1629 + bg->start); 1630 + return -EIO; 1631 + } 1632 + } 1633 + prev_offset = raid0_allocs[idx]; 1634 + 1635 + if ((raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK) != 0) 1636 + has_partial = true; 1637 + } 1638 + 1639 + if (zone_info[i].alloc_offset == WP_MISSING_DEV || 1640 + zone_info[i].alloc_offset == WP_CONVENTIONAL) 1641 + zone_info[i].alloc_offset = raid0_allocs[idx]; 1758 1642 1759 1643 if (test_bit(0, active) != test_bit(i, active)) { 1760 1644 if (unlikely(!btrfs_zone_activate(bg))) 1761 1645 return -EIO; 1762 - } else { 1763 - if (test_bit(0, active)) 1764 - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1765 - } 1766 - 1767 - if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { 1768 - zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); 1769 - 1770 - if (stripe_index > (i / map->sub_stripes)) 1771 - zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; 1772 - else if (stripe_index == (i / map->sub_stripes)) 1773 - zone_info[i].alloc_offset += stripe_offset; 1646 + } else if (test_bit(0, active)) { 1647 + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1774 1648 } 1775 1649 1776 1650 if ((i % map->sub_stripes) == 0) { ··· 1806 1626 } 1807 1627 } 1808 1628 1629 + /* Check if all devices stay in the same stripe row. */ 1630 + if (unlikely(zone_info[0].alloc_offset - 1631 + zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) { 1632 + btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", 1633 + bg->start); 1634 + return -EIO; 1635 + } 1636 + 1637 + if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) { 1638 + btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu", 1639 + bg->alloc_offset, last_alloc); 1640 + return -EIO; 1641 + } 1642 + 1809 1643 return 0; 1644 + } 1645 + 1646 + EXPORT_FOR_TESTS 1647 + int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg, 1648 + struct btrfs_chunk_map *map, 1649 + struct zone_info *zone_info, 1650 + unsigned long *active, u64 last_alloc) 1651 + { 1652 + struct btrfs_fs_info *fs_info = bg->fs_info; 1653 + u64 profile; 1654 + int ret; 1655 + 1656 + profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 1657 + switch (profile) { 1658 + case 0: /* single */ 1659 + ret = btrfs_load_block_group_single(bg, &zone_info[0], active); 1660 + break; 1661 + case BTRFS_BLOCK_GROUP_DUP: 1662 + ret = btrfs_load_block_group_dup(bg, map, zone_info, active, last_alloc); 1663 + break; 1664 + case BTRFS_BLOCK_GROUP_RAID1: 1665 + case BTRFS_BLOCK_GROUP_RAID1C3: 1666 + case BTRFS_BLOCK_GROUP_RAID1C4: 1667 + ret = btrfs_load_block_group_raid1(bg, map, zone_info, active, last_alloc); 1668 + break; 1669 + case BTRFS_BLOCK_GROUP_RAID0: 1670 + ret = btrfs_load_block_group_raid0(bg, map, zone_info, active, last_alloc); 1671 + break; 1672 + case BTRFS_BLOCK_GROUP_RAID10: 1673 + ret = btrfs_load_block_group_raid10(bg, map, zone_info, active, last_alloc); 1674 + break; 1675 + case BTRFS_BLOCK_GROUP_RAID5: 1676 + case BTRFS_BLOCK_GROUP_RAID6: 1677 + default: 1678 + btrfs_err(fs_info, "zoned: profile %s not yet supported", 1679 + btrfs_bg_type_to_raid_name(map->type)); 1680 + return -EINVAL; 1681 + } 1682 + 1683 + if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && 1684 + profile != BTRFS_BLOCK_GROUP_RAID10) { 1685 + /* 1686 + * Detected broken write pointer. Make this block group 1687 + * unallocatable by setting the allocation pointer at the end of 1688 + * allocatable region. Relocating this block group will fix the 1689 + * mismatch. 1690 + * 1691 + * Currently, we cannot handle RAID0 or RAID10 case like this 1692 + * because we don't have a proper zone_capacity value. But, 1693 + * reading from this block group won't work anyway by a missing 1694 + * stripe. 1695 + */ 1696 + bg->alloc_offset = bg->zone_capacity; 1697 + } 1698 + 1699 + return ret; 1810 1700 } 1811 1701 1812 1702 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) ··· 1891 1641 unsigned long *active = NULL; 1892 1642 u64 last_alloc = 0; 1893 1643 u32 num_sequential = 0, num_conventional = 0; 1894 - u64 profile; 1895 1644 1896 1645 if (!btrfs_is_zoned(fs_info)) 1897 1646 return 0; ··· 1950 1701 } 1951 1702 } 1952 1703 1953 - profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 1954 - switch (profile) { 1955 - case 0: /* single */ 1956 - ret = btrfs_load_block_group_single(cache, &zone_info[0], active); 1957 - break; 1958 - case BTRFS_BLOCK_GROUP_DUP: 1959 - ret = btrfs_load_block_group_dup(cache, map, zone_info, active, 1960 - last_alloc); 1961 - break; 1962 - case BTRFS_BLOCK_GROUP_RAID1: 1963 - case BTRFS_BLOCK_GROUP_RAID1C3: 1964 - case BTRFS_BLOCK_GROUP_RAID1C4: 1965 - ret = btrfs_load_block_group_raid1(cache, map, zone_info, 1966 - active, last_alloc); 1967 - break; 1968 - case BTRFS_BLOCK_GROUP_RAID0: 1969 - ret = btrfs_load_block_group_raid0(cache, map, zone_info, 1970 - active, last_alloc); 1971 - break; 1972 - case BTRFS_BLOCK_GROUP_RAID10: 1973 - ret = btrfs_load_block_group_raid10(cache, map, zone_info, 1974 - active, last_alloc); 1975 - break; 1976 - case BTRFS_BLOCK_GROUP_RAID5: 1977 - case BTRFS_BLOCK_GROUP_RAID6: 1978 - default: 1979 - btrfs_err(fs_info, "zoned: profile %s not yet supported", 1980 - btrfs_bg_type_to_raid_name(map->type)); 1981 - ret = -EINVAL; 1982 - goto out; 1983 - } 1984 - 1985 - if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && 1986 - profile != BTRFS_BLOCK_GROUP_RAID10) { 1987 - /* 1988 - * Detected broken write pointer. Make this block group 1989 - * unallocatable by setting the allocation pointer at the end of 1990 - * allocatable region. Relocating this block group will fix the 1991 - * mismatch. 1992 - * 1993 - * Currently, we cannot handle RAID0 or RAID10 case like this 1994 - * because we don't have a proper zone_capacity value. But, 1995 - * reading from this block group won't work anyway by a missing 1996 - * stripe. 1997 - */ 1998 - cache->alloc_offset = cache->zone_capacity; 1999 - } 1704 + ret = btrfs_load_block_group_by_raid_type(cache, map, zone_info, active, last_alloc); 2000 1705 2001 1706 out: 2002 1707 /* Reject non SINGLE data profiles without RST */ ··· 2231 2028 2232 2029 if (block_group) { 2233 2030 if (block_group->start > eb->start || 2234 - block_group->start + block_group->length <= eb->start) { 2031 + btrfs_block_group_end(block_group) <= eb->start) { 2235 2032 btrfs_put_block_group(block_group); 2236 2033 block_group = NULL; 2237 2034 ctx->zoned_bg = NULL; ··· 2451 2248 static void wait_eb_writebacks(struct btrfs_block_group *block_group) 2452 2249 { 2453 2250 struct btrfs_fs_info *fs_info = block_group->fs_info; 2454 - const u64 end = block_group->start + block_group->length; 2251 + const u64 end = btrfs_block_group_end(block_group); 2455 2252 struct extent_buffer *eb; 2456 2253 unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); 2457 2254 ··· 3186 2983 } 3187 2984 3188 2985 return 0; 2986 + } 2987 + 2988 + void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) 2989 + { 2990 + struct btrfs_block_group *bg; 2991 + u64 data_reloc_bg; 2992 + u64 treelog_bg; 2993 + 2994 + seq_puts(seq, "\n zoned statistics:\n"); 2995 + 2996 + spin_lock(&fs_info->zone_active_bgs_lock); 2997 + seq_printf(seq, "\tactive block-groups: %zu\n", 2998 + list_count_nodes(&fs_info->zone_active_bgs)); 2999 + spin_unlock(&fs_info->zone_active_bgs_lock); 3000 + 3001 + spin_lock(&fs_info->unused_bgs_lock); 3002 + seq_printf(seq, "\t reclaimable: %zu\n", 3003 + list_count_nodes(&fs_info->reclaim_bgs)); 3004 + seq_printf(seq, "\t unused: %zu\n", list_count_nodes(&fs_info->unused_bgs)); 3005 + spin_unlock(&fs_info->unused_bgs_lock); 3006 + 3007 + seq_printf(seq,"\t need reclaim: %s\n", 3008 + str_true_false(btrfs_zoned_should_reclaim(fs_info))); 3009 + 3010 + data_reloc_bg = data_race(fs_info->data_reloc_bg); 3011 + if (data_reloc_bg) 3012 + seq_printf(seq, "\tdata relocation block-group: %llu\n", 3013 + data_reloc_bg); 3014 + treelog_bg = data_race(fs_info->treelog_bg); 3015 + if (treelog_bg) 3016 + seq_printf(seq, "\ttree-log block-group: %llu\n", treelog_bg); 3017 + 3018 + spin_lock(&fs_info->zone_active_bgs_lock); 3019 + seq_puts(seq, "\tactive zones:\n"); 3020 + list_for_each_entry(bg, &fs_info->zone_active_bgs, active_bg_list) { 3021 + u64 start; 3022 + u64 alloc_offset; 3023 + u64 used; 3024 + u64 reserved; 3025 + u64 zone_unusable; 3026 + const char *typestr = btrfs_space_info_type_str(bg->space_info); 3027 + 3028 + spin_lock(&bg->lock); 3029 + start = bg->start; 3030 + alloc_offset = bg->alloc_offset; 3031 + used = bg->used; 3032 + reserved = bg->reserved; 3033 + zone_unusable = bg->zone_unusable; 3034 + spin_unlock(&bg->lock); 3035 + 3036 + seq_printf(seq, 3037 + "\t start: %llu, wp: %llu used: %llu, reserved: %llu, unusable: %llu (%s)\n", 3038 + start, alloc_offset, used, reserved, zone_unusable, typestr); 3039 + } 3040 + spin_unlock(&fs_info->zone_active_bgs_lock); 3189 3041 }

+17

fs/btrfs/zoned.h

··· 10 10 #include <linux/errno.h> 11 11 #include <linux/spinlock.h> 12 12 #include <linux/mutex.h> 13 + #include <linux/seq_file.h> 13 14 #include "messages.h" 14 15 #include "volumes.h" 15 16 #include "disk-io.h" ··· 97 96 int btrfs_zoned_activate_one_bg(struct btrfs_space_info *space_info, bool do_finish); 98 97 void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); 99 98 int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes); 99 + void btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq); 100 + 101 + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 102 + struct zone_info; 103 + 104 + int btrfs_load_block_group_by_raid_type(struct btrfs_block_group *bg, 105 + struct btrfs_chunk_map *map, 106 + struct zone_info *zone_info, 107 + unsigned long *active, u64 last_alloc); 108 + #endif 109 + 100 110 #else /* CONFIG_BLK_DEV_ZONED */ 101 111 102 112 static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) ··· 283 271 284 272 static inline int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, 285 273 u64 num_bytes) 274 + { 275 + return 0; 276 + } 277 + 278 + static inline int btrfs_show_zoned_stats(struct btrfs_fs_info *fs_info, struct seq_file *seq) 286 279 { 287 280 return 0; 288 281 }

+71 -68

fs/btrfs/zstd.c

··· 396 396 return ERR_PTR(-ENOMEM); 397 397 } 398 398 399 - int zstd_compress_folios(struct list_head *ws, struct btrfs_inode *inode, 400 - u64 start, struct folio **folios, unsigned long *out_folios, 401 - unsigned long *total_in, unsigned long *total_out) 399 + int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) 402 400 { 401 + struct btrfs_inode *inode = cb->bbio.inode; 403 402 struct btrfs_fs_info *fs_info = inode->root->fs_info; 404 403 struct workspace *workspace = list_entry(ws, struct workspace, list); 405 404 struct address_space *mapping = inode->vfs_inode.i_mapping; 405 + struct bio *bio = &cb->bbio.bio; 406 406 zstd_cstream *stream; 407 407 int ret = 0; 408 - int nr_folios = 0; 409 - struct folio *in_folio = NULL; /* The current folio to read. */ 410 - struct folio *out_folio = NULL; /* The current folio to write to. */ 408 + /* The current folio to read. */ 409 + struct folio *in_folio = NULL; 410 + /* The current folio to write to. */ 411 + struct folio *out_folio = NULL; 411 412 unsigned long tot_in = 0; 412 413 unsigned long tot_out = 0; 413 - unsigned long len = *total_out; 414 - const unsigned long nr_dest_folios = *out_folios; 415 - const u64 orig_end = start + len; 414 + const u64 start = cb->start; 415 + const u32 len = cb->len; 416 + const u64 end = start + len; 416 417 const u32 blocksize = fs_info->sectorsize; 417 418 const u32 min_folio_size = btrfs_min_folio_size(fs_info); 418 - unsigned long max_out = nr_dest_folios * min_folio_size; 419 - unsigned int cur_len; 420 419 421 420 workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); 422 - *out_folios = 0; 423 - *total_out = 0; 424 - *total_in = 0; 425 421 426 - /* Initialize the stream */ 427 - stream = zstd_init_cstream(&workspace->params, len, workspace->mem, 428 - workspace->size); 422 + /* Initialize the stream. */ 423 + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); 429 424 if (unlikely(!stream)) { 430 425 btrfs_err(fs_info, 431 426 "zstd compression init level %d failed, root %llu inode %llu offset %llu", ··· 430 435 goto out; 431 436 } 432 437 433 - /* map in the first page of input data */ 438 + /* Map in the first page of input data. */ 434 439 ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); 435 440 if (ret < 0) 436 441 goto out; 437 - cur_len = btrfs_calc_input_length(in_folio, orig_end, start); 438 442 workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); 439 443 workspace->in_buf.pos = 0; 440 - workspace->in_buf.size = cur_len; 444 + workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, start); 441 445 442 - /* Allocate and map in the output buffer */ 446 + /* Allocate and map in the output buffer. */ 443 447 out_folio = btrfs_alloc_compr_folio(fs_info); 444 448 if (out_folio == NULL) { 445 449 ret = -ENOMEM; 446 450 goto out; 447 451 } 448 - folios[nr_folios++] = out_folio; 449 452 workspace->out_buf.dst = folio_address(out_folio); 450 453 workspace->out_buf.pos = 0; 451 - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); 454 + workspace->out_buf.size = min_folio_size; 452 455 453 456 while (1) { 454 457 size_t ret2; 455 458 456 - ret2 = zstd_compress_stream(stream, &workspace->out_buf, 457 - &workspace->in_buf); 459 + ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); 458 460 if (unlikely(zstd_is_error(ret2))) { 459 461 btrfs_warn(fs_info, 460 462 "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", 461 463 workspace->req_level, zstd_get_error_code(ret2), 462 464 btrfs_root_id(inode->root), btrfs_ino(inode), 463 - start); 465 + start + tot_in); 464 466 ret = -EIO; 465 467 goto out; 466 468 } 467 469 468 - /* Check to see if we are making it bigger */ 470 + /* Check to see if we are making it bigger. */ 469 471 if (tot_in + workspace->in_buf.pos > blocksize * 2 && 470 - tot_in + workspace->in_buf.pos < 471 - tot_out + workspace->out_buf.pos) { 472 + tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { 472 473 ret = -E2BIG; 473 474 goto out; 474 475 } 475 476 476 - /* We've reached the end of our output range */ 477 - if (workspace->out_buf.pos >= max_out) { 478 - tot_out += workspace->out_buf.pos; 479 - ret = -E2BIG; 480 - goto out; 481 - } 482 - 483 - /* Check if we need more output space */ 484 - if (workspace->out_buf.pos == workspace->out_buf.size) { 477 + /* Check if we need more output space. */ 478 + if (workspace->out_buf.pos >= workspace->out_buf.size) { 485 479 tot_out += min_folio_size; 486 - max_out -= min_folio_size; 487 - if (nr_folios == nr_dest_folios) { 480 + if (tot_out >= len) { 488 481 ret = -E2BIG; 489 482 goto out; 490 483 } 484 + /* Queue the current foliot into the bio. */ 485 + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { 486 + ret = -E2BIG; 487 + goto out; 488 + } 489 + 491 490 out_folio = btrfs_alloc_compr_folio(fs_info); 492 491 if (out_folio == NULL) { 493 492 ret = -ENOMEM; 494 493 goto out; 495 494 } 496 - folios[nr_folios++] = out_folio; 497 495 workspace->out_buf.dst = folio_address(out_folio); 498 496 workspace->out_buf.pos = 0; 499 - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); 497 + workspace->out_buf.size = min_folio_size; 500 498 } 501 499 502 - /* We've reached the end of the input */ 503 - if (workspace->in_buf.pos >= len) { 500 + /* We've reached the end of the input. */ 501 + if (tot_in + workspace->in_buf.pos >= len) { 504 502 tot_in += workspace->in_buf.pos; 505 503 break; 506 504 } 507 505 508 - /* Check if we need more input */ 509 - if (workspace->in_buf.pos == workspace->in_buf.size) { 506 + /* Check if we need more input. */ 507 + if (workspace->in_buf.pos >= workspace->in_buf.size) { 508 + u64 cur; 509 + 510 510 tot_in += workspace->in_buf.size; 511 + cur = start + tot_in; 512 + 511 513 kunmap_local(workspace->in_buf.src); 512 514 workspace->in_buf.src = NULL; 513 515 folio_put(in_folio); 514 - start += cur_len; 515 - len -= cur_len; 516 - ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); 516 + 517 + ret = btrfs_compress_filemap_get_folio(mapping, cur, &in_folio); 517 518 if (ret < 0) 518 519 goto out; 519 - cur_len = btrfs_calc_input_length(in_folio, orig_end, start); 520 520 workspace->in_buf.src = kmap_local_folio(in_folio, 521 - offset_in_folio(in_folio, start)); 521 + offset_in_folio(in_folio, cur)); 522 522 workspace->in_buf.pos = 0; 523 - workspace->in_buf.size = cur_len; 523 + workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, cur); 524 524 } 525 525 } 526 + 526 527 while (1) { 527 528 size_t ret2; 528 529 ··· 528 537 "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", 529 538 workspace->req_level, zstd_get_error_code(ret2), 530 539 btrfs_root_id(inode->root), btrfs_ino(inode), 531 - start); 540 + start + tot_in); 532 541 ret = -EIO; 533 542 goto out; 534 543 } 544 + /* Queue the remaining part of the output folio into bio. */ 535 545 if (ret2 == 0) { 536 546 tot_out += workspace->out_buf.pos; 547 + if (tot_out >= len) { 548 + ret = -E2BIG; 549 + goto out; 550 + } 551 + if (!bio_add_folio(bio, out_folio, workspace->out_buf.pos, 0)) { 552 + ret = -E2BIG; 553 + goto out; 554 + } 555 + out_folio = NULL; 537 556 break; 538 557 } 539 - if (workspace->out_buf.pos >= max_out) { 540 - tot_out += workspace->out_buf.pos; 558 + tot_out += min_folio_size; 559 + if (tot_out >= len) { 541 560 ret = -E2BIG; 542 561 goto out; 543 562 } 544 - 545 - tot_out += min_folio_size; 546 - max_out -= min_folio_size; 547 - if (nr_folios == nr_dest_folios) { 563 + if (!bio_add_folio(bio, out_folio, folio_size(out_folio), 0)) { 548 564 ret = -E2BIG; 549 565 goto out; 550 566 } ··· 560 562 ret = -ENOMEM; 561 563 goto out; 562 564 } 563 - folios[nr_folios++] = out_folio; 564 565 workspace->out_buf.dst = folio_address(out_folio); 565 566 workspace->out_buf.pos = 0; 566 - workspace->out_buf.size = min_t(size_t, max_out, min_folio_size); 567 + workspace->out_buf.size = min_folio_size; 567 568 } 568 569 569 570 if (tot_out >= tot_in) { ··· 571 574 } 572 575 573 576 ret = 0; 574 - *total_in = tot_in; 575 - *total_out = tot_out; 577 + ASSERT(tot_out == bio->bi_iter.bi_size); 576 578 out: 577 - *out_folios = nr_folios; 579 + if (out_folio) 580 + btrfs_free_compr_folio(out_folio); 578 581 if (workspace->in_buf.src) { 579 582 kunmap_local(workspace->in_buf.src); 580 583 folio_put(in_folio); ··· 586 589 { 587 590 struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); 588 591 struct workspace *workspace = list_entry(ws, struct workspace, list); 589 - struct folio **folios_in = cb->compressed_folios; 592 + struct folio_iter fi; 590 593 size_t srclen = cb->compressed_len; 591 594 zstd_dstream *stream; 592 595 int ret = 0; ··· 596 599 unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); 597 600 unsigned long buf_start; 598 601 unsigned long total_out = 0; 602 + 603 + bio_first_folio(&fi, &cb->bbio.bio, 0); 604 + if (unlikely(!fi.folio)) 605 + return -EINVAL; 606 + ASSERT(folio_size(fi.folio) == blocksize); 599 607 600 608 stream = zstd_init_dstream( 601 609 ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); ··· 614 612 goto done; 615 613 } 616 614 617 - workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); 615 + workspace->in_buf.src = kmap_local_folio(fi.folio, 0); 618 616 workspace->in_buf.pos = 0; 619 617 workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); 620 618 ··· 662 660 goto done; 663 661 } 664 662 srclen -= min_folio_size; 665 - workspace->in_buf.src = 666 - kmap_local_folio(folios_in[folio_in_index], 0); 663 + bio_next_folio(&fi, &cb->bbio.bio); 664 + ASSERT(fi.folio); 665 + workspace->in_buf.src = kmap_local_folio(fi.folio, 0); 667 666 workspace->in_buf.pos = 0; 668 667 workspace->in_buf.size = min_t(size_t, srclen, min_folio_size); 669 668 }

+1

include/uapi/linux/btrfs.h

··· 336 336 #define BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2 (1ULL << 13) 337 337 #define BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE (1ULL << 14) 338 338 #define BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA (1ULL << 16) 339 + #define BTRFS_FEATURE_INCOMPAT_REMAP_TREE (1ULL << 17) 339 340 340 341 struct btrfs_ioctl_feature_flags { 341 342 __u64 compat_flags;

+32 -2

include/uapi/linux/btrfs_tree.h

··· 76 76 /* Tracks RAID stripes in block groups. */ 77 77 #define BTRFS_RAID_STRIPE_TREE_OBJECTID 12ULL 78 78 79 + /* Holds details of remapped addresses after relocation. */ 80 + #define BTRFS_REMAP_TREE_OBJECTID 13ULL 81 + 79 82 /* device stats in the device tree */ 80 83 #define BTRFS_DEV_STATS_OBJECTID 0ULL 81 84 ··· 284 281 #define BTRFS_CHUNK_ITEM_KEY 228 285 282 286 283 #define BTRFS_RAID_STRIPE_KEY 230 284 + 285 + #define BTRFS_IDENTITY_REMAP_KEY 234 286 + #define BTRFS_REMAP_KEY 235 287 + #define BTRFS_REMAP_BACKREF_KEY 236 287 288 288 289 /* 289 290 * Records the overall state of the qgroups. ··· 721 714 __u8 metadata_uuid[BTRFS_FSID_SIZE]; 722 715 723 716 __u64 nr_global_roots; 717 + __le64 remap_root; 718 + __le64 remap_root_generation; 719 + __u8 remap_root_level; 724 720 725 721 /* Future expansion */ 726 - __le64 reserved[27]; 722 + __u8 reserved[199]; 727 723 __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 728 724 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; 729 725 ··· 1171 1161 #define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8) 1172 1162 #define BTRFS_BLOCK_GROUP_RAID1C3 (1ULL << 9) 1173 1163 #define BTRFS_BLOCK_GROUP_RAID1C4 (1ULL << 10) 1164 + #define BTRFS_BLOCK_GROUP_REMAPPED (1ULL << 11) 1165 + #define BTRFS_BLOCK_GROUP_METADATA_REMAP (1ULL << 12) 1174 1166 #define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \ 1175 1167 BTRFS_SPACE_INFO_GLOBAL_RSV) 1176 1168 1177 1169 #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ 1178 1170 BTRFS_BLOCK_GROUP_SYSTEM | \ 1179 - BTRFS_BLOCK_GROUP_METADATA) 1171 + BTRFS_BLOCK_GROUP_METADATA | \ 1172 + BTRFS_BLOCK_GROUP_METADATA_REMAP) 1180 1173 1181 1174 #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ 1182 1175 BTRFS_BLOCK_GROUP_RAID1 | \ ··· 1230 1217 __le64 used; 1231 1218 __le64 chunk_objectid; 1232 1219 __le64 flags; 1220 + } __attribute__ ((__packed__)); 1221 + 1222 + struct btrfs_block_group_item_v2 { 1223 + __le64 used; 1224 + __le64 chunk_objectid; 1225 + __le64 flags; 1226 + __le64 remap_bytes; 1227 + __le32 identity_remap_count; 1233 1228 } __attribute__ ((__packed__)); 1234 1229 1235 1230 struct btrfs_free_space_info { ··· 1342 1321 */ 1343 1322 __le64 reserved[2]; 1344 1323 __u8 encryption; 1324 + } __attribute__ ((__packed__)); 1325 + 1326 + /* 1327 + * For a range identified by a BTRFS_REMAP_KEY item in the remap tree, gives 1328 + * the address that the start of the range will get remapped to. This 1329 + * structure is also shared by BTRFS_REMAP_BACKREF_KEY. 1330 + */ 1331 + struct btrfs_remap_item { 1332 + __le64 address; 1345 1333 } __attribute__ ((__packed__)); 1346 1334 1347 1335 #endif /* _BTRFS_CTREE_H_ */

Configure Feed

Configure Feed