Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.13-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
"A few more fixes that accumulated over the last two weeks, fixing some
user reported problems:

- swapfile fixes:
- conditional reschedule in the activation loop
- fix race with memory mapped file when activating
- make activation loop interruptible
- rework and fix extent sharing checks

- folio fixes:
- in send, recheck folio mapping after unlock
- in relocation, recheck folio mapping after unlock

- fix waiting for encoded read io_uring requests

- fix transaction atomicity when enabling simple quotas

- move COW block trace point before the block gets freed

- print various sizes in sysfs with correct endianity"

* tag 'for-6.13-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: sysfs: fix direct super block member reads
btrfs: fix transaction atomicity bug when enabling simple quotas
btrfs: avoid monopolizing a core when activating a swap file
btrfs: allow swap activation to be interruptible
btrfs: fix swap file activation failure due to extents that used to be shared
btrfs: fix race with memory mapped writes when activating swap file
btrfs: check folio mapping after unlock in put_file_data()
btrfs: check folio mapping after unlock in relocate_one_folio()
btrfs: fix use-after-free when COWing tree bock and tracing is enabled
btrfs: fix use-after-free waiting for encoded read endios

+130 -56
+4 -7
fs/btrfs/ctree.c
··· 654 654 goto error_unlock_cow; 655 655 } 656 656 } 657 + 658 + trace_btrfs_cow_block(root, buf, cow); 657 659 if (unlock_orig) 658 660 btrfs_tree_unlock(buf); 659 661 free_extent_buffer_stale(buf); ··· 712 710 { 713 711 struct btrfs_fs_info *fs_info = root->fs_info; 714 712 u64 search_start; 715 - int ret; 716 713 717 714 if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { 718 715 btrfs_abort_transaction(trans, -EUCLEAN); ··· 752 751 * Also We don't care about the error, as it's handled internally. 753 752 */ 754 753 btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); 755 - ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, 756 - cow_ret, search_start, 0, nest); 757 - 758 - trace_btrfs_cow_block(root, buf, *cow_ret); 759 - 760 - return ret; 754 + return btrfs_force_cow_block(trans, root, buf, parent, parent_slot, 755 + cow_ret, search_start, 0, nest); 761 756 } 762 757 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); 763 758
+110 -44
fs/btrfs/inode.c
··· 9078 9078 } 9079 9079 9080 9080 struct btrfs_encoded_read_private { 9081 - wait_queue_head_t wait; 9081 + struct completion done; 9082 9082 void *uring_ctx; 9083 - atomic_t pending; 9083 + refcount_t pending_refs; 9084 9084 blk_status_t status; 9085 9085 }; 9086 9086 ··· 9099 9099 */ 9100 9100 WRITE_ONCE(priv->status, bbio->bio.bi_status); 9101 9101 } 9102 - if (atomic_dec_and_test(&priv->pending)) { 9102 + if (refcount_dec_and_test(&priv->pending_refs)) { 9103 9103 int err = blk_status_to_errno(READ_ONCE(priv->status)); 9104 9104 9105 9105 if (priv->uring_ctx) { 9106 9106 btrfs_uring_read_extent_endio(priv->uring_ctx, err); 9107 9107 kfree(priv); 9108 9108 } else { 9109 - wake_up(&priv->wait); 9109 + complete(&priv->done); 9110 9110 } 9111 9111 } 9112 9112 bio_put(&bbio->bio); ··· 9126 9126 if (!priv) 9127 9127 return -ENOMEM; 9128 9128 9129 - init_waitqueue_head(&priv->wait); 9130 - atomic_set(&priv->pending, 1); 9129 + init_completion(&priv->done); 9130 + refcount_set(&priv->pending_refs, 1); 9131 9131 priv->status = 0; 9132 9132 priv->uring_ctx = uring_ctx; 9133 9133 ··· 9140 9140 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); 9141 9141 9142 9142 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { 9143 - atomic_inc(&priv->pending); 9143 + refcount_inc(&priv->pending_refs); 9144 9144 btrfs_submit_bbio(bbio, 0); 9145 9145 9146 9146 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, ··· 9155 9155 disk_io_size -= bytes; 9156 9156 } while (disk_io_size); 9157 9157 9158 - atomic_inc(&priv->pending); 9158 + refcount_inc(&priv->pending_refs); 9159 9159 btrfs_submit_bbio(bbio, 0); 9160 9160 9161 9161 if (uring_ctx) { 9162 - if (atomic_dec_return(&priv->pending) == 0) { 9162 + if (refcount_dec_and_test(&priv->pending_refs)) { 9163 9163 ret = blk_status_to_errno(READ_ONCE(priv->status)); 9164 9164 btrfs_uring_read_extent_endio(uring_ctx, ret); 9165 9165 kfree(priv); ··· 9168 9168 9169 9169 return -EIOCBQUEUED; 9170 9170 } else { 9171 - if (atomic_dec_return(&priv->pending) != 0) 9172 - io_wait_event(priv->wait, !atomic_read(&priv->pending)); 9171 + if (!refcount_dec_and_test(&priv->pending_refs)) 9172 + wait_for_completion_io(&priv->done); 9173 9173 /* See btrfs_encoded_read_endio() for ordering. */ 9174 9174 ret = blk_status_to_errno(READ_ONCE(priv->status)); 9175 9175 kfree(priv); ··· 9799 9799 struct btrfs_fs_info *fs_info = root->fs_info; 9800 9800 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 9801 9801 struct extent_state *cached_state = NULL; 9802 - struct extent_map *em = NULL; 9803 9802 struct btrfs_chunk_map *map = NULL; 9804 9803 struct btrfs_device *device = NULL; 9805 9804 struct btrfs_swap_info bsi = { 9806 9805 .lowest_ppage = (sector_t)-1ULL, 9807 9806 }; 9807 + struct btrfs_backref_share_check_ctx *backref_ctx = NULL; 9808 + struct btrfs_path *path = NULL; 9808 9809 int ret = 0; 9809 9810 u64 isize; 9810 - u64 start; 9811 + u64 prev_extent_end = 0; 9812 + 9813 + /* 9814 + * Acquire the inode's mmap lock to prevent races with memory mapped 9815 + * writes, as they could happen after we flush delalloc below and before 9816 + * we lock the extent range further below. The inode was already locked 9817 + * up in the call chain. 9818 + */ 9819 + btrfs_assert_inode_locked(BTRFS_I(inode)); 9820 + down_write(&BTRFS_I(inode)->i_mmap_lock); 9811 9821 9812 9822 /* 9813 9823 * If the swap file was just created, make sure delalloc is done. If the ··· 9826 9816 */ 9827 9817 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); 9828 9818 if (ret) 9829 - return ret; 9819 + goto out_unlock_mmap; 9830 9820 9831 9821 /* 9832 9822 * The inode is locked, so these flags won't change after we check them. 9833 9823 */ 9834 9824 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { 9835 9825 btrfs_warn(fs_info, "swapfile must not be compressed"); 9836 - return -EINVAL; 9826 + ret = -EINVAL; 9827 + goto out_unlock_mmap; 9837 9828 } 9838 9829 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { 9839 9830 btrfs_warn(fs_info, "swapfile must not be copy-on-write"); 9840 - return -EINVAL; 9831 + ret = -EINVAL; 9832 + goto out_unlock_mmap; 9841 9833 } 9842 9834 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 9843 9835 btrfs_warn(fs_info, "swapfile must not be checksummed"); 9844 - return -EINVAL; 9836 + ret = -EINVAL; 9837 + goto out_unlock_mmap; 9838 + } 9839 + 9840 + path = btrfs_alloc_path(); 9841 + backref_ctx = btrfs_alloc_backref_share_check_ctx(); 9842 + if (!path || !backref_ctx) { 9843 + ret = -ENOMEM; 9844 + goto out_unlock_mmap; 9845 9845 } 9846 9846 9847 9847 /* ··· 9866 9846 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { 9867 9847 btrfs_warn(fs_info, 9868 9848 "cannot activate swapfile while exclusive operation is running"); 9869 - return -EBUSY; 9849 + ret = -EBUSY; 9850 + goto out_unlock_mmap; 9870 9851 } 9871 9852 9872 9853 /* ··· 9881 9860 btrfs_exclop_finish(fs_info); 9882 9861 btrfs_warn(fs_info, 9883 9862 "cannot activate swapfile because snapshot creation is in progress"); 9884 - return -EINVAL; 9863 + ret = -EINVAL; 9864 + goto out_unlock_mmap; 9885 9865 } 9886 9866 /* 9887 9867 * Snapshots can create extents which require COW even if NODATACOW is ··· 9903 9881 btrfs_warn(fs_info, 9904 9882 "cannot activate swapfile because subvolume %llu is being deleted", 9905 9883 btrfs_root_id(root)); 9906 - return -EPERM; 9884 + ret = -EPERM; 9885 + goto out_unlock_mmap; 9907 9886 } 9908 9887 atomic_inc(&root->nr_swapfiles); 9909 9888 spin_unlock(&root->root_item_lock); ··· 9912 9889 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); 9913 9890 9914 9891 lock_extent(io_tree, 0, isize - 1, &cached_state); 9915 - start = 0; 9916 - while (start < isize) { 9917 - u64 logical_block_start, physical_block_start; 9892 + while (prev_extent_end < isize) { 9893 + struct btrfs_key key; 9894 + struct extent_buffer *leaf; 9895 + struct btrfs_file_extent_item *ei; 9918 9896 struct btrfs_block_group *bg; 9919 - u64 len = isize - start; 9897 + u64 logical_block_start; 9898 + u64 physical_block_start; 9899 + u64 extent_gen; 9900 + u64 disk_bytenr; 9901 + u64 len; 9920 9902 9921 - em = btrfs_get_extent(BTRFS_I(inode), NULL, start, len); 9922 - if (IS_ERR(em)) { 9923 - ret = PTR_ERR(em); 9903 + key.objectid = btrfs_ino(BTRFS_I(inode)); 9904 + key.type = BTRFS_EXTENT_DATA_KEY; 9905 + key.offset = prev_extent_end; 9906 + 9907 + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 9908 + if (ret < 0) 9924 9909 goto out; 9925 - } 9926 9910 9927 - if (em->disk_bytenr == EXTENT_MAP_HOLE) { 9911 + /* 9912 + * If key not found it means we have an implicit hole (NO_HOLES 9913 + * is enabled). 9914 + */ 9915 + if (ret > 0) { 9928 9916 btrfs_warn(fs_info, "swapfile must not have holes"); 9929 9917 ret = -EINVAL; 9930 9918 goto out; 9931 9919 } 9932 - if (em->disk_bytenr == EXTENT_MAP_INLINE) { 9920 + 9921 + leaf = path->nodes[0]; 9922 + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); 9923 + 9924 + if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) { 9933 9925 /* 9934 9926 * It's unlikely we'll ever actually find ourselves 9935 9927 * here, as a file small enough to fit inline won't be ··· 9956 9918 ret = -EINVAL; 9957 9919 goto out; 9958 9920 } 9959 - if (extent_map_is_compressed(em)) { 9921 + 9922 + if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { 9960 9923 btrfs_warn(fs_info, "swapfile must not be compressed"); 9961 9924 ret = -EINVAL; 9962 9925 goto out; 9963 9926 } 9964 9927 9965 - logical_block_start = extent_map_block_start(em) + (start - em->start); 9966 - len = min(len, em->len - (start - em->start)); 9967 - free_extent_map(em); 9968 - em = NULL; 9928 + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); 9929 + if (disk_bytenr == 0) { 9930 + btrfs_warn(fs_info, "swapfile must not have holes"); 9931 + ret = -EINVAL; 9932 + goto out; 9933 + } 9969 9934 9970 - ret = can_nocow_extent(inode, start, &len, NULL, false, true); 9935 + logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei); 9936 + extent_gen = btrfs_file_extent_generation(leaf, ei); 9937 + prev_extent_end = btrfs_file_extent_end(path); 9938 + 9939 + if (prev_extent_end > isize) 9940 + len = isize - key.offset; 9941 + else 9942 + len = btrfs_file_extent_num_bytes(leaf, ei); 9943 + 9944 + backref_ctx->curr_leaf_bytenr = leaf->start; 9945 + 9946 + /* 9947 + * Don't need the path anymore, release to avoid deadlocks when 9948 + * calling btrfs_is_data_extent_shared() because when joining a 9949 + * transaction it can block waiting for the current one's commit 9950 + * which in turn may be trying to lock the same leaf to flush 9951 + * delayed items for example. 9952 + */ 9953 + btrfs_release_path(path); 9954 + 9955 + ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr, 9956 + extent_gen, backref_ctx); 9971 9957 if (ret < 0) { 9972 9958 goto out; 9973 - } else if (ret) { 9974 - ret = 0; 9975 - } else { 9959 + } else if (ret > 0) { 9976 9960 btrfs_warn(fs_info, 9977 9961 "swapfile must not be copy-on-write"); 9978 9962 ret = -EINVAL; ··· 10029 9969 10030 9970 physical_block_start = (map->stripes[0].physical + 10031 9971 (logical_block_start - map->start)); 10032 - len = min(len, map->chunk_len - (logical_block_start - map->start)); 10033 9972 btrfs_free_chunk_map(map); 10034 9973 map = NULL; 10035 9974 ··· 10069 10010 if (ret) 10070 10011 goto out; 10071 10012 } 10072 - bsi.start = start; 10013 + bsi.start = key.offset; 10073 10014 bsi.block_start = physical_block_start; 10074 10015 bsi.block_len = len; 10075 10016 } 10076 10017 10077 - start += len; 10018 + if (fatal_signal_pending(current)) { 10019 + ret = -EINTR; 10020 + goto out; 10021 + } 10022 + 10023 + cond_resched(); 10078 10024 } 10079 10025 10080 10026 if (bsi.block_len) 10081 10027 ret = btrfs_add_swap_extent(sis, &bsi); 10082 10028 10083 10029 out: 10084 - if (!IS_ERR_OR_NULL(em)) 10085 - free_extent_map(em); 10086 10030 if (!IS_ERR_OR_NULL(map)) 10087 10031 btrfs_free_chunk_map(map); 10088 10032 ··· 10098 10036 10099 10037 btrfs_exclop_finish(fs_info); 10100 10038 10039 + out_unlock_mmap: 10040 + up_write(&BTRFS_I(inode)->i_mmap_lock); 10041 + btrfs_free_backref_share_ctx(backref_ctx); 10042 + btrfs_free_path(path); 10101 10043 if (ret) 10102 10044 return ret; 10103 10045
+1 -2
fs/btrfs/qgroup.c
··· 1121 1121 fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; 1122 1122 if (simple) { 1123 1123 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; 1124 + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); 1124 1125 btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); 1125 1126 } else { 1126 1127 fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; ··· 1255 1254 spin_lock(&fs_info->qgroup_lock); 1256 1255 fs_info->quota_root = quota_root; 1257 1256 set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); 1258 - if (simple) 1259 - btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); 1260 1257 spin_unlock(&fs_info->qgroup_lock); 1261 1258 1262 1259 /* Skip rescan for simple qgroups. */
+6
fs/btrfs/relocation.c
··· 2902 2902 const bool use_rst = btrfs_need_stripe_tree_update(fs_info, rc->block_group->flags); 2903 2903 2904 2904 ASSERT(index <= last_index); 2905 + again: 2905 2906 folio = filemap_lock_folio(inode->i_mapping, index); 2906 2907 if (IS_ERR(folio)) { 2907 2908 ··· 2937 2936 if (!folio_test_uptodate(folio)) { 2938 2937 ret = -EIO; 2939 2938 goto release_folio; 2939 + } 2940 + if (folio->mapping != inode->i_mapping) { 2941 + folio_unlock(folio); 2942 + folio_put(folio); 2943 + goto again; 2940 2944 } 2941 2945 } 2942 2946
+6
fs/btrfs/send.c
··· 5280 5280 unsigned cur_len = min_t(unsigned, len, 5281 5281 PAGE_SIZE - pg_offset); 5282 5282 5283 + again: 5283 5284 folio = filemap_lock_folio(mapping, index); 5284 5285 if (IS_ERR(folio)) { 5285 5286 page_cache_sync_readahead(mapping, ··· 5312 5311 folio_put(folio); 5313 5312 ret = -EIO; 5314 5313 break; 5314 + } 5315 + if (folio->mapping != mapping) { 5316 + folio_unlock(folio); 5317 + folio_put(folio); 5318 + goto again; 5315 5319 } 5316 5320 } 5317 5321
+3 -3
fs/btrfs/sysfs.c
··· 1118 1118 { 1119 1119 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 1120 1120 1121 - return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); 1121 + return sysfs_emit(buf, "%u\n", fs_info->nodesize); 1122 1122 } 1123 1123 1124 1124 BTRFS_ATTR(, nodesize, btrfs_nodesize_show); ··· 1128 1128 { 1129 1129 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 1130 1130 1131 - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); 1131 + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); 1132 1132 } 1133 1133 1134 1134 BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); ··· 1180 1180 { 1181 1181 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 1182 1182 1183 - return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); 1183 + return sysfs_emit(buf, "%u\n", fs_info->sectorsize); 1184 1184 } 1185 1185 1186 1186 BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show);