Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
"There are still regressions being found and fixed in the zoned mode
and subpage code, the rest are fixes for bugs reported by users.

Regressions:

- subpage block support:
- readahead works on the proper block size
- fix last page zeroing

- zoned mode:
- linked list corruption for tree log

Fixes:

- qgroup leak after falloc failure

- tree mod log and backref resolving:
- extent buffer cloning race when resolving backrefs
- pin deleted leaves with active tree mod log users

- drop debugging flag from slab cache"

* tag 'for-5.12-rc3-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: always pin deleted leaves when there are active tree mod log users
btrfs: fix race when cloning extent buffer during rewind of an old root
btrfs: fix slab cache flags for free space tree bitmap
btrfs: subpage: make readahead work properly
btrfs: subpage: fix wild pointer access during metadata read failure
btrfs: zoned: fix linked list corruption after log root tree allocation failure
btrfs: fix qgroup data rsv leak caused by falloc failure
btrfs: track qgroup released data in own variable in insert_prealloc_file_extent
btrfs: fix wrong offset to zero out range beyond i_size

+103 -35
+2
fs/btrfs/ctree.c
··· 1365 1365 "failed to read tree block %llu from get_old_root", 1366 1366 logical); 1367 1367 } else { 1368 + btrfs_tree_read_lock(old); 1368 1369 eb = btrfs_clone_extent_buffer(old); 1370 + btrfs_tree_read_unlock(old); 1369 1371 free_extent_buffer(old); 1370 1372 } 1371 1373 } else if (old_root) {
+22 -1
fs/btrfs/extent-tree.c
··· 3323 3323 3324 3324 if (last_ref && btrfs_header_generation(buf) == trans->transid) { 3325 3325 struct btrfs_block_group *cache; 3326 + bool must_pin = false; 3326 3327 3327 3328 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 3328 3329 ret = check_ref_cleanup(trans, buf->start); ··· 3341 3340 goto out; 3342 3341 } 3343 3342 3344 - if (btrfs_is_zoned(fs_info)) { 3343 + /* 3344 + * If this is a leaf and there are tree mod log users, we may 3345 + * have recorded mod log operations that point to this leaf. 3346 + * So we must make sure no one reuses this leaf's extent before 3347 + * mod log operations are applied to a node, otherwise after 3348 + * rewinding a node using the mod log operations we get an 3349 + * inconsistent btree, as the leaf's extent may now be used as 3350 + * a node or leaf for another different btree. 3351 + * We are safe from races here because at this point no other 3352 + * node or root points to this extent buffer, so if after this 3353 + * check a new tree mod log user joins, it will not be able to 3354 + * find a node pointing to this leaf and record operations that 3355 + * point to this leaf. 3356 + */ 3357 + if (btrfs_header_level(buf) == 0) { 3358 + read_lock(&fs_info->tree_mod_log_lock); 3359 + must_pin = !list_empty(&fs_info->tree_mod_seq_list); 3360 + read_unlock(&fs_info->tree_mod_log_lock); 3361 + } 3362 + 3363 + if (must_pin || btrfs_is_zoned(fs_info)) { 3345 3364 btrfs_redirty_list_add(trans->transaction, buf); 3346 3365 pin_down_extent(trans, cache, buf->start, buf->len, 1); 3347 3366 btrfs_put_block_group(cache);
+31 -2
fs/btrfs/extent_io.c
··· 2886 2886 } 2887 2887 2888 2888 /* 2889 + * Find extent buffer for a givne bytenr. 2890 + * 2891 + * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking 2892 + * in endio context. 2893 + */ 2894 + static struct extent_buffer *find_extent_buffer_readpage( 2895 + struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) 2896 + { 2897 + struct extent_buffer *eb; 2898 + 2899 + /* 2900 + * For regular sectorsize, we can use page->private to grab extent 2901 + * buffer 2902 + */ 2903 + if (fs_info->sectorsize == PAGE_SIZE) { 2904 + ASSERT(PagePrivate(page) && page->private); 2905 + return (struct extent_buffer *)page->private; 2906 + } 2907 + 2908 + /* For subpage case, we need to lookup buffer radix tree */ 2909 + rcu_read_lock(); 2910 + eb = radix_tree_lookup(&fs_info->buffer_radix, 2911 + bytenr >> fs_info->sectorsize_bits); 2912 + rcu_read_unlock(); 2913 + ASSERT(eb); 2914 + return eb; 2915 + } 2916 + 2917 + /* 2889 2918 * after a readpage IO is done, we need to: 2890 2919 * clear the uptodate bits on error 2891 2920 * set the uptodate bits if things worked ··· 3025 2996 } else { 3026 2997 struct extent_buffer *eb; 3027 2998 3028 - eb = (struct extent_buffer *)page->private; 2999 + eb = find_extent_buffer_readpage(fs_info, page, start); 3029 3000 set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); 3030 3001 eb->read_mirror = mirror; 3031 3002 atomic_dec(&eb->io_pages); ··· 3049 3020 */ 3050 3021 if (page->index == end_index && i_size <= end) { 3051 3022 u32 zero_start = max(offset_in_page(i_size), 3052 - offset_in_page(end)); 3023 + offset_in_page(start)); 3053 3024 3054 3025 zero_user_segment(page, zero_start, 3055 3026 offset_in_page(end) + 1);
+26 -11
fs/btrfs/inode.c
··· 9008 9008 9009 9009 btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", 9010 9010 PAGE_SIZE, PAGE_SIZE, 9011 - SLAB_RED_ZONE, NULL); 9011 + SLAB_MEM_SPREAD, NULL); 9012 9012 if (!btrfs_free_space_bitmap_cachep) 9013 9013 goto fail; 9014 9014 ··· 9877 9877 struct btrfs_path *path; 9878 9878 u64 start = ins->objectid; 9879 9879 u64 len = ins->offset; 9880 + int qgroup_released; 9880 9881 int ret; 9881 9882 9882 9883 memset(&stack_fi, 0, sizeof(stack_fi)); ··· 9890 9889 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); 9891 9890 /* Encryption and other encoding is reserved and all 0 */ 9892 9891 9893 - ret = btrfs_qgroup_release_data(inode, file_offset, len); 9894 - if (ret < 0) 9895 - return ERR_PTR(ret); 9892 + qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); 9893 + if (qgroup_released < 0) 9894 + return ERR_PTR(qgroup_released); 9896 9895 9897 9896 if (trans) { 9898 9897 ret = insert_reserved_file_extent(trans, inode, 9899 9898 file_offset, &stack_fi, 9900 - true, ret); 9899 + true, qgroup_released); 9901 9900 if (ret) 9902 - return ERR_PTR(ret); 9901 + goto free_qgroup; 9903 9902 return trans; 9904 9903 } 9905 9904 ··· 9910 9909 extent_info.file_offset = file_offset; 9911 9910 extent_info.extent_buf = (char *)&stack_fi; 9912 9911 extent_info.is_new_extent = true; 9913 - extent_info.qgroup_reserved = ret; 9912 + extent_info.qgroup_reserved = qgroup_released; 9914 9913 extent_info.insertions = 0; 9915 9914 9916 9915 path = btrfs_alloc_path(); 9917 - if (!path) 9918 - return ERR_PTR(-ENOMEM); 9916 + if (!path) { 9917 + ret = -ENOMEM; 9918 + goto free_qgroup; 9919 + } 9919 9920 9920 9921 ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, 9921 9922 file_offset + len - 1, &extent_info, 9922 9923 &trans); 9923 9924 btrfs_free_path(path); 9924 9925 if (ret) 9925 - return ERR_PTR(ret); 9926 - 9926 + goto free_qgroup; 9927 9927 return trans; 9928 + 9929 + free_qgroup: 9930 + /* 9931 + * We have released qgroup data range at the beginning of the function, 9932 + * and normally qgroup_released bytes will be freed when committing 9933 + * transaction. 9934 + * But if we error out early, we have to free what we have released 9935 + * or we leak qgroup data reservation. 9936 + */ 9937 + btrfs_qgroup_free_refroot(inode->root->fs_info, 9938 + inode->root->root_key.objectid, qgroup_released, 9939 + BTRFS_QGROUP_RSV_DATA); 9940 + return ERR_PTR(ret); 9928 9941 } 9929 9942 9930 9943 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
+18 -17
fs/btrfs/reada.c
··· 209 209 /* find extent */ 210 210 spin_lock(&fs_info->reada_lock); 211 211 re = radix_tree_lookup(&fs_info->reada_tree, 212 - eb->start >> PAGE_SHIFT); 212 + eb->start >> fs_info->sectorsize_bits); 213 213 if (re) 214 214 re->refcnt++; 215 215 spin_unlock(&fs_info->reada_lock); ··· 240 240 zone = NULL; 241 241 spin_lock(&fs_info->reada_lock); 242 242 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 243 - logical >> PAGE_SHIFT, 1); 243 + logical >> fs_info->sectorsize_bits, 1); 244 244 if (ret == 1 && logical >= zone->start && logical <= zone->end) { 245 245 kref_get(&zone->refcnt); 246 246 spin_unlock(&fs_info->reada_lock); ··· 283 283 284 284 spin_lock(&fs_info->reada_lock); 285 285 ret = radix_tree_insert(&dev->reada_zones, 286 - (unsigned long)(zone->end >> PAGE_SHIFT), 287 - zone); 286 + (unsigned long)(zone->end >> fs_info->sectorsize_bits), 287 + zone); 288 288 289 289 if (ret == -EEXIST) { 290 290 kfree(zone); 291 291 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, 292 - logical >> PAGE_SHIFT, 1); 292 + logical >> fs_info->sectorsize_bits, 1); 293 293 if (ret == 1 && logical >= zone->start && logical <= zone->end) 294 294 kref_get(&zone->refcnt); 295 295 else ··· 315 315 u64 length; 316 316 int real_stripes; 317 317 int nzones = 0; 318 - unsigned long index = logical >> PAGE_SHIFT; 318 + unsigned long index = logical >> fs_info->sectorsize_bits; 319 319 int dev_replace_is_ongoing; 320 320 int have_zone = 0; 321 321 ··· 497 497 struct reada_extent *re) 498 498 { 499 499 int i; 500 - unsigned long index = re->logical >> PAGE_SHIFT; 500 + unsigned long index = re->logical >> fs_info->sectorsize_bits; 501 501 502 502 spin_lock(&fs_info->reada_lock); 503 503 if (--re->refcnt) { ··· 538 538 static void reada_zone_release(struct kref *kref) 539 539 { 540 540 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt); 541 + struct btrfs_fs_info *fs_info = zone->device->fs_info; 541 542 542 - lockdep_assert_held(&zone->device->fs_info->reada_lock); 543 + lockdep_assert_held(&fs_info->reada_lock); 543 544 544 545 radix_tree_delete(&zone->device->reada_zones, 545 - zone->end >> PAGE_SHIFT); 546 + zone->end >> fs_info->sectorsize_bits); 546 547 547 548 kfree(zone); 548 549 } ··· 594 593 static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock) 595 594 { 596 595 int i; 597 - unsigned long index = zone->end >> PAGE_SHIFT; 596 + unsigned long index = zone->end >> zone->device->fs_info->sectorsize_bits; 598 597 599 598 for (i = 0; i < zone->ndevs; ++i) { 600 599 struct reada_zone *peer; ··· 629 628 (void **)&zone, index, 1); 630 629 if (ret == 0) 631 630 break; 632 - index = (zone->end >> PAGE_SHIFT) + 1; 631 + index = (zone->end >> dev->fs_info->sectorsize_bits) + 1; 633 632 if (zone->locked) { 634 633 if (zone->elems > top_locked_elems) { 635 634 top_locked_elems = zone->elems; ··· 710 709 * plugging to speed things up 711 710 */ 712 711 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 713 - dev->reada_next >> PAGE_SHIFT, 1); 712 + dev->reada_next >> fs_info->sectorsize_bits, 1); 714 713 if (ret == 0 || re->logical > dev->reada_curr_zone->end) { 715 714 ret = reada_pick_zone(dev); 716 715 if (!ret) { ··· 719 718 } 720 719 re = NULL; 721 720 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re, 722 - dev->reada_next >> PAGE_SHIFT, 1); 721 + dev->reada_next >> fs_info->sectorsize_bits, 1); 723 722 } 724 723 if (ret == 0) { 725 724 spin_unlock(&fs_info->reada_lock); ··· 886 885 pr_cont(" curr off %llu", 887 886 device->reada_next - zone->start); 888 887 pr_cont("\n"); 889 - index = (zone->end >> PAGE_SHIFT) + 1; 888 + index = (zone->end >> fs_info->sectorsize_bits) + 1; 890 889 } 891 890 cnt = 0; 892 891 index = 0; ··· 911 910 } 912 911 } 913 912 pr_cont("\n"); 914 - index = (re->logical >> PAGE_SHIFT) + 1; 913 + index = (re->logical >> fs_info->sectorsize_bits) + 1; 915 914 if (++cnt > 15) 916 915 break; 917 916 } ··· 927 926 if (ret == 0) 928 927 break; 929 928 if (!re->scheduled) { 930 - index = (re->logical >> PAGE_SHIFT) + 1; 929 + index = (re->logical >> fs_info->sectorsize_bits) + 1; 931 930 continue; 932 931 } 933 932 pr_debug("re: logical %llu size %u list empty %d scheduled %d", ··· 943 942 } 944 943 } 945 944 pr_cont("\n"); 946 - index = (re->logical >> PAGE_SHIFT) + 1; 945 + index = (re->logical >> fs_info->sectorsize_bits) + 1; 947 946 } 948 947 spin_unlock(&fs_info->reada_lock); 949 948 }
+4 -4
fs/btrfs/tree-log.c
··· 3169 3169 3170 3170 mutex_lock(&log_root_tree->log_mutex); 3171 3171 3172 - index2 = log_root_tree->log_transid % 2; 3173 - list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3174 - root_log_ctx.log_transid = log_root_tree->log_transid; 3175 - 3176 3172 if (btrfs_is_zoned(fs_info)) { 3177 3173 if (!log_root_tree->node) { 3178 3174 ret = btrfs_alloc_log_tree_node(trans, log_root_tree); ··· 3178 3182 } 3179 3183 } 3180 3184 } 3185 + 3186 + index2 = log_root_tree->log_transid % 2; 3187 + list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); 3188 + root_log_ctx.log_transid = log_root_tree->log_transid; 3181 3189 3182 3190 /* 3183 3191 * Now we are safe to update the log_root_tree because we're under the