Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.5-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
"Stable fixes:

- fix race between balance and cancel/pause

- various iput() fixes

- fix use-after-free of new block group that became unused

- fix warning when putting transaction with qgroups enabled after
abort

- fix crash in subpage mode when page could be released between map
and map read

- when scrubbing raid56 verify the P/Q stripes unconditionally

- fix minor memory leak in zoned mode when a block group with an
unexpected superblock is found

Regression fixes:

- fix ordered extent split error handling when submitting direct IO

- user irq-safe locking when adding delayed iputs"

* tag 'for-6.5-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: fix warning when putting transaction with qgroups enabled after abort
btrfs: fix ordered extent split error handling in btrfs_dio_submit_io
btrfs: set_page_extent_mapped after read_folio in btrfs_cont_expand
btrfs: raid56: always verify the P/Q contents for scrub
btrfs: use irq safe locking when running and adding delayed iputs
btrfs: fix iput() on error pointer after error during orphan cleanup
btrfs: fix double iput() on inode after an error during orphan cleanup
btrfs: zoned: fix memory leak after finding block group with super blocks
btrfs: fix use-after-free of new block group that became unused
btrfs: be a bit more careful when setting mirror_num_ret in btrfs_map_block
btrfs: fix race between balance and cancel/pause

+79 -46
+12 -2
fs/btrfs/block-group.c
··· 1640 1640 { 1641 1641 struct btrfs_fs_info *fs_info = bg->fs_info; 1642 1642 1643 - trace_btrfs_add_unused_block_group(bg); 1644 1643 spin_lock(&fs_info->unused_bgs_lock); 1645 1644 if (list_empty(&bg->bg_list)) { 1646 1645 btrfs_get_block_group(bg); 1646 + trace_btrfs_add_unused_block_group(bg); 1647 1647 list_add_tail(&bg->bg_list, &fs_info->unused_bgs); 1648 - } else { 1648 + } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { 1649 1649 /* Pull out the block group from the reclaim_bgs list. */ 1650 + trace_btrfs_add_unused_block_group(bg); 1650 1651 list_move_tail(&bg->bg_list, &fs_info->unused_bgs); 1651 1652 } 1652 1653 spin_unlock(&fs_info->unused_bgs_lock); ··· 2088 2087 2089 2088 /* Shouldn't have super stripes in sequential zones */ 2090 2089 if (zoned && nr) { 2090 + kfree(logical); 2091 2091 btrfs_err(fs_info, 2092 2092 "zoned: block group %llu must not contain super block", 2093 2093 cache->start); ··· 2670 2668 next: 2671 2669 btrfs_delayed_refs_rsv_release(fs_info, 1); 2672 2670 list_del_init(&block_group->bg_list); 2671 + clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); 2673 2672 } 2674 2673 btrfs_trans_release_chunk_metadata(trans); 2675 2674 } ··· 2709 2706 cache = btrfs_create_block_group_cache(fs_info, chunk_offset); 2710 2707 if (!cache) 2711 2708 return ERR_PTR(-ENOMEM); 2709 + 2710 + /* 2711 + * Mark it as new before adding it to the rbtree of block groups or any 2712 + * list, so that no other task finds it and calls btrfs_mark_bg_unused() 2713 + * before the new flag is set. 2714 + */ 2715 + set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); 2712 2716 2713 2717 cache->length = size; 2714 2718 set_free_space_tree_thresholds(cache);
+5
fs/btrfs/block-group.h
··· 70 70 BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, 71 71 /* Indicate that the block group is placed on a sequential zone */ 72 72 BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, 73 + /* 74 + * Indicate that block group is in the list of new block groups of a 75 + * transaction. 76 + */ 77 + BLOCK_GROUP_FLAG_NEW, 73 78 }; 74 79 75 80 enum btrfs_caching_type {
+52 -25
fs/btrfs/inode.c
··· 3482 3482 void btrfs_add_delayed_iput(struct btrfs_inode *inode) 3483 3483 { 3484 3484 struct btrfs_fs_info *fs_info = inode->root->fs_info; 3485 + unsigned long flags; 3485 3486 3486 3487 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) 3487 3488 return; 3488 3489 3489 3490 atomic_inc(&fs_info->nr_delayed_iputs); 3490 - spin_lock(&fs_info->delayed_iput_lock); 3491 + /* 3492 + * Need to be irq safe here because we can be called from either an irq 3493 + * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq 3494 + * context. 3495 + */ 3496 + spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); 3491 3497 ASSERT(list_empty(&inode->delayed_iput)); 3492 3498 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); 3493 - spin_unlock(&fs_info->delayed_iput_lock); 3499 + spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); 3494 3500 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) 3495 3501 wake_up_process(fs_info->cleaner_kthread); 3496 3502 } ··· 3505 3499 struct btrfs_inode *inode) 3506 3500 { 3507 3501 list_del_init(&inode->delayed_iput); 3508 - spin_unlock(&fs_info->delayed_iput_lock); 3502 + spin_unlock_irq(&fs_info->delayed_iput_lock); 3509 3503 iput(&inode->vfs_inode); 3510 3504 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) 3511 3505 wake_up(&fs_info->delayed_iputs_wait); 3512 - spin_lock(&fs_info->delayed_iput_lock); 3506 + spin_lock_irq(&fs_info->delayed_iput_lock); 3513 3507 } 3514 3508 3515 3509 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, 3516 3510 struct btrfs_inode *inode) 3517 3511 { 3518 3512 if (!list_empty(&inode->delayed_iput)) { 3519 - spin_lock(&fs_info->delayed_iput_lock); 3513 + spin_lock_irq(&fs_info->delayed_iput_lock); 3520 3514 if (!list_empty(&inode->delayed_iput)) 3521 3515 run_delayed_iput_locked(fs_info, inode); 3522 - spin_unlock(&fs_info->delayed_iput_lock); 3516 + spin_unlock_irq(&fs_info->delayed_iput_lock); 3523 3517 } 3524 3518 } 3525 3519 3526 3520 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) 3527 3521 { 3528 - 3529 - spin_lock(&fs_info->delayed_iput_lock); 3522 + /* 3523 + * btrfs_put_ordered_extent() can run in irq context (see bio.c), which 3524 + * calls btrfs_add_delayed_iput() and that needs to lock 3525 + * fs_info->delayed_iput_lock. So we need to disable irqs here to 3526 + * prevent a deadlock. 3527 + */ 3528 + spin_lock_irq(&fs_info->delayed_iput_lock); 3530 3529 while (!list_empty(&fs_info->delayed_iputs)) { 3531 3530 struct btrfs_inode *inode; 3532 3531 3533 3532 inode = list_first_entry(&fs_info->delayed_iputs, 3534 3533 struct btrfs_inode, delayed_iput); 3535 3534 run_delayed_iput_locked(fs_info, inode); 3536 - cond_resched_lock(&fs_info->delayed_iput_lock); 3535 + if (need_resched()) { 3536 + spin_unlock_irq(&fs_info->delayed_iput_lock); 3537 + cond_resched(); 3538 + spin_lock_irq(&fs_info->delayed_iput_lock); 3539 + } 3537 3540 } 3538 - spin_unlock(&fs_info->delayed_iput_lock); 3541 + spin_unlock_irq(&fs_info->delayed_iput_lock); 3539 3542 } 3540 3543 3541 3544 /* ··· 3674 3659 found_key.type = BTRFS_INODE_ITEM_KEY; 3675 3660 found_key.offset = 0; 3676 3661 inode = btrfs_iget(fs_info->sb, last_objectid, root); 3677 - ret = PTR_ERR_OR_ZERO(inode); 3678 - if (ret && ret != -ENOENT) 3679 - goto out; 3662 + if (IS_ERR(inode)) { 3663 + ret = PTR_ERR(inode); 3664 + inode = NULL; 3665 + if (ret != -ENOENT) 3666 + goto out; 3667 + } 3680 3668 3681 - if (ret == -ENOENT && root == fs_info->tree_root) { 3669 + if (!inode && root == fs_info->tree_root) { 3682 3670 struct btrfs_root *dead_root; 3683 3671 int is_dead_root = 0; 3684 3672 ··· 3742 3724 * deleted but wasn't. The inode number may have been reused, 3743 3725 * but either way, we can delete the orphan item. 3744 3726 */ 3745 - if (ret == -ENOENT || inode->i_nlink) { 3746 - if (!ret) { 3727 + if (!inode || inode->i_nlink) { 3728 + if (inode) { 3747 3729 ret = btrfs_drop_verity_items(BTRFS_I(inode)); 3748 3730 iput(inode); 3731 + inode = NULL; 3749 3732 if (ret) 3750 3733 goto out; 3751 3734 } 3752 3735 trans = btrfs_start_transaction(root, 1); 3753 3736 if (IS_ERR(trans)) { 3754 3737 ret = PTR_ERR(trans); 3755 - iput(inode); 3756 3738 goto out; 3757 3739 } 3758 3740 btrfs_debug(fs_info, "auto deleting %Lu", ··· 3760 3742 ret = btrfs_del_orphan_item(trans, root, 3761 3743 found_key.objectid); 3762 3744 btrfs_end_transaction(trans); 3763 - if (ret) { 3764 - iput(inode); 3745 + if (ret) 3765 3746 goto out; 3766 - } 3767 3747 continue; 3768 3748 } 3769 3749 ··· 4863 4847 ret = -ENOMEM; 4864 4848 goto out; 4865 4849 } 4866 - ret = set_page_extent_mapped(page); 4867 - if (ret < 0) 4868 - goto out_unlock; 4869 4850 4870 4851 if (!PageUptodate(page)) { 4871 4852 ret = btrfs_read_folio(NULL, page_folio(page)); ··· 4877 4864 goto out_unlock; 4878 4865 } 4879 4866 } 4867 + 4868 + /* 4869 + * We unlock the page after the io is completed and then re-lock it 4870 + * above. release_folio() could have come in between that and cleared 4871 + * PagePrivate(), but left the page in the mapping. Set the page mapped 4872 + * here to make sure it's properly set for the subpage stuff. 4873 + */ 4874 + ret = set_page_extent_mapped(page); 4875 + if (ret < 0) 4876 + goto out_unlock; 4877 + 4880 4878 wait_on_page_writeback(page); 4881 4879 4882 4880 lock_extent(io_tree, block_start, block_end, &cached_state); ··· 7873 7849 7874 7850 ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); 7875 7851 if (ret) { 7876 - bbio->bio.bi_status = errno_to_blk_status(ret); 7877 - btrfs_dio_end_io(bbio); 7852 + btrfs_finish_ordered_extent(dio_data->ordered, NULL, 7853 + file_offset, dip->bytes, 7854 + !ret); 7855 + bio->bi_status = errno_to_blk_status(ret); 7856 + iomap_dio_bio_end_io(bio); 7878 7857 return; 7879 7858 } 7880 7859 }
+1
fs/btrfs/qgroup.c
··· 4445 4445 ulist_free(entry->old_roots); 4446 4446 kfree(entry); 4447 4447 } 4448 + *root = RB_ROOT; 4448 4449 }
+3 -8
fs/btrfs/raid56.c
··· 71 71 static void index_rbio_pages(struct btrfs_raid_bio *rbio); 72 72 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); 73 73 74 - static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); 74 + static int finish_parity_scrub(struct btrfs_raid_bio *rbio); 75 75 static void scrub_rbio_work_locked(struct work_struct *work); 76 76 77 77 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) ··· 2404 2404 return 0; 2405 2405 } 2406 2406 2407 - static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) 2407 + static int finish_parity_scrub(struct btrfs_raid_bio *rbio) 2408 2408 { 2409 2409 struct btrfs_io_context *bioc = rbio->bioc; 2410 2410 const u32 sectorsize = bioc->fs_info->sectorsize; ··· 2444 2444 * it. 2445 2445 */ 2446 2446 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); 2447 - 2448 - if (!need_check) 2449 - goto writeback; 2450 2447 2451 2448 p_sector.page = alloc_page(GFP_NOFS); 2452 2449 if (!p_sector.page) ··· 2513 2516 q_sector.page = NULL; 2514 2517 } 2515 2518 2516 - writeback: 2517 2519 /* 2518 2520 * time to start writing. Make bios for everything from the 2519 2521 * higher layers (the bio_list in our rbio) and our p/q. Ignore ··· 2695 2699 2696 2700 static void scrub_rbio(struct btrfs_raid_bio *rbio) 2697 2701 { 2698 - bool need_check = false; 2699 2702 int sector_nr; 2700 2703 int ret; 2701 2704 ··· 2717 2722 * We have every sector properly prepared. Can finish the scrub 2718 2723 * and writeback the good content. 2719 2724 */ 2720 - ret = finish_parity_scrub(rbio, need_check); 2725 + ret = finish_parity_scrub(rbio); 2721 2726 wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); 2722 2727 for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { 2723 2728 int found_errors;
+6 -11
fs/btrfs/volumes.c
··· 4078 4078 return has_single_bit_set(flags); 4079 4079 } 4080 4080 4081 - static inline int balance_need_close(struct btrfs_fs_info *fs_info) 4082 - { 4083 - /* cancel requested || normal exit path */ 4084 - return atomic_read(&fs_info->balance_cancel_req) || 4085 - (atomic_read(&fs_info->balance_pause_req) == 0 && 4086 - atomic_read(&fs_info->balance_cancel_req) == 0); 4087 - } 4088 - 4089 4081 /* 4090 4082 * Validate target profile against allowed profiles and return true if it's OK. 4091 4083 * Otherwise print the error message and return false. ··· 4267 4275 u64 num_devices; 4268 4276 unsigned seq; 4269 4277 bool reducing_redundancy; 4278 + bool paused = false; 4270 4279 int i; 4271 4280 4272 4281 if (btrfs_fs_closing(fs_info) || ··· 4398 4405 if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { 4399 4406 btrfs_info(fs_info, "balance: paused"); 4400 4407 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); 4408 + paused = true; 4401 4409 } 4402 4410 /* 4403 4411 * Balance can be canceled by: ··· 4427 4433 btrfs_update_ioctl_balance_args(fs_info, bargs); 4428 4434 } 4429 4435 4430 - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || 4431 - balance_need_close(fs_info)) { 4436 + /* We didn't pause, we can clean everything up. */ 4437 + if (!paused) { 4432 4438 reset_balance_state(fs_info); 4433 4439 btrfs_exclop_finish(fs_info); 4434 4440 } ··· 6398 6404 (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || 6399 6405 !dev_replace->tgtdev)) { 6400 6406 set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); 6401 - *mirror_num_ret = mirror_num; 6407 + if (mirror_num_ret) 6408 + *mirror_num_ret = mirror_num; 6402 6409 *bioc_ret = NULL; 6403 6410 ret = 0; 6404 6411 goto out;