Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

- fix handling of folio private changes.

The private value holds pointer to our extent buffer structure
representing a metadata range. Release and create of the range was
not properly synchronized when updating the private bit which ended
up in double folio_put, leading to all sorts of breakage

- fix a crash, reported as duplicate key in metadata, but caused by a
race of fsync and size extending write. Requires prealloc target
range + fsync and other conditions (log tree state, timing)

- fix leak of qgroup extent records after transaction abort

* tag 'for-6.10-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: protect folio::private when attaching extent buffer folios
btrfs: fix leak of qgroup extent records after transaction abort
btrfs: fix crash on racing fsync and size-extending write into prealloc

+43 -44
+1 -9
fs/btrfs/disk-io.c
··· 4538 4538 struct btrfs_fs_info *fs_info) 4539 4539 { 4540 4540 struct rb_node *node; 4541 - struct btrfs_delayed_ref_root *delayed_refs; 4541 + struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; 4542 4542 struct btrfs_delayed_ref_node *ref; 4543 4543 4544 - delayed_refs = &trans->delayed_refs; 4545 - 4546 4544 spin_lock(&delayed_refs->lock); 4547 - if (atomic_read(&delayed_refs->num_entries) == 0) { 4548 - spin_unlock(&delayed_refs->lock); 4549 - btrfs_debug(fs_info, "delayed_refs has NO entry"); 4550 - return; 4551 - } 4552 - 4553 4545 while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { 4554 4546 struct btrfs_delayed_ref_head *head; 4555 4547 struct rb_node *n;
+31 -29
fs/btrfs/extent_io.c
··· 3689 3689 struct folio *folio = page_folio(page); 3690 3690 struct extent_buffer *exists; 3691 3691 3692 + lockdep_assert_held(&page->mapping->i_private_lock); 3693 + 3692 3694 /* 3693 3695 * For subpage case, we completely rely on radix tree to ensure we 3694 3696 * don't try to insert two ebs for the same bytenr. So here we always ··· 3758 3756 * The caller needs to free the existing folios and retry using the same order. 3759 3757 */ 3760 3758 static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, 3759 + struct btrfs_subpage *prealloc, 3761 3760 struct extent_buffer **found_eb_ret) 3762 3761 { 3763 3762 3764 3763 struct btrfs_fs_info *fs_info = eb->fs_info; 3765 3764 struct address_space *mapping = fs_info->btree_inode->i_mapping; 3766 3765 const unsigned long index = eb->start >> PAGE_SHIFT; 3767 - struct folio *existing_folio; 3766 + struct folio *existing_folio = NULL; 3768 3767 int ret; 3769 3768 3770 3769 ASSERT(found_eb_ret); ··· 3777 3774 ret = filemap_add_folio(mapping, eb->folios[i], index + i, 3778 3775 GFP_NOFS | __GFP_NOFAIL); 3779 3776 if (!ret) 3780 - return 0; 3777 + goto finish; 3781 3778 3782 3779 existing_folio = filemap_lock_folio(mapping, index + i); 3783 3780 /* The page cache only exists for a very short time, just retry. */ 3784 - if (IS_ERR(existing_folio)) 3781 + if (IS_ERR(existing_folio)) { 3782 + existing_folio = NULL; 3785 3783 goto retry; 3784 + } 3786 3785 3787 3786 /* For now, we should only have single-page folios for btree inode. */ 3788 3787 ASSERT(folio_nr_pages(existing_folio) == 1); ··· 3795 3790 return -EAGAIN; 3796 3791 } 3797 3792 3798 - if (fs_info->nodesize < PAGE_SIZE) { 3799 - /* 3800 - * We're going to reuse the existing page, can drop our page 3801 - * and subpage structure now. 3802 - */ 3793 + finish: 3794 + spin_lock(&mapping->i_private_lock); 3795 + if (existing_folio && fs_info->nodesize < PAGE_SIZE) { 3796 + /* We're going to reuse the existing page, can drop our folio now. */ 3803 3797 __free_page(folio_page(eb->folios[i], 0)); 3804 3798 eb->folios[i] = existing_folio; 3805 - } else { 3799 + } else if (existing_folio) { 3806 3800 struct extent_buffer *existing_eb; 3807 3801 3808 3802 existing_eb = grab_extent_buffer(fs_info, ··· 3809 3805 if (existing_eb) { 3810 3806 /* The extent buffer still exists, we can use it directly. */ 3811 3807 *found_eb_ret = existing_eb; 3808 + spin_unlock(&mapping->i_private_lock); 3812 3809 folio_unlock(existing_folio); 3813 3810 folio_put(existing_folio); 3814 3811 return 1; ··· 3818 3813 __free_page(folio_page(eb->folios[i], 0)); 3819 3814 eb->folios[i] = existing_folio; 3820 3815 } 3816 + eb->folio_size = folio_size(eb->folios[i]); 3817 + eb->folio_shift = folio_shift(eb->folios[i]); 3818 + /* Should not fail, as we have preallocated the memory. */ 3819 + ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); 3820 + ASSERT(!ret); 3821 + /* 3822 + * To inform we have an extra eb under allocation, so that 3823 + * detach_extent_buffer_page() won't release the folio private when the 3824 + * eb hasn't been inserted into radix tree yet. 3825 + * 3826 + * The ref will be decreased when the eb releases the page, in 3827 + * detach_extent_buffer_page(). Thus needs no special handling in the 3828 + * error path. 3829 + */ 3830 + btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); 3831 + spin_unlock(&mapping->i_private_lock); 3821 3832 return 0; 3822 3833 } 3823 3834 ··· 3845 3824 int attached = 0; 3846 3825 struct extent_buffer *eb; 3847 3826 struct extent_buffer *existing_eb = NULL; 3848 - struct address_space *mapping = fs_info->btree_inode->i_mapping; 3849 3827 struct btrfs_subpage *prealloc = NULL; 3850 3828 u64 lockdep_owner = owner_root; 3851 3829 bool page_contig = true; ··· 3910 3890 for (int i = 0; i < num_folios; i++) { 3911 3891 struct folio *folio; 3912 3892 3913 - ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); 3893 + ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); 3914 3894 if (ret > 0) { 3915 3895 ASSERT(existing_eb); 3916 3896 goto out; ··· 3947 3927 * and free the allocated page. 3948 3928 */ 3949 3929 folio = eb->folios[i]; 3950 - eb->folio_size = folio_size(folio); 3951 - eb->folio_shift = folio_shift(folio); 3952 - spin_lock(&mapping->i_private_lock); 3953 - /* Should not fail, as we have preallocated the memory */ 3954 - ret = attach_extent_buffer_folio(eb, folio, prealloc); 3955 - ASSERT(!ret); 3956 - /* 3957 - * To inform we have extra eb under allocation, so that 3958 - * detach_extent_buffer_page() won't release the folio private 3959 - * when the eb hasn't yet been inserted into radix tree. 3960 - * 3961 - * The ref will be decreased when the eb released the page, in 3962 - * detach_extent_buffer_page(). 3963 - * Thus needs no special handling in error path. 3964 - */ 3965 - btrfs_folio_inc_eb_refs(fs_info, folio); 3966 - spin_unlock(&mapping->i_private_lock); 3967 - 3968 3930 WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); 3969 3931 3970 3932 /*
+11 -6
fs/btrfs/tree-log.c
··· 4860 4860 path->slots[0]++; 4861 4861 continue; 4862 4862 } 4863 - if (!dropped_extents) { 4864 - /* 4865 - * Avoid logging extent items logged in past fsync calls 4866 - * and leading to duplicate keys in the log tree. 4867 - */ 4863 + /* 4864 + * Avoid overlapping items in the log tree. The first time we 4865 + * get here, get rid of everything from a past fsync. After 4866 + * that, if the current extent starts before the end of the last 4867 + * extent we copied, truncate the last one. This can happen if 4868 + * an ordered extent completion modifies the subvolume tree 4869 + * while btrfs_next_leaf() has the tree unlocked. 4870 + */ 4871 + if (!dropped_extents || key.offset < truncate_offset) { 4868 4872 ret = truncate_inode_items(trans, root->log_root, inode, 4869 - truncate_offset, 4873 + min(key.offset, truncate_offset), 4870 4874 BTRFS_EXTENT_DATA_KEY); 4871 4875 if (ret) 4872 4876 goto out; 4873 4877 dropped_extents = true; 4874 4878 } 4879 + truncate_offset = btrfs_file_extent_end(path); 4875 4880 if (ins_nr == 0) 4876 4881 start_slot = slot; 4877 4882 ins_nr++;