Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'for-6.16-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

- tree-log fixes:
- fixes of log tracking of directories and subvolumes
- fix iteration and error handling of inode references
during log replay

- fix free space tree rebuild (reported by syzbot)

* tag 'for-6.16-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
btrfs: use btrfs_record_snapshot_destroy() during rmdir
btrfs: propagate last_unlink_trans earlier when doing a rmdir
btrfs: record new subvolume in parent dir earlier to avoid dir logging races
btrfs: fix inode lookup error handling during log replay
btrfs: fix iteration of extrefs during log replay
btrfs: fix missing error handling when searching for inode refs during log replay
btrfs: fix failure to rebuild free space tree using multiple transactions

+131 -88
+2
fs/btrfs/block-group.h
··· 83 83 BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, 84 84 /* Does the block group need to be added to the free space tree? */ 85 85 BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, 86 + /* Set after we add a new block group to the free space tree. */ 87 + BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, 86 88 /* Indicate that the block group is placed on a sequential zone */ 87 89 BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, 88 90 /*
+40
fs/btrfs/free-space-tree.c
··· 1241 1241 { 1242 1242 BTRFS_PATH_AUTO_FREE(path); 1243 1243 struct btrfs_key key; 1244 + struct rb_node *node; 1244 1245 int nr; 1245 1246 int ret; 1246 1247 ··· 1268 1267 return ret; 1269 1268 1270 1269 btrfs_release_path(path); 1270 + } 1271 + 1272 + node = rb_first_cached(&trans->fs_info->block_group_cache_tree); 1273 + while (node) { 1274 + struct btrfs_block_group *bg; 1275 + 1276 + bg = rb_entry(node, struct btrfs_block_group, cache_node); 1277 + clear_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &bg->runtime_flags); 1278 + node = rb_next(node); 1279 + cond_resched(); 1271 1280 } 1272 1281 1273 1282 return 0; ··· 1369 1358 1370 1359 block_group = rb_entry(node, struct btrfs_block_group, 1371 1360 cache_node); 1361 + 1362 + if (test_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, 1363 + &block_group->runtime_flags)) 1364 + goto next; 1365 + 1372 1366 ret = populate_free_space_tree(trans, block_group); 1373 1367 if (ret) { 1374 1368 btrfs_abort_transaction(trans, ret); 1375 1369 btrfs_end_transaction(trans); 1376 1370 return ret; 1377 1371 } 1372 + next: 1378 1373 if (btrfs_should_end_transaction(trans)) { 1379 1374 btrfs_end_transaction(trans); 1380 1375 trans = btrfs_start_transaction(free_space_root, 1); ··· 1406 1389 int ret; 1407 1390 1408 1391 clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); 1392 + 1393 + /* 1394 + * While rebuilding the free space tree we may allocate new metadata 1395 + * block groups while modifying the free space tree. 1396 + * 1397 + * Because during the rebuild (at btrfs_rebuild_free_space_tree()) we 1398 + * can use multiple transactions, every time btrfs_end_transaction() is 1399 + * called at btrfs_rebuild_free_space_tree() we finish the creation of 1400 + * new block groups by calling btrfs_create_pending_block_groups(), and 1401 + * that in turn calls us, through add_block_group_free_space(), to add 1402 + * a free space info item and a free space extent item for the block 1403 + * group. 1404 + * 1405 + * Then later btrfs_rebuild_free_space_tree() may find such new block 1406 + * groups and processes them with populate_free_space_tree(), which can 1407 + * fail with EEXIST since there are already items for the block group in 1408 + * the free space tree. Notice that we say "may find" because a new 1409 + * block group may be added to the block groups rbtree in a node before 1410 + * or after the block group currently being processed by the rebuild 1411 + * process. So signal the rebuild process to skip such new block groups 1412 + * if it finds them. 1413 + */ 1414 + set_bit(BLOCK_GROUP_FLAG_FREE_SPACE_ADDED, &block_group->runtime_flags); 1409 1415 1410 1416 ret = add_new_free_space_info(trans, block_group, path); 1411 1417 if (ret)
+18 -18
fs/btrfs/inode.c
··· 4710 4710 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; 4711 4711 int ret = 0; 4712 4712 struct btrfs_trans_handle *trans; 4713 - u64 last_unlink_trans; 4714 4713 struct fscrypt_name fname; 4715 4714 4716 4715 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) ··· 4735 4736 goto out_notrans; 4736 4737 } 4737 4738 4739 + /* 4740 + * Propagate the last_unlink_trans value of the deleted dir to its 4741 + * parent directory. This is to prevent an unrecoverable log tree in the 4742 + * case we do something like this: 4743 + * 1) create dir foo 4744 + * 2) create snapshot under dir foo 4745 + * 3) delete the snapshot 4746 + * 4) rmdir foo 4747 + * 5) mkdir foo 4748 + * 6) fsync foo or some file inside foo 4749 + * 4750 + * This is because we can't unlink other roots when replaying the dir 4751 + * deletes for directory foo. 4752 + */ 4753 + if (BTRFS_I(inode)->last_unlink_trans >= trans->transid) 4754 + btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); 4755 + 4738 4756 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 4739 4757 ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); 4740 4758 goto out; ··· 4761 4745 if (ret) 4762 4746 goto out; 4763 4747 4764 - last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; 4765 - 4766 4748 /* now the directory is empty */ 4767 4749 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 4768 4750 &fname.disk_name); 4769 - if (!ret) { 4751 + if (!ret) 4770 4752 btrfs_i_size_write(BTRFS_I(inode), 0); 4771 - /* 4772 - * Propagate the last_unlink_trans value of the deleted dir to 4773 - * its parent directory. This is to prevent an unrecoverable 4774 - * log tree in the case we do something like this: 4775 - * 1) create dir foo 4776 - * 2) create snapshot under dir foo 4777 - * 3) delete the snapshot 4778 - * 4) rmdir foo 4779 - * 5) mkdir foo 4780 - * 6) fsync foo or some file inside foo 4781 - */ 4782 - if (last_unlink_trans >= trans->transid) 4783 - BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; 4784 - } 4785 4753 out: 4786 4754 btrfs_end_transaction(trans); 4787 4755 out_notrans:
+2 -2
fs/btrfs/ioctl.c
··· 666 666 goto out; 667 667 } 668 668 669 + btrfs_record_new_subvolume(trans, BTRFS_I(dir)); 670 + 669 671 ret = btrfs_create_new_inode(trans, &new_inode_args); 670 672 if (ret) { 671 673 btrfs_abort_transaction(trans, ret); 672 674 goto out; 673 675 } 674 - 675 - btrfs_record_new_subvolume(trans, BTRFS_I(dir)); 676 676 677 677 d_instantiate_new(dentry, new_inode_args.inode); 678 678 new_inode_args.inode = NULL;
+69 -68
fs/btrfs/tree-log.c
··· 143 143 unsigned int nofs_flag; 144 144 struct btrfs_inode *inode; 145 145 146 + /* Only meant to be called for subvolume roots and not for log roots. */ 147 + ASSERT(is_fstree(btrfs_root_id(root))); 148 + 146 149 /* 147 150 * We're holding a transaction handle whether we are logging or 148 151 * replaying a log tree, so we must make sure NOFS semantics apply ··· 607 604 return 0; 608 605 } 609 606 610 - /* 611 - * simple helper to read an inode off the disk from a given root 612 - * This can only be called for subvolume roots and not for the log 613 - */ 614 - static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, 615 - u64 objectid) 616 - { 617 - struct btrfs_inode *inode; 618 - 619 - inode = btrfs_iget_logging(objectid, root); 620 - if (IS_ERR(inode)) 621 - return NULL; 622 - return inode; 623 - } 624 - 625 607 /* replays a single extent in 'eb' at 'slot' with 'key' into the 626 608 * subvolume 'root'. path is released on entry and should be released 627 609 * on exit. ··· 662 674 return -EUCLEAN; 663 675 } 664 676 665 - inode = read_one_inode(root, key->objectid); 666 - if (!inode) 667 - return -EIO; 677 + inode = btrfs_iget_logging(key->objectid, root); 678 + if (IS_ERR(inode)) 679 + return PTR_ERR(inode); 668 680 669 681 /* 670 682 * first check to see if we already have this extent in the ··· 936 948 937 949 btrfs_release_path(path); 938 950 939 - inode = read_one_inode(root, location.objectid); 940 - if (!inode) { 941 - ret = -EIO; 951 + inode = btrfs_iget_logging(location.objectid, root); 952 + if (IS_ERR(inode)) { 953 + ret = PTR_ERR(inode); 954 + inode = NULL; 942 955 goto out; 943 956 } 944 957 ··· 1062 1073 search_key.type = BTRFS_INODE_REF_KEY; 1063 1074 search_key.offset = parent_objectid; 1064 1075 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 1065 - if (ret == 0) { 1076 + if (ret < 0) { 1077 + return ret; 1078 + } else if (ret == 0) { 1066 1079 struct btrfs_inode_ref *victim_ref; 1067 1080 unsigned long ptr; 1068 1081 unsigned long ptr_end; ··· 1137 1146 struct fscrypt_str victim_name; 1138 1147 1139 1148 extref = (struct btrfs_inode_extref *)(base + cur_offset); 1149 + victim_name.len = btrfs_inode_extref_name_len(leaf, extref); 1140 1150 1141 1151 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) 1142 1152 goto next; 1143 1153 1144 1154 ret = read_alloc_one_name(leaf, &extref->name, 1145 - btrfs_inode_extref_name_len(leaf, extref), 1146 - &victim_name); 1155 + victim_name.len, &victim_name); 1147 1156 if (ret) 1148 1157 return ret; 1149 1158 ··· 1158 1167 kfree(victim_name.name); 1159 1168 return ret; 1160 1169 } else if (!ret) { 1161 - ret = -ENOENT; 1162 - victim_parent = read_one_inode(root, 1163 - parent_objectid); 1164 - if (victim_parent) { 1170 + victim_parent = btrfs_iget_logging(parent_objectid, root); 1171 + if (IS_ERR(victim_parent)) { 1172 + ret = PTR_ERR(victim_parent); 1173 + } else { 1165 1174 inc_nlink(&inode->vfs_inode); 1166 1175 btrfs_release_path(path); 1167 1176 ··· 1306 1315 struct btrfs_inode *dir; 1307 1316 1308 1317 btrfs_release_path(path); 1309 - dir = read_one_inode(root, parent_id); 1310 - if (!dir) { 1311 - ret = -ENOENT; 1318 + dir = btrfs_iget_logging(parent_id, root); 1319 + if (IS_ERR(dir)) { 1320 + ret = PTR_ERR(dir); 1312 1321 kfree(name.name); 1313 1322 goto out; 1314 1323 } ··· 1380 1389 * copy the back ref in. The link count fixup code will take 1381 1390 * care of the rest 1382 1391 */ 1383 - dir = read_one_inode(root, parent_objectid); 1384 - if (!dir) { 1385 - ret = -ENOENT; 1392 + dir = btrfs_iget_logging(parent_objectid, root); 1393 + if (IS_ERR(dir)) { 1394 + ret = PTR_ERR(dir); 1395 + dir = NULL; 1386 1396 goto out; 1387 1397 } 1388 1398 1389 - inode = read_one_inode(root, inode_objectid); 1390 - if (!inode) { 1391 - ret = -EIO; 1399 + inode = btrfs_iget_logging(inode_objectid, root); 1400 + if (IS_ERR(inode)) { 1401 + ret = PTR_ERR(inode); 1402 + inode = NULL; 1392 1403 goto out; 1393 1404 } 1394 1405 ··· 1402 1409 * parent object can change from one array 1403 1410 * item to another. 1404 1411 */ 1405 - if (!dir) 1406 - dir = read_one_inode(root, parent_objectid); 1407 1412 if (!dir) { 1408 - ret = -ENOENT; 1409 - goto out; 1413 + dir = btrfs_iget_logging(parent_objectid, root); 1414 + if (IS_ERR(dir)) { 1415 + ret = PTR_ERR(dir); 1416 + dir = NULL; 1417 + goto out; 1418 + } 1410 1419 } 1411 1420 } else { 1412 1421 ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); ··· 1677 1682 break; 1678 1683 1679 1684 btrfs_release_path(path); 1680 - inode = read_one_inode(root, key.offset); 1681 - if (!inode) { 1682 - ret = -EIO; 1685 + inode = btrfs_iget_logging(key.offset, root); 1686 + if (IS_ERR(inode)) { 1687 + ret = PTR_ERR(inode); 1683 1688 break; 1684 1689 } 1685 1690 ··· 1715 1720 struct btrfs_inode *inode; 1716 1721 struct inode *vfs_inode; 1717 1722 1718 - inode = read_one_inode(root, objectid); 1719 - if (!inode) 1720 - return -EIO; 1723 + inode = btrfs_iget_logging(objectid, root); 1724 + if (IS_ERR(inode)) 1725 + return PTR_ERR(inode); 1721 1726 1722 1727 vfs_inode = &inode->vfs_inode; 1723 1728 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; ··· 1756 1761 struct btrfs_inode *dir; 1757 1762 int ret; 1758 1763 1759 - inode = read_one_inode(root, location->objectid); 1760 - if (!inode) 1761 - return -ENOENT; 1764 + inode = btrfs_iget_logging(location->objectid, root); 1765 + if (IS_ERR(inode)) 1766 + return PTR_ERR(inode); 1762 1767 1763 - dir = read_one_inode(root, dirid); 1764 - if (!dir) { 1768 + dir = btrfs_iget_logging(dirid, root); 1769 + if (IS_ERR(dir)) { 1765 1770 iput(&inode->vfs_inode); 1766 - return -EIO; 1771 + return PTR_ERR(dir); 1767 1772 } 1768 1773 1769 1774 ret = btrfs_add_link(trans, dir, inode, name, 1, index); ··· 1840 1845 bool update_size = true; 1841 1846 bool name_added = false; 1842 1847 1843 - dir = read_one_inode(root, key->objectid); 1844 - if (!dir) 1845 - return -EIO; 1848 + dir = btrfs_iget_logging(key->objectid, root); 1849 + if (IS_ERR(dir)) 1850 + return PTR_ERR(dir); 1846 1851 1847 1852 ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); 1848 1853 if (ret) ··· 2142 2147 btrfs_dir_item_key_to_cpu(eb, di, &location); 2143 2148 btrfs_release_path(path); 2144 2149 btrfs_release_path(log_path); 2145 - inode = read_one_inode(root, location.objectid); 2146 - if (!inode) { 2147 - ret = -EIO; 2150 + inode = btrfs_iget_logging(location.objectid, root); 2151 + if (IS_ERR(inode)) { 2152 + ret = PTR_ERR(inode); 2153 + inode = NULL; 2148 2154 goto out; 2149 2155 } 2150 2156 ··· 2297 2301 if (!log_path) 2298 2302 return -ENOMEM; 2299 2303 2300 - dir = read_one_inode(root, dirid); 2301 - /* it isn't an error if the inode isn't there, that can happen 2302 - * because we replay the deletes before we copy in the inode item 2303 - * from the log 2304 + dir = btrfs_iget_logging(dirid, root); 2305 + /* 2306 + * It isn't an error if the inode isn't there, that can happen because 2307 + * we replay the deletes before we copy in the inode item from the log. 2304 2308 */ 2305 - if (!dir) { 2309 + if (IS_ERR(dir)) { 2306 2310 btrfs_free_path(log_path); 2307 - return 0; 2311 + ret = PTR_ERR(dir); 2312 + if (ret == -ENOENT) 2313 + ret = 0; 2314 + return ret; 2308 2315 } 2309 2316 2310 2317 range_start = 0; ··· 2466 2467 struct btrfs_inode *inode; 2467 2468 u64 from; 2468 2469 2469 - inode = read_one_inode(root, key.objectid); 2470 - if (!inode) { 2471 - ret = -EIO; 2470 + inode = btrfs_iget_logging(key.objectid, root); 2471 + if (IS_ERR(inode)) { 2472 + ret = PTR_ERR(inode); 2472 2473 break; 2473 2474 } 2474 2475 from = ALIGN(i_size_read(&inode->vfs_inode), ··· 7447 7448 * full log sync. 7448 7449 * Also we don't need to worry with renames, since btrfs_rename() marks the log 7449 7450 * for full commit when renaming a subvolume. 7451 + * 7452 + * Must be called before creating the subvolume entry in its parent directory. 7450 7453 */ 7451 7454 void btrfs_record_new_subvolume(const struct btrfs_trans_handle *trans, 7452 7455 struct btrfs_inode *dir)