Merge tag 'for-6.17-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

+19 -5

fs/btrfs/extent_io.c

··· 1512 1512 1513 1513 /* 1514 1514 * Return 0 if we have submitted or queued the sector for submission. 1515 - * Return <0 for critical errors. 1515 + * Return <0 for critical errors, and the sector will have its dirty flag cleared. 1516 1516 * 1517 1517 * Caller should make sure filepos < i_size and handle filepos >= i_size case. 1518 1518 */ ··· 1535 1535 ASSERT(filepos < i_size); 1536 1536 1537 1537 em = btrfs_get_extent(inode, NULL, filepos, sectorsize); 1538 - if (IS_ERR(em)) 1538 + if (IS_ERR(em)) { 1539 + /* 1540 + * When submission failed, we should still clear the folio dirty. 1541 + * Or the folio will be written back again but without any 1542 + * ordered extent. 1543 + */ 1544 + btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); 1545 + btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); 1546 + btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); 1539 1547 return PTR_ERR(em); 1548 + } 1540 1549 1541 1550 extent_offset = filepos - em->start; 1542 1551 em_end = btrfs_extent_map_end(em); ··· 1618 1609 folio_unlock(folio); 1619 1610 return 1; 1620 1611 } 1621 - if (ret < 0) 1612 + if (ret < 0) { 1613 + btrfs_folio_clear_dirty(fs_info, folio, start, len); 1614 + btrfs_folio_set_writeback(fs_info, folio, start, len); 1615 + btrfs_folio_clear_writeback(fs_info, folio, start, len); 1622 1616 return ret; 1617 + } 1623 1618 1624 1619 for (cur = start; cur < start + len; cur += fs_info->sectorsize) 1625 1620 set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); ··· 1679 1666 * Here we set writeback and clear for the range. If the full folio 1680 1667 * is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag. 1681 1668 * 1682 - * If we hit any error, the corresponding sector will still be dirty 1683 - * thus no need to clear PAGECACHE_TAG_DIRTY. 1669 + * If we hit any error, the corresponding sector will have its dirty 1670 + * flag cleared and writeback finished, thus no need to handle the error case. 1684 1671 */ 1685 1672 if (!submitted_io && !error) { 1686 1673 btrfs_folio_set_writeback(fs_info, folio, start, len); ··· 1826 1813 xas_load(&xas); 1827 1814 xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); 1828 1815 xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); 1816 + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 1829 1817 xas_unlock_irqrestore(&xas, flags); 1830 1818 1831 1819 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);

+19 -10

fs/btrfs/inode.c

··· 4189 4189 return ret; 4190 4190 } 4191 4191 4192 + static void update_time_after_link_or_unlink(struct btrfs_inode *dir) 4193 + { 4194 + struct timespec64 now; 4195 + 4196 + /* 4197 + * If we are replaying a log tree, we do not want to update the mtime 4198 + * and ctime of the parent directory with the current time, since the 4199 + * log replay procedure is responsible for setting them to their correct 4200 + * values (the ones it had when the fsync was done). 4201 + */ 4202 + if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags)) 4203 + return; 4204 + 4205 + now = inode_set_ctime_current(&dir->vfs_inode); 4206 + inode_set_mtime_to_ts(&dir->vfs_inode, now); 4207 + } 4208 + 4192 4209 /* 4193 4210 * unlink helper that gets used here in inode.c and in the tree logging 4194 4211 * recovery code. It remove a link in a directory with a given name, and ··· 4306 4289 inode_inc_iversion(&inode->vfs_inode); 4307 4290 inode_set_ctime_current(&inode->vfs_inode); 4308 4291 inode_inc_iversion(&dir->vfs_inode); 4309 - inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); 4292 + update_time_after_link_or_unlink(dir); 4310 4293 4311 4294 return btrfs_update_inode(trans, dir); 4312 4295 } ··· 6700 6683 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + 6701 6684 name->len * 2); 6702 6685 inode_inc_iversion(&parent_inode->vfs_inode); 6703 - /* 6704 - * If we are replaying a log tree, we do not want to update the mtime 6705 - * and ctime of the parent directory with the current time, since the 6706 - * log replay procedure is responsible for setting them to their correct 6707 - * values (the ones it had when the fsync was done). 6708 - */ 6709 - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) 6710 - inode_set_mtime_to_ts(&parent_inode->vfs_inode, 6711 - inode_set_ctime_current(&parent_inode->vfs_inode)); 6686 + update_time_after_link_or_unlink(parent_inode); 6712 6687 6713 6688 ret = btrfs_update_inode(trans, parent_inode); 6714 6689 if (ret)

+18 -1

fs/btrfs/subpage.c

··· 448 448 449 449 spin_lock_irqsave(&bfs->lock, flags); 450 450 bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits); 451 + 452 + /* 453 + * Don't clear the TOWRITE tag when starting writeback on a still-dirty 454 + * folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it, 455 + * assume writeback is complete, and exit too early — violating sync 456 + * ordering guarantees. 457 + */ 451 458 if (!folio_test_writeback(folio)) 452 - folio_start_writeback(folio); 459 + __folio_start_writeback(folio, true); 460 + if (!folio_test_dirty(folio)) { 461 + struct address_space *mapping = folio_mapping(folio); 462 + XA_STATE(xas, &mapping->i_pages, folio->index); 463 + unsigned long flags; 464 + 465 + xas_lock_irqsave(&xas, flags); 466 + xas_load(&xas); 467 + xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); 468 + xas_unlock_irqrestore(&xas, flags); 469 + } 453 470 spin_unlock_irqrestore(&bfs->lock, flags); 454 471 } 455 472

+8 -5

fs/btrfs/super.c

··· 88 88 refcount_t refs; 89 89 }; 90 90 91 + static void btrfs_emit_options(struct btrfs_fs_info *info, 92 + struct btrfs_fs_context *old); 93 + 91 94 enum { 92 95 Opt_acl, 93 96 Opt_clear_cache, ··· 701 698 702 699 if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { 703 700 if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { 704 - btrfs_info(info, "disk space caching is enabled"); 705 701 btrfs_warn(info, 706 702 "space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2"); 707 703 } 708 - if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) 709 - btrfs_info(info, "using free-space-tree"); 710 704 } 711 705 712 706 return ret; ··· 979 979 btrfs_err(fs_info, "open_ctree failed: %d", ret); 980 980 return ret; 981 981 } 982 + 983 + btrfs_emit_options(fs_info, NULL); 982 984 983 985 inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); 984 986 if (IS_ERR(inode)) { ··· 1439 1437 { 1440 1438 btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); 1441 1439 btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); 1442 - btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); 1440 + btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow"); 1443 1441 btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); 1444 1442 btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); 1445 1443 btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); ··· 1461 1459 btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums"); 1462 1460 btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags"); 1463 1461 1462 + btrfs_info_if_unset(info, old, NODATASUM, "setting datasum"); 1464 1463 btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); 1465 1464 btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); 1466 1465 btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); 1467 - btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); 1466 + btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers"); 1468 1467 btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); 1469 1468 btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); 1470 1469 btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");

+99 -34

fs/btrfs/zoned.c

··· 17 17 #include "accessors.h" 18 18 #include "bio.h" 19 19 #include "transaction.h" 20 + #include "sysfs.h" 20 21 21 22 /* Maximum number of zones to report per blkdev_report_zones() call */ 22 23 #define BTRFS_REPORT_NR_ZONES 4096 ··· 42 41 43 42 /* Number of superblock log zones */ 44 43 #define BTRFS_NR_SB_LOG_ZONES 2 44 + 45 + /* Default number of max active zones when the device has no limits. */ 46 + #define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128 45 47 46 48 /* 47 49 * Minimum of active zones we need: ··· 420 416 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 421 417 zone_info->nr_zones++; 422 418 423 - max_active_zones = bdev_max_active_zones(bdev); 419 + max_active_zones = min_not_zero(bdev_max_active_zones(bdev), 420 + bdev_max_open_zones(bdev)); 421 + if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) 422 + max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES; 424 423 if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { 425 424 btrfs_err(fs_info, 426 425 "zoned: %s: max active zones %u is too small, need at least %u active zones", ··· 2175 2168 goto out_unlock; 2176 2169 } 2177 2170 2178 - /* No space left */ 2179 - if (btrfs_zoned_bg_is_full(block_group)) { 2180 - ret = false; 2181 - goto out_unlock; 2171 + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { 2172 + /* The caller should check if the block group is full. */ 2173 + if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { 2174 + ret = false; 2175 + goto out_unlock; 2176 + } 2177 + } else { 2178 + /* Since it is already written, it should have been active. */ 2179 + WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start); 2182 2180 } 2183 2181 2184 2182 for (i = 0; i < map->num_stripes; i++) { ··· 2242 2230 struct btrfs_fs_info *fs_info = block_group->fs_info; 2243 2231 const u64 end = block_group->start + block_group->length; 2244 2232 struct extent_buffer *eb; 2245 - unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); 2233 + unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); 2246 2234 2247 2235 rcu_read_lock(); 2248 2236 xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { ··· 2255 2243 rcu_read_lock(); 2256 2244 } 2257 2245 rcu_read_unlock(); 2246 + } 2247 + 2248 + static int call_zone_finish(struct btrfs_block_group *block_group, 2249 + struct btrfs_io_stripe *stripe) 2250 + { 2251 + struct btrfs_device *device = stripe->dev; 2252 + const u64 physical = stripe->physical; 2253 + struct btrfs_zoned_device_info *zinfo = device->zone_info; 2254 + int ret; 2255 + 2256 + if (!device->bdev) 2257 + return 0; 2258 + 2259 + if (zinfo->max_active_zones == 0) 2260 + return 0; 2261 + 2262 + if (btrfs_dev_is_sequential(device, physical)) { 2263 + unsigned int nofs_flags; 2264 + 2265 + nofs_flags = memalloc_nofs_save(); 2266 + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 2267 + physical >> SECTOR_SHIFT, 2268 + zinfo->zone_size >> SECTOR_SHIFT); 2269 + memalloc_nofs_restore(nofs_flags); 2270 + 2271 + if (ret) 2272 + return ret; 2273 + } 2274 + 2275 + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 2276 + zinfo->reserved_active_zones++; 2277 + btrfs_dev_clear_active_zone(device, physical); 2278 + 2279 + return 0; 2258 2280 } 2259 2281 2260 2282 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) ··· 2375 2329 down_read(&dev_replace->rwsem); 2376 2330 map = block_group->physical_map; 2377 2331 for (i = 0; i < map->num_stripes; i++) { 2378 - struct btrfs_device *device = map->stripes[i].dev; 2379 - const u64 physical = map->stripes[i].physical; 2380 - struct btrfs_zoned_device_info *zinfo = device->zone_info; 2381 - unsigned int nofs_flags; 2382 2332 2383 - if (!device->bdev) 2384 - continue; 2385 - 2386 - if (zinfo->max_active_zones == 0) 2387 - continue; 2388 - 2389 - nofs_flags = memalloc_nofs_save(); 2390 - ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 2391 - physical >> SECTOR_SHIFT, 2392 - zinfo->zone_size >> SECTOR_SHIFT); 2393 - memalloc_nofs_restore(nofs_flags); 2394 - 2333 + ret = call_zone_finish(block_group, &map->stripes[i]); 2395 2334 if (ret) { 2396 2335 up_read(&dev_replace->rwsem); 2397 2336 return ret; 2398 2337 } 2399 - 2400 - if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 2401 - zinfo->reserved_active_zones++; 2402 - btrfs_dev_clear_active_zone(device, physical); 2403 2338 } 2404 2339 up_read(&dev_replace->rwsem); 2405 2340 ··· 2531 2504 void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) 2532 2505 { 2533 2506 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 2534 - struct btrfs_space_info *space_info = data_sinfo->sub_group[0]; 2507 + struct btrfs_space_info *space_info = data_sinfo; 2535 2508 struct btrfs_trans_handle *trans; 2536 2509 struct btrfs_block_group *bg; 2537 2510 struct list_head *bg_list; 2538 2511 u64 alloc_flags; 2539 - bool initial = false; 2512 + bool first = true; 2540 2513 bool did_chunk_alloc = false; 2541 2514 int index; 2542 2515 int ret; ··· 2550 2523 if (sb_rdonly(fs_info->sb)) 2551 2524 return; 2552 2525 2553 - ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); 2554 2526 alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); 2555 2527 index = btrfs_bg_flags_to_raid_index(alloc_flags); 2556 2528 2557 - bg_list = &data_sinfo->block_groups[index]; 2529 + /* Scan the data space_info to find empty block groups. Take the second one. */ 2558 2530 again: 2531 + bg_list = &space_info->block_groups[index]; 2559 2532 list_for_each_entry(bg, bg_list, list) { 2560 - if (bg->used > 0) 2533 + if (bg->alloc_offset != 0) 2561 2534 continue; 2562 2535 2563 - if (!initial) { 2564 - initial = true; 2536 + if (first) { 2537 + first = false; 2565 2538 continue; 2539 + } 2540 + 2541 + if (space_info == data_sinfo) { 2542 + /* Migrate the block group to the data relocation space_info. */ 2543 + struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; 2544 + int factor; 2545 + 2546 + ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); 2547 + factor = btrfs_bg_type_to_factor(bg->flags); 2548 + 2549 + down_write(&space_info->groups_sem); 2550 + list_del_init(&bg->list); 2551 + /* We can assume this as we choose the second empty one. */ 2552 + ASSERT(!list_empty(&space_info->block_groups[index])); 2553 + up_write(&space_info->groups_sem); 2554 + 2555 + spin_lock(&space_info->lock); 2556 + space_info->total_bytes -= bg->length; 2557 + space_info->disk_total -= bg->length * factor; 2558 + /* There is no allocation ever happened. */ 2559 + ASSERT(bg->used == 0); 2560 + ASSERT(bg->zone_unusable == 0); 2561 + /* No super block in a block group on the zoned setup. */ 2562 + ASSERT(bg->bytes_super == 0); 2563 + spin_unlock(&space_info->lock); 2564 + 2565 + bg->space_info = reloc_sinfo; 2566 + if (reloc_sinfo->block_group_kobjs[index] == NULL) 2567 + btrfs_sysfs_add_block_group_type(bg); 2568 + 2569 + btrfs_add_bg_to_space_info(fs_info, bg); 2566 2570 } 2567 2571 2568 2572 fs_info->data_reloc_bg = bg->start; ··· 2610 2552 if (IS_ERR(trans)) 2611 2553 return; 2612 2554 2555 + /* Allocate new BG in the data relocation space_info. */ 2556 + space_info = data_sinfo->sub_group[0]; 2557 + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); 2613 2558 ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); 2614 2559 btrfs_end_transaction(trans); 2615 2560 if (ret == 1) { 2561 + /* 2562 + * We allocated a new block group in the data relocation space_info. We 2563 + * can take that one. 2564 + */ 2565 + first = false; 2616 2566 did_chunk_alloc = true; 2617 - bg_list = &space_info->block_groups[index]; 2618 2567 goto again; 2619 2568 } 2620 2569 }

Configure Feed

Configure Feed