Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge patch series "allow file systems to increase the minimum writeback chunk size v2"

Christoph Hellwig <hch@lst.de> says:

The relatively low minimal writeback size of 4MiB leads means that
written back inodes on rotational media are switched a lot. Besides
introducing additional seeks, this also can lead to extreme file
fragmentation on zoned devices when a lot of files are cached relative
to the available writeback bandwidth.

Add a superblock field that allows the file system to override the
default size, and set it to the zone size for zoned XFS.

* patches from https://patch.msgid.link/20251017034611.651385-1-hch@lst.de:
xfs: set s_min_writeback_pages for zoned file systems
writeback: allow the file system to override MIN_WRITEBACK_PAGES
writeback: cleanup writeback_chunk_size

Link: https://patch.msgid.link/20251017034611.651385-1-hch@lst.de
Signed-off-by: Christian Brauner <brauner@kernel.org>

+42 -19
+9 -17
fs/fs-writeback.c
··· 33 33 #include "internal.h" 34 34 35 35 /* 36 - * 4MB minimal write chunk size 37 - */ 38 - #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) 39 - 40 - /* 41 36 * Passed into wb_writeback(), essentially a subset of writeback_control 42 37 */ 43 38 struct wb_writeback_work { ··· 1884 1889 return ret; 1885 1890 } 1886 1891 1887 - static long writeback_chunk_size(struct bdi_writeback *wb, 1888 - struct wb_writeback_work *work) 1892 + static long writeback_chunk_size(struct super_block *sb, 1893 + struct bdi_writeback *wb, struct wb_writeback_work *work) 1889 1894 { 1890 1895 long pages; 1891 1896 ··· 1903 1908 * (maybe slowly) sync all tagged pages 1904 1909 */ 1905 1910 if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) 1906 - pages = LONG_MAX; 1907 - else { 1908 - pages = min(wb->avg_write_bandwidth / 2, 1909 - global_wb_domain.dirty_limit / DIRTY_SCOPE); 1910 - pages = min(pages, work->nr_pages); 1911 - pages = round_down(pages + MIN_WRITEBACK_PAGES, 1912 - MIN_WRITEBACK_PAGES); 1913 - } 1911 + return LONG_MAX; 1914 1912 1915 - return pages; 1913 + pages = min(wb->avg_write_bandwidth / 2, 1914 + global_wb_domain.dirty_limit / DIRTY_SCOPE); 1915 + pages = min(pages, work->nr_pages); 1916 + return round_down(pages + sb->s_min_writeback_pages, 1917 + sb->s_min_writeback_pages); 1916 1918 } 1917 1919 1918 1920 /* ··· 2011 2019 inode->i_state |= I_SYNC; 2012 2020 wbc_attach_and_unlock_inode(&wbc, inode); 2013 2021 2014 - write_chunk = writeback_chunk_size(wb, work); 2022 + write_chunk = writeback_chunk_size(inode->i_sb, wb, work); 2015 2023 wbc.nr_to_write = write_chunk; 2016 2024 wbc.pages_skipped = 0; 2017 2025
+1
fs/super.c
··· 389 389 goto fail; 390 390 if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) 391 391 goto fail; 392 + s->s_min_writeback_pages = MIN_WRITEBACK_PAGES; 392 393 return s; 393 394 394 395 fail:
+26 -2
fs/xfs/xfs_zone_alloc.c
··· 1215 1215 .mp = mp, 1216 1216 }; 1217 1217 struct xfs_buftarg *bt = mp->m_rtdev_targp; 1218 + xfs_extlen_t zone_blocks = mp->m_groups[XG_TYPE_RTG].blocks; 1218 1219 int error; 1219 1220 1220 1221 if (!bt) { ··· 1246 1245 return -ENOMEM; 1247 1246 1248 1247 xfs_info(mp, "%u zones of %u blocks (%u max open zones)", 1249 - mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, 1250 - mp->m_max_open_zones); 1248 + mp->m_sb.sb_rgcount, zone_blocks, mp->m_max_open_zones); 1251 1249 trace_xfs_zones_mount(mp); 1250 + 1251 + /* 1252 + * The writeback code switches between inodes regularly to provide 1253 + * fairness. The default lower bound is 4MiB, but for zoned file 1254 + * systems we want to increase that both to reduce seeks, but also more 1255 + * importantly so that workloads that writes files in a multiple of the 1256 + * zone size do not get fragmented and require garbage collection when 1257 + * they shouldn't. Increase is to the zone size capped by the max 1258 + * extent len. 1259 + * 1260 + * Note that because s_min_writeback_pages is a superblock field, this 1261 + * value also get applied to non-zoned files on the data device if 1262 + * there are any. On typical zoned setup all data is on the RT device 1263 + * because using the more efficient sequential write required zones 1264 + * is the reason for using the zone allocator, and either the RT device 1265 + * and the (meta)data device are on the same block device, or the 1266 + * (meta)data device is on a fast SSD while the data on the RT device 1267 + * is on a SMR HDD. In any combination of the above cases enforcing 1268 + * the higher min_writeback_pages for non-RT inodes is either a noop 1269 + * or beneficial. 1270 + */ 1271 + mp->m_super->s_min_writeback_pages = 1272 + XFS_FSB_TO_B(mp, min(zone_blocks, XFS_MAX_BMBT_EXTLEN)) >> 1273 + PAGE_SHIFT; 1252 1274 1253 1275 if (bdev_is_zoned(bt->bt_bdev)) { 1254 1276 error = blkdev_report_zones(bt->bt_bdev,
+1
include/linux/fs.h
··· 1583 1583 1584 1584 spinlock_t s_inode_wblist_lock; 1585 1585 struct list_head s_inodes_wb; /* writeback inodes */ 1586 + long s_min_writeback_pages; 1586 1587 } __randomize_layout; 1587 1588 1588 1589 static inline struct user_namespace *i_user_ns(const struct inode *inode)
+5
include/linux/writeback.h
··· 374 374 void sb_mark_inode_writeback(struct inode *inode); 375 375 void sb_clear_inode_writeback(struct inode *inode); 376 376 377 + /* 378 + * 4MB minimal write chunk size 379 + */ 380 + #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) 381 + 377 382 #endif /* WRITEBACK_H */