Merge tag 'vfs-6.12.blocksize' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull vfs blocksize updates from Christian Brauner:
"This contains the vfs infrastructure as well as the xfs bits to enable
support for block sizes (bs) larger than page sizes (ps) plus a few
fixes to related infrastructure.

There has been efforts over the last 16 years to enable enable Large
Block Sizes (LBS), that is block sizes in filesystems where bs > page
size. Through these efforts we have learned that one of the main
blockers to supporting bs > ps in filesystems has been a way to
allocate pages that are at least the filesystem block size on the page
cache where bs > ps.

Thanks to various previous efforts it is possible to support bs > ps
in XFS with only a few changes in XFS itself. Most changes are to the
page cache to support minimum order folio support for the target block
size on the filesystem.

A motivation for Large Block Sizes today is to support high-capacity
(large amount of Terabytes) QLC SSDs where the internal Indirection
Unit (IU) are typically greater than 4k to help reduce DRAM and so in
turn cost and space. In practice this then allows different
architectures to use a base page size of 4k while still enabling
support for block sizes aligned to the larger IUs by relying on high
order folios on the page cache when needed.

It also allows to take advantage of the drive's support for atomics
larger than 4k with buffered IO support in Linux. As described this
year at LSFMM, supporting large atomics greater than 4k enables
databases to remove the need to rely on their own journaling, so they
can disable double buffered writes, which is a feature different cloud
providers are already enabling through custom storage solutions"

* tag 'vfs-6.12.blocksize' of gitolite.kernel.org:pub/scm/linux/kernel/git/vfs/vfs: (22 commits)
Documentation: iomap: fix a typo
iomap: remove the iomap_file_buffered_write_punch_delalloc return value
iomap: pass the iomap to the punch callback
iomap: pass flags to iomap_file_buffered_write_punch_delalloc
iomap: improve shared block detection in iomap_unshare_iter
iomap: handle a post-direct I/O invalidate race in iomap_write_delalloc_release
docs:filesystems: fix spelling and grammar mistakes in iomap design page
filemap: fix htmldoc warning for mapping_align_index()
iomap: make zero range flush conditional on unwritten mappings
iomap: fix handling of dirty folios over unwritten extents
iomap: add a private argument for iomap_file_buffered_write
iomap: remove set_memor_ro() on zero page
xfs: enable block size larger than page size support
xfs: make the calculation generic in xfs_sb_validate_fsb_count()
xfs: expose block size in stat
xfs: use kvmalloc for xattr buffers
iomap: fix iomap_dio_zero() for fs bs > system page size
filemap: cap PTE range to be created to allowed zero fill in folio_map_range()
mm: split a folio in minimum folio order chunks
readahead: allocate folios with mapping_min_order in readahead
...

Linus Torvalds 2 years ago 171754c3 baeb9a7d

+504 -194

21 changed files

expand all

Documentation

filesystems

iomap

design.rst

block

fops.c

gfs2

file.c

iomap

buffered-io.c

direct-io.c

xfs

libxfs

xfs_attr_leaf.c

xfs_ialloc.c

xfs_shared.h

xfs_file.c

xfs_icache.c

xfs_iomap.c

xfs_iops.c

xfs_mount.c

xfs_super.c

zonefs

file.c

include

linux

huge_mm.h

iomap.h

pagemap.h

filemap.c

huge_memory.c

readahead.c

+2 -2

Documentation/filesystems/iomap/design.rst

··· 165 165 u16 flags; 166 166 struct block_device *bdev; 167 167 struct dax_device *dax_dev; 168 - voidw *inline_data; 168 + void *inline_data; 169 169 void *private; 170 170 const struct iomap_folio_ops *folio_ops; 171 171 u64 validity_cookie; ··· 426 426 427 427 The exact locking requirements are specific to the filesystem; for 428 428 certain operations, some of these locks can be elided. 429 - All further mention of locking are *recommendations*, not mandates. 429 + All further mentions of locking are *recommendations*, not mandates. 430 430 Each filesystem author must figure out the locking for themself. 431 431 432 432 Bugs and Limitations

+1 -1

block/fops.c

··· 666 666 667 667 static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) 668 668 { 669 - return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); 669 + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops, NULL); 670 670 } 671 671 672 672 /*

+1 -1

fs/gfs2/file.c

··· 1057 1057 } 1058 1058 1059 1059 pagefault_disable(); 1060 - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); 1060 + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops, NULL); 1061 1061 pagefault_enable(); 1062 1062 if (ret > 0) 1063 1063 written += ret;

+123 -76

fs/iomap/buffered-io.c

··· 23 23 24 24 #define IOEND_BATCH_SIZE 4096 25 25 26 - typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); 27 26 /* 28 27 * Structure allocated for each folio to track per-block uptodate, dirty state 29 28 * and I/O completions. ··· 1021 1022 1022 1023 ssize_t 1023 1024 iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, 1024 - const struct iomap_ops *ops) 1025 + const struct iomap_ops *ops, void *private) 1025 1026 { 1026 1027 struct iomap_iter iter = { 1027 1028 .inode = iocb->ki_filp->f_mapping->host, 1028 1029 .pos = iocb->ki_pos, 1029 1030 .len = iov_iter_count(i), 1030 1031 .flags = IOMAP_WRITE, 1032 + .private = private, 1031 1033 }; 1032 1034 ssize_t ret; 1033 1035 ··· 1046 1046 } 1047 1047 EXPORT_SYMBOL_GPL(iomap_file_buffered_write); 1048 1048 1049 - static int iomap_write_delalloc_ifs_punch(struct inode *inode, 1049 + static void iomap_write_delalloc_ifs_punch(struct inode *inode, 1050 1050 struct folio *folio, loff_t start_byte, loff_t end_byte, 1051 - iomap_punch_t punch) 1051 + struct iomap *iomap, iomap_punch_t punch) 1052 1052 { 1053 1053 unsigned int first_blk, last_blk, i; 1054 1054 loff_t last_byte; 1055 1055 u8 blkbits = inode->i_blkbits; 1056 1056 struct iomap_folio_state *ifs; 1057 - int ret = 0; 1058 1057 1059 1058 /* 1060 1059 * When we have per-block dirty tracking, there can be ··· 1063 1064 */ 1064 1065 ifs = folio->private; 1065 1066 if (!ifs) 1066 - return ret; 1067 + return; 1067 1068 1068 1069 last_byte = min_t(loff_t, end_byte - 1, 1069 1070 folio_pos(folio) + folio_size(folio) - 1); 1070 1071 first_blk = offset_in_folio(folio, start_byte) >> blkbits; 1071 1072 last_blk = offset_in_folio(folio, last_byte) >> blkbits; 1072 1073 for (i = first_blk; i <= last_blk; i++) { 1073 - if (!ifs_block_is_dirty(folio, ifs, i)) { 1074 - ret = punch(inode, folio_pos(folio) + (i << blkbits), 1075 - 1 << blkbits); 1076 - if (ret) 1077 - return ret; 1078 - } 1074 + if (!ifs_block_is_dirty(folio, ifs, i)) 1075 + punch(inode, folio_pos(folio) + (i << blkbits), 1076 + 1 << blkbits, iomap); 1079 1077 } 1080 - 1081 - return ret; 1082 1078 } 1083 1079 1084 - 1085 - static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, 1080 + static void iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, 1086 1081 loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, 1087 - iomap_punch_t punch) 1082 + struct iomap *iomap, iomap_punch_t punch) 1088 1083 { 1089 - int ret = 0; 1090 - 1091 1084 if (!folio_test_dirty(folio)) 1092 - return ret; 1085 + return; 1093 1086 1094 1087 /* if dirty, punch up to offset */ 1095 1088 if (start_byte > *punch_start_byte) { 1096 - ret = punch(inode, *punch_start_byte, 1097 - start_byte - *punch_start_byte); 1098 - if (ret) 1099 - return ret; 1089 + punch(inode, *punch_start_byte, start_byte - *punch_start_byte, 1090 + iomap); 1100 1091 } 1101 1092 1102 1093 /* Punch non-dirty blocks within folio */ 1103 - ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, 1104 - end_byte, punch); 1105 - if (ret) 1106 - return ret; 1094 + iomap_write_delalloc_ifs_punch(inode, folio, start_byte, end_byte, 1095 + iomap, punch); 1107 1096 1108 1097 /* 1109 1098 * Make sure the next punch start is correctly bound to ··· 1099 1112 */ 1100 1113 *punch_start_byte = min_t(loff_t, end_byte, 1101 1114 folio_pos(folio) + folio_size(folio)); 1102 - 1103 - return ret; 1104 1115 } 1105 1116 1106 1117 /* ··· 1118 1133 * This function uses [start_byte, end_byte) intervals (i.e. open ended) to 1119 1134 * simplify range iterations. 1120 1135 */ 1121 - static int iomap_write_delalloc_scan(struct inode *inode, 1136 + static void iomap_write_delalloc_scan(struct inode *inode, 1122 1137 loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, 1123 - iomap_punch_t punch) 1138 + struct iomap *iomap, iomap_punch_t punch) 1124 1139 { 1125 1140 while (start_byte < end_byte) { 1126 1141 struct folio *folio; 1127 - int ret; 1128 1142 1129 1143 /* grab locked page */ 1130 1144 folio = filemap_lock_folio(inode->i_mapping, ··· 1134 1150 continue; 1135 1151 } 1136 1152 1137 - ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte, 1138 - start_byte, end_byte, punch); 1139 - if (ret) { 1140 - folio_unlock(folio); 1141 - folio_put(folio); 1142 - return ret; 1143 - } 1153 + iomap_write_delalloc_punch(inode, folio, punch_start_byte, 1154 + start_byte, end_byte, iomap, punch); 1144 1155 1145 1156 /* move offset to start of next folio in range */ 1146 1157 start_byte = folio_next_index(folio) << PAGE_SHIFT; 1147 1158 folio_unlock(folio); 1148 1159 folio_put(folio); 1149 1160 } 1150 - return 0; 1151 1161 } 1152 1162 1153 1163 /* ··· 1177 1199 * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose 1178 1200 * the code to subtle off-by-one bugs.... 1179 1201 */ 1180 - static int iomap_write_delalloc_release(struct inode *inode, 1181 - loff_t start_byte, loff_t end_byte, iomap_punch_t punch) 1202 + static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, 1203 + loff_t end_byte, unsigned flags, struct iomap *iomap, 1204 + iomap_punch_t punch) 1182 1205 { 1183 1206 loff_t punch_start_byte = start_byte; 1184 1207 loff_t scan_end_byte = min(i_size_read(inode), end_byte); 1185 - int error = 0; 1186 1208 1187 1209 /* 1188 1210 * Lock the mapping to avoid races with page faults re-instantiating ··· 1199 1221 /* 1200 1222 * If there is no more data to scan, all that is left is to 1201 1223 * punch out the remaining range. 1224 + * 1225 + * Note that mapping_seek_hole_data is only supposed to return 1226 + * either an offset or -ENXIO, so WARN on any other error as 1227 + * that would be an API change without updating the callers. 1202 1228 */ 1203 1229 if (start_byte == -ENXIO || start_byte == scan_end_byte) 1204 1230 break; 1205 - if (start_byte < 0) { 1206 - error = start_byte; 1231 + if (WARN_ON_ONCE(start_byte < 0)) 1207 1232 goto out_unlock; 1208 - } 1209 1233 WARN_ON_ONCE(start_byte < punch_start_byte); 1210 1234 WARN_ON_ONCE(start_byte > scan_end_byte); 1211 1235 ··· 1217 1237 */ 1218 1238 data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, 1219 1239 scan_end_byte, SEEK_HOLE); 1220 - if (data_end < 0) { 1221 - error = data_end; 1240 + if (WARN_ON_ONCE(data_end < 0)) 1222 1241 goto out_unlock; 1223 - } 1224 - WARN_ON_ONCE(data_end <= start_byte); 1242 + 1243 + /* 1244 + * If we race with post-direct I/O invalidation of the page cache, 1245 + * there might be no data left at start_byte. 1246 + */ 1247 + if (data_end == start_byte) 1248 + continue; 1249 + 1250 + WARN_ON_ONCE(data_end < start_byte); 1225 1251 WARN_ON_ONCE(data_end > scan_end_byte); 1226 1252 1227 - error = iomap_write_delalloc_scan(inode, &punch_start_byte, 1228 - start_byte, data_end, punch); 1229 - if (error) 1230 - goto out_unlock; 1253 + iomap_write_delalloc_scan(inode, &punch_start_byte, start_byte, 1254 + data_end, iomap, punch); 1231 1255 1232 1256 /* The next data search starts at the end of this one. */ 1233 1257 start_byte = data_end; 1234 1258 } 1235 1259 1236 1260 if (punch_start_byte < end_byte) 1237 - error = punch(inode, punch_start_byte, 1238 - end_byte - punch_start_byte); 1261 + punch(inode, punch_start_byte, end_byte - punch_start_byte, 1262 + iomap); 1239 1263 out_unlock: 1240 1264 filemap_invalidate_unlock(inode->i_mapping); 1241 - return error; 1242 1265 } 1243 1266 1244 1267 /* ··· 1274 1291 * ->punch 1275 1292 * internal filesystem allocation lock 1276 1293 */ 1277 - int iomap_file_buffered_write_punch_delalloc(struct inode *inode, 1278 - struct iomap *iomap, loff_t pos, loff_t length, 1279 - ssize_t written, iomap_punch_t punch) 1294 + void iomap_file_buffered_write_punch_delalloc(struct inode *inode, 1295 + loff_t pos, loff_t length, ssize_t written, unsigned flags, 1296 + struct iomap *iomap, iomap_punch_t punch) 1280 1297 { 1281 1298 loff_t start_byte; 1282 1299 loff_t end_byte; 1283 1300 unsigned int blocksize = i_blocksize(inode); 1284 1301 1285 1302 if (iomap->type != IOMAP_DELALLOC) 1286 - return 0; 1303 + return; 1287 1304 1288 1305 /* If we didn't reserve the blocks, we're not allowed to punch them. */ 1289 1306 if (!(iomap->flags & IOMAP_F_NEW)) 1290 - return 0; 1307 + return; 1291 1308 1292 1309 /* 1293 1310 * start_byte refers to the first unused block after a short write. If ··· 1302 1319 1303 1320 /* Nothing to do if we've written the entire delalloc extent */ 1304 1321 if (start_byte >= end_byte) 1305 - return 0; 1322 + return; 1306 1323 1307 - return iomap_write_delalloc_release(inode, start_byte, end_byte, 1308 - punch); 1324 + iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap, 1325 + punch); 1309 1326 } 1310 1327 EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); 1311 1328 1312 1329 static loff_t iomap_unshare_iter(struct iomap_iter *iter) 1313 1330 { 1314 1331 struct iomap *iomap = &iter->iomap; 1315 - const struct iomap *srcmap = iomap_iter_srcmap(iter); 1316 1332 loff_t pos = iter->pos; 1317 1333 loff_t length = iomap_length(iter); 1318 1334 loff_t written = 0; 1319 1335 1320 - /* don't bother with blocks that are not shared to start with */ 1336 + /* Don't bother with blocks that are not shared to start with. */ 1321 1337 if (!(iomap->flags & IOMAP_F_SHARED)) 1322 1338 return length; 1323 - /* don't bother with holes or unwritten extents */ 1324 - if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 1339 + 1340 + /* 1341 + * Don't bother with holes or unwritten extents. 1342 + * 1343 + * Note that we use srcmap directly instead of iomap_iter_srcmap as 1344 + * unsharing requires providing a separate source map, and the presence 1345 + * of one is a good indicator that unsharing is needed, unlike 1346 + * IOMAP_F_SHARED which can be set for any data that goes into the COW 1347 + * fork for XFS. 1348 + */ 1349 + if (iter->srcmap.type == IOMAP_HOLE || 1350 + iter->srcmap.type == IOMAP_UNWRITTEN) 1325 1351 return length; 1326 1352 1327 1353 do { ··· 1385 1393 } 1386 1394 EXPORT_SYMBOL_GPL(iomap_file_unshare); 1387 1395 1388 - static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) 1396 + /* 1397 + * Flush the remaining range of the iter and mark the current mapping stale. 1398 + * This is used when zero range sees an unwritten mapping that may have had 1399 + * dirty pagecache over it. 1400 + */ 1401 + static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) 1402 + { 1403 + struct address_space *mapping = i->inode->i_mapping; 1404 + loff_t end = i->pos + i->len - 1; 1405 + 1406 + i->iomap.flags |= IOMAP_F_STALE; 1407 + return filemap_write_and_wait_range(mapping, i->pos, end); 1408 + } 1409 + 1410 + static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero, 1411 + bool *range_dirty) 1389 1412 { 1390 1413 const struct iomap *srcmap = iomap_iter_srcmap(iter); 1391 1414 loff_t pos = iter->pos; 1392 1415 loff_t length = iomap_length(iter); 1393 1416 loff_t written = 0; 1394 1417 1395 - /* already zeroed? we're done. */ 1396 - if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) 1418 + /* 1419 + * We must zero subranges of unwritten mappings that might be dirty in 1420 + * pagecache from previous writes. We only know whether the entire range 1421 + * was clean or not, however, and dirty folios may have been written 1422 + * back or reclaimed at any point after mapping lookup. 1423 + * 1424 + * The easiest way to deal with this is to flush pagecache to trigger 1425 + * any pending unwritten conversions and then grab the updated extents 1426 + * from the fs. The flush may change the current mapping, so mark it 1427 + * stale for the iterator to remap it for the next pass to handle 1428 + * properly. 1429 + * 1430 + * Note that holes are treated the same as unwritten because zero range 1431 + * is (ab)used for partial folio zeroing in some cases. Hole backed 1432 + * post-eof ranges can be dirtied via mapped write and the flush 1433 + * triggers writeback time post-eof zeroing. 1434 + */ 1435 + if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { 1436 + if (*range_dirty) { 1437 + *range_dirty = false; 1438 + return iomap_zero_iter_flush_and_stale(iter); 1439 + } 1440 + /* range is clean and already zeroed, nothing to do */ 1397 1441 return length; 1442 + } 1398 1443 1399 1444 do { 1400 1445 struct folio *folio; ··· 1479 1450 .flags = IOMAP_ZERO, 1480 1451 }; 1481 1452 int ret; 1453 + bool range_dirty; 1454 + 1455 + /* 1456 + * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but 1457 + * pagecache must be flushed to ensure stale data from previous 1458 + * buffered writes is not exposed. A flush is only required for certain 1459 + * types of mappings, but checking pagecache after mapping lookup is 1460 + * racy with writeback and reclaim. 1461 + * 1462 + * Therefore, check the entire range first and pass along whether any 1463 + * part of it is dirty. If so and an underlying mapping warrants it, 1464 + * flush the cache at that point. This trades off the occasional false 1465 + * positive (and spurious flush, if the dirty data and mapping don't 1466 + * happen to overlap) for simplicity in handling a relatively uncommon 1467 + * situation. 1468 + */ 1469 + range_dirty = filemap_range_needs_writeback(inode->i_mapping, 1470 + pos, pos + len - 1); 1482 1471 1483 1472 while ((ret = iomap_iter(&iter, ops)) > 0) 1484 - iter.processed = iomap_zero_iter(&iter, did_zero); 1473 + iter.processed = iomap_zero_iter(&iter, did_zero, &range_dirty); 1485 1474 return ret; 1486 1475 } 1487 1476 EXPORT_SYMBOL_GPL(iomap_zero_range); ··· 2054 2007 } 2055 2008 EXPORT_SYMBOL_GPL(iomap_writepages); 2056 2009 2057 - static int __init iomap_init(void) 2010 + static int __init iomap_buffered_init(void) 2058 2011 { 2059 2012 return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), 2060 2013 offsetof(struct iomap_ioend, io_bio), 2061 2014 BIOSET_NEED_BVECS); 2062 2015 } 2063 - fs_initcall(iomap_init); 2016 + fs_initcall(iomap_buffered_init);

+36 -6

fs/iomap/direct-io.c

··· 27 27 #define IOMAP_DIO_WRITE (1U << 30) 28 28 #define IOMAP_DIO_DIRTY (1U << 31) 29 29 30 + /* 31 + * Used for sub block zeroing in iomap_dio_zero() 32 + */ 33 + #define IOMAP_ZERO_PAGE_SIZE (SZ_64K) 34 + #define IOMAP_ZERO_PAGE_ORDER (get_order(IOMAP_ZERO_PAGE_SIZE)) 35 + static struct page *zero_page; 36 + 30 37 struct iomap_dio { 31 38 struct kiocb *iocb; 32 39 const struct iomap_dio_ops *dops; ··· 239 232 } 240 233 EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); 241 234 242 - static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, 235 + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, 243 236 loff_t pos, unsigned len) 244 237 { 245 238 struct inode *inode = file_inode(dio->iocb->ki_filp); 246 - struct page *page = ZERO_PAGE(0); 247 239 struct bio *bio; 240 + 241 + if (!len) 242 + return 0; 243 + /* 244 + * Max block size supported is 64k 245 + */ 246 + if (WARN_ON_ONCE(len > IOMAP_ZERO_PAGE_SIZE)) 247 + return -EINVAL; 248 248 249 249 bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE); 250 250 fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits, ··· 260 246 bio->bi_private = dio; 261 247 bio->bi_end_io = iomap_dio_bio_end_io; 262 248 263 - __bio_add_page(bio, page, len, 0); 249 + __bio_add_page(bio, zero_page, len, 0); 264 250 iomap_dio_submit_bio(iter, dio, bio, pos); 251 + return 0; 265 252 } 266 253 267 254 /* ··· 371 356 if (need_zeroout) { 372 357 /* zero out from the start of the block to the write offset */ 373 358 pad = pos & (fs_block_size - 1); 374 - if (pad) 375 - iomap_dio_zero(iter, dio, pos - pad, pad); 359 + 360 + ret = iomap_dio_zero(iter, dio, pos - pad, pad); 361 + if (ret) 362 + goto out; 376 363 } 377 364 378 365 /* ··· 448 431 /* zero out from the end of the write to the end of the block */ 449 432 pad = pos & (fs_block_size - 1); 450 433 if (pad) 451 - iomap_dio_zero(iter, dio, pos, fs_block_size - pad); 434 + ret = iomap_dio_zero(iter, dio, pos, 435 + fs_block_size - pad); 452 436 } 453 437 out: 454 438 /* Undo iter limitation to current extent */ ··· 771 753 return iomap_dio_complete(dio); 772 754 } 773 755 EXPORT_SYMBOL_GPL(iomap_dio_rw); 756 + 757 + static int __init iomap_dio_init(void) 758 + { 759 + zero_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 760 + IOMAP_ZERO_PAGE_ORDER); 761 + 762 + if (!zero_page) 763 + return -ENOMEM; 764 + 765 + return 0; 766 + } 767 + fs_initcall(iomap_dio_init);

+6 -9

fs/xfs/libxfs/xfs_attr_leaf.c

··· 1138 1138 1139 1139 trace_xfs_attr_leaf_to_sf(args); 1140 1140 1141 - tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); 1142 - if (!tmpbuffer) 1143 - return -ENOMEM; 1144 - 1141 + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); 1145 1142 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 1146 1143 1147 1144 leaf = (xfs_attr_leafblock_t *)tmpbuffer; ··· 1202 1205 error = 0; 1203 1206 1204 1207 out: 1205 - kfree(tmpbuffer); 1208 + kvfree(tmpbuffer); 1206 1209 return error; 1207 1210 } 1208 1211 ··· 1610 1613 1611 1614 trace_xfs_attr_leaf_compact(args); 1612 1615 1613 - tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); 1616 + tmpbuffer = kvmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL); 1614 1617 memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); 1615 1618 memset(bp->b_addr, 0, args->geo->blksize); 1616 1619 leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; ··· 1648 1651 */ 1649 1652 xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1); 1650 1653 1651 - kfree(tmpbuffer); 1654 + kvfree(tmpbuffer); 1652 1655 } 1653 1656 1654 1657 /* ··· 2327 2330 struct xfs_attr_leafblock *tmp_leaf; 2328 2331 struct xfs_attr3_icleaf_hdr tmphdr; 2329 2332 2330 - tmp_leaf = kzalloc(state->args->geo->blksize, 2333 + tmp_leaf = kvzalloc(state->args->geo->blksize, 2331 2334 GFP_KERNEL | __GFP_NOFAIL); 2332 2335 2333 2336 /* ··· 2368 2371 } 2369 2372 memcpy(save_leaf, tmp_leaf, state->args->geo->blksize); 2370 2373 savehdr = tmphdr; /* struct copy */ 2371 - kfree(tmp_leaf); 2374 + kvfree(tmp_leaf); 2372 2375 } 2373 2376 2374 2377 xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);

fs/xfs/libxfs/xfs_ialloc.c

··· 3034 3034 igeo->ialloc_align = mp->m_dalign; 3035 3035 else 3036 3036 igeo->ialloc_align = 0; 3037 + 3038 + if (mp->m_sb.sb_blocksize > PAGE_SIZE) 3039 + igeo->min_folio_order = mp->m_sb.sb_blocklog - PAGE_SHIFT; 3040 + else 3041 + igeo->min_folio_order = 0; 3037 3042 } 3038 3043 3039 3044 /* Compute the location of the root directory inode that is laid out by mkfs. */

fs/xfs/libxfs/xfs_shared.h

··· 224 224 /* precomputed value for di_flags2 */ 225 225 uint64_t new_diflags2; 226 226 227 + /* minimum folio order of a page cache allocation */ 228 + unsigned int min_folio_order; 229 + 227 230 }; 228 231 229 232 #endif /* __XFS_SHARED_H__ */

+1 -1

fs/xfs/xfs_file.c

··· 760 760 761 761 trace_xfs_file_buffered_write(iocb, from); 762 762 ret = iomap_file_buffered_write(iocb, from, 763 - &xfs_buffered_write_iomap_ops); 763 + &xfs_buffered_write_iomap_ops, NULL); 764 764 765 765 /* 766 766 * If we hit a space limit, try to free up some lingering preallocated

+4 -2

fs/xfs/xfs_icache.c

··· 100 100 101 101 /* VFS doesn't initialise i_mode! */ 102 102 VFS_I(ip)->i_mode = 0; 103 - mapping_set_large_folios(VFS_I(ip)->i_mapping); 103 + mapping_set_folio_min_order(VFS_I(ip)->i_mapping, 104 + M_IGEO(mp)->min_folio_order); 104 105 105 106 XFS_STATS_INC(mp, vn_active); 106 107 ASSERT(atomic_read(&ip->i_pincount) == 0); ··· 361 360 inode->i_uid = uid; 362 361 inode->i_gid = gid; 363 362 inode->i_state = state; 364 - mapping_set_large_folios(inode->i_mapping); 363 + mapping_set_folio_min_order(inode->i_mapping, 364 + M_IGEO(mp)->min_folio_order); 365 365 return error; 366 366 } 367 367

+5 -14

fs/xfs/xfs_iomap.c

··· 1208 1208 return error; 1209 1209 } 1210 1210 1211 - static int 1211 + static void 1212 1212 xfs_buffered_write_delalloc_punch( 1213 1213 struct inode *inode, 1214 1214 loff_t offset, 1215 - loff_t length) 1215 + loff_t length, 1216 + struct iomap *iomap) 1216 1217 { 1217 1218 xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); 1218 - return 0; 1219 1219 } 1220 1220 1221 1221 static int ··· 1227 1227 unsigned flags, 1228 1228 struct iomap *iomap) 1229 1229 { 1230 - 1231 - struct xfs_mount *mp = XFS_M(inode->i_sb); 1232 - int error; 1233 - 1234 - error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, 1235 - length, written, &xfs_buffered_write_delalloc_punch); 1236 - if (error && !xfs_is_shutdown(mp)) { 1237 - xfs_alert(mp, "%s: unable to clean up ino 0x%llx", 1238 - __func__, XFS_I(inode)->i_ino); 1239 - return error; 1240 - } 1230 + iomap_file_buffered_write_punch_delalloc(inode, offset, length, written, 1231 + flags, iomap, &xfs_buffered_write_delalloc_punch); 1241 1232 return 0; 1242 1233 } 1243 1234

+1 -11

fs/xfs/xfs_iops.c

··· 567 567 return 1U << mp->m_allocsize_log; 568 568 } 569 569 570 - return PAGE_SIZE; 570 + return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize); 571 571 } 572 572 573 573 STATIC int ··· 870 870 error = xfs_zero_range(ip, oldsize, newsize - oldsize, 871 871 &did_zeroing); 872 872 } else { 873 - /* 874 - * iomap won't detect a dirty page over an unwritten block (or a 875 - * cow block over a hole) and subsequently skips zeroing the 876 - * newly post-EOF portion of the page. Flush the new EOF to 877 - * convert the block before the pagecache truncate. 878 - */ 879 - error = filemap_write_and_wait_range(inode->i_mapping, newsize, 880 - newsize); 881 - if (error) 882 - return error; 883 873 error = xfs_truncate_page(ip, newsize, &did_zeroing); 884 874 } 885 875

+6 -2

fs/xfs/xfs_mount.c

··· 132 132 xfs_sb_t *sbp, 133 133 uint64_t nblocks) 134 134 { 135 - ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); 135 + uint64_t max_bytes; 136 + 136 137 ASSERT(sbp->sb_blocklog >= BBSHIFT); 137 138 139 + if (check_shl_overflow(nblocks, sbp->sb_blocklog, &max_bytes)) 140 + return -EFBIG; 141 + 138 142 /* Limited by ULONG_MAX of page cache index */ 139 - if (nblocks >> (PAGE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) 143 + if (max_bytes >> PAGE_SHIFT > ULONG_MAX) 140 144 return -EFBIG; 141 145 return 0; 142 146 }

+20 -8

fs/xfs/xfs_super.c

··· 1638 1638 goto out_free_sb; 1639 1639 } 1640 1640 1641 - /* 1642 - * Until this is fixed only page-sized or smaller data blocks work. 1643 - */ 1644 1641 if (mp->m_sb.sb_blocksize > PAGE_SIZE) { 1645 - xfs_warn(mp, 1646 - "File system with blocksize %d bytes. " 1647 - "Only pagesize (%ld) or less will currently work.", 1642 + size_t max_folio_size = mapping_max_folio_size_supported(); 1643 + 1644 + if (!xfs_has_crc(mp)) { 1645 + xfs_warn(mp, 1646 + "V4 Filesystem with blocksize %d bytes. Only pagesize (%ld) or less is supported.", 1648 1647 mp->m_sb.sb_blocksize, PAGE_SIZE); 1649 - error = -ENOSYS; 1650 - goto out_free_sb; 1648 + error = -ENOSYS; 1649 + goto out_free_sb; 1650 + } 1651 + 1652 + if (mp->m_sb.sb_blocksize > max_folio_size) { 1653 + xfs_warn(mp, 1654 + "block size (%u bytes) not supported; Only block size (%zu) or less is supported", 1655 + mp->m_sb.sb_blocksize, max_folio_size); 1656 + error = -ENOSYS; 1657 + goto out_free_sb; 1658 + } 1659 + 1660 + xfs_warn(mp, 1661 + "EXPERIMENTAL: V5 Filesystem with Large Block Size (%d bytes) enabled.", 1662 + mp->m_sb.sb_blocksize); 1651 1663 } 1652 1664 1653 1665 /* Ensure this filesystem fits in the page cache limits */

+1 -1

fs/zonefs/file.c

··· 563 563 if (ret <= 0) 564 564 goto inode_unlock; 565 565 566 - ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); 566 + ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL); 567 567 if (ret == -EIO) 568 568 zonefs_io_error(inode, true); 569 569

+24 -4

include/linux/huge_mm.h

··· 96 96 #define thp_vma_allowable_order(vma, vm_flags, tva_flags, order) \ 97 97 (!!thp_vma_allowable_orders(vma, vm_flags, tva_flags, BIT(order))) 98 98 99 + #define split_folio(f) split_folio_to_list(f, NULL) 100 + 99 101 #ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES 100 102 #define HPAGE_PMD_SHIFT PMD_SHIFT 101 103 #define HPAGE_PUD_SHIFT PUD_SHIFT ··· 319 317 bool can_split_folio(struct folio *folio, int *pextra_pins); 320 318 int split_huge_page_to_list_to_order(struct page *page, struct list_head *list, 321 319 unsigned int new_order); 320 + int min_order_for_split(struct folio *folio); 321 + int split_folio_to_list(struct folio *folio, struct list_head *list); 322 322 static inline int split_huge_page(struct page *page) 323 323 { 324 - return split_huge_page_to_list_to_order(page, NULL, 0); 324 + struct folio *folio = page_folio(page); 325 + int ret = min_order_for_split(folio); 326 + 327 + if (ret < 0) 328 + return ret; 329 + 330 + /* 331 + * split_huge_page() locks the page before splitting and 332 + * expects the same page that has been split to be locked when 333 + * returned. split_folio(page_folio(page)) cannot be used here 334 + * because it converts the page to folio and passes the head 335 + * page to be split. 336 + */ 337 + return split_huge_page_to_list_to_order(page, NULL, ret); 325 338 } 326 339 void deferred_split_folio(struct folio *folio); 327 340 ··· 501 484 { 502 485 return 0; 503 486 } 487 + 488 + static inline int split_folio_to_list(struct folio *folio, struct list_head *list) 489 + { 490 + return 0; 491 + } 492 + 504 493 static inline void deferred_split_folio(struct folio *folio) {} 505 494 #define split_huge_pmd(__vma, __pmd, __address) \ 506 495 do { } while (0) ··· 620 597 { 621 598 return split_folio_to_list_to_order(folio, NULL, new_order); 622 599 } 623 - 624 - #define split_folio_to_list(f, l) split_folio_to_list_to_order(f, l, 0) 625 - #define split_folio(f) split_folio_to_order(f, 0) 626 600 627 601 #endif /* _LINUX_HUGE_MM_H */

+8 -5

include/linux/iomap.h

··· 257 257 } 258 258 259 259 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, 260 - const struct iomap_ops *ops); 261 - int iomap_file_buffered_write_punch_delalloc(struct inode *inode, 262 - struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, 263 - int (*punch)(struct inode *inode, loff_t pos, loff_t length)); 264 - 260 + const struct iomap_ops *ops, void *private); 265 261 int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); 266 262 void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); 267 263 bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); ··· 273 277 const struct iomap_ops *ops); 274 278 vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, 275 279 const struct iomap_ops *ops); 280 + 281 + typedef void (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length, 282 + struct iomap *iomap); 283 + void iomap_file_buffered_write_punch_delalloc(struct inode *inode, loff_t pos, 284 + loff_t length, ssize_t written, unsigned flag, 285 + struct iomap *iomap, iomap_punch_t punch); 286 + 276 287 int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 277 288 u64 start, u64 len, const struct iomap_ops *ops); 278 289 loff_t iomap_seek_hole(struct inode *inode, loff_t offset,

+110 -14

include/linux/pagemap.h

··· 206 206 AS_EXITING = 4, /* final truncate in progress */ 207 207 /* writeback related tags are not used */ 208 208 AS_NO_WRITEBACK_TAGS = 5, 209 - AS_LARGE_FOLIO_SUPPORT = 6, 210 - AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ 211 - AS_STABLE_WRITES, /* must wait for writeback before modifying 209 + AS_RELEASE_ALWAYS = 6, /* Call ->release_folio(), even if no private data */ 210 + AS_STABLE_WRITES = 7, /* must wait for writeback before modifying 212 211 folio contents */ 213 - AS_INACCESSIBLE, /* Do not attempt direct R/W access to the mapping, 214 - including to move the mapping */ 212 + AS_INACCESSIBLE = 8, /* Do not attempt direct R/W access to the mapping */ 213 + /* Bits 16-25 are used for FOLIO_ORDER */ 214 + AS_FOLIO_ORDER_BITS = 5, 215 + AS_FOLIO_ORDER_MIN = 16, 216 + AS_FOLIO_ORDER_MAX = AS_FOLIO_ORDER_MIN + AS_FOLIO_ORDER_BITS, 215 217 }; 218 + 219 + #define AS_FOLIO_ORDER_BITS_MASK ((1u << AS_FOLIO_ORDER_BITS) - 1) 220 + #define AS_FOLIO_ORDER_MIN_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MIN) 221 + #define AS_FOLIO_ORDER_MAX_MASK (AS_FOLIO_ORDER_BITS_MASK << AS_FOLIO_ORDER_MAX) 222 + #define AS_FOLIO_ORDER_MASK (AS_FOLIO_ORDER_MIN_MASK | AS_FOLIO_ORDER_MAX_MASK) 216 223 217 224 /** 218 225 * mapping_set_error - record a writeback error in the address_space ··· 376 369 #define MAX_XAS_ORDER (XA_CHUNK_SHIFT * 2 - 1) 377 370 #define MAX_PAGECACHE_ORDER min(MAX_XAS_ORDER, PREFERRED_MAX_PAGECACHE_ORDER) 378 371 372 + /* 373 + * mapping_max_folio_size_supported() - Check the max folio size supported 374 + * 375 + * The filesystem should call this function at mount time if there is a 376 + * requirement on the folio mapping size in the page cache. 377 + */ 378 + static inline size_t mapping_max_folio_size_supported(void) 379 + { 380 + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 381 + return 1U << (PAGE_SHIFT + MAX_PAGECACHE_ORDER); 382 + return PAGE_SIZE; 383 + } 384 + 385 + /* 386 + * mapping_set_folio_order_range() - Set the orders supported by a file. 387 + * @mapping: The address space of the file. 388 + * @min: Minimum folio order (between 0-MAX_PAGECACHE_ORDER inclusive). 389 + * @max: Maximum folio order (between @min-MAX_PAGECACHE_ORDER inclusive). 390 + * 391 + * The filesystem should call this function in its inode constructor to 392 + * indicate which base size (min) and maximum size (max) of folio the VFS 393 + * can use to cache the contents of the file. This should only be used 394 + * if the filesystem needs special handling of folio sizes (ie there is 395 + * something the core cannot know). 396 + * Do not tune it based on, eg, i_size. 397 + * 398 + * Context: This should not be called while the inode is active as it 399 + * is non-atomic. 400 + */ 401 + static inline void mapping_set_folio_order_range(struct address_space *mapping, 402 + unsigned int min, 403 + unsigned int max) 404 + { 405 + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 406 + return; 407 + 408 + if (min > MAX_PAGECACHE_ORDER) 409 + min = MAX_PAGECACHE_ORDER; 410 + 411 + if (max > MAX_PAGECACHE_ORDER) 412 + max = MAX_PAGECACHE_ORDER; 413 + 414 + if (max < min) 415 + max = min; 416 + 417 + mapping->flags = (mapping->flags & ~AS_FOLIO_ORDER_MASK) | 418 + (min << AS_FOLIO_ORDER_MIN) | (max << AS_FOLIO_ORDER_MAX); 419 + } 420 + 421 + static inline void mapping_set_folio_min_order(struct address_space *mapping, 422 + unsigned int min) 423 + { 424 + mapping_set_folio_order_range(mapping, min, MAX_PAGECACHE_ORDER); 425 + } 426 + 379 427 /** 380 428 * mapping_set_large_folios() - Indicate the file supports large folios. 381 - * @mapping: The file. 429 + * @mapping: The address space of the file. 382 430 * 383 431 * The filesystem should call this function in its inode constructor to 384 432 * indicate that the VFS can use large folios to cache the contents of ··· 444 382 */ 445 383 static inline void mapping_set_large_folios(struct address_space *mapping) 446 384 { 447 - __set_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); 385 + mapping_set_folio_order_range(mapping, 0, MAX_PAGECACHE_ORDER); 386 + } 387 + 388 + static inline unsigned int 389 + mapping_max_folio_order(const struct address_space *mapping) 390 + { 391 + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 392 + return 0; 393 + return (mapping->flags & AS_FOLIO_ORDER_MAX_MASK) >> AS_FOLIO_ORDER_MAX; 394 + } 395 + 396 + static inline unsigned int 397 + mapping_min_folio_order(const struct address_space *mapping) 398 + { 399 + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) 400 + return 0; 401 + return (mapping->flags & AS_FOLIO_ORDER_MIN_MASK) >> AS_FOLIO_ORDER_MIN; 402 + } 403 + 404 + static inline unsigned long 405 + mapping_min_folio_nrpages(struct address_space *mapping) 406 + { 407 + return 1UL << mapping_min_folio_order(mapping); 408 + } 409 + 410 + /** 411 + * mapping_align_index() - Align index for this mapping. 412 + * @mapping: The address_space. 413 + * @index: The page index. 414 + * 415 + * The index of a folio must be naturally aligned. If you are adding a 416 + * new folio to the page cache and need to know what index to give it, 417 + * call this function. 418 + */ 419 + static inline pgoff_t mapping_align_index(struct address_space *mapping, 420 + pgoff_t index) 421 + { 422 + return round_down(index, mapping_min_folio_nrpages(mapping)); 448 423 } 449 424 450 425 /* ··· 490 391 */ 491 392 static inline bool mapping_large_folio_support(struct address_space *mapping) 492 393 { 493 - /* AS_LARGE_FOLIO_SUPPORT is only reasonable for pagecache folios */ 394 + /* AS_FOLIO_ORDER is only reasonable for pagecache folios */ 494 395 VM_WARN_ONCE((unsigned long)mapping & PAGE_MAPPING_ANON, 495 396 "Anonymous mapping always supports large folio"); 496 397 497 - return IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && 498 - test_bit(AS_LARGE_FOLIO_SUPPORT, &mapping->flags); 398 + return mapping_max_folio_order(mapping) > 0; 499 399 } 500 400 501 401 /* Return the maximum folio size for this pagecache mapping, in bytes. */ 502 - static inline size_t mapping_max_folio_size(struct address_space *mapping) 402 + static inline size_t mapping_max_folio_size(const struct address_space *mapping) 503 403 { 504 - if (mapping_large_folio_support(mapping)) 505 - return PAGE_SIZE << MAX_PAGECACHE_ORDER; 506 - return PAGE_SIZE; 404 + return PAGE_SIZE << mapping_max_folio_order(mapping); 507 405 } 508 406 509 407 static inline int filemap_nr_thps(struct address_space *mapping)

+23 -13

mm/filemap.c

··· 859 859 860 860 VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); 861 861 VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); 862 + VM_BUG_ON_FOLIO(folio_order(folio) < mapping_min_folio_order(mapping), 863 + folio); 862 864 mapping_set_update(&xas, mapping); 863 865 864 866 VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); ··· 1921 1919 folio_wait_stable(folio); 1922 1920 no_page: 1923 1921 if (!folio && (fgp_flags & FGP_CREAT)) { 1924 - unsigned order = FGF_GET_ORDER(fgp_flags); 1922 + unsigned int min_order = mapping_min_folio_order(mapping); 1923 + unsigned int order = max(min_order, FGF_GET_ORDER(fgp_flags)); 1925 1924 int err; 1925 + index = mapping_align_index(mapping, index); 1926 1926 1927 1927 if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping)) 1928 1928 gfp |= __GFP_WRITE; ··· 1937 1933 if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP)))) 1938 1934 fgp_flags |= FGP_LOCK; 1939 1935 1940 - if (!mapping_large_folio_support(mapping)) 1941 - order = 0; 1942 - if (order > MAX_PAGECACHE_ORDER) 1943 - order = MAX_PAGECACHE_ORDER; 1936 + if (order > mapping_max_folio_order(mapping)) 1937 + order = mapping_max_folio_order(mapping); 1944 1938 /* If we're not aligned, allocate a smaller folio */ 1945 1939 if (index & ((1UL << order) - 1)) 1946 1940 order = __ffs(index); ··· 1947 1945 gfp_t alloc_gfp = gfp; 1948 1946 1949 1947 err = -ENOMEM; 1950 - if (order > 0) 1948 + if (order > min_order) 1951 1949 alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; 1952 1950 folio = filemap_alloc_folio(alloc_gfp, order); 1953 1951 if (!folio) ··· 1962 1960 break; 1963 1961 folio_put(folio); 1964 1962 folio = NULL; 1965 - } while (order-- > 0); 1963 + } while (order-- > min_order); 1966 1964 1967 1965 if (err == -EEXIST) 1968 1966 goto repeat; ··· 2451 2449 } 2452 2450 2453 2451 static int filemap_create_folio(struct file *file, 2454 - struct address_space *mapping, pgoff_t index, 2452 + struct address_space *mapping, loff_t pos, 2455 2453 struct folio_batch *fbatch) 2456 2454 { 2457 2455 struct folio *folio; 2458 2456 int error; 2457 + unsigned int min_order = mapping_min_folio_order(mapping); 2458 + pgoff_t index; 2459 2459 2460 - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0); 2460 + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); 2461 2461 if (!folio) 2462 2462 return -ENOMEM; 2463 2463 ··· 2477 2473 * well to keep locking rules simple. 2478 2474 */ 2479 2475 filemap_invalidate_lock_shared(mapping); 2476 + index = (pos >> (PAGE_SHIFT + min_order)) << min_order; 2480 2477 error = filemap_add_folio(mapping, folio, index, 2481 2478 mapping_gfp_constraint(mapping, GFP_KERNEL)); 2482 2479 if (error == -EEXIST) ··· 2538 2533 if (!folio_batch_count(fbatch)) { 2539 2534 if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) 2540 2535 return -EAGAIN; 2541 - err = filemap_create_folio(filp, mapping, 2542 - iocb->ki_pos >> PAGE_SHIFT, fbatch); 2536 + err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); 2543 2537 if (err == AOP_TRUNCATED_PAGE) 2544 2538 goto retry; 2545 2539 return err; ··· 3615 3611 struct vm_area_struct *vma = vmf->vma; 3616 3612 struct file *file = vma->vm_file; 3617 3613 struct address_space *mapping = file->f_mapping; 3618 - pgoff_t last_pgoff = start_pgoff; 3614 + pgoff_t file_end, last_pgoff = start_pgoff; 3619 3615 unsigned long addr; 3620 3616 XA_STATE(xas, &mapping->i_pages, start_pgoff); 3621 3617 struct folio *folio; ··· 3640 3636 folio_put(folio); 3641 3637 goto out; 3642 3638 } 3639 + 3640 + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; 3641 + if (end_pgoff > file_end) 3642 + end_pgoff = file_end; 3643 3643 3644 3644 folio_type = mm_counter_file(folio); 3645 3645 do { ··· 3765 3757 repeat: 3766 3758 folio = filemap_get_folio(mapping, index); 3767 3759 if (IS_ERR(folio)) { 3768 - folio = filemap_alloc_folio(gfp, 0); 3760 + folio = filemap_alloc_folio(gfp, 3761 + mapping_min_folio_order(mapping)); 3769 3762 if (!folio) 3770 3763 return ERR_PTR(-ENOMEM); 3764 + index = mapping_align_index(mapping, index); 3771 3765 err = filemap_add_folio(mapping, folio, index, gfp); 3772 3766 if (unlikely(err)) { 3773 3767 folio_put(folio);

+61 -4

mm/huge_memory.c

··· 3081 3081 * released, or if some unexpected race happened (e.g., anon VMA disappeared, 3082 3082 * truncation). 3083 3083 * 3084 + * Callers should ensure that the order respects the address space mapping 3085 + * min-order if one is set for non-anonymous folios. 3086 + * 3084 3087 * Returns -EINVAL when trying to split to an order that is incompatible 3085 3088 * with the folio. Splitting to order 0 is compatible with all folios. 3086 3089 */ ··· 3165 3162 mapping = NULL; 3166 3163 anon_vma_lock_write(anon_vma); 3167 3164 } else { 3165 + unsigned int min_order; 3168 3166 gfp_t gfp; 3169 3167 3170 3168 mapping = folio->mapping; ··· 3173 3169 /* Truncated ? */ 3174 3170 if (!mapping) { 3175 3171 ret = -EBUSY; 3172 + goto out; 3173 + } 3174 + 3175 + min_order = mapping_min_folio_order(folio->mapping); 3176 + if (new_order < min_order) { 3177 + VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u", 3178 + min_order); 3179 + ret = -EINVAL; 3176 3180 goto out; 3177 3181 } 3178 3182 ··· 3294 3282 count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); 3295 3283 count_mthp_stat(order, !ret ? MTHP_STAT_SPLIT : MTHP_STAT_SPLIT_FAILED); 3296 3284 return ret; 3285 + } 3286 + 3287 + int min_order_for_split(struct folio *folio) 3288 + { 3289 + if (folio_test_anon(folio)) 3290 + return 0; 3291 + 3292 + if (!folio->mapping) { 3293 + if (folio_test_pmd_mappable(folio)) 3294 + count_vm_event(THP_SPLIT_PAGE_FAILED); 3295 + return -EBUSY; 3296 + } 3297 + 3298 + return mapping_min_folio_order(folio->mapping); 3299 + } 3300 + 3301 + int split_folio_to_list(struct folio *folio, struct list_head *list) 3302 + { 3303 + int ret = min_order_for_split(folio); 3304 + 3305 + if (ret < 0) 3306 + return ret; 3307 + 3308 + return split_huge_page_to_list_to_order(&folio->page, list, ret); 3297 3309 } 3298 3310 3299 3311 void __folio_undo_large_rmappable(struct folio *folio) ··· 3550 3514 struct vm_area_struct *vma = vma_lookup(mm, addr); 3551 3515 struct page *page; 3552 3516 struct folio *folio; 3517 + struct address_space *mapping; 3518 + unsigned int target_order = new_order; 3553 3519 3554 3520 if (!vma) 3555 3521 break; ··· 3572 3534 if (!is_transparent_hugepage(folio)) 3573 3535 goto next; 3574 3536 3575 - if (new_order >= folio_order(folio)) 3537 + if (!folio_test_anon(folio)) { 3538 + mapping = folio->mapping; 3539 + target_order = max(new_order, 3540 + mapping_min_folio_order(mapping)); 3541 + } 3542 + 3543 + if (target_order >= folio_order(folio)) 3576 3544 goto next; 3577 3545 3578 3546 total++; ··· 3594 3550 if (!folio_trylock(folio)) 3595 3551 goto next; 3596 3552 3597 - if (!split_folio_to_order(folio, new_order)) 3553 + if (!folio_test_anon(folio) && folio->mapping != mapping) 3554 + goto unlock; 3555 + 3556 + if (!split_folio_to_order(folio, target_order)) 3598 3557 split++; 3558 + 3559 + unlock: 3599 3560 3600 3561 folio_unlock(folio); 3601 3562 next: ··· 3626 3577 pgoff_t index; 3627 3578 int nr_pages = 1; 3628 3579 unsigned long total = 0, split = 0; 3580 + unsigned int min_order; 3581 + unsigned int target_order; 3629 3582 3630 3583 file = getname_kernel(file_path); 3631 3584 if (IS_ERR(file)) ··· 3641 3590 file_path, off_start, off_end); 3642 3591 3643 3592 mapping = candidate->f_mapping; 3593 + min_order = mapping_min_folio_order(mapping); 3594 + target_order = max(new_order, min_order); 3644 3595 3645 3596 for (index = off_start; index < off_end; index += nr_pages) { 3646 3597 struct folio *folio = filemap_get_folio(mapping, index); ··· 3657 3604 total++; 3658 3605 nr_pages = folio_nr_pages(folio); 3659 3606 3660 - if (new_order >= folio_order(folio)) 3607 + if (target_order >= folio_order(folio)) 3661 3608 goto next; 3662 3609 3663 3610 if (!folio_trylock(folio)) 3664 3611 goto next; 3665 3612 3666 - if (!split_folio_to_order(folio, new_order)) 3613 + if (folio->mapping != mapping) 3614 + goto unlock; 3615 + 3616 + if (!split_folio_to_order(folio, target_order)) 3667 3617 split++; 3668 3618 3619 + unlock: 3669 3620 folio_unlock(folio); 3670 3621 next: 3671 3622 folio_put(folio);

+63 -20

mm/readahead.c

··· 206 206 unsigned long nr_to_read, unsigned long lookahead_size) 207 207 { 208 208 struct address_space *mapping = ractl->mapping; 209 - unsigned long index = readahead_index(ractl); 209 + unsigned long ra_folio_index, index = readahead_index(ractl); 210 210 gfp_t gfp_mask = readahead_gfp_mask(mapping); 211 - unsigned long i; 211 + unsigned long mark, i = 0; 212 + unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); 212 213 213 214 /* 214 215 * Partway through the readahead operation, we will have added ··· 224 223 unsigned int nofs = memalloc_nofs_save(); 225 224 226 225 filemap_invalidate_lock_shared(mapping); 226 + index = mapping_align_index(mapping, index); 227 + 228 + /* 229 + * As iterator `i` is aligned to min_nrpages, round_up the 230 + * difference between nr_to_read and lookahead_size to mark the 231 + * index that only has lookahead or "async_region" to set the 232 + * readahead flag. 233 + */ 234 + ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, 235 + min_nrpages); 236 + mark = ra_folio_index - index; 237 + nr_to_read += readahead_index(ractl) - index; 238 + ractl->_index = index; 239 + 227 240 /* 228 241 * Preallocate as many pages as we will need. 229 242 */ 230 - for (i = 0; i < nr_to_read; i++) { 243 + while (i < nr_to_read) { 231 244 struct folio *folio = xa_load(&mapping->i_pages, index + i); 232 245 int ret; 233 246 ··· 255 240 * not worth getting one just for that. 256 241 */ 257 242 read_pages(ractl); 258 - ractl->_index++; 259 - i = ractl->_index + ractl->_nr_pages - index - 1; 243 + ractl->_index += min_nrpages; 244 + i = ractl->_index + ractl->_nr_pages - index; 260 245 continue; 261 246 } 262 247 263 - folio = filemap_alloc_folio(gfp_mask, 0); 248 + folio = filemap_alloc_folio(gfp_mask, 249 + mapping_min_folio_order(mapping)); 264 250 if (!folio) 265 251 break; 266 252 ··· 271 255 if (ret == -ENOMEM) 272 256 break; 273 257 read_pages(ractl); 274 - ractl->_index++; 275 - i = ractl->_index + ractl->_nr_pages - index - 1; 258 + ractl->_index += min_nrpages; 259 + i = ractl->_index + ractl->_nr_pages - index; 276 260 continue; 277 261 } 278 - if (i == nr_to_read - lookahead_size) 262 + if (i == mark) 279 263 folio_set_readahead(folio); 280 264 ractl->_workingset |= folio_test_workingset(folio); 281 - ractl->_nr_pages++; 265 + ractl->_nr_pages += min_nrpages; 266 + i += min_nrpages; 282 267 } 283 268 284 269 /* ··· 455 438 struct address_space *mapping = ractl->mapping; 456 439 pgoff_t start = readahead_index(ractl); 457 440 pgoff_t index = start; 441 + unsigned int min_order = mapping_min_folio_order(mapping); 458 442 pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT; 459 443 pgoff_t mark = index + ra->size - ra->async_size; 460 444 unsigned int nofs; 461 445 int err = 0; 462 446 gfp_t gfp = readahead_gfp_mask(mapping); 447 + unsigned int min_ra_size = max(4, mapping_min_folio_nrpages(mapping)); 463 448 464 - if (!mapping_large_folio_support(mapping) || ra->size < 4) 449 + /* 450 + * Fallback when size < min_nrpages as each folio should be 451 + * at least min_nrpages anyway. 452 + */ 453 + if (!mapping_large_folio_support(mapping) || ra->size < min_ra_size) 465 454 goto fallback; 466 455 467 456 limit = min(limit, index + ra->size - 1); 468 457 469 - if (new_order < MAX_PAGECACHE_ORDER) 458 + if (new_order < mapping_max_folio_order(mapping)) 470 459 new_order += 2; 471 460 472 - new_order = min_t(unsigned int, MAX_PAGECACHE_ORDER, new_order); 461 + new_order = min(mapping_max_folio_order(mapping), new_order); 473 462 new_order = min_t(unsigned int, new_order, ilog2(ra->size)); 463 + new_order = max(new_order, min_order); 474 464 475 465 /* See comment in page_cache_ra_unbounded() */ 476 466 nofs = memalloc_nofs_save(); 477 467 filemap_invalidate_lock_shared(mapping); 468 + /* 469 + * If the new_order is greater than min_order and index is 470 + * already aligned to new_order, then this will be noop as index 471 + * aligned to new_order should also be aligned to min_order. 472 + */ 473 + ractl->_index = mapping_align_index(mapping, index); 474 + index = readahead_index(ractl); 475 + 478 476 while (index <= limit) { 479 477 unsigned int order = new_order; 480 478 ··· 497 465 if (index & ((1UL << order) - 1)) 498 466 order = __ffs(index); 499 467 /* Don't allocate pages past EOF */ 500 - while (index + (1UL << order) - 1 > limit) 468 + while (order > min_order && index + (1UL << order) - 1 > limit) 501 469 order--; 502 470 err = ra_alloc_folio(ractl, index, mark, order, gfp); 503 471 if (err) ··· 735 703 struct file_ra_state *ra = ractl->ra; 736 704 pgoff_t new_index, new_nr_pages; 737 705 gfp_t gfp_mask = readahead_gfp_mask(mapping); 706 + unsigned long min_nrpages = mapping_min_folio_nrpages(mapping); 707 + unsigned int min_order = mapping_min_folio_order(mapping); 738 708 739 709 new_index = new_start / PAGE_SIZE; 710 + /* 711 + * Readahead code should have aligned the ractl->_index to 712 + * min_nrpages before calling readahead aops. 713 + */ 714 + VM_BUG_ON(!IS_ALIGNED(ractl->_index, min_nrpages)); 740 715 741 716 /* Expand the leading edge downwards */ 742 717 while (ractl->_index > new_index) { ··· 753 714 if (folio && !xa_is_value(folio)) 754 715 return; /* Folio apparently present */ 755 716 756 - folio = filemap_alloc_folio(gfp_mask, 0); 717 + folio = filemap_alloc_folio(gfp_mask, min_order); 757 718 if (!folio) 758 719 return; 720 + 721 + index = mapping_align_index(mapping, index); 759 722 if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { 760 723 folio_put(folio); 761 724 return; ··· 767 726 ractl->_workingset = true; 768 727 psi_memstall_enter(&ractl->_pflags); 769 728 } 770 - ractl->_nr_pages++; 729 + ractl->_nr_pages += min_nrpages; 771 730 ractl->_index = folio->index; 772 731 } 773 732 ··· 782 741 if (folio && !xa_is_value(folio)) 783 742 return; /* Folio apparently present */ 784 743 785 - folio = filemap_alloc_folio(gfp_mask, 0); 744 + folio = filemap_alloc_folio(gfp_mask, min_order); 786 745 if (!folio) 787 746 return; 747 + 748 + index = mapping_align_index(mapping, index); 788 749 if (filemap_add_folio(mapping, folio, index, gfp_mask) < 0) { 789 750 folio_put(folio); 790 751 return; ··· 796 753 ractl->_workingset = true; 797 754 psi_memstall_enter(&ractl->_pflags); 798 755 } 799 - ractl->_nr_pages++; 756 + ractl->_nr_pages += min_nrpages; 800 757 if (ra) { 801 - ra->size++; 802 - ra->async_size++; 758 + ra->size += min_nrpages; 759 + ra->async_size += min_nrpages; 803 760 } 804 761 } 805 762 }

Configure Feed

Configure Feed