Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

block: add helpers to bounce buffer an iov_iter into bios

Add helpers to implement bounce buffering of data into a bio to implement
direct I/O for cases where direct user access is not possible because
stable in-flight data is required. These are intended to be used as
easily as bio_iov_iter_get_pages for the zero-copy path.

The write side is trivial and just copies data into the bounce buffer.
The read side is a lot more complex because it needs to perform the copy
from the completion context, and without preserving the iov_iter through
the call chain. It steals a trick from the integrity data user interface
and uses the first vector in the bio for the bounce buffer data that is
fed to the block I/O stack, and uses the others to record the user
buffer fragments.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Tested-by: Anuj Gupta <anuj20.g@samsung.com>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Christoph Hellwig and committed by
Jens Axboe
8dd5e7c7 301f5356

+205
+179
block/bio.c
··· 1266 1266 return bio_iov_iter_align_down(bio, iter, len_align_mask); 1267 1267 } 1268 1268 1269 + static struct folio *folio_alloc_greedy(gfp_t gfp, size_t *size) 1270 + { 1271 + struct folio *folio; 1272 + 1273 + while (*size > PAGE_SIZE) { 1274 + folio = folio_alloc(gfp | __GFP_NORETRY, get_order(*size)); 1275 + if (folio) 1276 + return folio; 1277 + *size = rounddown_pow_of_two(*size - 1); 1278 + } 1279 + 1280 + return folio_alloc(gfp, get_order(*size)); 1281 + } 1282 + 1283 + static void bio_free_folios(struct bio *bio) 1284 + { 1285 + struct bio_vec *bv; 1286 + int i; 1287 + 1288 + bio_for_each_bvec_all(bv, bio, i) { 1289 + struct folio *folio = page_folio(bv->bv_page); 1290 + 1291 + if (!is_zero_folio(folio)) 1292 + folio_put(folio); 1293 + } 1294 + } 1295 + 1296 + static int bio_iov_iter_bounce_write(struct bio *bio, struct iov_iter *iter) 1297 + { 1298 + size_t total_len = iov_iter_count(iter); 1299 + 1300 + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) 1301 + return -EINVAL; 1302 + if (WARN_ON_ONCE(bio->bi_iter.bi_size)) 1303 + return -EINVAL; 1304 + if (WARN_ON_ONCE(bio->bi_vcnt >= bio->bi_max_vecs)) 1305 + return -EINVAL; 1306 + 1307 + do { 1308 + size_t this_len = min(total_len, SZ_1M); 1309 + struct folio *folio; 1310 + 1311 + if (this_len > PAGE_SIZE * 2) 1312 + this_len = rounddown_pow_of_two(this_len); 1313 + 1314 + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - this_len) 1315 + break; 1316 + 1317 + folio = folio_alloc_greedy(GFP_KERNEL, &this_len); 1318 + if (!folio) 1319 + break; 1320 + bio_add_folio_nofail(bio, folio, this_len, 0); 1321 + 1322 + if (copy_from_iter(folio_address(folio), this_len, iter) != 1323 + this_len) { 1324 + bio_free_folios(bio); 1325 + return -EFAULT; 1326 + } 1327 + 1328 + total_len -= this_len; 1329 + } while (total_len && bio->bi_vcnt < bio->bi_max_vecs); 1330 + 1331 + if (!bio->bi_iter.bi_size) 1332 + return -ENOMEM; 1333 + return 0; 1334 + } 1335 + 1336 + static int bio_iov_iter_bounce_read(struct bio *bio, struct iov_iter *iter) 1337 + { 1338 + size_t len = min(iov_iter_count(iter), SZ_1M); 1339 + struct folio *folio; 1340 + 1341 + folio = folio_alloc_greedy(GFP_KERNEL, &len); 1342 + if (!folio) 1343 + return -ENOMEM; 1344 + 1345 + do { 1346 + ssize_t ret; 1347 + 1348 + ret = iov_iter_extract_bvecs(iter, bio->bi_io_vec + 1, len, 1349 + &bio->bi_vcnt, bio->bi_max_vecs - 1, 0); 1350 + if (ret <= 0) { 1351 + if (!bio->bi_vcnt) 1352 + return ret; 1353 + break; 1354 + } 1355 + len -= ret; 1356 + bio->bi_iter.bi_size += ret; 1357 + } while (len && bio->bi_vcnt < bio->bi_max_vecs - 1); 1358 + 1359 + /* 1360 + * Set the folio directly here. The above loop has already calculated 1361 + * the correct bi_size, and we use bi_vcnt for the user buffers. That 1362 + * is safe as bi_vcnt is only used by the submitter and not the actual 1363 + * I/O path. 1364 + */ 1365 + bvec_set_folio(&bio->bi_io_vec[0], folio, bio->bi_iter.bi_size, 0); 1366 + if (iov_iter_extract_will_pin(iter)) 1367 + bio_set_flag(bio, BIO_PAGE_PINNED); 1368 + return 0; 1369 + } 1370 + 1371 + /** 1372 + * bio_iov_iter_bounce - bounce buffer data from an iter into a bio 1373 + * @bio: bio to send 1374 + * @iter: iter to read from / write into 1375 + * 1376 + * Helper for direct I/O implementations that need to bounce buffer because 1377 + * we need to checksum the data or perform other operations that require 1378 + * consistency. Allocates folios to back the bounce buffer, and for writes 1379 + * copies the data into it. Needs to be paired with bio_iov_iter_unbounce() 1380 + * called on completion. 1381 + */ 1382 + int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter) 1383 + { 1384 + if (op_is_write(bio_op(bio))) 1385 + return bio_iov_iter_bounce_write(bio, iter); 1386 + return bio_iov_iter_bounce_read(bio, iter); 1387 + } 1388 + 1389 + static void bvec_unpin(struct bio_vec *bv, bool mark_dirty) 1390 + { 1391 + struct folio *folio = page_folio(bv->bv_page); 1392 + size_t nr_pages = (bv->bv_offset + bv->bv_len - 1) / PAGE_SIZE - 1393 + bv->bv_offset / PAGE_SIZE + 1; 1394 + 1395 + if (mark_dirty) 1396 + folio_mark_dirty_lock(folio); 1397 + unpin_user_folio(folio, nr_pages); 1398 + } 1399 + 1400 + static void bio_iov_iter_unbounce_read(struct bio *bio, bool is_error, 1401 + bool mark_dirty) 1402 + { 1403 + unsigned int len = bio->bi_io_vec[0].bv_len; 1404 + 1405 + if (likely(!is_error)) { 1406 + void *buf = bvec_virt(&bio->bi_io_vec[0]); 1407 + struct iov_iter to; 1408 + 1409 + iov_iter_bvec(&to, ITER_DEST, bio->bi_io_vec + 1, bio->bi_vcnt, 1410 + len); 1411 + /* copying to pinned pages should always work */ 1412 + WARN_ON_ONCE(copy_to_iter(buf, len, &to) != len); 1413 + } else { 1414 + /* No need to mark folios dirty if never copied to them */ 1415 + mark_dirty = false; 1416 + } 1417 + 1418 + if (bio_flagged(bio, BIO_PAGE_PINNED)) { 1419 + int i; 1420 + 1421 + for (i = 0; i < bio->bi_vcnt; i++) 1422 + bvec_unpin(&bio->bi_io_vec[1 + i], mark_dirty); 1423 + } 1424 + 1425 + folio_put(page_folio(bio->bi_io_vec[0].bv_page)); 1426 + } 1427 + 1428 + /** 1429 + * bio_iov_iter_unbounce - finish a bounce buffer operation 1430 + * @bio: completed bio 1431 + * @is_error: %true if an I/O error occurred and data should not be copied 1432 + * @mark_dirty: If %true, folios will be marked dirty. 1433 + * 1434 + * Helper for direct I/O implementations that need to bounce buffer because 1435 + * we need to checksum the data or perform other operations that require 1436 + * consistency. Called to complete a bio set up by bio_iov_iter_bounce(). 1437 + * Copies data back for reads, and marks the original folios dirty if 1438 + * requested and then frees the bounce buffer. 1439 + */ 1440 + void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty) 1441 + { 1442 + if (op_is_write(bio_op(bio))) 1443 + bio_free_folios(bio); 1444 + else 1445 + bio_iov_iter_unbounce_read(bio, is_error, mark_dirty); 1446 + } 1447 + 1269 1448 static void submit_bio_wait_endio(struct bio *bio) 1270 1449 { 1271 1450 complete(bio->bi_private);
+26
include/linux/bio.h
··· 397 397 return iov_iter_npages(iter, max_segs); 398 398 } 399 399 400 + /** 401 + * bio_iov_bounce_nr_vecs - calculate number of bvecs for a bounce bio 402 + * @iter: iter to bounce from 403 + * @op: REQ_OP_* for the bio 404 + * 405 + * Calculates how many bvecs are needed for the next bio to bounce from/to 406 + * @iter. 407 + */ 408 + static inline unsigned short 409 + bio_iov_bounce_nr_vecs(struct iov_iter *iter, blk_opf_t op) 410 + { 411 + /* 412 + * We still need to bounce bvec iters, so don't special case them 413 + * here unlike in bio_iov_vecs_to_alloc. 414 + * 415 + * For reads we need to use a vector for the bounce buffer, account 416 + * for that here. 417 + */ 418 + if (op_is_write(op)) 419 + return iov_iter_npages(iter, BIO_MAX_VECS); 420 + return iov_iter_npages(iter, BIO_MAX_VECS - 1) + 1; 421 + } 422 + 400 423 struct request_queue; 401 424 402 425 void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, ··· 472 449 void __bio_release_pages(struct bio *bio, bool mark_dirty); 473 450 extern void bio_set_pages_dirty(struct bio *bio); 474 451 extern void bio_check_pages_dirty(struct bio *bio); 452 + 453 + int bio_iov_iter_bounce(struct bio *bio, struct iov_iter *iter); 454 + void bio_iov_iter_unbounce(struct bio *bio, bool is_error, bool mark_dirty); 475 455 476 456 extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, 477 457 struct bio *src, struct bvec_iter *src_iter);