Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

block: accumulate memory segment gaps per bio

The blk-mq dma iterator has an optimization for requests that align to
the device's iommu merge boundary. This boundary may be larger than the
device's virtual boundary, but the code had been depending on that queue
limit to know ahead of time if the request is guaranteed to align to
that optimization.

Rather than rely on that queue limit, which many devices may not report,
save the lowest set bit of any boundary gap between each segment in the
bio while checking the segments. The request stores the value for
merging and quickly checking per io if the request can use iova
optimizations.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Keith Busch and committed by
Jens Axboe
2f6b2565 0739c2c6

+77 -5
+1
block/bio.c
··· 253 253 bio->bi_write_hint = 0; 254 254 bio->bi_write_stream = 0; 255 255 bio->bi_status = 0; 256 + bio->bi_bvec_gap_bit = 0; 256 257 bio->bi_iter.bi_sector = 0; 257 258 bio->bi_iter.bi_size = 0; 258 259 bio->bi_iter.bi_idx = 0;
+3
block/blk-map.c
··· 459 459 if (rq->bio) { 460 460 if (!ll_back_merge_fn(rq, bio, nr_segs)) 461 461 return -EINVAL; 462 + rq->phys_gap_bit = bio_seg_gap(rq->q, rq->biotail, bio, 463 + rq->phys_gap_bit); 462 464 rq->biotail->bi_next = bio; 463 465 rq->biotail = bio; 464 466 rq->__data_len += bio->bi_iter.bi_size; ··· 471 469 rq->nr_phys_segments = nr_segs; 472 470 rq->bio = rq->biotail = bio; 473 471 rq->__data_len = bio->bi_iter.bi_size; 472 + rq->phys_gap_bit = bio->bi_bvec_gap_bit; 474 473 return 0; 475 474 } 476 475 EXPORT_SYMBOL(blk_rq_append_bio);
+36 -3
block/blk-merge.c
··· 302 302 return lim->logical_block_size; 303 303 } 304 304 305 + static inline unsigned int bvec_seg_gap(struct bio_vec *bvprv, 306 + struct bio_vec *bv) 307 + { 308 + return bv->bv_offset | (bvprv->bv_offset + bvprv->bv_len); 309 + } 310 + 305 311 /** 306 312 * bio_split_io_at - check if and where to split a bio 307 313 * @bio: [in] bio to be split ··· 325 319 unsigned *segs, unsigned max_bytes, unsigned len_align_mask) 326 320 { 327 321 struct bio_vec bv, bvprv, *bvprvp = NULL; 322 + unsigned nsegs = 0, bytes = 0, gaps = 0; 328 323 struct bvec_iter iter; 329 - unsigned nsegs = 0, bytes = 0; 330 324 331 325 bio_for_each_bvec(bv, bio, iter) { 332 326 if (bv.bv_offset & lim->dma_alignment || ··· 337 331 * If the queue doesn't support SG gaps and adding this 338 332 * offset would create a gap, disallow it. 339 333 */ 340 - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) 341 - goto split; 334 + if (bvprvp) { 335 + if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) 336 + goto split; 337 + gaps |= bvec_seg_gap(bvprvp, &bv); 338 + } 342 339 343 340 if (nsegs < lim->max_segments && 344 341 bytes + bv.bv_len <= max_bytes && ··· 359 350 } 360 351 361 352 *segs = nsegs; 353 + bio->bi_bvec_gap_bit = ffs(gaps); 362 354 return 0; 363 355 split: 364 356 if (bio->bi_opf & REQ_ATOMIC) ··· 395 385 * big IO can be trival, disable iopoll when split needed. 396 386 */ 397 387 bio_clear_polled(bio); 388 + bio->bi_bvec_gap_bit = ffs(gaps); 398 389 return bytes >> SECTOR_SHIFT; 399 390 } 400 391 EXPORT_SYMBOL_GPL(bio_split_io_at); ··· 732 721 return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC); 733 722 } 734 723 724 + u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, 725 + u8 gaps_bit) 726 + { 727 + struct bio_vec pb, nb; 728 + 729 + gaps_bit = min_not_zero(gaps_bit, prev->bi_bvec_gap_bit); 730 + gaps_bit = min_not_zero(gaps_bit, next->bi_bvec_gap_bit); 731 + 732 + bio_get_last_bvec(prev, &pb); 733 + bio_get_first_bvec(next, &nb); 734 + if (!biovec_phys_mergeable(q, &pb, &nb)) 735 + gaps_bit = min_not_zero(gaps_bit, ffs(bvec_seg_gap(&pb, &nb))); 736 + return gaps_bit; 737 + } 738 + 735 739 /* 736 740 * For non-mq, this has to be called with the request spinlock acquired. 737 741 * For mq with scheduling, the appropriate queue wide lock should be held. ··· 811 785 if (next->start_time_ns < req->start_time_ns) 812 786 req->start_time_ns = next->start_time_ns; 813 787 788 + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, next->bio, 789 + min_not_zero(next->phys_gap_bit, 790 + req->phys_gap_bit)); 814 791 req->biotail->bi_next = next->bio; 815 792 req->biotail = next->biotail; 816 793 ··· 937 908 if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) 938 909 blk_zone_write_plug_bio_merged(bio); 939 910 911 + req->phys_gap_bit = bio_seg_gap(req->q, req->biotail, bio, 912 + req->phys_gap_bit); 940 913 req->biotail->bi_next = bio; 941 914 req->biotail = bio; 942 915 req->__data_len += bio->bi_iter.bi_size; ··· 973 942 974 943 blk_update_mixed_merge(req, bio, true); 975 944 945 + req->phys_gap_bit = bio_seg_gap(req->q, bio, req->bio, 946 + req->phys_gap_bit); 976 947 bio->bi_next = req->bio; 977 948 req->bio = bio; 978 949
+1 -2
block/blk-mq-dma.c
··· 79 79 static inline bool blk_can_dma_map_iova(struct request *req, 80 80 struct device *dma_dev) 81 81 { 82 - return !((queue_virt_boundary(req->q) + 1) & 83 - dma_get_merge_boundary(dma_dev)); 82 + return !(req_phys_gap_mask(req) & dma_get_merge_boundary(dma_dev)); 84 83 } 85 84 86 85 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
+6
block/blk-mq.c
··· 376 376 INIT_LIST_HEAD(&rq->queuelist); 377 377 rq->q = q; 378 378 rq->__sector = (sector_t) -1; 379 + rq->phys_gap_bit = 0; 379 380 INIT_HLIST_NODE(&rq->hash); 380 381 RB_CLEAR_NODE(&rq->rb_node); 381 382 rq->tag = BLK_MQ_NO_TAG; ··· 669 668 goto out_queue_exit; 670 669 } 671 670 rq->__data_len = 0; 671 + rq->phys_gap_bit = 0; 672 672 rq->__sector = (sector_t) -1; 673 673 rq->bio = rq->biotail = NULL; 674 674 return rq; ··· 750 748 rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); 751 749 blk_mq_rq_time_init(rq, alloc_time_ns); 752 750 rq->__data_len = 0; 751 + rq->phys_gap_bit = 0; 753 752 rq->__sector = (sector_t) -1; 754 753 rq->bio = rq->biotail = NULL; 755 754 return rq; ··· 2677 2674 rq->bio = rq->biotail = bio; 2678 2675 rq->__sector = bio->bi_iter.bi_sector; 2679 2676 rq->__data_len = bio->bi_iter.bi_size; 2677 + rq->phys_gap_bit = bio->bi_bvec_gap_bit; 2678 + 2680 2679 rq->nr_phys_segments = nr_segs; 2681 2680 if (bio_integrity(bio)) 2682 2681 rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q, ··· 3385 3380 } 3386 3381 rq->nr_phys_segments = rq_src->nr_phys_segments; 3387 3382 rq->nr_integrity_segments = rq_src->nr_integrity_segments; 3383 + rq->phys_gap_bit = rq_src->phys_gap_bit; 3388 3384 3389 3385 if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) 3390 3386 goto free_and_out;
+2
include/linux/bio.h
··· 324 324 gfp_t gfp, struct bio_set *bs); 325 325 int bio_split_io_at(struct bio *bio, const struct queue_limits *lim, 326 326 unsigned *segs, unsigned max_bytes, unsigned len_align); 327 + u8 bio_seg_gap(struct request_queue *q, struct bio *prev, struct bio *next, 328 + u8 gaps_bit); 327 329 328 330 /** 329 331 * bio_next_split - get next @sectors from a bio, splitting if necessary
+16
include/linux/blk-mq.h
··· 152 152 unsigned short nr_phys_segments; 153 153 unsigned short nr_integrity_segments; 154 154 155 + /* 156 + * The lowest set bit for address gaps between physical segments. This 157 + * provides information necessary for dma optimization opprotunities, 158 + * like for testing if the segments can be coalesced against the 159 + * device's iommu granule. 160 + */ 161 + unsigned char phys_gap_bit; 162 + 155 163 #ifdef CONFIG_BLK_INLINE_ENCRYPTION 156 164 struct bio_crypt_ctx *crypt_ctx; 157 165 struct blk_crypto_keyslot *crypt_keyslot; ··· 215 207 rq_end_io_fn *end_io; 216 208 void *end_io_data; 217 209 }; 210 + 211 + /* 212 + * Returns a mask with all bits starting at req->phys_gap_bit set to 1. 213 + */ 214 + static inline unsigned long req_phys_gap_mask(const struct request *req) 215 + { 216 + return ~(((1 << req->phys_gap_bit) >> 1) - 1); 217 + } 218 218 219 219 static inline enum req_op req_op(const struct request *req) 220 220 {
+12
include/linux/blk_types.h
··· 218 218 enum rw_hint bi_write_hint; 219 219 u8 bi_write_stream; 220 220 blk_status_t bi_status; 221 + 222 + /* 223 + * The bvec gap bit indicates the lowest set bit in any address offset 224 + * between all bi_io_vecs. This field is initialized only after the bio 225 + * is split to the hardware limits (see bio_split_io_at()). The value 226 + * may be used to consider DMA optimization when performing that 227 + * mapping. The value is compared to a power of two mask where the 228 + * result depends on any bit set within the mask, so saving the lowest 229 + * bit is sufficient to know if any segment gap collides with the mask. 230 + */ 231 + u8 bi_bvec_gap_bit; 232 + 221 233 atomic_t __bi_remaining; 222 234 223 235 struct bvec_iter bi_iter;