Merge tag 'io_uring-20190323' of git://git.kernel.dk/linux-block

+10

MAINTAINERS

··· 8096 8096 F: include/linux/of_iommu.h 8097 8097 F: include/linux/iova.h 8098 8098 8099 + IO_URING 8100 + M: Jens Axboe <axboe@kernel.dk> 8101 + L: linux-block@vger.kernel.org 8102 + L: linux-fsdevel@vger.kernel.org 8103 + T: git git://git.kernel.dk/linux-block 8104 + T: git git://git.kernel.dk/liburing 8105 + S: Maintained 8106 + F: fs/io_uring.c 8107 + F: include/uapi/linux/io_uring.h 8108 + 8099 8109 IP MASQUERADING 8100 8110 M: Juanjo Ciarlante <jjciarla@raiz.uncu.edu.ar> 8101 8111 S: Maintained

+24 -19

block/bio.c

··· 849 849 size = bio_add_page(bio, bv->bv_page, len, 850 850 bv->bv_offset + iter->iov_offset); 851 851 if (size == len) { 852 - struct page *page; 853 - int i; 852 + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { 853 + struct page *page; 854 + int i; 854 855 855 - /* 856 - * For the normal O_DIRECT case, we could skip grabbing this 857 - * reference and then not have to put them again when IO 858 - * completes. But this breaks some in-kernel users, like 859 - * splicing to/from a loop device, where we release the pipe 860 - * pages unconditionally. If we can fix that case, we can 861 - * get rid of the get here and the need to call 862 - * bio_release_pages() at IO completion time. 863 - */ 864 - mp_bvec_for_each_page(page, bv, i) 865 - get_page(page); 856 + mp_bvec_for_each_page(page, bv, i) 857 + get_page(page); 858 + } 859 + 866 860 iov_iter_advance(iter, size); 867 861 return 0; 868 862 } ··· 919 925 * This takes either an iterator pointing to user memory, or one pointing to 920 926 * kernel pages (BVEC iterator). If we're adding user pages, we pin them and 921 927 * map them into the kernel. On IO completion, the caller should put those 922 - * pages. For now, when adding kernel pages, we still grab a reference to the 923 - * page. This isn't strictly needed for the common case, but some call paths 924 - * end up releasing pages from eg a pipe and we can't easily control these. 925 - * See comment in __bio_iov_bvec_add_pages(). 928 + * pages. If we're adding kernel pages, and the caller told us it's safe to 929 + * do so, we just have to add the pages to the bio directly. We don't grab an 930 + * extra reference to those pages (the user should already have that), and we 931 + * don't put the page on IO completion. The caller needs to check if the bio is 932 + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be 933 + * released. 926 934 * 927 935 * The function tries, but does not guarantee, to pin as many pages as 928 936 * fit into the bio, or are requested in *iter, whatever is smaller. If ··· 935 939 { 936 940 const bool is_bvec = iov_iter_is_bvec(iter); 937 941 unsigned short orig_vcnt = bio->bi_vcnt; 942 + 943 + /* 944 + * If this is a BVEC iter, then the pages are kernel pages. Don't 945 + * release them on IO completion, if the caller asked us to. 946 + */ 947 + if (is_bvec && iov_iter_bvec_no_ref(iter)) 948 + bio_set_flag(bio, BIO_NO_PAGE_REF); 938 949 939 950 do { 940 951 int ret; ··· 1699 1696 next = bio->bi_private; 1700 1697 1701 1698 bio_set_pages_dirty(bio); 1702 - bio_release_pages(bio); 1699 + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) 1700 + bio_release_pages(bio); 1703 1701 bio_put(bio); 1704 1702 } 1705 1703 } ··· 1717 1713 goto defer; 1718 1714 } 1719 1715 1720 - bio_release_pages(bio); 1716 + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) 1717 + bio_release_pages(bio); 1721 1718 bio_put(bio); 1722 1719 return; 1723 1720 defer:

+7 -5

fs/block_dev.c

··· 336 336 if (should_dirty) { 337 337 bio_check_pages_dirty(bio); 338 338 } else { 339 - struct bio_vec *bvec; 340 - int i; 341 - struct bvec_iter_all iter_all; 339 + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { 340 + struct bvec_iter_all iter_all; 341 + struct bio_vec *bvec; 342 + int i; 342 343 343 - bio_for_each_segment_all(bvec, bio, i, iter_all) 344 - put_page(bvec->bv_page); 344 + bio_for_each_segment_all(bvec, bio, i, iter_all) 345 + put_page(bvec->bv_page); 346 + } 345 347 bio_put(bio); 346 348 } 347 349 }

+220 -227

fs/io_uring.c

··· 189 189 bool needs_fixed_file; 190 190 }; 191 191 192 + /* 193 + * First field must be the file pointer in all the 194 + * iocb unions! See also 'struct kiocb' in <linux/fs.h> 195 + */ 192 196 struct io_poll_iocb { 193 197 struct file *file; 194 198 struct wait_queue_head *head; 195 199 __poll_t events; 196 - bool woken; 200 + bool done; 197 201 bool canceled; 198 202 struct wait_queue_entry wait; 199 203 }; 200 204 205 + /* 206 + * NOTE! Each of the iocb union members has the file pointer 207 + * as the first entry in their struct definition. So you can 208 + * access the file pointer through any of the sub-structs, 209 + * or directly as just 'ki_filp' in this struct. 210 + */ 201 211 struct io_kiocb { 202 212 union { 213 + struct file *file; 203 214 struct kiocb rw; 204 215 struct io_poll_iocb poll; 205 216 }; ··· 225 214 #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ 226 215 #define REQ_F_FIXED_FILE 4 /* ctx owns file */ 227 216 #define REQ_F_SEQ_PREV 8 /* sequential with previous */ 217 + #define REQ_F_PREPPED 16 /* prep already done */ 228 218 u64 user_data; 229 219 u64 error; 230 220 ··· 367 355 } 368 356 } 369 357 370 - static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, 358 + static void io_cqring_ev_posted(struct io_ring_ctx *ctx) 359 + { 360 + if (waitqueue_active(&ctx->wait)) 361 + wake_up(&ctx->wait); 362 + if (waitqueue_active(&ctx->sqo_wait)) 363 + wake_up(&ctx->sqo_wait); 364 + } 365 + 366 + static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, 371 367 long res, unsigned ev_flags) 372 368 { 373 369 unsigned long flags; 374 370 375 371 spin_lock_irqsave(&ctx->completion_lock, flags); 376 - io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); 372 + io_cqring_fill_event(ctx, user_data, res, ev_flags); 377 373 io_commit_cqring(ctx); 378 374 spin_unlock_irqrestore(&ctx->completion_lock, flags); 379 375 380 - if (waitqueue_active(&ctx->wait)) 381 - wake_up(&ctx->wait); 382 - if (waitqueue_active(&ctx->sqo_wait)) 383 - wake_up(&ctx->sqo_wait); 376 + io_cqring_ev_posted(ctx); 384 377 } 385 378 386 379 static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) ··· 399 382 static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, 400 383 struct io_submit_state *state) 401 384 { 385 + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; 402 386 struct io_kiocb *req; 403 387 404 388 if (!percpu_ref_tryget(&ctx->refs)) 405 389 return NULL; 406 390 407 391 if (!state) { 408 - req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); 392 + req = kmem_cache_alloc(req_cachep, gfp); 409 393 if (unlikely(!req)) 410 394 goto out; 411 395 } else if (!state->free_reqs) { ··· 414 396 int ret; 415 397 416 398 sz = min_t(size_t, state->ios_left, ARRAY_SIZE(state->reqs)); 417 - ret = kmem_cache_alloc_bulk(req_cachep, __GFP_NOWARN, sz, 418 - state->reqs); 419 - if (unlikely(ret <= 0)) 420 - goto out; 399 + ret = kmem_cache_alloc_bulk(req_cachep, gfp, sz, state->reqs); 400 + 401 + /* 402 + * Bulk alloc is all-or-nothing. If we fail to get a batch, 403 + * retry single alloc to be on the safe side. 404 + */ 405 + if (unlikely(ret <= 0)) { 406 + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); 407 + if (!state->reqs[0]) 408 + goto out; 409 + ret = 1; 410 + } 421 411 state->free_reqs = ret - 1; 422 412 state->cur_req = 1; 423 413 req = state->reqs[0]; ··· 437 411 438 412 req->ctx = ctx; 439 413 req->flags = 0; 440 - refcount_set(&req->refs, 0); 414 + /* one is dropped after submission, the other at completion */ 415 + refcount_set(&req->refs, 2); 441 416 return req; 442 417 out: 443 418 io_ring_drop_ctx_refs(ctx, 1); ··· 456 429 457 430 static void io_free_req(struct io_kiocb *req) 458 431 { 459 - if (!refcount_read(&req->refs) || refcount_dec_and_test(&req->refs)) { 460 - io_ring_drop_ctx_refs(req->ctx, 1); 461 - kmem_cache_free(req_cachep, req); 462 - } 432 + if (req->file && !(req->flags & REQ_F_FIXED_FILE)) 433 + fput(req->file); 434 + io_ring_drop_ctx_refs(req->ctx, 1); 435 + kmem_cache_free(req_cachep, req); 436 + } 437 + 438 + static void io_put_req(struct io_kiocb *req) 439 + { 440 + if (refcount_dec_and_test(&req->refs)) 441 + io_free_req(req); 463 442 } 464 443 465 444 /* ··· 475 442 struct list_head *done) 476 443 { 477 444 void *reqs[IO_IOPOLL_BATCH]; 478 - int file_count, to_free; 479 - struct file *file = NULL; 480 445 struct io_kiocb *req; 446 + int to_free; 481 447 482 - file_count = to_free = 0; 448 + to_free = 0; 483 449 while (!list_empty(done)) { 484 450 req = list_first_entry(done, struct io_kiocb, list); 485 451 list_del(&req->list); 486 452 487 453 io_cqring_fill_event(ctx, req->user_data, req->error, 0); 488 - 489 - reqs[to_free++] = req; 490 454 (*nr_events)++; 491 455 492 - /* 493 - * Batched puts of the same file, to avoid dirtying the 494 - * file usage count multiple times, if avoidable. 495 - */ 496 - if (!(req->flags & REQ_F_FIXED_FILE)) { 497 - if (!file) { 498 - file = req->rw.ki_filp; 499 - file_count = 1; 500 - } else if (file == req->rw.ki_filp) { 501 - file_count++; 456 + if (refcount_dec_and_test(&req->refs)) { 457 + /* If we're not using fixed files, we have to pair the 458 + * completion part with the file put. Use regular 459 + * completions for those, only batch free for fixed 460 + * file. 461 + */ 462 + if (req->flags & REQ_F_FIXED_FILE) { 463 + reqs[to_free++] = req; 464 + if (to_free == ARRAY_SIZE(reqs)) 465 + io_free_req_many(ctx, reqs, &to_free); 502 466 } else { 503 - fput_many(file, file_count); 504 - file = req->rw.ki_filp; 505 - file_count = 1; 467 + io_free_req(req); 506 468 } 507 469 } 508 - 509 - if (to_free == ARRAY_SIZE(reqs)) 510 - io_free_req_many(ctx, reqs, &to_free); 511 470 } 512 - io_commit_cqring(ctx); 513 471 514 - if (file) 515 - fput_many(file, file_count); 472 + io_commit_cqring(ctx); 516 473 io_free_req_many(ctx, reqs, &to_free); 517 474 } 518 475 ··· 625 602 } 626 603 } 627 604 628 - static void io_fput(struct io_kiocb *req) 629 - { 630 - if (!(req->flags & REQ_F_FIXED_FILE)) 631 - fput(req->rw.ki_filp); 632 - } 633 - 634 605 static void io_complete_rw(struct kiocb *kiocb, long res, long res2) 635 606 { 636 607 struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); 637 608 638 609 kiocb_end_write(kiocb); 639 610 640 - io_fput(req); 641 611 io_cqring_add_event(req->ctx, req->user_data, res, 0); 642 - io_free_req(req); 612 + io_put_req(req); 643 613 } 644 614 645 615 static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) ··· 747 731 const struct io_uring_sqe *sqe = s->sqe; 748 732 struct io_ring_ctx *ctx = req->ctx; 749 733 struct kiocb *kiocb = &req->rw; 750 - unsigned ioprio, flags; 751 - int fd, ret; 734 + unsigned ioprio; 735 + int ret; 752 736 737 + if (!req->file) 738 + return -EBADF; 753 739 /* For -EAGAIN retry, everything is already prepped */ 754 - if (kiocb->ki_filp) 740 + if (req->flags & REQ_F_PREPPED) 755 741 return 0; 756 742 757 - flags = READ_ONCE(sqe->flags); 758 - fd = READ_ONCE(sqe->fd); 743 + if (force_nonblock && !io_file_supports_async(req->file)) 744 + force_nonblock = false; 759 745 760 - if (flags & IOSQE_FIXED_FILE) { 761 - if (unlikely(!ctx->user_files || 762 - (unsigned) fd >= ctx->nr_user_files)) 763 - return -EBADF; 764 - kiocb->ki_filp = ctx->user_files[fd]; 765 - req->flags |= REQ_F_FIXED_FILE; 766 - } else { 767 - if (s->needs_fixed_file) 768 - return -EBADF; 769 - kiocb->ki_filp = io_file_get(state, fd); 770 - if (unlikely(!kiocb->ki_filp)) 771 - return -EBADF; 772 - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) 773 - force_nonblock = false; 774 - } 775 746 kiocb->ki_pos = READ_ONCE(sqe->off); 776 747 kiocb->ki_flags = iocb_flags(kiocb->ki_filp); 777 748 kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); ··· 767 764 if (ioprio) { 768 765 ret = ioprio_check_cap(ioprio); 769 766 if (ret) 770 - goto out_fput; 767 + return ret; 771 768 772 769 kiocb->ki_ioprio = ioprio; 773 770 } else ··· 775 772 776 773 ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); 777 774 if (unlikely(ret)) 778 - goto out_fput; 775 + return ret; 779 776 if (force_nonblock) { 780 777 kiocb->ki_flags |= IOCB_NOWAIT; 781 778 req->flags |= REQ_F_FORCE_NONBLOCK; 782 779 } 783 780 if (ctx->flags & IORING_SETUP_IOPOLL) { 784 - ret = -EOPNOTSUPP; 785 781 if (!(kiocb->ki_flags & IOCB_DIRECT) || 786 782 !kiocb->ki_filp->f_op->iopoll) 787 - goto out_fput; 783 + return -EOPNOTSUPP; 788 784 789 785 req->error = 0; 790 786 kiocb->ki_flags |= IOCB_HIPRI; 791 787 kiocb->ki_complete = io_complete_rw_iopoll; 792 788 } else { 793 - if (kiocb->ki_flags & IOCB_HIPRI) { 794 - ret = -EINVAL; 795 - goto out_fput; 796 - } 789 + if (kiocb->ki_flags & IOCB_HIPRI) 790 + return -EINVAL; 797 791 kiocb->ki_complete = io_complete_rw; 798 792 } 793 + req->flags |= REQ_F_PREPPED; 799 794 return 0; 800 - out_fput: 801 - if (!(flags & IOSQE_FIXED_FILE)) { 802 - /* 803 - * in case of error, we didn't use this file reference. drop it. 804 - */ 805 - if (state) 806 - state->used_refs--; 807 - io_file_put(state, kiocb->ki_filp); 808 - } 809 - return ret; 810 795 } 811 796 812 797 static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) ··· 855 864 iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); 856 865 if (offset) 857 866 iov_iter_advance(iter, offset); 867 + 868 + /* don't drop a reference to these pages */ 869 + iter->type |= ITER_BVEC_FLAG_NO_REF; 858 870 return 0; 859 871 } 860 872 ··· 881 887 opcode = READ_ONCE(sqe->opcode); 882 888 if (opcode == IORING_OP_READ_FIXED || 883 889 opcode == IORING_OP_WRITE_FIXED) { 884 - ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); 890 + int ret = io_import_fixed(ctx, rw, sqe, iter); 885 891 *iovec = NULL; 886 892 return ret; 887 893 } ··· 939 945 async_list->io_end = io_end; 940 946 } 941 947 942 - static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, 943 - bool force_nonblock, struct io_submit_state *state) 948 + static int io_read(struct io_kiocb *req, const struct sqe_submit *s, 949 + bool force_nonblock, struct io_submit_state *state) 944 950 { 945 951 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 946 952 struct kiocb *kiocb = &req->rw; 947 953 struct iov_iter iter; 948 954 struct file *file; 949 955 size_t iov_count; 950 - ssize_t ret; 956 + int ret; 951 957 952 958 ret = io_prep_rw(req, s, force_nonblock, state); 953 959 if (ret) 954 960 return ret; 955 961 file = kiocb->ki_filp; 956 962 957 - ret = -EBADF; 958 963 if (unlikely(!(file->f_mode & FMODE_READ))) 959 - goto out_fput; 960 - ret = -EINVAL; 964 + return -EBADF; 961 965 if (unlikely(!file->f_op->read_iter)) 962 - goto out_fput; 966 + return -EINVAL; 963 967 964 968 ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); 965 969 if (ret) 966 - goto out_fput; 970 + return ret; 967 971 968 972 iov_count = iov_iter_count(&iter); 969 973 ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); ··· 983 991 } 984 992 } 985 993 kfree(iovec); 986 - out_fput: 987 - /* Hold on to the file for -EAGAIN */ 988 - if (unlikely(ret && ret != -EAGAIN)) 989 - io_fput(req); 990 994 return ret; 991 995 } 992 996 993 - static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, 994 - bool force_nonblock, struct io_submit_state *state) 997 + static int io_write(struct io_kiocb *req, const struct sqe_submit *s, 998 + bool force_nonblock, struct io_submit_state *state) 995 999 { 996 1000 struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; 997 1001 struct kiocb *kiocb = &req->rw; 998 1002 struct iov_iter iter; 999 1003 struct file *file; 1000 1004 size_t iov_count; 1001 - ssize_t ret; 1005 + int ret; 1002 1006 1003 1007 ret = io_prep_rw(req, s, force_nonblock, state); 1004 1008 if (ret) 1005 1009 return ret; 1006 1010 1007 - ret = -EBADF; 1008 1011 file = kiocb->ki_filp; 1009 1012 if (unlikely(!(file->f_mode & FMODE_WRITE))) 1010 - goto out_fput; 1011 - ret = -EINVAL; 1013 + return -EBADF; 1012 1014 if (unlikely(!file->f_op->write_iter)) 1013 - goto out_fput; 1015 + return -EINVAL; 1014 1016 1015 1017 ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); 1016 1018 if (ret) 1017 - goto out_fput; 1019 + return ret; 1018 1020 1019 1021 iov_count = iov_iter_count(&iter); 1020 1022 ··· 1040 1054 } 1041 1055 out_free: 1042 1056 kfree(iovec); 1043 - out_fput: 1044 - /* Hold on to the file for -EAGAIN */ 1045 - if (unlikely(ret && ret != -EAGAIN)) 1046 - io_fput(req); 1047 1057 return ret; 1048 1058 } 1049 1059 ··· 1054 1072 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) 1055 1073 return -EINVAL; 1056 1074 1057 - /* 1058 - * Twilight zone - it's possible that someone issued an opcode that 1059 - * has a file attached, then got -EAGAIN on submission, and changed 1060 - * the sqe before we retried it from async context. Avoid dropping 1061 - * a file reference for this malicious case, and flag the error. 1062 - */ 1063 - if (req->rw.ki_filp) { 1064 - err = -EBADF; 1065 - io_fput(req); 1066 - } 1067 1075 io_cqring_add_event(ctx, user_data, err, 0); 1068 - io_free_req(req); 1076 + io_put_req(req); 1069 1077 return 0; 1070 1078 } 1071 1079 1072 1080 static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) 1073 1081 { 1074 1082 struct io_ring_ctx *ctx = req->ctx; 1075 - unsigned flags; 1076 - int fd; 1077 1083 1078 - /* Prep already done */ 1079 - if (req->rw.ki_filp) 1084 + if (!req->file) 1085 + return -EBADF; 1086 + /* Prep already done (EAGAIN retry) */ 1087 + if (req->flags & REQ_F_PREPPED) 1080 1088 return 0; 1081 1089 1082 1090 if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) ··· 1074 1102 if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) 1075 1103 return -EINVAL; 1076 1104 1077 - fd = READ_ONCE(sqe->fd); 1078 - flags = READ_ONCE(sqe->flags); 1079 - 1080 - if (flags & IOSQE_FIXED_FILE) { 1081 - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) 1082 - return -EBADF; 1083 - req->rw.ki_filp = ctx->user_files[fd]; 1084 - req->flags |= REQ_F_FIXED_FILE; 1085 - } else { 1086 - req->rw.ki_filp = fget(fd); 1087 - if (unlikely(!req->rw.ki_filp)) 1088 - return -EBADF; 1089 - } 1090 - 1105 + req->flags |= REQ_F_PREPPED; 1091 1106 return 0; 1092 1107 } 1093 1108 ··· 1103 1144 end > 0 ? end : LLONG_MAX, 1104 1145 fsync_flags & IORING_FSYNC_DATASYNC); 1105 1146 1106 - io_fput(req); 1107 1147 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); 1108 - io_free_req(req); 1148 + io_put_req(req); 1109 1149 return 0; 1110 1150 } 1111 1151 ··· 1162 1204 spin_unlock_irq(&ctx->completion_lock); 1163 1205 1164 1206 io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); 1165 - io_free_req(req); 1207 + io_put_req(req); 1166 1208 return 0; 1167 1209 } 1168 1210 1169 - static void io_poll_complete(struct io_kiocb *req, __poll_t mask) 1211 + static void io_poll_complete(struct io_ring_ctx *ctx, struct io_kiocb *req, 1212 + __poll_t mask) 1170 1213 { 1171 - io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); 1172 - io_fput(req); 1173 - io_free_req(req); 1214 + req->poll.done = true; 1215 + io_cqring_fill_event(ctx, req->user_data, mangle_poll(mask), 0); 1216 + io_commit_cqring(ctx); 1174 1217 } 1175 1218 1176 1219 static void io_poll_complete_work(struct work_struct *work) ··· 1199 1240 return; 1200 1241 } 1201 1242 list_del_init(&req->list); 1243 + io_poll_complete(ctx, req, mask); 1202 1244 spin_unlock_irq(&ctx->completion_lock); 1203 1245 1204 - io_poll_complete(req, mask); 1246 + io_cqring_ev_posted(ctx); 1247 + io_put_req(req); 1205 1248 } 1206 1249 1207 1250 static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ··· 1214 1253 struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); 1215 1254 struct io_ring_ctx *ctx = req->ctx; 1216 1255 __poll_t mask = key_to_poll(key); 1217 - 1218 - poll->woken = true; 1256 + unsigned long flags; 1219 1257 1220 1258 /* for instances that support it check for an event match first: */ 1221 - if (mask) { 1222 - unsigned long flags; 1223 - 1224 - if (!(mask & poll->events)) 1225 - return 0; 1226 - 1227 - /* try to complete the iocb inline if we can: */ 1228 - if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { 1229 - list_del(&req->list); 1230 - spin_unlock_irqrestore(&ctx->completion_lock, flags); 1231 - 1232 - list_del_init(&poll->wait.entry); 1233 - io_poll_complete(req, mask); 1234 - return 1; 1235 - } 1236 - } 1259 + if (mask && !(mask & poll->events)) 1260 + return 0; 1237 1261 1238 1262 list_del_init(&poll->wait.entry); 1239 - queue_work(ctx->sqo_wq, &req->work); 1263 + 1264 + if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { 1265 + list_del(&req->list); 1266 + io_poll_complete(ctx, req, mask); 1267 + spin_unlock_irqrestore(&ctx->completion_lock, flags); 1268 + 1269 + io_cqring_ev_posted(ctx); 1270 + io_put_req(req); 1271 + } else { 1272 + queue_work(ctx->sqo_wq, &req->work); 1273 + } 1274 + 1240 1275 return 1; 1241 1276 } 1242 1277 ··· 1262 1305 struct io_poll_iocb *poll = &req->poll; 1263 1306 struct io_ring_ctx *ctx = req->ctx; 1264 1307 struct io_poll_table ipt; 1265 - unsigned flags; 1308 + bool cancel = false; 1266 1309 __poll_t mask; 1267 1310 u16 events; 1268 - int fd; 1269 1311 1270 1312 if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) 1271 1313 return -EINVAL; 1272 1314 if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) 1273 1315 return -EINVAL; 1316 + if (!poll->file) 1317 + return -EBADF; 1274 1318 1275 1319 INIT_WORK(&req->work, io_poll_complete_work); 1276 1320 events = READ_ONCE(sqe->poll_events); 1277 1321 poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; 1278 1322 1279 - flags = READ_ONCE(sqe->flags); 1280 - fd = READ_ONCE(sqe->fd); 1281 - 1282 - if (flags & IOSQE_FIXED_FILE) { 1283 - if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) 1284 - return -EBADF; 1285 - poll->file = ctx->user_files[fd]; 1286 - req->flags |= REQ_F_FIXED_FILE; 1287 - } else { 1288 - poll->file = fget(fd); 1289 - } 1290 - if (unlikely(!poll->file)) 1291 - return -EBADF; 1292 - 1293 1323 poll->head = NULL; 1294 - poll->woken = false; 1324 + poll->done = false; 1295 1325 poll->canceled = false; 1296 1326 1297 1327 ipt.pt._qproc = io_poll_queue_proc; ··· 1290 1346 INIT_LIST_HEAD(&poll->wait.entry); 1291 1347 init_waitqueue_func_entry(&poll->wait, io_poll_wake); 1292 1348 1293 - /* one for removal from waitqueue, one for this function */ 1294 - refcount_set(&req->refs, 2); 1295 - 1296 1349 mask = vfs_poll(poll->file, &ipt.pt) & poll->events; 1297 - if (unlikely(!poll->head)) { 1298 - /* we did not manage to set up a waitqueue, done */ 1299 - goto out; 1300 - } 1301 1350 1302 1351 spin_lock_irq(&ctx->completion_lock); 1303 - spin_lock(&poll->head->lock); 1304 - if (poll->woken) { 1305 - /* wake_up context handles the rest */ 1306 - mask = 0; 1307 - ipt.error = 0; 1308 - } else if (mask || ipt.error) { 1309 - /* if we get an error or a mask we are done */ 1310 - WARN_ON_ONCE(list_empty(&poll->wait.entry)); 1311 - list_del_init(&poll->wait.entry); 1312 - } else { 1313 - /* actually waiting for an event */ 1314 - list_add_tail(&req->list, &ctx->cancel_list); 1352 + if (likely(poll->head)) { 1353 + spin_lock(&poll->head->lock); 1354 + if (unlikely(list_empty(&poll->wait.entry))) { 1355 + if (ipt.error) 1356 + cancel = true; 1357 + ipt.error = 0; 1358 + mask = 0; 1359 + } 1360 + if (mask || ipt.error) 1361 + list_del_init(&poll->wait.entry); 1362 + else if (cancel) 1363 + WRITE_ONCE(poll->canceled, true); 1364 + else if (!poll->done) /* actually waiting for an event */ 1365 + list_add_tail(&req->list, &ctx->cancel_list); 1366 + spin_unlock(&poll->head->lock); 1315 1367 } 1316 - spin_unlock(&poll->head->lock); 1368 + if (mask) { /* no async, we'd stolen it */ 1369 + req->error = mangle_poll(mask); 1370 + ipt.error = 0; 1371 + io_poll_complete(ctx, req, mask); 1372 + } 1317 1373 spin_unlock_irq(&ctx->completion_lock); 1318 1374 1319 - out: 1320 - if (unlikely(ipt.error)) { 1321 - if (!(flags & IOSQE_FIXED_FILE)) 1322 - fput(poll->file); 1323 - /* 1324 - * Drop one of our refs to this req, __io_submit_sqe() will 1325 - * drop the other one since we're returning an error. 1326 - */ 1327 - io_free_req(req); 1328 - return ipt.error; 1375 + if (mask) { 1376 + io_cqring_ev_posted(ctx); 1377 + io_put_req(req); 1329 1378 } 1330 - 1331 - if (mask) 1332 - io_poll_complete(req, mask); 1333 - io_free_req(req); 1334 - return 0; 1379 + return ipt.error; 1335 1380 } 1336 1381 1337 1382 static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, 1338 1383 const struct sqe_submit *s, bool force_nonblock, 1339 1384 struct io_submit_state *state) 1340 1385 { 1341 - ssize_t ret; 1342 - int opcode; 1386 + int ret, opcode; 1343 1387 1344 1388 if (unlikely(s->index >= ctx->sq_entries)) 1345 1389 return -EINVAL; ··· 1456 1524 break; 1457 1525 cond_resched(); 1458 1526 } while (1); 1527 + 1528 + /* drop submission reference */ 1529 + io_put_req(req); 1459 1530 } 1460 1531 if (ret) { 1461 1532 io_cqring_add_event(ctx, sqe->user_data, ret, 0); 1462 - io_free_req(req); 1533 + io_put_req(req); 1463 1534 } 1464 1535 1465 1536 /* async context always use a copy of the sqe */ ··· 1549 1614 return ret; 1550 1615 } 1551 1616 1617 + static bool io_op_needs_file(const struct io_uring_sqe *sqe) 1618 + { 1619 + int op = READ_ONCE(sqe->opcode); 1620 + 1621 + switch (op) { 1622 + case IORING_OP_NOP: 1623 + case IORING_OP_POLL_REMOVE: 1624 + return false; 1625 + default: 1626 + return true; 1627 + } 1628 + } 1629 + 1630 + static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, 1631 + struct io_submit_state *state, struct io_kiocb *req) 1632 + { 1633 + unsigned flags; 1634 + int fd; 1635 + 1636 + flags = READ_ONCE(s->sqe->flags); 1637 + fd = READ_ONCE(s->sqe->fd); 1638 + 1639 + if (!io_op_needs_file(s->sqe)) { 1640 + req->file = NULL; 1641 + return 0; 1642 + } 1643 + 1644 + if (flags & IOSQE_FIXED_FILE) { 1645 + if (unlikely(!ctx->user_files || 1646 + (unsigned) fd >= ctx->nr_user_files)) 1647 + return -EBADF; 1648 + req->file = ctx->user_files[fd]; 1649 + req->flags |= REQ_F_FIXED_FILE; 1650 + } else { 1651 + if (s->needs_fixed_file) 1652 + return -EBADF; 1653 + req->file = io_file_get(state, fd); 1654 + if (unlikely(!req->file)) 1655 + return -EBADF; 1656 + } 1657 + 1658 + return 0; 1659 + } 1660 + 1552 1661 static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, 1553 1662 struct io_submit_state *state) 1554 1663 { 1555 1664 struct io_kiocb *req; 1556 - ssize_t ret; 1665 + int ret; 1557 1666 1558 1667 /* enforce forwards compatibility on users */ 1559 1668 if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) ··· 1607 1628 if (unlikely(!req)) 1608 1629 return -EAGAIN; 1609 1630 1610 - req->rw.ki_filp = NULL; 1631 + ret = io_req_set_file(ctx, s, state, req); 1632 + if (unlikely(ret)) 1633 + goto out; 1611 1634 1612 1635 ret = __io_submit_sqe(ctx, req, s, true, state); 1613 1636 if (ret == -EAGAIN) { ··· 1630 1649 INIT_WORK(&req->work, io_sq_wq_submit_work); 1631 1650 queue_work(ctx->sqo_wq, &req->work); 1632 1651 } 1633 - ret = 0; 1652 + 1653 + /* 1654 + * Queued up for async execution, worker will release 1655 + * submit reference when the iocb is actually 1656 + * submitted. 1657 + */ 1658 + return 0; 1634 1659 } 1635 1660 } 1661 + 1662 + out: 1663 + /* drop submission reference */ 1664 + io_put_req(req); 1665 + 1666 + /* and drop final reference, if we failed */ 1636 1667 if (ret) 1637 - io_free_req(req); 1668 + io_put_req(req); 1638 1669 1639 1670 return ret; 1640 1671 }

+7 -5

fs/iomap.c

··· 1589 1589 if (should_dirty) { 1590 1590 bio_check_pages_dirty(bio); 1591 1591 } else { 1592 - struct bio_vec *bvec; 1593 - int i; 1594 - struct bvec_iter_all iter_all; 1592 + if (!bio_flagged(bio, BIO_NO_PAGE_REF)) { 1593 + struct bvec_iter_all iter_all; 1594 + struct bio_vec *bvec; 1595 + int i; 1595 1596 1596 - bio_for_each_segment_all(bvec, bio, i, iter_all) 1597 - put_page(bvec->bv_page); 1597 + bio_for_each_segment_all(bvec, bio, i, iter_all) 1598 + put_page(bvec->bv_page); 1599 + } 1598 1600 bio_put(bio); 1599 1601 } 1600 1602 }

+1

include/linux/blk_types.h

··· 215 215 /* 216 216 * bio flags 217 217 */ 218 + #define BIO_NO_PAGE_REF 0 /* don't put release vec pages */ 218 219 #define BIO_SEG_VALID 1 /* bi_phys_segments valid */ 219 220 #define BIO_CLONED 2 /* doesn't own data */ 220 221 #define BIO_BOUNCED 3 /* bio is a bounce bio */

+19 -5

include/linux/uio.h

··· 23 23 }; 24 24 25 25 enum iter_type { 26 - ITER_IOVEC = 0, 27 - ITER_KVEC = 2, 28 - ITER_BVEC = 4, 29 - ITER_PIPE = 8, 30 - ITER_DISCARD = 16, 26 + /* set if ITER_BVEC doesn't hold a bv_page ref */ 27 + ITER_BVEC_FLAG_NO_REF = 2, 28 + 29 + /* iter types */ 30 + ITER_IOVEC = 4, 31 + ITER_KVEC = 8, 32 + ITER_BVEC = 16, 33 + ITER_PIPE = 32, 34 + ITER_DISCARD = 64, 31 35 }; 32 36 33 37 struct iov_iter { 38 + /* 39 + * Bit 0 is the read/write bit, set if we're writing. 40 + * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and 41 + * the caller isn't expecting to drop a page reference when done. 42 + */ 34 43 unsigned int type; 35 44 size_t iov_offset; 36 45 size_t count; ··· 91 82 static inline unsigned char iov_iter_rw(const struct iov_iter *i) 92 83 { 93 84 return i->type & (READ | WRITE); 85 + } 86 + 87 + static inline bool iov_iter_bvec_no_ref(const struct iov_iter *i) 88 + { 89 + return (i->type & ITER_BVEC_FLAG_NO_REF) != 0; 94 90 } 95 91 96 92 /*

Configure Feed

Configure Feed