Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'ceph-for-4.17-rc5' of git://github.com/ceph/ceph-client

Pull ceph fixes from Ilya Dryomov:
"These patches fix two long-standing bugs in the DIO code path, one of
which is a crash trivially triggerable with splice()"

* tag 'ceph-for-4.17-rc5' of git://github.com/ceph/ceph-client:
ceph: fix iov_iter issues in ceph_direct_read_write()
libceph: add osd_req_op_extent_osd_data_bvecs()
ceph: fix rsize/wsize capping in ceph_direct_read_write()

+158 -90
+3 -1
drivers/block/rbd.c
··· 2366 2366 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", 2367 2367 "copyup"); 2368 2368 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0, 2369 - obj_req->copyup_bvecs, bytes); 2369 + obj_req->copyup_bvecs, 2370 + obj_req->copyup_bvec_count, 2371 + bytes); 2370 2372 2371 2373 switch (obj_req->img_request->op_type) { 2372 2374 case OBJ_OP_WRITE:
+122 -83
fs/ceph/file.c
··· 70 70 */ 71 71 72 72 /* 73 - * Calculate the length sum of direct io vectors that can 74 - * be combined into one page vector. 73 + * How many pages to get in one call to iov_iter_get_pages(). This 74 + * determines the size of the on-stack array used as a buffer. 75 75 */ 76 - static size_t dio_get_pagev_size(const struct iov_iter *it) 77 - { 78 - const struct iovec *iov = it->iov; 79 - const struct iovec *iovend = iov + it->nr_segs; 80 - size_t size; 76 + #define ITER_GET_BVECS_PAGES 64 81 77 82 - size = iov->iov_len - it->iov_offset; 83 - /* 84 - * An iov can be page vectored when both the current tail 85 - * and the next base are page aligned. 86 - */ 87 - while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) && 88 - (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) { 89 - size += iov->iov_len; 90 - } 91 - dout("dio_get_pagevlen len = %zu\n", size); 92 - return size; 78 + static ssize_t __iter_get_bvecs(struct iov_iter *iter, size_t maxsize, 79 + struct bio_vec *bvecs) 80 + { 81 + size_t size = 0; 82 + int bvec_idx = 0; 83 + 84 + if (maxsize > iov_iter_count(iter)) 85 + maxsize = iov_iter_count(iter); 86 + 87 + while (size < maxsize) { 88 + struct page *pages[ITER_GET_BVECS_PAGES]; 89 + ssize_t bytes; 90 + size_t start; 91 + int idx = 0; 92 + 93 + bytes = iov_iter_get_pages(iter, pages, maxsize - size, 94 + ITER_GET_BVECS_PAGES, &start); 95 + if (bytes < 0) 96 + return size ?: bytes; 97 + 98 + iov_iter_advance(iter, bytes); 99 + size += bytes; 100 + 101 + for ( ; bytes; idx++, bvec_idx++) { 102 + struct bio_vec bv = { 103 + .bv_page = pages[idx], 104 + .bv_len = min_t(int, bytes, PAGE_SIZE - start), 105 + .bv_offset = start, 106 + }; 107 + 108 + bvecs[bvec_idx] = bv; 109 + bytes -= bv.bv_len; 110 + start = 0; 111 + } 112 + } 113 + 114 + return size; 93 115 } 94 116 95 117 /* 96 - * Allocate a page vector based on (@it, @nbytes). 97 - * The return value is the tuple describing a page vector, 98 - * that is (@pages, @page_align, @num_pages). 118 + * iov_iter_get_pages() only considers one iov_iter segment, no matter 119 + * what maxsize or maxpages are given. For ITER_BVEC that is a single 120 + * page. 121 + * 122 + * Attempt to get up to @maxsize bytes worth of pages from @iter. 123 + * Return the number of bytes in the created bio_vec array, or an error. 99 124 */ 100 - static struct page ** 101 - dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, 102 - size_t *page_align, int *num_pages) 125 + static ssize_t iter_get_bvecs_alloc(struct iov_iter *iter, size_t maxsize, 126 + struct bio_vec **bvecs, int *num_bvecs) 103 127 { 104 - struct iov_iter tmp_it = *it; 105 - size_t align; 106 - struct page **pages; 107 - int ret = 0, idx, npages; 128 + struct bio_vec *bv; 129 + size_t orig_count = iov_iter_count(iter); 130 + ssize_t bytes; 131 + int npages; 108 132 109 - align = (unsigned long)(it->iov->iov_base + it->iov_offset) & 110 - (PAGE_SIZE - 1); 111 - npages = calc_pages_for(align, nbytes); 112 - pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL); 113 - if (!pages) 114 - return ERR_PTR(-ENOMEM); 133 + iov_iter_truncate(iter, maxsize); 134 + npages = iov_iter_npages(iter, INT_MAX); 135 + iov_iter_reexpand(iter, orig_count); 115 136 116 - for (idx = 0; idx < npages; ) { 117 - size_t start; 118 - ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes, 119 - npages - idx, &start); 120 - if (ret < 0) 121 - goto fail; 137 + /* 138 + * __iter_get_bvecs() may populate only part of the array -- zero it 139 + * out. 140 + */ 141 + bv = kvmalloc_array(npages, sizeof(*bv), GFP_KERNEL | __GFP_ZERO); 142 + if (!bv) 143 + return -ENOMEM; 122 144 123 - iov_iter_advance(&tmp_it, ret); 124 - nbytes -= ret; 125 - idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE; 145 + bytes = __iter_get_bvecs(iter, maxsize, bv); 146 + if (bytes < 0) { 147 + /* 148 + * No pages were pinned -- just free the array. 149 + */ 150 + kvfree(bv); 151 + return bytes; 126 152 } 127 153 128 - BUG_ON(nbytes != 0); 129 - *num_pages = npages; 130 - *page_align = align; 131 - dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align); 132 - return pages; 133 - fail: 134 - ceph_put_page_vector(pages, idx, false); 135 - return ERR_PTR(ret); 154 + *bvecs = bv; 155 + *num_bvecs = npages; 156 + return bytes; 157 + } 158 + 159 + static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) 160 + { 161 + int i; 162 + 163 + for (i = 0; i < num_bvecs; i++) { 164 + if (bvecs[i].bv_page) { 165 + if (should_dirty) 166 + set_page_dirty_lock(bvecs[i].bv_page); 167 + put_page(bvecs[i].bv_page); 168 + } 169 + } 170 + kvfree(bvecs); 136 171 } 137 172 138 173 /* ··· 781 746 struct inode *inode = req->r_inode; 782 747 struct ceph_aio_request *aio_req = req->r_priv; 783 748 struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); 784 - int num_pages = calc_pages_for((u64)osd_data->alignment, 785 - osd_data->length); 786 749 787 - dout("ceph_aio_complete_req %p rc %d bytes %llu\n", 788 - inode, rc, osd_data->length); 750 + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); 751 + BUG_ON(!osd_data->num_bvecs); 752 + 753 + dout("ceph_aio_complete_req %p rc %d bytes %u\n", 754 + inode, rc, osd_data->bvec_pos.iter.bi_size); 789 755 790 756 if (rc == -EOLDSNAPC) { 791 757 struct ceph_aio_work *aio_work; ··· 804 768 } else if (!aio_req->write) { 805 769 if (rc == -ENOENT) 806 770 rc = 0; 807 - if (rc >= 0 && osd_data->length > rc) { 808 - int zoff = osd_data->alignment + rc; 809 - int zlen = osd_data->length - rc; 771 + if (rc >= 0 && osd_data->bvec_pos.iter.bi_size > rc) { 772 + struct iov_iter i; 773 + int zlen = osd_data->bvec_pos.iter.bi_size - rc; 774 + 810 775 /* 811 776 * If read is satisfied by single OSD request, 812 777 * it can pass EOF. Otherwise read is within ··· 822 785 aio_req->total_len = rc + zlen; 823 786 } 824 787 825 - if (zlen > 0) 826 - ceph_zero_page_vector_range(zoff, zlen, 827 - osd_data->pages); 788 + iov_iter_bvec(&i, ITER_BVEC, osd_data->bvec_pos.bvecs, 789 + osd_data->num_bvecs, 790 + osd_data->bvec_pos.iter.bi_size); 791 + iov_iter_advance(&i, rc); 792 + iov_iter_zero(zlen, &i); 828 793 } 829 794 } 830 795 831 - ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty); 796 + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, 797 + aio_req->should_dirty); 832 798 ceph_osdc_put_request(req); 833 799 834 800 if (rc < 0) ··· 919 879 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 920 880 struct ceph_vino vino; 921 881 struct ceph_osd_request *req; 922 - struct page **pages; 882 + struct bio_vec *bvecs; 923 883 struct ceph_aio_request *aio_req = NULL; 924 884 int num_pages = 0; 925 885 int flags; ··· 954 914 } 955 915 956 916 while (iov_iter_count(iter) > 0) { 957 - u64 size = dio_get_pagev_size(iter); 958 - size_t start = 0; 917 + u64 size = iov_iter_count(iter); 959 918 ssize_t len; 919 + 920 + if (write) 921 + size = min_t(u64, size, fsc->mount_options->wsize); 922 + else 923 + size = min_t(u64, size, fsc->mount_options->rsize); 960 924 961 925 vino = ceph_vino(inode); 962 926 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, ··· 977 933 break; 978 934 } 979 935 980 - if (write) 981 - size = min_t(u64, size, fsc->mount_options->wsize); 982 - else 983 - size = min_t(u64, size, fsc->mount_options->rsize); 984 - 985 - len = size; 986 - pages = dio_get_pages_alloc(iter, len, &start, &num_pages); 987 - if (IS_ERR(pages)) { 936 + len = iter_get_bvecs_alloc(iter, size, &bvecs, &num_pages); 937 + if (len < 0) { 988 938 ceph_osdc_put_request(req); 989 - ret = PTR_ERR(pages); 939 + ret = len; 990 940 break; 991 941 } 942 + if (len != size) 943 + osd_req_op_extent_update(req, 0, len); 992 944 993 945 /* 994 946 * To simplify error handling, allow AIO when IO within i_size ··· 1017 977 req->r_mtime = mtime; 1018 978 } 1019 979 1020 - osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, 1021 - false, false); 980 + osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len); 1022 981 1023 982 if (aio_req) { 1024 983 aio_req->total_len += len; ··· 1030 991 list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); 1031 992 1032 993 pos += len; 1033 - iov_iter_advance(iter, len); 1034 994 continue; 1035 995 } 1036 996 ··· 1042 1004 if (ret == -ENOENT) 1043 1005 ret = 0; 1044 1006 if (ret >= 0 && ret < len && pos + ret < size) { 1007 + struct iov_iter i; 1045 1008 int zlen = min_t(size_t, len - ret, 1046 1009 size - pos - ret); 1047 - ceph_zero_page_vector_range(start + ret, zlen, 1048 - pages); 1010 + 1011 + iov_iter_bvec(&i, ITER_BVEC, bvecs, num_pages, 1012 + len); 1013 + iov_iter_advance(&i, ret); 1014 + iov_iter_zero(zlen, &i); 1049 1015 ret += zlen; 1050 1016 } 1051 1017 if (ret >= 0) 1052 1018 len = ret; 1053 1019 } 1054 1020 1055 - ceph_put_page_vector(pages, num_pages, should_dirty); 1056 - 1021 + put_bvecs(bvecs, num_pages, should_dirty); 1057 1022 ceph_osdc_put_request(req); 1058 1023 if (ret < 0) 1059 1024 break; 1060 1025 1061 1026 pos += len; 1062 - iov_iter_advance(iter, len); 1063 - 1064 1027 if (!write && pos >= size) 1065 1028 break; 1066 1029
+10 -2
include/linux/ceph/osd_client.h
··· 77 77 u32 bio_length; 78 78 }; 79 79 #endif /* CONFIG_BLOCK */ 80 - struct ceph_bvec_iter bvec_pos; 80 + struct { 81 + struct ceph_bvec_iter bvec_pos; 82 + u32 num_bvecs; 83 + }; 81 84 }; 82 85 }; 83 86 ··· 415 412 struct ceph_bio_iter *bio_pos, 416 413 u32 bio_length); 417 414 #endif /* CONFIG_BLOCK */ 415 + void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, 416 + unsigned int which, 417 + struct bio_vec *bvecs, u32 num_bvecs, 418 + u32 bytes); 418 419 void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, 419 420 unsigned int which, 420 421 struct ceph_bvec_iter *bvec_pos); ··· 433 426 bool own_pages); 434 427 void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, 435 428 unsigned int which, 436 - struct bio_vec *bvecs, u32 bytes); 429 + struct bio_vec *bvecs, u32 num_bvecs, 430 + u32 bytes); 437 431 extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, 438 432 unsigned int which, 439 433 struct page **pages, u64 length,
+23 -4
net/ceph/osd_client.c
··· 157 157 #endif /* CONFIG_BLOCK */ 158 158 159 159 static void ceph_osd_data_bvecs_init(struct ceph_osd_data *osd_data, 160 - struct ceph_bvec_iter *bvec_pos) 160 + struct ceph_bvec_iter *bvec_pos, 161 + u32 num_bvecs) 161 162 { 162 163 osd_data->type = CEPH_OSD_DATA_TYPE_BVECS; 163 164 osd_data->bvec_pos = *bvec_pos; 165 + osd_data->num_bvecs = num_bvecs; 164 166 } 165 167 166 168 #define osd_req_op_data(oreq, whch, typ, fld) \ ··· 239 237 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio); 240 238 #endif /* CONFIG_BLOCK */ 241 239 240 + void osd_req_op_extent_osd_data_bvecs(struct ceph_osd_request *osd_req, 241 + unsigned int which, 242 + struct bio_vec *bvecs, u32 num_bvecs, 243 + u32 bytes) 244 + { 245 + struct ceph_osd_data *osd_data; 246 + struct ceph_bvec_iter it = { 247 + .bvecs = bvecs, 248 + .iter = { .bi_size = bytes }, 249 + }; 250 + 251 + osd_data = osd_req_op_data(osd_req, which, extent, osd_data); 252 + ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); 253 + } 254 + EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvecs); 255 + 242 256 void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, 243 257 unsigned int which, 244 258 struct ceph_bvec_iter *bvec_pos) ··· 262 244 struct ceph_osd_data *osd_data; 263 245 264 246 osd_data = osd_req_op_data(osd_req, which, extent, osd_data); 265 - ceph_osd_data_bvecs_init(osd_data, bvec_pos); 247 + ceph_osd_data_bvecs_init(osd_data, bvec_pos, 0); 266 248 } 267 249 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bvec_pos); 268 250 ··· 305 287 306 288 void osd_req_op_cls_request_data_bvecs(struct ceph_osd_request *osd_req, 307 289 unsigned int which, 308 - struct bio_vec *bvecs, u32 bytes) 290 + struct bio_vec *bvecs, u32 num_bvecs, 291 + u32 bytes) 309 292 { 310 293 struct ceph_osd_data *osd_data; 311 294 struct ceph_bvec_iter it = { ··· 315 296 }; 316 297 317 298 osd_data = osd_req_op_data(osd_req, which, cls, request_data); 318 - ceph_osd_data_bvecs_init(osd_data, &it); 299 + ceph_osd_data_bvecs_init(osd_data, &it, num_bvecs); 319 300 osd_req->r_ops[which].cls.indata_len += bytes; 320 301 osd_req->r_ops[which].indata_len += bytes; 321 302 }