Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
"There is quite a bit here, including some overdue refactoring and
cleanup on the mon_client and osd_client code from Ilya, scattered
writeback support for CephFS and a pile of bug fixes from Zheng, and a
few random cleanups and fixes from others"

[ I already decided not to pull this because of it having been rebased
recently, but ended up changing my mind after all. Next time I'll
really hold people to it. Oh well. - Linus ]

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (34 commits)
libceph: use KMEM_CACHE macro
ceph: use kmem_cache_zalloc
rbd: use KMEM_CACHE macro
ceph: use lookup request to revalidate dentry
ceph: kill ceph_get_dentry_parent_inode()
ceph: fix security xattr deadlock
ceph: don't request vxattrs from MDS
ceph: fix mounting same fs multiple times
ceph: remove unnecessary NULL check
ceph: avoid updating directory inode's i_size accidentally
ceph: fix race during filling readdir cache
libceph: use sizeof_footer() more
ceph: kill ceph_empty_snapc
ceph: fix a wrong comparison
ceph: replace CURRENT_TIME by current_fs_time()
ceph: scattered page writeback
libceph: add helper that duplicates last extent operation
libceph: enable large, variable-sized OSD requests
libceph: osdc->req_mempool should be backed by a slab pool
libceph: make r_request msg_size calculation clearer
...

+807 -515
+3 -11
drivers/block/rbd.c
··· 1847 1847 if (osd_req->r_result < 0) 1848 1848 obj_request->result = osd_req->r_result; 1849 1849 1850 - rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); 1851 - 1852 1850 /* 1853 1851 * We support a 64-bit length, but ultimately it has to be 1854 1852 * passed to the block layer, which just supports a 32-bit 1855 1853 * length field. 1856 1854 */ 1857 - obj_request->xferred = osd_req->r_reply_op_len[0]; 1855 + obj_request->xferred = osd_req->r_ops[0].outdata_len; 1858 1856 rbd_assert(obj_request->xferred < (u64)UINT_MAX); 1859 1857 1860 1858 opcode = osd_req->r_ops[0].op; ··· 5641 5643 static int rbd_slab_init(void) 5642 5644 { 5643 5645 rbd_assert(!rbd_img_request_cache); 5644 - rbd_img_request_cache = kmem_cache_create("rbd_img_request", 5645 - sizeof (struct rbd_img_request), 5646 - __alignof__(struct rbd_img_request), 5647 - 0, NULL); 5646 + rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 5648 5647 if (!rbd_img_request_cache) 5649 5648 return -ENOMEM; 5650 5649 5651 5650 rbd_assert(!rbd_obj_request_cache); 5652 - rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 5653 - sizeof (struct rbd_obj_request), 5654 - __alignof__(struct rbd_obj_request), 5655 - 0, NULL); 5651 + rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 5656 5652 if (!rbd_obj_request_cache) 5657 5653 goto out_err; 5658 5654
+203 -119
fs/ceph/addr.c
··· 175 175 176 176 static int ceph_releasepage(struct page *page, gfp_t g) 177 177 { 178 - struct inode *inode = page->mapping ? page->mapping->host : NULL; 179 - dout("%p releasepage %p idx %lu\n", inode, page, page->index); 178 + dout("%p releasepage %p idx %lu\n", page->mapping->host, 179 + page, page->index); 180 180 WARN_ON(PageDirty(page)); 181 181 182 182 /* Can we release the page from the cache? */ ··· 276 276 for (i = 0; i < num_pages; i++) { 277 277 struct page *page = osd_data->pages[i]; 278 278 279 - if (rc < 0 && rc != ENOENT) 279 + if (rc < 0 && rc != -ENOENT) 280 280 goto unlock; 281 281 if (bytes < (int)PAGE_CACHE_SIZE) { 282 282 /* zero (remainder of) page */ ··· 606 606 struct inode *inode = req->r_inode; 607 607 struct ceph_inode_info *ci = ceph_inode(inode); 608 608 struct ceph_osd_data *osd_data; 609 - unsigned wrote; 610 609 struct page *page; 611 - int num_pages; 612 - int i; 610 + int num_pages, total_pages = 0; 611 + int i, j; 612 + int rc = req->r_result; 613 613 struct ceph_snap_context *snapc = req->r_snapc; 614 614 struct address_space *mapping = inode->i_mapping; 615 - int rc = req->r_result; 616 - u64 bytes = req->r_ops[0].extent.length; 617 615 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 618 - long writeback_stat; 619 - unsigned issued = ceph_caps_issued(ci); 616 + bool remove_page; 620 617 621 - osd_data = osd_req_op_extent_osd_data(req, 0); 622 - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 623 - num_pages = calc_pages_for((u64)osd_data->alignment, 624 - (u64)osd_data->length); 625 - if (rc >= 0) { 626 - /* 627 - * Assume we wrote the pages we originally sent. The 628 - * osd might reply with fewer pages if our writeback 629 - * raced with a truncation and was adjusted at the osd, 630 - * so don't believe the reply. 631 - */ 632 - wrote = num_pages; 633 - } else { 634 - wrote = 0; 618 + 619 + dout("writepages_finish %p rc %d\n", inode, rc); 620 + if (rc < 0) 635 621 mapping_set_error(mapping, rc); 636 - } 637 - dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", 638 - inode, rc, bytes, wrote); 622 + 623 + /* 624 + * We lost the cache cap, need to truncate the page before 625 + * it is unlocked, otherwise we'd truncate it later in the 626 + * page truncation thread, possibly losing some data that 627 + * raced its way in 628 + */ 629 + remove_page = !(ceph_caps_issued(ci) & 630 + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); 639 631 640 632 /* clean all pages */ 641 - for (i = 0; i < num_pages; i++) { 642 - page = osd_data->pages[i]; 643 - BUG_ON(!page); 644 - WARN_ON(!PageUptodate(page)); 633 + for (i = 0; i < req->r_num_ops; i++) { 634 + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) 635 + break; 645 636 646 - writeback_stat = 647 - atomic_long_dec_return(&fsc->writeback_count); 648 - if (writeback_stat < 649 - CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) 650 - clear_bdi_congested(&fsc->backing_dev_info, 651 - BLK_RW_ASYNC); 637 + osd_data = osd_req_op_extent_osd_data(req, i); 638 + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 639 + num_pages = calc_pages_for((u64)osd_data->alignment, 640 + (u64)osd_data->length); 641 + total_pages += num_pages; 642 + for (j = 0; j < num_pages; j++) { 643 + page = osd_data->pages[j]; 644 + BUG_ON(!page); 645 + WARN_ON(!PageUptodate(page)); 652 646 653 - ceph_put_snap_context(page_snap_context(page)); 654 - page->private = 0; 655 - ClearPagePrivate(page); 656 - dout("unlocking %d %p\n", i, page); 657 - end_page_writeback(page); 647 + if (atomic_long_dec_return(&fsc->writeback_count) < 648 + CONGESTION_OFF_THRESH( 649 + fsc->mount_options->congestion_kb)) 650 + clear_bdi_congested(&fsc->backing_dev_info, 651 + BLK_RW_ASYNC); 658 652 659 - /* 660 - * We lost the cache cap, need to truncate the page before 661 - * it is unlocked, otherwise we'd truncate it later in the 662 - * page truncation thread, possibly losing some data that 663 - * raced its way in 664 - */ 665 - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) 666 - generic_error_remove_page(inode->i_mapping, page); 653 + ceph_put_snap_context(page_snap_context(page)); 654 + page->private = 0; 655 + ClearPagePrivate(page); 656 + dout("unlocking %p\n", page); 657 + end_page_writeback(page); 667 658 668 - unlock_page(page); 659 + if (remove_page) 660 + generic_error_remove_page(inode->i_mapping, 661 + page); 662 + 663 + unlock_page(page); 664 + } 665 + dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", 666 + inode, osd_data->length, rc >= 0 ? num_pages : 0); 667 + 668 + ceph_release_pages(osd_data->pages, num_pages); 669 669 } 670 - dout("%p wrote+cleaned %d pages\n", inode, wrote); 671 - ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); 672 670 673 - ceph_release_pages(osd_data->pages, num_pages); 671 + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); 672 + 673 + osd_data = osd_req_op_extent_osd_data(req, 0); 674 674 if (osd_data->pages_from_pool) 675 675 mempool_free(osd_data->pages, 676 676 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); ··· 778 778 while (!done && index <= end) { 779 779 unsigned i; 780 780 int first; 781 - pgoff_t next; 782 - int pvec_pages, locked_pages; 783 - struct page **pages = NULL; 781 + pgoff_t strip_unit_end = 0; 782 + int num_ops = 0, op_idx; 783 + int pvec_pages, locked_pages = 0; 784 + struct page **pages = NULL, **data_pages; 784 785 mempool_t *pool = NULL; /* Becomes non-null if mempool used */ 785 786 struct page *page; 786 787 int want; 787 - u64 offset, len; 788 - long writeback_stat; 788 + u64 offset = 0, len = 0; 789 789 790 - next = 0; 791 - locked_pages = 0; 792 790 max_pages = max_pages_ever; 793 791 794 792 get_more_pages: ··· 822 824 unlock_page(page); 823 825 break; 824 826 } 825 - if (next && (page->index != next)) { 826 - dout("not consecutive %p\n", page); 827 + if (strip_unit_end && (page->index > strip_unit_end)) { 828 + dout("end of strip unit %p\n", page); 827 829 unlock_page(page); 828 830 break; 829 831 } ··· 865 867 /* 866 868 * We have something to write. If this is 867 869 * the first locked page this time through, 868 - * allocate an osd request and a page array 869 - * that it will use. 870 + * calculate max possinle write size and 871 + * allocate a page array 870 872 */ 871 873 if (locked_pages == 0) { 872 - BUG_ON(pages); 874 + u64 objnum; 875 + u64 objoff; 876 + 873 877 /* prepare async write request */ 874 878 offset = (u64)page_offset(page); 875 879 len = wsize; 876 - req = ceph_osdc_new_request(&fsc->client->osdc, 877 - &ci->i_layout, vino, 878 - offset, &len, 0, 879 - do_sync ? 2 : 1, 880 - CEPH_OSD_OP_WRITE, 881 - CEPH_OSD_FLAG_WRITE | 882 - CEPH_OSD_FLAG_ONDISK, 883 - snapc, truncate_seq, 884 - truncate_size, true); 885 - if (IS_ERR(req)) { 886 - rc = PTR_ERR(req); 880 + 881 + rc = ceph_calc_file_object_mapping(&ci->i_layout, 882 + offset, len, 883 + &objnum, &objoff, 884 + &len); 885 + if (rc < 0) { 887 886 unlock_page(page); 888 887 break; 889 888 } 890 889 891 - if (do_sync) 892 - osd_req_op_init(req, 1, 893 - CEPH_OSD_OP_STARTSYNC, 0); 890 + num_ops = 1 + do_sync; 891 + strip_unit_end = page->index + 892 + ((len - 1) >> PAGE_CACHE_SHIFT); 894 893 895 - req->r_callback = writepages_finish; 896 - req->r_inode = inode; 897 - 894 + BUG_ON(pages); 898 895 max_pages = calc_pages_for(0, (u64)len); 899 896 pages = kmalloc(max_pages * sizeof (*pages), 900 897 GFP_NOFS); ··· 898 905 pages = mempool_alloc(pool, GFP_NOFS); 899 906 BUG_ON(!pages); 900 907 } 908 + 909 + len = 0; 910 + } else if (page->index != 911 + (offset + len) >> PAGE_CACHE_SHIFT) { 912 + if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS : 913 + CEPH_OSD_MAX_OPS)) { 914 + redirty_page_for_writepage(wbc, page); 915 + unlock_page(page); 916 + break; 917 + } 918 + 919 + num_ops++; 920 + offset = (u64)page_offset(page); 921 + len = 0; 901 922 } 902 923 903 924 /* note position of first page in pvec */ ··· 920 913 dout("%p will write page %p idx %lu\n", 921 914 inode, page, page->index); 922 915 923 - writeback_stat = 924 - atomic_long_inc_return(&fsc->writeback_count); 925 - if (writeback_stat > CONGESTION_ON_THRESH( 916 + if (atomic_long_inc_return(&fsc->writeback_count) > 917 + CONGESTION_ON_THRESH( 926 918 fsc->mount_options->congestion_kb)) { 927 919 set_bdi_congested(&fsc->backing_dev_info, 928 920 BLK_RW_ASYNC); 929 921 } 930 922 931 - set_page_writeback(page); 932 923 pages[locked_pages] = page; 933 924 locked_pages++; 934 - next = page->index + 1; 925 + len += PAGE_CACHE_SIZE; 935 926 } 936 927 937 928 /* did we get anything? */ ··· 949 944 /* shift unused pages over in the pvec... we 950 945 * will need to release them below. */ 951 946 for (j = i; j < pvec_pages; j++) { 952 - dout(" pvec leftover page %p\n", 953 - pvec.pages[j]); 947 + dout(" pvec leftover page %p\n", pvec.pages[j]); 954 948 pvec.pages[j-i+first] = pvec.pages[j]; 955 949 } 956 950 pvec.nr -= i-first; 957 951 } 958 952 959 - /* Format the osd request message and submit the write */ 953 + new_request: 960 954 offset = page_offset(pages[0]); 961 - len = (u64)locked_pages << PAGE_CACHE_SHIFT; 962 - if (snap_size == -1) { 963 - len = min(len, (u64)i_size_read(inode) - offset); 964 - /* writepages_finish() clears writeback pages 965 - * according to the data length, so make sure 966 - * data length covers all locked pages */ 967 - len = max(len, 1 + 968 - ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); 969 - } else { 970 - len = min(len, snap_size - offset); 955 + len = wsize; 956 + 957 + req = ceph_osdc_new_request(&fsc->client->osdc, 958 + &ci->i_layout, vino, 959 + offset, &len, 0, num_ops, 960 + CEPH_OSD_OP_WRITE, 961 + CEPH_OSD_FLAG_WRITE | 962 + CEPH_OSD_FLAG_ONDISK, 963 + snapc, truncate_seq, 964 + truncate_size, false); 965 + if (IS_ERR(req)) { 966 + req = ceph_osdc_new_request(&fsc->client->osdc, 967 + &ci->i_layout, vino, 968 + offset, &len, 0, 969 + min(num_ops, 970 + CEPH_OSD_SLAB_OPS), 971 + CEPH_OSD_OP_WRITE, 972 + CEPH_OSD_FLAG_WRITE | 973 + CEPH_OSD_FLAG_ONDISK, 974 + snapc, truncate_seq, 975 + truncate_size, true); 976 + BUG_ON(IS_ERR(req)); 971 977 } 972 - dout("writepages got %d pages at %llu~%llu\n", 973 - locked_pages, offset, len); 978 + BUG_ON(len < page_offset(pages[locked_pages - 1]) + 979 + PAGE_CACHE_SIZE - offset); 974 980 975 - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 981 + req->r_callback = writepages_finish; 982 + req->r_inode = inode; 983 + 984 + /* Format the osd request message and submit the write */ 985 + len = 0; 986 + data_pages = pages; 987 + op_idx = 0; 988 + for (i = 0; i < locked_pages; i++) { 989 + u64 cur_offset = page_offset(pages[i]); 990 + if (offset + len != cur_offset) { 991 + if (op_idx + do_sync + 1 == req->r_num_ops) 992 + break; 993 + osd_req_op_extent_dup_last(req, op_idx, 994 + cur_offset - offset); 995 + dout("writepages got pages at %llu~%llu\n", 996 + offset, len); 997 + osd_req_op_extent_osd_data_pages(req, op_idx, 998 + data_pages, len, 0, 976 999 !!pool, false); 1000 + osd_req_op_extent_update(req, op_idx, len); 977 1001 978 - pages = NULL; /* request message now owns the pages array */ 1002 + len = 0; 1003 + offset = cur_offset; 1004 + data_pages = pages + i; 1005 + op_idx++; 1006 + } 1007 + 1008 + set_page_writeback(pages[i]); 1009 + len += PAGE_CACHE_SIZE; 1010 + } 1011 + 1012 + if (snap_size != -1) { 1013 + len = min(len, snap_size - offset); 1014 + } else if (i == locked_pages) { 1015 + /* writepages_finish() clears writeback pages 1016 + * according to the data length, so make sure 1017 + * data length covers all locked pages */ 1018 + u64 min_len = len + 1 - PAGE_CACHE_SIZE; 1019 + len = min(len, (u64)i_size_read(inode) - offset); 1020 + len = max(len, min_len); 1021 + } 1022 + dout("writepages got pages at %llu~%llu\n", offset, len); 1023 + 1024 + osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 1025 + 0, !!pool, false); 1026 + osd_req_op_extent_update(req, op_idx, len); 1027 + 1028 + if (do_sync) { 1029 + op_idx++; 1030 + osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); 1031 + } 1032 + BUG_ON(op_idx + 1 != req->r_num_ops); 1033 + 979 1034 pool = NULL; 1035 + if (i < locked_pages) { 1036 + BUG_ON(num_ops <= req->r_num_ops); 1037 + num_ops -= req->r_num_ops; 1038 + num_ops += do_sync; 1039 + locked_pages -= i; 980 1040 981 - /* Update the write op length in case we changed it */ 982 - 983 - osd_req_op_extent_update(req, 0, len); 1041 + /* allocate new pages array for next request */ 1042 + data_pages = pages; 1043 + pages = kmalloc(locked_pages * sizeof (*pages), 1044 + GFP_NOFS); 1045 + if (!pages) { 1046 + pool = fsc->wb_pagevec_pool; 1047 + pages = mempool_alloc(pool, GFP_NOFS); 1048 + BUG_ON(!pages); 1049 + } 1050 + memcpy(pages, data_pages + i, 1051 + locked_pages * sizeof(*pages)); 1052 + memset(data_pages + i, 0, 1053 + locked_pages * sizeof(*pages)); 1054 + } else { 1055 + BUG_ON(num_ops != req->r_num_ops); 1056 + index = pages[i - 1]->index + 1; 1057 + /* request message now owns the pages array */ 1058 + pages = NULL; 1059 + } 984 1060 985 1061 vino = ceph_vino(inode); 986 1062 ceph_osdc_build_request(req, offset, snapc, vino.snap, ··· 1071 985 BUG_ON(rc); 1072 986 req = NULL; 1073 987 1074 - /* continue? */ 1075 - index = next; 1076 - wbc->nr_to_write -= locked_pages; 988 + wbc->nr_to_write -= i; 989 + if (pages) 990 + goto new_request; 991 + 1077 992 if (wbc->nr_to_write <= 0) 1078 993 done = 1; 1079 994 ··· 1609 1522 ceph_vino(inode), 0, &len, 0, 1, 1610 1523 CEPH_OSD_OP_CREATE, 1611 1524 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1612 - ceph_empty_snapc, 0, 0, false); 1525 + NULL, 0, 0, false); 1613 1526 if (IS_ERR(req)) { 1614 1527 err = PTR_ERR(req); 1615 1528 goto out; ··· 1627 1540 ceph_vino(inode), 0, &len, 1, 3, 1628 1541 CEPH_OSD_OP_WRITE, 1629 1542 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1630 - ceph_empty_snapc, 1631 - ci->i_truncate_seq, ci->i_truncate_size, 1632 - false); 1543 + NULL, ci->i_truncate_seq, 1544 + ci->i_truncate_size, false); 1633 1545 if (IS_ERR(req)) { 1634 1546 err = PTR_ERR(req); 1635 1547 goto out; ··· 1749 1663 goto out; 1750 1664 } 1751 1665 1752 - rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, 1753 - ceph_empty_snapc, 1666 + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1754 1667 1, false, GFP_NOFS); 1755 1668 if (!rd_req) { 1756 1669 err = -ENOMEM; ··· 1763 1678 "%llx.00000000", ci->i_vino.ino); 1764 1679 rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); 1765 1680 1766 - wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, 1767 - ceph_empty_snapc, 1681 + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1768 1682 1, false, GFP_NOFS); 1769 1683 if (!wr_req) { 1770 1684 err = -ENOMEM;
+7 -4
fs/ceph/caps.c
··· 991 991 u32 seq, u64 flush_tid, u64 oldest_flush_tid, 992 992 u32 issue_seq, u32 mseq, u64 size, u64 max_size, 993 993 struct timespec *mtime, struct timespec *atime, 994 - u64 time_warp_seq, 994 + struct timespec *ctime, u64 time_warp_seq, 995 995 kuid_t uid, kgid_t gid, umode_t mode, 996 996 u64 xattr_version, 997 997 struct ceph_buffer *xattrs_buf, ··· 1042 1042 ceph_encode_timespec(&fc->mtime, mtime); 1043 1043 if (atime) 1044 1044 ceph_encode_timespec(&fc->atime, atime); 1045 + if (ctime) 1046 + ceph_encode_timespec(&fc->ctime, ctime); 1045 1047 fc->time_warp_seq = cpu_to_le32(time_warp_seq); 1046 1048 1047 1049 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); ··· 1118 1116 int held, revoking, dropping, keep; 1119 1117 u64 seq, issue_seq, mseq, time_warp_seq, follows; 1120 1118 u64 size, max_size; 1121 - struct timespec mtime, atime; 1119 + struct timespec mtime, atime, ctime; 1122 1120 int wake = 0; 1123 1121 umode_t mode; 1124 1122 kuid_t uid; ··· 1182 1180 ci->i_requested_max_size = max_size; 1183 1181 mtime = inode->i_mtime; 1184 1182 atime = inode->i_atime; 1183 + ctime = inode->i_ctime; 1185 1184 time_warp_seq = ci->i_time_warp_seq; 1186 1185 uid = inode->i_uid; 1187 1186 gid = inode->i_gid; ··· 1201 1198 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1202 1199 op, keep, want, flushing, seq, 1203 1200 flush_tid, oldest_flush_tid, issue_seq, mseq, 1204 - size, max_size, &mtime, &atime, time_warp_seq, 1201 + size, max_size, &mtime, &atime, &ctime, time_warp_seq, 1205 1202 uid, gid, mode, xattr_version, xattr_blob, 1206 1203 follows, inline_data); 1207 1204 if (ret < 0) { ··· 1323 1320 capsnap->dirty, 0, capsnap->flush_tid, 0, 1324 1321 0, mseq, capsnap->size, 0, 1325 1322 &capsnap->mtime, &capsnap->atime, 1326 - capsnap->time_warp_seq, 1323 + &capsnap->ctime, capsnap->time_warp_seq, 1327 1324 capsnap->uid, capsnap->gid, capsnap->mode, 1328 1325 capsnap->xattr_version, capsnap->xattr_blob, 1329 1326 capsnap->follows, capsnap->inline_data);
+47 -22
fs/ceph/dir.c
··· 38 38 if (dentry->d_fsdata) 39 39 return 0; 40 40 41 - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); 41 + di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); 42 42 if (!di) 43 43 return -ENOMEM; /* oh well */ 44 44 ··· 67 67 spin_unlock(&dentry->d_lock); 68 68 return 0; 69 69 } 70 - 71 - struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) 72 - { 73 - struct inode *inode = NULL; 74 - 75 - if (!dentry) 76 - return NULL; 77 - 78 - spin_lock(&dentry->d_lock); 79 - if (!IS_ROOT(dentry)) { 80 - inode = d_inode(dentry->d_parent); 81 - ihold(inode); 82 - } 83 - spin_unlock(&dentry->d_lock); 84 - return inode; 85 - } 86 - 87 70 88 71 /* 89 72 * for readdir, we encode the directory frag and offset within that ··· 607 624 struct ceph_mds_client *mdsc = fsc->mdsc; 608 625 struct ceph_mds_request *req; 609 626 int op; 627 + int mask; 610 628 int err; 611 629 612 630 dout("lookup %p dentry %p '%pd'\n", ··· 650 666 return ERR_CAST(req); 651 667 req->r_dentry = dget(dentry); 652 668 req->r_num_caps = 2; 653 - /* we only need inode linkage */ 654 - req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 669 + 670 + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 671 + if (ceph_security_xattr_wanted(dir)) 672 + mask |= CEPH_CAP_XATTR_SHARED; 673 + req->r_args.getattr.mask = cpu_to_le32(mask); 674 + 655 675 req->r_locked_dir = dir; 656 676 err = ceph_mdsc_do_request(mdsc, NULL, req); 657 677 err = ceph_handle_snapdir(req, dentry, err); ··· 1083 1095 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) 1084 1096 { 1085 1097 int valid = 0; 1098 + struct dentry *parent; 1086 1099 struct inode *dir; 1087 1100 1088 1101 if (flags & LOOKUP_RCU) ··· 1092 1103 dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, 1093 1104 dentry, d_inode(dentry), ceph_dentry(dentry)->offset); 1094 1105 1095 - dir = ceph_get_dentry_parent_inode(dentry); 1106 + parent = dget_parent(dentry); 1107 + dir = d_inode(parent); 1096 1108 1097 1109 /* always trust cached snapped dentries, snapdir dentry */ 1098 1110 if (ceph_snap(dir) != CEPH_NOSNAP) { ··· 1111 1121 valid = 1; 1112 1122 } 1113 1123 1124 + if (!valid) { 1125 + struct ceph_mds_client *mdsc = 1126 + ceph_sb_to_client(dir->i_sb)->mdsc; 1127 + struct ceph_mds_request *req; 1128 + int op, mask, err; 1129 + 1130 + op = ceph_snap(dir) == CEPH_SNAPDIR ? 1131 + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 1132 + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 1133 + if (!IS_ERR(req)) { 1134 + req->r_dentry = dget(dentry); 1135 + req->r_num_caps = 2; 1136 + 1137 + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 1138 + if (ceph_security_xattr_wanted(dir)) 1139 + mask |= CEPH_CAP_XATTR_SHARED; 1140 + req->r_args.getattr.mask = mask; 1141 + 1142 + req->r_locked_dir = dir; 1143 + err = ceph_mdsc_do_request(mdsc, NULL, req); 1144 + if (err == 0 || err == -ENOENT) { 1145 + if (dentry == req->r_dentry) { 1146 + valid = !d_unhashed(dentry); 1147 + } else { 1148 + d_invalidate(req->r_dentry); 1149 + err = -EAGAIN; 1150 + } 1151 + } 1152 + ceph_mdsc_put_request(req); 1153 + dout("d_revalidate %p lookup result=%d\n", 1154 + dentry, err); 1155 + } 1156 + } 1157 + 1114 1158 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1115 1159 if (valid) { 1116 1160 ceph_dentry_lru_touch(dentry); 1117 1161 } else { 1118 1162 ceph_dir_clear_complete(dir); 1119 1163 } 1120 - iput(dir); 1164 + 1165 + dput(parent); 1121 1166 return valid; 1122 1167 } 1123 1168
+13
fs/ceph/export.c
··· 71 71 inode = ceph_find_inode(sb, vino); 72 72 if (!inode) { 73 73 struct ceph_mds_request *req; 74 + int mask; 74 75 75 76 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, 76 77 USE_ANY_MDS); 77 78 if (IS_ERR(req)) 78 79 return ERR_CAST(req); 80 + 81 + mask = CEPH_STAT_CAP_INODE; 82 + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) 83 + mask |= CEPH_CAP_XATTR_SHARED; 84 + req->r_args.getattr.mask = cpu_to_le32(mask); 79 85 80 86 req->r_ino1 = vino; 81 87 req->r_num_caps = 1; ··· 134 128 struct ceph_mds_request *req; 135 129 struct inode *inode; 136 130 struct dentry *dentry; 131 + int mask; 137 132 int err; 138 133 139 134 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, ··· 151 144 .snap = CEPH_NOSNAP, 152 145 }; 153 146 } 147 + 148 + mask = CEPH_STAT_CAP_INODE; 149 + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) 150 + mask |= CEPH_CAP_XATTR_SHARED; 151 + req->r_args.getattr.mask = cpu_to_le32(mask); 152 + 154 153 req->r_num_caps = 1; 155 154 err = ceph_mdsc_do_request(mdsc, NULL, req); 156 155 inode = req->r_target_inode;
+10 -5
fs/ceph/file.c
··· 157 157 case S_IFDIR: 158 158 dout("init_file %p %p 0%o (regular)\n", inode, file, 159 159 inode->i_mode); 160 - cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); 160 + cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 161 161 if (cf == NULL) { 162 162 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 163 163 return -ENOMEM; ··· 300 300 struct ceph_mds_request *req; 301 301 struct dentry *dn; 302 302 struct ceph_acls_info acls = {}; 303 + int mask; 303 304 int err; 304 305 305 306 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", ··· 336 335 acls.pagelist = NULL; 337 336 } 338 337 } 338 + 339 + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 340 + if (ceph_security_xattr_wanted(dir)) 341 + mask |= CEPH_CAP_XATTR_SHARED; 342 + req->r_args.open.mask = cpu_to_le32(mask); 343 + 339 344 req->r_locked_dir = dir; /* caller holds dir->i_mutex */ 340 345 err = ceph_mdsc_do_request(mdsc, 341 346 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, ··· 732 725 ret = ceph_osdc_start_request(req->r_osdc, req, false); 733 726 out: 734 727 if (ret < 0) { 735 - BUG_ON(ret == -EOLDSNAPC); 736 728 req->r_result = ret; 737 729 ceph_aio_complete_req(req, NULL); 738 730 } ··· 789 783 int num_pages = 0; 790 784 int flags; 791 785 int ret; 792 - struct timespec mtime = CURRENT_TIME; 786 + struct timespec mtime = current_fs_time(inode->i_sb); 793 787 size_t count = iov_iter_count(iter); 794 788 loff_t pos = iocb->ki_pos; 795 789 bool write = iov_iter_rw(iter) == WRITE; ··· 955 949 ret = ceph_osdc_start_request(req->r_osdc, 956 950 req, false); 957 951 if (ret < 0) { 958 - BUG_ON(ret == -EOLDSNAPC); 959 952 req->r_result = ret; 960 953 ceph_aio_complete_req(req, NULL); 961 954 } ··· 993 988 int flags; 994 989 int check_caps = 0; 995 990 int ret; 996 - struct timespec mtime = CURRENT_TIME; 991 + struct timespec mtime = current_fs_time(inode->i_sb); 997 992 size_t count = iov_iter_count(from); 998 993 999 994 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+27 -7
fs/ceph/inode.c
··· 549 549 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || 550 550 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { 551 551 dout("size %lld -> %llu\n", inode->i_size, size); 552 + if (size > 0 && S_ISDIR(inode->i_mode)) { 553 + pr_err("fill_file_size non-zero size for directory\n"); 554 + size = 0; 555 + } 552 556 i_size_write(inode, size); 553 557 inode->i_blocks = (size + (1<<9) - 1) >> 9; 554 558 ci->i_reported_size = size; ··· 1265 1261 dout(" %p links to %p %llx.%llx, not %llx.%llx\n", 1266 1262 dn, d_inode(dn), ceph_vinop(d_inode(dn)), 1267 1263 ceph_vinop(in)); 1264 + d_invalidate(dn); 1268 1265 have_lease = false; 1269 1266 } 1270 1267 ··· 1354 1349 1355 1350 if (!ctl->page || pgoff != page_index(ctl->page)) { 1356 1351 ceph_readdir_cache_release(ctl); 1357 - ctl->page = grab_cache_page(&dir->i_data, pgoff); 1352 + if (idx == 0) 1353 + ctl->page = grab_cache_page(&dir->i_data, pgoff); 1354 + else 1355 + ctl->page = find_lock_page(&dir->i_data, pgoff); 1358 1356 if (!ctl->page) { 1359 1357 ctl->index = -1; 1360 - return -ENOMEM; 1358 + return idx == 0 ? -ENOMEM : 0; 1361 1359 } 1362 1360 /* reading/filling the cache are serialized by 1363 1361 * i_mutex, no need to use page lock */ 1364 1362 unlock_page(ctl->page); 1365 1363 ctl->dentries = kmap(ctl->page); 1364 + if (idx == 0) 1365 + memset(ctl->dentries, 0, PAGE_CACHE_SIZE); 1366 1366 } 1367 1367 1368 1368 if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && ··· 1390 1380 struct qstr dname; 1391 1381 struct dentry *dn; 1392 1382 struct inode *in; 1393 - int err = 0, ret, i; 1383 + int err = 0, skipped = 0, ret, i; 1394 1384 struct inode *snapdir = NULL; 1395 1385 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1396 1386 struct ceph_dentry_info *di; ··· 1502 1492 } 1503 1493 1504 1494 if (d_really_is_negative(dn)) { 1505 - struct dentry *realdn = splice_dentry(dn, in); 1495 + struct dentry *realdn; 1496 + 1497 + if (ceph_security_xattr_deadlock(in)) { 1498 + dout(" skip splicing dn %p to inode %p" 1499 + " (security xattr deadlock)\n", dn, in); 1500 + iput(in); 1501 + skipped++; 1502 + goto next_item; 1503 + } 1504 + 1505 + realdn = splice_dentry(dn, in); 1506 1506 if (IS_ERR(realdn)) { 1507 1507 err = PTR_ERR(realdn); 1508 1508 d_drop(dn); ··· 1529 1509 req->r_session, 1530 1510 req->r_request_started); 1531 1511 1532 - if (err == 0 && cache_ctl.index >= 0) { 1512 + if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 1533 1513 ret = fill_readdir_cache(d_inode(parent), dn, 1534 1514 &cache_ctl, req); 1535 1515 if (ret < 0) ··· 1540 1520 dput(dn); 1541 1521 } 1542 1522 out: 1543 - if (err == 0) { 1523 + if (err == 0 && skipped == 0) { 1544 1524 req->r_did_prepopulate = true; 1545 1525 req->r_readdir_cache_idx = cache_ctl.index; 1546 1526 } ··· 1970 1950 if (dirtied) { 1971 1951 inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, 1972 1952 &prealloc_cf); 1973 - inode->i_ctime = CURRENT_TIME; 1953 + inode->i_ctime = current_fs_time(inode->i_sb); 1974 1954 } 1975 1955 1976 1956 release &= issued;
+5 -2
fs/ceph/mds_client.c
··· 1729 1729 init_completion(&req->r_safe_completion); 1730 1730 INIT_LIST_HEAD(&req->r_unsafe_item); 1731 1731 1732 - req->r_stamp = CURRENT_TIME; 1732 + req->r_stamp = current_fs_time(mdsc->fsc->sb); 1733 1733 1734 1734 req->r_op = op; 1735 1735 req->r_direct_mode = mode; ··· 2540 2540 2541 2541 /* insert trace into our cache */ 2542 2542 mutex_lock(&req->r_fill_mutex); 2543 + current->journal_info = req; 2543 2544 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2544 2545 if (err == 0) { 2545 2546 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || ··· 2548 2547 ceph_readdir_prepopulate(req, req->r_session); 2549 2548 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2550 2549 } 2550 + current->journal_info = NULL; 2551 2551 mutex_unlock(&req->r_fill_mutex); 2552 2552 2553 2553 up_read(&mdsc->snap_rwsem); ··· 3766 3764 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3767 3765 3768 3766 /* do we need it? */ 3769 - ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); 3770 3767 mutex_lock(&mdsc->mutex); 3771 3768 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3772 3769 dout("handle_map epoch %u <= our %u\n", ··· 3792 3791 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3793 3792 3794 3793 __wake_requests(mdsc, &mdsc->waiting_for_map); 3794 + ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, 3795 + mdsc->mdsmap->m_epoch); 3795 3796 3796 3797 mutex_unlock(&mdsc->mutex); 3797 3798 schedule_delayed(mdsc);
-16
fs/ceph/snap.c
··· 296 296 } 297 297 298 298 299 - struct ceph_snap_context *ceph_empty_snapc; 300 - 301 299 /* 302 300 * build the snap context for a given realm. 303 301 */ ··· 984 986 if (locked_rwsem) 985 987 up_write(&mdsc->snap_rwsem); 986 988 return; 987 - } 988 - 989 - int __init ceph_snap_init(void) 990 - { 991 - ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); 992 - if (!ceph_empty_snapc) 993 - return -ENOMEM; 994 - ceph_empty_snapc->seq = 1; 995 - return 0; 996 - } 997 - 998 - void ceph_snap_exit(void) 999 - { 1000 - ceph_put_snap_context(ceph_empty_snapc); 1001 989 }
+19 -28
fs/ceph/super.c
··· 439 439 440 440 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) 441 441 seq_puts(m, ",dirstat"); 442 - if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) 443 - seq_puts(m, ",norbytes"); 442 + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES)) 443 + seq_puts(m, ",rbytes"); 444 444 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) 445 445 seq_puts(m, ",noasyncreaddir"); 446 446 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) ··· 530 530 goto fail; 531 531 } 532 532 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 533 - fsc->client->monc.want_mdsmap = 1; 533 + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); 534 534 535 535 fsc->mount_options = fsopt; 536 536 ··· 793 793 struct dentry *root; 794 794 int first = 0; /* first vfsmount for this super_block */ 795 795 796 - dout("mount start\n"); 796 + dout("mount start %p\n", fsc); 797 797 mutex_lock(&fsc->client->mount_mutex); 798 798 799 - err = __ceph_open_session(fsc->client, started); 800 - if (err < 0) 801 - goto out; 799 + if (!fsc->sb->s_root) { 800 + err = __ceph_open_session(fsc->client, started); 801 + if (err < 0) 802 + goto out; 802 803 803 - dout("mount opening root\n"); 804 - root = open_root_dentry(fsc, "", started); 805 - if (IS_ERR(root)) { 806 - err = PTR_ERR(root); 807 - goto out; 808 - } 809 - if (fsc->sb->s_root) { 810 - dput(root); 811 - } else { 804 + dout("mount opening root\n"); 805 + root = open_root_dentry(fsc, "", started); 806 + if (IS_ERR(root)) { 807 + err = PTR_ERR(root); 808 + goto out; 809 + } 812 810 fsc->sb->s_root = root; 813 811 first = 1; 814 812 ··· 816 818 } 817 819 818 820 if (path[0] == 0) { 821 + root = fsc->sb->s_root; 819 822 dget(root); 820 823 } else { 821 824 dout("mount opening base mountpoint\n"); ··· 832 833 mutex_unlock(&fsc->client->mount_mutex); 833 834 return root; 834 835 835 - out: 836 - mutex_unlock(&fsc->client->mount_mutex); 837 - return ERR_PTR(err); 838 - 839 836 fail: 840 837 if (first) { 841 838 dput(fsc->sb->s_root); 842 839 fsc->sb->s_root = NULL; 843 840 } 844 - goto out; 841 + out: 842 + mutex_unlock(&fsc->client->mount_mutex); 843 + return ERR_PTR(err); 845 844 } 846 845 847 846 static int ceph_set_super(struct super_block *s, void *data) ··· 1039 1042 1040 1043 ceph_flock_init(); 1041 1044 ceph_xattr_init(); 1042 - ret = ceph_snap_init(); 1043 - if (ret) 1044 - goto out_xattr; 1045 1045 ret = register_filesystem(&ceph_fs_type); 1046 1046 if (ret) 1047 - goto out_snap; 1047 + goto out_xattr; 1048 1048 1049 1049 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); 1050 1050 1051 1051 return 0; 1052 1052 1053 - out_snap: 1054 - ceph_snap_exit(); 1055 1053 out_xattr: 1056 1054 ceph_xattr_exit(); 1057 1055 destroy_caches(); ··· 1058 1066 { 1059 1067 dout("exit_ceph\n"); 1060 1068 unregister_filesystem(&ceph_fs_type); 1061 - ceph_snap_exit(); 1062 1069 ceph_xattr_exit(); 1063 1070 destroy_caches(); 1064 1071 }
+16 -7
fs/ceph/super.h
··· 37 37 #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ 38 38 #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ 39 39 40 - #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ 41 - CEPH_MOUNT_OPT_DCACHE) 40 + #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE 42 41 43 42 #define ceph_set_mount_opt(fsc, opt) \ 44 43 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; ··· 468 469 #define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ 469 470 #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 470 471 #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 471 - 472 + #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ 472 473 473 474 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, 474 475 long long release_count, ··· 720 721 721 722 722 723 /* snap.c */ 723 - extern struct ceph_snap_context *ceph_empty_snapc; 724 724 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 725 725 u64 ino); 726 726 extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, ··· 736 738 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 737 739 struct ceph_cap_snap *capsnap); 738 740 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 739 - extern int ceph_snap_init(void); 740 - extern void ceph_snap_exit(void); 741 741 742 742 /* 743 743 * a cap_snap is "pending" if it is still awaiting an in-progress ··· 803 807 extern void __init ceph_xattr_init(void); 804 808 extern void ceph_xattr_exit(void); 805 809 extern const struct xattr_handler *ceph_xattr_handlers[]; 810 + 811 + #ifdef CONFIG_SECURITY 812 + extern bool ceph_security_xattr_deadlock(struct inode *in); 813 + extern bool ceph_security_xattr_wanted(struct inode *in); 814 + #else 815 + static inline bool ceph_security_xattr_deadlock(struct inode *in) 816 + { 817 + return false; 818 + } 819 + static inline bool ceph_security_xattr_wanted(struct inode *in) 820 + { 821 + return false; 822 + } 823 + #endif 806 824 807 825 /* acl.c */ 808 826 struct ceph_acls_info { ··· 957 947 extern void ceph_dentry_lru_del(struct dentry *dn); 958 948 extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 959 949 extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); 960 - extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); 961 950 extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); 962 951 963 952 /*
+71 -7
fs/ceph/xattr.c
··· 714 714 } 715 715 } 716 716 717 + static inline int __get_request_mask(struct inode *in) { 718 + struct ceph_mds_request *req = current->journal_info; 719 + int mask = 0; 720 + if (req && req->r_target_inode == in) { 721 + if (req->r_op == CEPH_MDS_OP_LOOKUP || 722 + req->r_op == CEPH_MDS_OP_LOOKUPINO || 723 + req->r_op == CEPH_MDS_OP_LOOKUPPARENT || 724 + req->r_op == CEPH_MDS_OP_GETATTR) { 725 + mask = le32_to_cpu(req->r_args.getattr.mask); 726 + } else if (req->r_op == CEPH_MDS_OP_OPEN || 727 + req->r_op == CEPH_MDS_OP_CREATE) { 728 + mask = le32_to_cpu(req->r_args.open.mask); 729 + } 730 + } 731 + return mask; 732 + } 733 + 717 734 ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, 718 735 size_t size) 719 736 { 720 737 struct ceph_inode_info *ci = ceph_inode(inode); 721 - int err; 722 738 struct ceph_inode_xattr *xattr; 723 739 struct ceph_vxattr *vxattr = NULL; 740 + int req_mask; 741 + int err; 724 742 725 743 if (!ceph_is_valid_xattr(name)) 726 744 return -ENODATA; 727 745 728 746 /* let's see if a virtual xattr was requested */ 729 747 vxattr = ceph_match_vxattr(inode, name); 730 - if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { 731 - err = vxattr->getxattr_cb(ci, value, size); 748 + if (vxattr) { 749 + err = -ENODATA; 750 + if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) 751 + err = vxattr->getxattr_cb(ci, value, size); 732 752 return err; 733 753 } 754 + 755 + req_mask = __get_request_mask(inode); 734 756 735 757 spin_lock(&ci->i_ceph_lock); 736 758 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 737 759 ci->i_xattrs.version, ci->i_xattrs.index_version); 738 760 739 761 if (ci->i_xattrs.version == 0 || 740 - !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { 762 + !((req_mask & CEPH_CAP_XATTR_SHARED) || 763 + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) { 741 764 spin_unlock(&ci->i_ceph_lock); 765 + 766 + /* security module gets xattr while filling trace */ 767 + if (current->journal_info != NULL) { 768 + pr_warn_ratelimited("sync getxattr %p " 769 + "during filling trace\n", inode); 770 + return -EBUSY; 771 + } 772 + 742 773 /* get xattrs from mds (if we don't already have them) */ 743 774 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); 744 775 if (err) ··· 796 765 797 766 memcpy(value, xattr->val, xattr->val_len); 798 767 768 + if (current->journal_info != NULL && 769 + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 770 + ci->i_ceph_flags |= CEPH_I_SEC_INITED; 799 771 out: 800 772 spin_unlock(&ci->i_ceph_lock); 801 773 return err; ··· 1033 999 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, 1034 1000 &prealloc_cf); 1035 1001 ci->i_xattrs.dirty = true; 1036 - inode->i_ctime = CURRENT_TIME; 1002 + inode->i_ctime = current_fs_time(inode->i_sb); 1037 1003 } 1038 1004 1039 1005 spin_unlock(&ci->i_ceph_lock); ··· 1049 1015 do_sync_unlocked: 1050 1016 if (lock_snap_rwsem) 1051 1017 up_read(&mdsc->snap_rwsem); 1052 - err = ceph_sync_setxattr(dentry, name, value, size, flags); 1018 + 1019 + /* security module set xattr while filling trace */ 1020 + if (current->journal_info != NULL) { 1021 + pr_warn_ratelimited("sync setxattr %p " 1022 + "during filling trace\n", inode); 1023 + err = -EBUSY; 1024 + } else { 1025 + err = ceph_sync_setxattr(dentry, name, value, size, flags); 1026 + } 1053 1027 out: 1054 1028 ceph_free_cap_flush(prealloc_cf); 1055 1029 kfree(newname); ··· 1178 1136 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, 1179 1137 &prealloc_cf); 1180 1138 ci->i_xattrs.dirty = true; 1181 - inode->i_ctime = CURRENT_TIME; 1139 + inode->i_ctime = current_fs_time(inode->i_sb); 1182 1140 spin_unlock(&ci->i_ceph_lock); 1183 1141 if (lock_snap_rwsem) 1184 1142 up_read(&mdsc->snap_rwsem); ··· 1206 1164 1207 1165 return __ceph_removexattr(dentry, name); 1208 1166 } 1167 + 1168 + #ifdef CONFIG_SECURITY 1169 + bool ceph_security_xattr_wanted(struct inode *in) 1170 + { 1171 + return in->i_security != NULL; 1172 + } 1173 + 1174 + bool ceph_security_xattr_deadlock(struct inode *in) 1175 + { 1176 + struct ceph_inode_info *ci; 1177 + bool ret; 1178 + if (in->i_security == NULL) 1179 + return false; 1180 + ci = ceph_inode(in); 1181 + spin_lock(&ci->i_ceph_lock); 1182 + ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) && 1183 + !(ci->i_xattrs.version > 0 && 1184 + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)); 1185 + spin_unlock(&ci->i_ceph_lock); 1186 + return ret; 1187 + } 1188 + #endif
+2
include/linux/ceph/ceph_features.h
··· 105 105 */ 106 106 #define CEPH_FEATURES_SUPPORTED_DEFAULT \ 107 107 (CEPH_FEATURE_NOSRCADDR | \ 108 + CEPH_FEATURE_SUBSCRIBE2 | \ 108 109 CEPH_FEATURE_RECONNECT_SEQ | \ 109 110 CEPH_FEATURE_PGID64 | \ 110 111 CEPH_FEATURE_PGPOOL3 | \ ··· 128 127 129 128 #define CEPH_FEATURES_REQUIRED_DEFAULT \ 130 129 (CEPH_FEATURE_NOSRCADDR | \ 130 + CEPH_FEATURE_SUBSCRIBE2 | \ 131 131 CEPH_FEATURE_RECONNECT_SEQ | \ 132 132 CEPH_FEATURE_PGID64 | \ 133 133 CEPH_FEATURE_PGPOOL3 | \
+4 -3
include/linux/ceph/ceph_fs.h
··· 198 198 #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ 199 199 200 200 struct ceph_mon_subscribe_item { 201 - __le64 have_version; __le64 have; 202 - __u8 onetime; 201 + __le64 start; 202 + __u8 flags; 203 203 } __attribute__ ((packed)); 204 204 205 205 struct ceph_mon_subscribe_ack { ··· 376 376 __le32 stripe_count; /* ... */ 377 377 __le32 object_size; 378 378 __le32 file_replication; 379 - __le32 unused; /* used to be preferred osd */ 379 + __le32 mask; /* CEPH_CAP_* */ 380 + __le32 old_size; 380 381 } __attribute__ ((packed)) open; 381 382 struct { 382 383 __le32 flags;
+6 -2
include/linux/ceph/libceph.h
··· 47 47 unsigned long mount_timeout; /* jiffies */ 48 48 unsigned long osd_idle_ttl; /* jiffies */ 49 49 unsigned long osd_keepalive_timeout; /* jiffies */ 50 - unsigned long monc_ping_timeout; /* jiffies */ 51 50 52 51 /* 53 52 * any type that can't be simply compared or doesn't need need ··· 67 68 #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) 68 69 #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) 69 70 #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) 70 - #define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000) 71 + 72 + #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) 73 + #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) 74 + #define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) 75 + #define CEPH_MONC_HUNT_BACKOFF 2 76 + #define CEPH_MONC_HUNT_MAX_MULT 10 71 77 72 78 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 73 79 #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
+23 -8
include/linux/ceph/mon_client.h
··· 68 68 69 69 bool hunting; 70 70 int cur_mon; /* last monitor i contacted */ 71 - unsigned long sub_sent, sub_renew_after; 71 + unsigned long sub_renew_after; 72 + unsigned long sub_renew_sent; 72 73 struct ceph_connection con; 74 + 75 + bool had_a_connection; 76 + int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */ 73 77 74 78 /* pending generic requests */ 75 79 struct rb_root generic_request_tree; 76 80 int num_generic_requests; 77 81 u64 last_tid; 78 82 79 - /* mds/osd map */ 80 - int want_mdsmap; 81 - int want_next_osdmap; /* 1 = want, 2 = want+asked */ 82 - u32 have_osdmap, have_mdsmap; 83 + /* subs, indexed with CEPH_SUB_* */ 84 + struct { 85 + struct ceph_mon_subscribe_item item; 86 + bool want; 87 + u32 have; /* epoch */ 88 + } subs[3]; 83 89 84 90 #ifdef CONFIG_DEBUG_FS 85 91 struct dentry *debugfs_file; ··· 99 93 extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); 100 94 extern void ceph_monc_stop(struct ceph_mon_client *monc); 101 95 96 + enum { 97 + CEPH_SUB_MDSMAP = 0, 98 + CEPH_SUB_MONMAP, 99 + CEPH_SUB_OSDMAP, 100 + }; 101 + 102 + extern const char *ceph_sub_str[]; 103 + 102 104 /* 103 105 * The model here is to indicate that we need a new map of at least 104 - * epoch @want, and also call in when we receive a map. We will 106 + * epoch @epoch, and also call in when we receive a map. We will 105 107 * periodically rerequest the map from the monitor cluster until we 106 108 * get what we want. 107 109 */ 108 - extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); 109 - extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); 110 + bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, 111 + bool continuous); 112 + void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); 110 113 111 114 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); 112 115 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
+10 -5
include/linux/ceph/osd_client.h
··· 43 43 }; 44 44 45 45 46 - #define CEPH_OSD_MAX_OP 3 46 + #define CEPH_OSD_SLAB_OPS 2 47 + #define CEPH_OSD_MAX_OPS 16 47 48 48 49 enum ceph_osd_data_type { 49 50 CEPH_OSD_DATA_TYPE_NONE = 0, ··· 78 77 struct ceph_osd_req_op { 79 78 u16 op; /* CEPH_OSD_OP_* */ 80 79 u32 flags; /* CEPH_OSD_OP_FLAG_* */ 81 - u32 payload_len; 80 + u32 indata_len; /* request */ 81 + u32 outdata_len; /* reply */ 82 + s32 rval; 83 + 82 84 union { 83 85 struct ceph_osd_data raw_data_in; 84 86 struct { ··· 140 136 141 137 /* request osd ops array */ 142 138 unsigned int r_num_ops; 143 - struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; 144 139 145 140 /* these are updated on each send */ 146 141 __le32 *r_request_osdmap_epoch; ··· 151 148 struct ceph_eversion *r_request_reassert_version; 152 149 153 150 int r_result; 154 - int r_reply_op_len[CEPH_OSD_MAX_OP]; 155 - s32 r_reply_op_result[CEPH_OSD_MAX_OP]; 156 151 int r_got_reply; 157 152 int r_linger; 158 153 ··· 175 174 unsigned long r_stamp; /* send OR check time */ 176 175 177 176 struct ceph_snap_context *r_snapc; /* snap context for writes */ 177 + 178 + struct ceph_osd_req_op r_ops[]; 178 179 }; 179 180 180 181 struct ceph_request_redirect { ··· 266 263 u64 truncate_size, u32 truncate_seq); 267 264 extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, 268 265 unsigned int which, u64 length); 266 + extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, 267 + unsigned int which, u64 offset_inc); 269 268 270 269 extern struct ceph_osd_data *osd_req_op_extent_osd_data( 271 270 struct ceph_osd_request *osd_req,
+3 -1
net/ceph/ceph_common.c
··· 361 361 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 362 362 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; 363 363 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; 364 - opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; 365 364 366 365 /* get mon ip(s) */ 367 366 /* ip1[:port1][,ip2[:port2]...] */ ··· 684 685 if (client->auth_err < 0) 685 686 return client->auth_err; 686 687 } 688 + 689 + pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid); 690 + ceph_debugfs_client_init(client); 687 691 688 692 return 0; 689 693 }
+11 -6
net/ceph/debugfs.c
··· 112 112 struct ceph_mon_generic_request *req; 113 113 struct ceph_mon_client *monc = &client->monc; 114 114 struct rb_node *rp; 115 + int i; 115 116 116 117 mutex_lock(&monc->mutex); 117 118 118 - if (monc->have_mdsmap) 119 - seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); 120 - if (monc->have_osdmap) 121 - seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); 122 - if (monc->want_next_osdmap) 123 - seq_printf(s, "want next osdmap\n"); 119 + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 120 + seq_printf(s, "have %s %u", ceph_sub_str[i], 121 + monc->subs[i].have); 122 + if (monc->subs[i].want) 123 + seq_printf(s, " want %llu%s", 124 + le64_to_cpu(monc->subs[i].item.start), 125 + (monc->subs[i].item.flags & 126 + CEPH_SUBSCRIBE_ONETIME ? "" : "+")); 127 + seq_putc(s, '\n'); 128 + } 124 129 125 130 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { 126 131 __u16 op;
+5 -24
net/ceph/messenger.c
··· 235 235 static int ceph_msgr_slab_init(void) 236 236 { 237 237 BUG_ON(ceph_msg_cache); 238 - ceph_msg_cache = kmem_cache_create("ceph_msg", 239 - sizeof (struct ceph_msg), 240 - __alignof__(struct ceph_msg), 0, NULL); 241 - 238 + ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); 242 239 if (!ceph_msg_cache) 243 240 return -ENOMEM; 244 241 245 242 BUG_ON(ceph_msg_data_cache); 246 - ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", 247 - sizeof (struct ceph_msg_data), 248 - __alignof__(struct ceph_msg_data), 249 - 0, NULL); 243 + ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); 250 244 if (ceph_msg_data_cache) 251 245 return 0; 252 246 ··· 1215 1221 static void prepare_write_message_footer(struct ceph_connection *con) 1216 1222 { 1217 1223 struct ceph_msg *m = con->out_msg; 1218 - int v = con->out_kvec_left; 1219 1224 1220 1225 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; 1221 1226 1222 1227 dout("prepare_write_message_footer %p\n", con); 1223 - con->out_kvec[v].iov_base = &m->footer; 1228 + con_out_kvec_add(con, sizeof_footer(con), &m->footer); 1224 1229 if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 1225 1230 if (con->ops->sign_message) 1226 1231 con->ops->sign_message(m); 1227 1232 else 1228 1233 m->footer.sig = 0; 1229 - con->out_kvec[v].iov_len = sizeof(m->footer); 1230 - con->out_kvec_bytes += sizeof(m->footer); 1231 1234 } else { 1232 1235 m->old_footer.flags = m->footer.flags; 1233 - con->out_kvec[v].iov_len = sizeof(m->old_footer); 1234 - con->out_kvec_bytes += sizeof(m->old_footer); 1235 1236 } 1236 - con->out_kvec_left++; 1237 1237 con->out_more = m->more_to_follow; 1238 1238 con->out_msg_done = true; 1239 1239 } ··· 2397 2409 } 2398 2410 2399 2411 /* footer */ 2400 - if (need_sign) 2401 - size = sizeof(m->footer); 2402 - else 2403 - size = sizeof(m->old_footer); 2404 - 2412 + size = sizeof_footer(con); 2405 2413 end += size; 2406 2414 ret = read_partial(con, end, size, &m->footer); 2407 2415 if (ret <= 0) ··· 3073 3089 con->out_skip += con_out_kvec_skip(con); 3074 3090 } else { 3075 3091 BUG_ON(!msg->data_length); 3076 - if (con->peer_features & CEPH_FEATURE_MSG_AUTH) 3077 - con->out_skip += sizeof(msg->footer); 3078 - else 3079 - con->out_skip += sizeof(msg->old_footer); 3092 + con->out_skip += sizeof_footer(con); 3080 3093 } 3081 3094 /* data, middle, front */ 3082 3095 if (msg->data_length)
+249 -202
net/ceph/mon_client.c
··· 122 122 ceph_msg_revoke(monc->m_subscribe); 123 123 ceph_msg_revoke_incoming(monc->m_subscribe_ack); 124 124 ceph_con_close(&monc->con); 125 - monc->cur_mon = -1; 125 + 126 126 monc->pending_auth = 0; 127 127 ceph_auth_reset(monc->auth); 128 128 } 129 129 130 130 /* 131 - * Open a session with a (new) monitor. 131 + * Pick a new monitor at random and set cur_mon. If we are repicking 132 + * (i.e. cur_mon is already set), be sure to pick a different one. 132 133 */ 133 - static int __open_session(struct ceph_mon_client *monc) 134 + static void pick_new_mon(struct ceph_mon_client *monc) 134 135 { 135 - char r; 136 - int ret; 136 + int old_mon = monc->cur_mon; 137 137 138 - if (monc->cur_mon < 0) { 139 - get_random_bytes(&r, 1); 140 - monc->cur_mon = r % monc->monmap->num_mon; 141 - dout("open_session num=%d r=%d -> mon%d\n", 142 - monc->monmap->num_mon, r, monc->cur_mon); 143 - monc->sub_sent = 0; 144 - monc->sub_renew_after = jiffies; /* i.e., expired */ 145 - monc->want_next_osdmap = !!monc->want_next_osdmap; 138 + BUG_ON(monc->monmap->num_mon < 1); 146 139 147 - dout("open_session mon%d opening\n", monc->cur_mon); 148 - ceph_con_open(&monc->con, 149 - CEPH_ENTITY_TYPE_MON, monc->cur_mon, 150 - &monc->monmap->mon_inst[monc->cur_mon].addr); 151 - 152 - /* send an initial keepalive to ensure our timestamp is 153 - * valid by the time we are in an OPENED state */ 154 - ceph_con_keepalive(&monc->con); 155 - 156 - /* initiatiate authentication handshake */ 157 - ret = ceph_auth_build_hello(monc->auth, 158 - monc->m_auth->front.iov_base, 159 - monc->m_auth->front_alloc_len); 160 - __send_prepared_auth_request(monc, ret); 140 + if (monc->monmap->num_mon == 1) { 141 + monc->cur_mon = 0; 161 142 } else { 162 - dout("open_session mon%d already open\n", monc->cur_mon); 143 + int max = monc->monmap->num_mon; 144 + int o = -1; 145 + int n; 146 + 147 + if (monc->cur_mon >= 0) { 148 + if (monc->cur_mon < monc->monmap->num_mon) 149 + o = monc->cur_mon; 150 + if (o >= 0) 151 + max--; 152 + } 153 + 154 + n = prandom_u32() % max; 155 + if (o >= 0 && n >= o) 156 + n++; 157 + 158 + monc->cur_mon = n; 163 159 } 164 - return 0; 160 + 161 + dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon, 162 + monc->cur_mon, monc->monmap->num_mon); 165 163 } 166 164 167 - static bool __sub_expired(struct ceph_mon_client *monc) 165 + /* 166 + * Open a session with a new monitor. 167 + */ 168 + static void __open_session(struct ceph_mon_client *monc) 168 169 { 169 - return time_after_eq(jiffies, monc->sub_renew_after); 170 + int ret; 171 + 172 + pick_new_mon(monc); 173 + 174 + monc->hunting = true; 175 + if (monc->had_a_connection) { 176 + monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF; 177 + if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT) 178 + monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT; 179 + } 180 + 181 + monc->sub_renew_after = jiffies; /* i.e., expired */ 182 + monc->sub_renew_sent = 0; 183 + 184 + dout("%s opening mon%d\n", __func__, monc->cur_mon); 185 + ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon, 186 + &monc->monmap->mon_inst[monc->cur_mon].addr); 187 + 188 + /* 189 + * send an initial keepalive to ensure our timestamp is valid 190 + * by the time we are in an OPENED state 191 + */ 192 + ceph_con_keepalive(&monc->con); 193 + 194 + /* initiate authentication handshake */ 195 + ret = ceph_auth_build_hello(monc->auth, 196 + monc->m_auth->front.iov_base, 197 + monc->m_auth->front_alloc_len); 198 + BUG_ON(ret <= 0); 199 + __send_prepared_auth_request(monc, ret); 200 + } 201 + 202 + static void reopen_session(struct ceph_mon_client *monc) 203 + { 204 + if (!monc->hunting) 205 + pr_info("mon%d %s session lost, hunting for new mon\n", 206 + monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr)); 207 + 208 + __close_session(monc); 209 + __open_session(monc); 170 210 } 171 211 172 212 /* ··· 214 174 */ 215 175 static void __schedule_delayed(struct ceph_mon_client *monc) 216 176 { 217 - struct ceph_options *opt = monc->client->options; 218 177 unsigned long delay; 219 178 220 - if (monc->cur_mon < 0 || __sub_expired(monc)) { 221 - delay = 10 * HZ; 222 - } else { 223 - delay = 20 * HZ; 224 - if (opt->monc_ping_timeout > 0) 225 - delay = min(delay, opt->monc_ping_timeout / 3); 226 - } 179 + if (monc->hunting) 180 + delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult; 181 + else 182 + delay = CEPH_MONC_PING_INTERVAL; 183 + 227 184 dout("__schedule_delayed after %lu\n", delay); 228 - schedule_delayed_work(&monc->delayed_work, 229 - round_jiffies_relative(delay)); 185 + mod_delayed_work(system_wq, &monc->delayed_work, 186 + round_jiffies_relative(delay)); 230 187 } 231 188 189 + const char *ceph_sub_str[] = { 190 + [CEPH_SUB_MDSMAP] = "mdsmap", 191 + [CEPH_SUB_MONMAP] = "monmap", 192 + [CEPH_SUB_OSDMAP] = "osdmap", 193 + }; 194 + 232 195 /* 233 - * Send subscribe request for mdsmap and/or osdmap. 196 + * Send subscribe request for one or more maps, according to 197 + * monc->subs. 234 198 */ 235 199 static void __send_subscribe(struct ceph_mon_client *monc) 236 200 { 237 - dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", 238 - (unsigned int)monc->sub_sent, __sub_expired(monc), 239 - monc->want_next_osdmap); 240 - if ((__sub_expired(monc) && !monc->sub_sent) || 241 - monc->want_next_osdmap == 1) { 242 - struct ceph_msg *msg = monc->m_subscribe; 243 - struct ceph_mon_subscribe_item *i; 244 - void *p, *end; 245 - int num; 201 + struct ceph_msg *msg = monc->m_subscribe; 202 + void *p = msg->front.iov_base; 203 + void *const end = p + msg->front_alloc_len; 204 + int num = 0; 205 + int i; 246 206 247 - p = msg->front.iov_base; 248 - end = p + msg->front_alloc_len; 207 + dout("%s sent %lu\n", __func__, monc->sub_renew_sent); 249 208 250 - num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; 251 - ceph_encode_32(&p, num); 209 + BUG_ON(monc->cur_mon < 0); 252 210 253 - if (monc->want_next_osdmap) { 254 - dout("__send_subscribe to 'osdmap' %u\n", 255 - (unsigned int)monc->have_osdmap); 256 - ceph_encode_string(&p, end, "osdmap", 6); 257 - i = p; 258 - i->have = cpu_to_le64(monc->have_osdmap); 259 - i->onetime = 1; 260 - p += sizeof(*i); 261 - monc->want_next_osdmap = 2; /* requested */ 262 - } 263 - if (monc->want_mdsmap) { 264 - dout("__send_subscribe to 'mdsmap' %u+\n", 265 - (unsigned int)monc->have_mdsmap); 266 - ceph_encode_string(&p, end, "mdsmap", 6); 267 - i = p; 268 - i->have = cpu_to_le64(monc->have_mdsmap); 269 - i->onetime = 0; 270 - p += sizeof(*i); 271 - } 272 - ceph_encode_string(&p, end, "monmap", 6); 273 - i = p; 274 - i->have = 0; 275 - i->onetime = 0; 276 - p += sizeof(*i); 211 + if (!monc->sub_renew_sent) 212 + monc->sub_renew_sent = jiffies | 1; /* never 0 */ 277 213 278 - msg->front.iov_len = p - msg->front.iov_base; 279 - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 280 - ceph_msg_revoke(msg); 281 - ceph_con_send(&monc->con, ceph_msg_get(msg)); 214 + msg->hdr.version = cpu_to_le16(2); 282 215 283 - monc->sub_sent = jiffies | 1; /* never 0 */ 216 + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 217 + if (monc->subs[i].want) 218 + num++; 284 219 } 220 + BUG_ON(num < 1); /* monmap sub is always there */ 221 + ceph_encode_32(&p, num); 222 + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 223 + const char *s = ceph_sub_str[i]; 224 + 225 + if (!monc->subs[i].want) 226 + continue; 227 + 228 + dout("%s %s start %llu flags 0x%x\n", __func__, s, 229 + le64_to_cpu(monc->subs[i].item.start), 230 + monc->subs[i].item.flags); 231 + ceph_encode_string(&p, end, s, strlen(s)); 232 + memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); 233 + p += sizeof(monc->subs[i].item); 234 + } 235 + 236 + BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); 237 + msg->front.iov_len = p - msg->front.iov_base; 238 + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 239 + ceph_msg_revoke(msg); 240 + ceph_con_send(&monc->con, ceph_msg_get(msg)); 285 241 } 286 242 287 243 static void handle_subscribe_ack(struct ceph_mon_client *monc, ··· 291 255 seconds = le32_to_cpu(h->duration); 292 256 293 257 mutex_lock(&monc->mutex); 294 - if (monc->hunting) { 295 - pr_info("mon%d %s session established\n", 296 - monc->cur_mon, 297 - ceph_pr_addr(&monc->con.peer_addr.in_addr)); 298 - monc->hunting = false; 258 + if (monc->sub_renew_sent) { 259 + monc->sub_renew_after = monc->sub_renew_sent + 260 + (seconds >> 1) * HZ - 1; 261 + dout("%s sent %lu duration %d renew after %lu\n", __func__, 262 + monc->sub_renew_sent, seconds, monc->sub_renew_after); 263 + monc->sub_renew_sent = 0; 264 + } else { 265 + dout("%s sent %lu renew after %lu, ignoring\n", __func__, 266 + monc->sub_renew_sent, monc->sub_renew_after); 299 267 } 300 - dout("handle_subscribe_ack after %d seconds\n", seconds); 301 - monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; 302 - monc->sub_sent = 0; 303 268 mutex_unlock(&monc->mutex); 304 269 return; 305 270 bad: ··· 309 272 } 310 273 311 274 /* 312 - * Keep track of which maps we have 275 + * Register interest in a map 276 + * 277 + * @sub: one of CEPH_SUB_* 278 + * @epoch: X for "every map since X", or 0 for "just the latest" 313 279 */ 314 - int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) 280 + static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub, 281 + u32 epoch, bool continuous) 315 282 { 316 - mutex_lock(&monc->mutex); 317 - monc->have_mdsmap = got; 318 - mutex_unlock(&monc->mutex); 319 - return 0; 320 - } 321 - EXPORT_SYMBOL(ceph_monc_got_mdsmap); 283 + __le64 start = cpu_to_le64(epoch); 284 + u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0; 322 285 323 - int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) 286 + dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub], 287 + epoch, continuous); 288 + 289 + if (monc->subs[sub].want && 290 + monc->subs[sub].item.start == start && 291 + monc->subs[sub].item.flags == flags) 292 + return false; 293 + 294 + monc->subs[sub].item.start = start; 295 + monc->subs[sub].item.flags = flags; 296 + monc->subs[sub].want = true; 297 + 298 + return true; 299 + } 300 + 301 + bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, 302 + bool continuous) 303 + { 304 + bool need_request; 305 + 306 + mutex_lock(&monc->mutex); 307 + need_request = __ceph_monc_want_map(monc, sub, epoch, continuous); 308 + mutex_unlock(&monc->mutex); 309 + 310 + return need_request; 311 + } 312 + EXPORT_SYMBOL(ceph_monc_want_map); 313 + 314 + /* 315 + * Keep track of which maps we have 316 + * 317 + * @sub: one of CEPH_SUB_* 318 + */ 319 + static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub, 320 + u32 epoch) 321 + { 322 + dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch); 323 + 324 + if (monc->subs[sub].want) { 325 + if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME) 326 + monc->subs[sub].want = false; 327 + else 328 + monc->subs[sub].item.start = cpu_to_le64(epoch + 1); 329 + } 330 + 331 + monc->subs[sub].have = epoch; 332 + } 333 + 334 + void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) 324 335 { 325 336 mutex_lock(&monc->mutex); 326 - monc->have_osdmap = got; 327 - monc->want_next_osdmap = 0; 337 + __ceph_monc_got_map(monc, sub, epoch); 328 338 mutex_unlock(&monc->mutex); 329 - return 0; 330 339 } 340 + EXPORT_SYMBOL(ceph_monc_got_map); 331 341 332 342 /* 333 343 * Register interest in the next osdmap 334 344 */ 335 345 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) 336 346 { 337 - dout("request_next_osdmap have %u\n", monc->have_osdmap); 347 + dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have); 338 348 mutex_lock(&monc->mutex); 339 - if (!monc->want_next_osdmap) 340 - monc->want_next_osdmap = 1; 341 - if (monc->want_next_osdmap < 2) 349 + if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 350 + monc->subs[CEPH_SUB_OSDMAP].have + 1, false)) 342 351 __send_subscribe(monc); 343 352 mutex_unlock(&monc->mutex); 344 353 } ··· 403 320 long ret; 404 321 405 322 mutex_lock(&monc->mutex); 406 - while (monc->have_osdmap < epoch) { 323 + while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) { 407 324 mutex_unlock(&monc->mutex); 408 325 409 326 if (timeout && time_after_eq(jiffies, started + timeout)) 410 327 return -ETIMEDOUT; 411 328 412 329 ret = wait_event_interruptible_timeout(monc->client->auth_wq, 413 - monc->have_osdmap >= epoch, 414 - ceph_timeout_jiffies(timeout)); 330 + monc->subs[CEPH_SUB_OSDMAP].have >= epoch, 331 + ceph_timeout_jiffies(timeout)); 415 332 if (ret < 0) 416 333 return ret; 417 334 ··· 424 341 EXPORT_SYMBOL(ceph_monc_wait_osdmap); 425 342 426 343 /* 427 - * 344 + * Open a session with a random monitor. Request monmap and osdmap, 345 + * which are waited upon in __ceph_open_session(). 428 346 */ 429 347 int ceph_monc_open_session(struct ceph_mon_client *monc) 430 348 { 431 349 mutex_lock(&monc->mutex); 350 + __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true); 351 + __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false); 432 352 __open_session(monc); 433 353 __schedule_delayed(monc); 434 354 mutex_unlock(&monc->mutex); ··· 439 353 } 440 354 EXPORT_SYMBOL(ceph_monc_open_session); 441 355 442 - /* 443 - * We require the fsid and global_id in order to initialize our 444 - * debugfs dir. 445 - */ 446 - static bool have_debugfs_info(struct ceph_mon_client *monc) 447 - { 448 - dout("have_debugfs_info fsid %d globalid %lld\n", 449 - (int)monc->client->have_fsid, monc->auth->global_id); 450 - return monc->client->have_fsid && monc->auth->global_id > 0; 451 - } 452 - 453 356 static void ceph_monc_handle_map(struct ceph_mon_client *monc, 454 357 struct ceph_msg *msg) 455 358 { 456 359 struct ceph_client *client = monc->client; 457 360 struct ceph_monmap *monmap = NULL, *old = monc->monmap; 458 361 void *p, *end; 459 - int had_debugfs_info, init_debugfs = 0; 460 362 461 363 mutex_lock(&monc->mutex); 462 - 463 - had_debugfs_info = have_debugfs_info(monc); 464 364 465 365 dout("handle_monmap\n"); 466 366 p = msg->front.iov_base; ··· 467 395 client->monc.monmap = monmap; 468 396 kfree(old); 469 397 470 - if (!client->have_fsid) { 471 - client->have_fsid = true; 472 - if (!had_debugfs_info && have_debugfs_info(monc)) { 473 - pr_info("client%lld fsid %pU\n", 474 - ceph_client_id(monc->client), 475 - &monc->client->fsid); 476 - init_debugfs = 1; 477 - } 478 - mutex_unlock(&monc->mutex); 398 + __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch); 399 + client->have_fsid = true; 479 400 480 - if (init_debugfs) { 481 - /* 482 - * do debugfs initialization without mutex to avoid 483 - * creating a locking dependency 484 - */ 485 - ceph_debugfs_client_init(monc->client); 486 - } 487 - 488 - goto out_unlocked; 489 - } 490 401 out: 491 402 mutex_unlock(&monc->mutex); 492 - out_unlocked: 493 403 wake_up_all(&client->auth_wq); 494 404 } 495 405 ··· 799 745 dout("monc delayed_work\n"); 800 746 mutex_lock(&monc->mutex); 801 747 if (monc->hunting) { 802 - __close_session(monc); 803 - __open_session(monc); /* continue hunting */ 748 + dout("%s continuing hunt\n", __func__); 749 + reopen_session(monc); 804 750 } else { 805 - struct ceph_options *opt = monc->client->options; 806 751 int is_auth = ceph_auth_is_authenticated(monc->auth); 807 752 if (ceph_con_keepalive_expired(&monc->con, 808 - opt->monc_ping_timeout)) { 753 + CEPH_MONC_PING_TIMEOUT)) { 809 754 dout("monc keepalive timeout\n"); 810 755 is_auth = 0; 811 - __close_session(monc); 812 - monc->hunting = true; 813 - __open_session(monc); 756 + reopen_session(monc); 814 757 } 815 758 816 759 if (!monc->hunting) { ··· 815 764 __validate_auth(monc); 816 765 } 817 766 818 - if (is_auth) 819 - __send_subscribe(monc); 767 + if (is_auth) { 768 + unsigned long now = jiffies; 769 + 770 + dout("%s renew subs? now %lu renew after %lu\n", 771 + __func__, now, monc->sub_renew_after); 772 + if (time_after_eq(now, monc->sub_renew_after)) 773 + __send_subscribe(monc); 774 + } 820 775 } 821 776 __schedule_delayed(monc); 822 777 mutex_unlock(&monc->mutex); ··· 909 852 &monc->client->msgr); 910 853 911 854 monc->cur_mon = -1; 912 - monc->hunting = true; 913 - monc->sub_renew_after = jiffies; 914 - monc->sub_sent = 0; 855 + monc->had_a_connection = false; 856 + monc->hunt_mult = 1; 915 857 916 858 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 917 859 monc->generic_request_tree = RB_ROOT; 918 860 monc->num_generic_requests = 0; 919 861 monc->last_tid = 0; 920 862 921 - monc->have_mdsmap = 0; 922 - monc->have_osdmap = 0; 923 - monc->want_next_osdmap = 1; 924 863 return 0; 925 864 926 865 out_auth_reply: ··· 941 888 942 889 mutex_lock(&monc->mutex); 943 890 __close_session(monc); 944 - 891 + monc->cur_mon = -1; 945 892 mutex_unlock(&monc->mutex); 946 893 947 894 /* ··· 963 910 } 964 911 EXPORT_SYMBOL(ceph_monc_stop); 965 912 913 + static void finish_hunting(struct ceph_mon_client *monc) 914 + { 915 + if (monc->hunting) { 916 + dout("%s found mon%d\n", __func__, monc->cur_mon); 917 + monc->hunting = false; 918 + monc->had_a_connection = true; 919 + monc->hunt_mult /= 2; /* reduce by 50% */ 920 + if (monc->hunt_mult < 1) 921 + monc->hunt_mult = 1; 922 + } 923 + } 924 + 966 925 static void handle_auth_reply(struct ceph_mon_client *monc, 967 926 struct ceph_msg *msg) 968 927 { 969 928 int ret; 970 929 int was_auth = 0; 971 - int had_debugfs_info, init_debugfs = 0; 972 930 973 931 mutex_lock(&monc->mutex); 974 - had_debugfs_info = have_debugfs_info(monc); 975 932 was_auth = ceph_auth_is_authenticated(monc->auth); 976 933 monc->pending_auth = 0; 977 934 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 978 935 msg->front.iov_len, 979 936 monc->m_auth->front.iov_base, 980 937 monc->m_auth->front_alloc_len); 938 + if (ret > 0) { 939 + __send_prepared_auth_request(monc, ret); 940 + goto out; 941 + } 942 + 943 + finish_hunting(monc); 944 + 981 945 if (ret < 0) { 982 946 monc->client->auth_err = ret; 983 - wake_up_all(&monc->client->auth_wq); 984 - } else if (ret > 0) { 985 - __send_prepared_auth_request(monc, ret); 986 947 } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { 987 948 dout("authenticated, starting session\n"); 988 949 ··· 1006 939 1007 940 __send_subscribe(monc); 1008 941 __resend_generic_request(monc); 942 + 943 + pr_info("mon%d %s session established\n", monc->cur_mon, 944 + ceph_pr_addr(&monc->con.peer_addr.in_addr)); 1009 945 } 1010 946 1011 - if (!had_debugfs_info && have_debugfs_info(monc)) { 1012 - pr_info("client%lld fsid %pU\n", 1013 - ceph_client_id(monc->client), 1014 - &monc->client->fsid); 1015 - init_debugfs = 1; 1016 - } 947 + out: 1017 948 mutex_unlock(&monc->mutex); 1018 - 1019 - if (init_debugfs) { 1020 - /* 1021 - * do debugfs initialization without mutex to avoid 1022 - * creating a locking dependency 1023 - */ 1024 - ceph_debugfs_client_init(monc->client); 1025 - } 949 + if (monc->client->auth_err < 0) 950 + wake_up_all(&monc->client->auth_wq); 1026 951 } 1027 952 1028 953 static int __validate_auth(struct ceph_mon_client *monc) ··· 1155 1096 { 1156 1097 struct ceph_mon_client *monc = con->private; 1157 1098 1158 - if (!monc) 1159 - return; 1160 - 1161 - dout("mon_fault\n"); 1162 1099 mutex_lock(&monc->mutex); 1163 - if (!con->private) 1164 - goto out; 1165 - 1166 - if (!monc->hunting) 1167 - pr_info("mon%d %s session lost, " 1168 - "hunting for new mon\n", monc->cur_mon, 1169 - ceph_pr_addr(&monc->con.peer_addr.in_addr)); 1170 - 1171 - __close_session(monc); 1172 - if (!monc->hunting) { 1173 - /* start hunting */ 1174 - monc->hunting = true; 1175 - __open_session(monc); 1176 - } else { 1177 - /* already hunting, let's wait a bit */ 1178 - __schedule_delayed(monc); 1100 + dout("%s mon%d\n", __func__, monc->cur_mon); 1101 + if (monc->cur_mon >= 0) { 1102 + if (!monc->hunting) { 1103 + dout("%s hunting for new mon\n", __func__); 1104 + reopen_session(monc); 1105 + __schedule_delayed(monc); 1106 + } else { 1107 + dout("%s already hunting\n", __func__); 1108 + } 1179 1109 } 1180 - out: 1181 1110 mutex_unlock(&monc->mutex); 1182 1111 } 1183 1112
+73 -36
net/ceph/osd_client.c
··· 338 338 ceph_put_snap_context(req->r_snapc); 339 339 if (req->r_mempool) 340 340 mempool_free(req, req->r_osdc->req_mempool); 341 - else 341 + else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) 342 342 kmem_cache_free(ceph_osd_request_cache, req); 343 - 343 + else 344 + kfree(req); 344 345 } 345 346 346 347 void ceph_osdc_get_request(struct ceph_osd_request *req) ··· 370 369 struct ceph_msg *msg; 371 370 size_t msg_size; 372 371 373 - BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); 374 - BUG_ON(num_ops > CEPH_OSD_MAX_OP); 375 - 376 - msg_size = 4 + 4 + 8 + 8 + 4+8; 377 - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 378 - msg_size += 1 + 8 + 4 + 4; /* pg_t */ 379 - msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 380 - msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); 381 - msg_size += 8; /* snapid */ 382 - msg_size += 8; /* snap_seq */ 383 - msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ 384 - msg_size += 4; 385 - 386 372 if (use_mempool) { 373 + BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); 387 374 req = mempool_alloc(osdc->req_mempool, gfp_flags); 388 - memset(req, 0, sizeof(*req)); 375 + } else if (num_ops <= CEPH_OSD_SLAB_OPS) { 376 + req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); 389 377 } else { 390 - req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); 378 + BUG_ON(num_ops > CEPH_OSD_MAX_OPS); 379 + req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]), 380 + gfp_flags); 391 381 } 392 - if (req == NULL) 382 + if (unlikely(!req)) 393 383 return NULL; 384 + 385 + /* req only, each op is zeroed in _osd_req_op_init() */ 386 + memset(req, 0, sizeof(*req)); 394 387 395 388 req->r_osdc = osdc; 396 389 req->r_mempool = use_mempool; ··· 403 408 req->r_base_oloc.pool = -1; 404 409 req->r_target_oloc.pool = -1; 405 410 411 + msg_size = OSD_OPREPLY_FRONT_LEN; 412 + if (num_ops > CEPH_OSD_SLAB_OPS) { 413 + /* ceph_osd_op and rval */ 414 + msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * 415 + (sizeof(struct ceph_osd_op) + 4); 416 + } 417 + 406 418 /* create reply message */ 407 419 if (use_mempool) 408 420 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 409 421 else 410 - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 411 - OSD_OPREPLY_FRONT_LEN, gfp_flags, true); 422 + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, 423 + gfp_flags, true); 412 424 if (!msg) { 413 425 ceph_osdc_put_request(req); 414 426 return NULL; 415 427 } 416 428 req->r_reply = msg; 429 + 430 + msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ 431 + msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ 432 + msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 433 + msg_size += 1 + 8 + 4 + 4; /* pgid */ 434 + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 435 + msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); 436 + msg_size += 8; /* snapid */ 437 + msg_size += 8; /* snap_seq */ 438 + msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ 439 + msg_size += 4; /* retry_attempt */ 417 440 418 441 /* create request message; allow space for oid */ 419 442 if (use_mempool) ··· 511 498 if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) 512 499 payload_len += length; 513 500 514 - op->payload_len = payload_len; 501 + op->indata_len = payload_len; 515 502 } 516 503 EXPORT_SYMBOL(osd_req_op_extent_init); 517 504 ··· 530 517 BUG_ON(length > previous); 531 518 532 519 op->extent.length = length; 533 - op->payload_len -= previous - length; 520 + op->indata_len -= previous - length; 534 521 } 535 522 EXPORT_SYMBOL(osd_req_op_extent_update); 523 + 524 + void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, 525 + unsigned int which, u64 offset_inc) 526 + { 527 + struct ceph_osd_req_op *op, *prev_op; 528 + 529 + BUG_ON(which + 1 >= osd_req->r_num_ops); 530 + 531 + prev_op = &osd_req->r_ops[which]; 532 + op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); 533 + /* dup previous one */ 534 + op->indata_len = prev_op->indata_len; 535 + op->outdata_len = prev_op->outdata_len; 536 + op->extent = prev_op->extent; 537 + /* adjust offset */ 538 + op->extent.offset += offset_inc; 539 + op->extent.length -= offset_inc; 540 + 541 + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) 542 + op->indata_len -= offset_inc; 543 + } 544 + EXPORT_SYMBOL(osd_req_op_extent_dup_last); 536 545 537 546 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, 538 547 u16 opcode, const char *class, const char *method) ··· 589 554 590 555 op->cls.argc = 0; /* currently unused */ 591 556 592 - op->payload_len = payload_len; 557 + op->indata_len = payload_len; 593 558 } 594 559 EXPORT_SYMBOL(osd_req_op_cls_init); 595 560 ··· 622 587 op->xattr.cmp_mode = cmp_mode; 623 588 624 589 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); 625 - op->payload_len = payload_len; 590 + op->indata_len = payload_len; 626 591 return 0; 627 592 } 628 593 EXPORT_SYMBOL(osd_req_op_xattr_init); ··· 742 707 BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); 743 708 dst->cls.indata_len = cpu_to_le32(data_length); 744 709 ceph_osdc_msg_data_add(req->r_request, osd_data); 745 - src->payload_len += data_length; 710 + src->indata_len += data_length; 746 711 request_data_len += data_length; 747 712 } 748 713 osd_data = &src->cls.response_data; ··· 785 750 786 751 dst->op = cpu_to_le16(src->op); 787 752 dst->flags = cpu_to_le32(src->flags); 788 - dst->payload_len = cpu_to_le32(src->payload_len); 753 + dst->payload_len = cpu_to_le32(src->indata_len); 789 754 790 755 return request_data_len; 791 756 } ··· 1845 1810 1846 1811 ceph_decode_need(&p, end, 4, bad_put); 1847 1812 numops = ceph_decode_32(&p); 1848 - if (numops > CEPH_OSD_MAX_OP) 1813 + if (numops > CEPH_OSD_MAX_OPS) 1849 1814 goto bad_put; 1850 1815 if (numops != req->r_num_ops) 1851 1816 goto bad_put; ··· 1856 1821 int len; 1857 1822 1858 1823 len = le32_to_cpu(op->payload_len); 1859 - req->r_reply_op_len[i] = len; 1824 + req->r_ops[i].outdata_len = len; 1860 1825 dout(" op %d has %d bytes\n", i, len); 1861 1826 payload_len += len; 1862 1827 p += sizeof(*op); ··· 1871 1836 ceph_decode_need(&p, end, 4 + numops * 4, bad_put); 1872 1837 retry_attempt = ceph_decode_32(&p); 1873 1838 for (i = 0; i < numops; i++) 1874 - req->r_reply_op_result[i] = ceph_decode_32(&p); 1839 + req->r_ops[i].rval = ceph_decode_32(&p); 1875 1840 1876 1841 if (le16_to_cpu(msg->hdr.version) >= 6) { 1877 1842 p += 8 + 4; /* skip replay_version */ ··· 2222 2187 goto bad; 2223 2188 done: 2224 2189 downgrade_write(&osdc->map_sem); 2225 - ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); 2190 + ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, 2191 + osdc->osdmap->epoch); 2226 2192 2227 2193 /* 2228 2194 * subscribe to subsequent osdmap updates if full to ensure ··· 2682 2646 round_jiffies_relative(osdc->client->options->osd_idle_ttl)); 2683 2647 2684 2648 err = -ENOMEM; 2685 - osdc->req_mempool = mempool_create_kmalloc_pool(10, 2686 - sizeof(struct ceph_osd_request)); 2649 + osdc->req_mempool = mempool_create_slab_pool(10, 2650 + ceph_osd_request_cache); 2687 2651 if (!osdc->req_mempool) 2688 2652 goto out; 2689 2653 ··· 2818 2782 2819 2783 int ceph_osdc_setup(void) 2820 2784 { 2785 + size_t size = sizeof(struct ceph_osd_request) + 2786 + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); 2787 + 2821 2788 BUG_ON(ceph_osd_request_cache); 2822 - ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", 2823 - sizeof (struct ceph_osd_request), 2824 - __alignof__(struct ceph_osd_request), 2825 - 0, NULL); 2789 + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, 2790 + 0, 0, NULL); 2826 2791 2827 2792 return ceph_osd_request_cache ? 0 : -ENOMEM; 2828 2793 }