Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

+3 -11

drivers/block/rbd.c

··· 1847 1847 if (osd_req->r_result < 0) 1848 1848 obj_request->result = osd_req->r_result; 1849 1849 1850 - rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP); 1851 - 1852 1850 /* 1853 1851 * We support a 64-bit length, but ultimately it has to be 1854 1852 * passed to the block layer, which just supports a 32-bit 1855 1853 * length field. 1856 1854 */ 1857 - obj_request->xferred = osd_req->r_reply_op_len[0]; 1855 + obj_request->xferred = osd_req->r_ops[0].outdata_len; 1858 1856 rbd_assert(obj_request->xferred < (u64)UINT_MAX); 1859 1857 1860 1858 opcode = osd_req->r_ops[0].op; ··· 5641 5643 static int rbd_slab_init(void) 5642 5644 { 5643 5645 rbd_assert(!rbd_img_request_cache); 5644 - rbd_img_request_cache = kmem_cache_create("rbd_img_request", 5645 - sizeof (struct rbd_img_request), 5646 - __alignof__(struct rbd_img_request), 5647 - 0, NULL); 5646 + rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); 5648 5647 if (!rbd_img_request_cache) 5649 5648 return -ENOMEM; 5650 5649 5651 5650 rbd_assert(!rbd_obj_request_cache); 5652 - rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", 5653 - sizeof (struct rbd_obj_request), 5654 - __alignof__(struct rbd_obj_request), 5655 - 0, NULL); 5651 + rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0); 5656 5652 if (!rbd_obj_request_cache) 5657 5653 goto out_err; 5658 5654

+203 -119

fs/ceph/addr.c

··· 175 175 176 176 static int ceph_releasepage(struct page *page, gfp_t g) 177 177 { 178 - struct inode *inode = page->mapping ? page->mapping->host : NULL; 179 - dout("%p releasepage %p idx %lu\n", inode, page, page->index); 178 + dout("%p releasepage %p idx %lu\n", page->mapping->host, 179 + page, page->index); 180 180 WARN_ON(PageDirty(page)); 181 181 182 182 /* Can we release the page from the cache? */ ··· 276 276 for (i = 0; i < num_pages; i++) { 277 277 struct page *page = osd_data->pages[i]; 278 278 279 - if (rc < 0 && rc != ENOENT) 279 + if (rc < 0 && rc != -ENOENT) 280 280 goto unlock; 281 281 if (bytes < (int)PAGE_CACHE_SIZE) { 282 282 /* zero (remainder of) page */ ··· 606 606 struct inode *inode = req->r_inode; 607 607 struct ceph_inode_info *ci = ceph_inode(inode); 608 608 struct ceph_osd_data *osd_data; 609 - unsigned wrote; 610 609 struct page *page; 611 - int num_pages; 612 - int i; 610 + int num_pages, total_pages = 0; 611 + int i, j; 612 + int rc = req->r_result; 613 613 struct ceph_snap_context *snapc = req->r_snapc; 614 614 struct address_space *mapping = inode->i_mapping; 615 - int rc = req->r_result; 616 - u64 bytes = req->r_ops[0].extent.length; 617 615 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 618 - long writeback_stat; 619 - unsigned issued = ceph_caps_issued(ci); 616 + bool remove_page; 620 617 621 - osd_data = osd_req_op_extent_osd_data(req, 0); 622 - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 623 - num_pages = calc_pages_for((u64)osd_data->alignment, 624 - (u64)osd_data->length); 625 - if (rc >= 0) { 626 - /* 627 - * Assume we wrote the pages we originally sent. The 628 - * osd might reply with fewer pages if our writeback 629 - * raced with a truncation and was adjusted at the osd, 630 - * so don't believe the reply. 631 - */ 632 - wrote = num_pages; 633 - } else { 634 - wrote = 0; 618 + 619 + dout("writepages_finish %p rc %d\n", inode, rc); 620 + if (rc < 0) 635 621 mapping_set_error(mapping, rc); 636 - } 637 - dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n", 638 - inode, rc, bytes, wrote); 622 + 623 + /* 624 + * We lost the cache cap, need to truncate the page before 625 + * it is unlocked, otherwise we'd truncate it later in the 626 + * page truncation thread, possibly losing some data that 627 + * raced its way in 628 + */ 629 + remove_page = !(ceph_caps_issued(ci) & 630 + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); 639 631 640 632 /* clean all pages */ 641 - for (i = 0; i < num_pages; i++) { 642 - page = osd_data->pages[i]; 643 - BUG_ON(!page); 644 - WARN_ON(!PageUptodate(page)); 633 + for (i = 0; i < req->r_num_ops; i++) { 634 + if (req->r_ops[i].op != CEPH_OSD_OP_WRITE) 635 + break; 645 636 646 - writeback_stat = 647 - atomic_long_dec_return(&fsc->writeback_count); 648 - if (writeback_stat < 649 - CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) 650 - clear_bdi_congested(&fsc->backing_dev_info, 651 - BLK_RW_ASYNC); 637 + osd_data = osd_req_op_extent_osd_data(req, i); 638 + BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); 639 + num_pages = calc_pages_for((u64)osd_data->alignment, 640 + (u64)osd_data->length); 641 + total_pages += num_pages; 642 + for (j = 0; j < num_pages; j++) { 643 + page = osd_data->pages[j]; 644 + BUG_ON(!page); 645 + WARN_ON(!PageUptodate(page)); 652 646 653 - ceph_put_snap_context(page_snap_context(page)); 654 - page->private = 0; 655 - ClearPagePrivate(page); 656 - dout("unlocking %d %p\n", i, page); 657 - end_page_writeback(page); 647 + if (atomic_long_dec_return(&fsc->writeback_count) < 648 + CONGESTION_OFF_THRESH( 649 + fsc->mount_options->congestion_kb)) 650 + clear_bdi_congested(&fsc->backing_dev_info, 651 + BLK_RW_ASYNC); 658 652 659 - /* 660 - * We lost the cache cap, need to truncate the page before 661 - * it is unlocked, otherwise we'd truncate it later in the 662 - * page truncation thread, possibly losing some data that 663 - * raced its way in 664 - */ 665 - if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) 666 - generic_error_remove_page(inode->i_mapping, page); 653 + ceph_put_snap_context(page_snap_context(page)); 654 + page->private = 0; 655 + ClearPagePrivate(page); 656 + dout("unlocking %p\n", page); 657 + end_page_writeback(page); 667 658 668 - unlock_page(page); 659 + if (remove_page) 660 + generic_error_remove_page(inode->i_mapping, 661 + page); 662 + 663 + unlock_page(page); 664 + } 665 + dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", 666 + inode, osd_data->length, rc >= 0 ? num_pages : 0); 667 + 668 + ceph_release_pages(osd_data->pages, num_pages); 669 669 } 670 - dout("%p wrote+cleaned %d pages\n", inode, wrote); 671 - ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); 672 670 673 - ceph_release_pages(osd_data->pages, num_pages); 671 + ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); 672 + 673 + osd_data = osd_req_op_extent_osd_data(req, 0); 674 674 if (osd_data->pages_from_pool) 675 675 mempool_free(osd_data->pages, 676 676 ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); ··· 778 778 while (!done && index <= end) { 779 779 unsigned i; 780 780 int first; 781 - pgoff_t next; 782 - int pvec_pages, locked_pages; 783 - struct page **pages = NULL; 781 + pgoff_t strip_unit_end = 0; 782 + int num_ops = 0, op_idx; 783 + int pvec_pages, locked_pages = 0; 784 + struct page **pages = NULL, **data_pages; 784 785 mempool_t *pool = NULL; /* Becomes non-null if mempool used */ 785 786 struct page *page; 786 787 int want; 787 - u64 offset, len; 788 - long writeback_stat; 788 + u64 offset = 0, len = 0; 789 789 790 - next = 0; 791 - locked_pages = 0; 792 790 max_pages = max_pages_ever; 793 791 794 792 get_more_pages: ··· 822 824 unlock_page(page); 823 825 break; 824 826 } 825 - if (next && (page->index != next)) { 826 - dout("not consecutive %p\n", page); 827 + if (strip_unit_end && (page->index > strip_unit_end)) { 828 + dout("end of strip unit %p\n", page); 827 829 unlock_page(page); 828 830 break; 829 831 } ··· 865 867 /* 866 868 * We have something to write. If this is 867 869 * the first locked page this time through, 868 - * allocate an osd request and a page array 869 - * that it will use. 870 + * calculate max possinle write size and 871 + * allocate a page array 870 872 */ 871 873 if (locked_pages == 0) { 872 - BUG_ON(pages); 874 + u64 objnum; 875 + u64 objoff; 876 + 873 877 /* prepare async write request */ 874 878 offset = (u64)page_offset(page); 875 879 len = wsize; 876 - req = ceph_osdc_new_request(&fsc->client->osdc, 877 - &ci->i_layout, vino, 878 - offset, &len, 0, 879 - do_sync ? 2 : 1, 880 - CEPH_OSD_OP_WRITE, 881 - CEPH_OSD_FLAG_WRITE | 882 - CEPH_OSD_FLAG_ONDISK, 883 - snapc, truncate_seq, 884 - truncate_size, true); 885 - if (IS_ERR(req)) { 886 - rc = PTR_ERR(req); 880 + 881 + rc = ceph_calc_file_object_mapping(&ci->i_layout, 882 + offset, len, 883 + &objnum, &objoff, 884 + &len); 885 + if (rc < 0) { 887 886 unlock_page(page); 888 887 break; 889 888 } 890 889 891 - if (do_sync) 892 - osd_req_op_init(req, 1, 893 - CEPH_OSD_OP_STARTSYNC, 0); 890 + num_ops = 1 + do_sync; 891 + strip_unit_end = page->index + 892 + ((len - 1) >> PAGE_CACHE_SHIFT); 894 893 895 - req->r_callback = writepages_finish; 896 - req->r_inode = inode; 897 - 894 + BUG_ON(pages); 898 895 max_pages = calc_pages_for(0, (u64)len); 899 896 pages = kmalloc(max_pages * sizeof (*pages), 900 897 GFP_NOFS); ··· 898 905 pages = mempool_alloc(pool, GFP_NOFS); 899 906 BUG_ON(!pages); 900 907 } 908 + 909 + len = 0; 910 + } else if (page->index != 911 + (offset + len) >> PAGE_CACHE_SHIFT) { 912 + if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS : 913 + CEPH_OSD_MAX_OPS)) { 914 + redirty_page_for_writepage(wbc, page); 915 + unlock_page(page); 916 + break; 917 + } 918 + 919 + num_ops++; 920 + offset = (u64)page_offset(page); 921 + len = 0; 901 922 } 902 923 903 924 /* note position of first page in pvec */ ··· 920 913 dout("%p will write page %p idx %lu\n", 921 914 inode, page, page->index); 922 915 923 - writeback_stat = 924 - atomic_long_inc_return(&fsc->writeback_count); 925 - if (writeback_stat > CONGESTION_ON_THRESH( 916 + if (atomic_long_inc_return(&fsc->writeback_count) > 917 + CONGESTION_ON_THRESH( 926 918 fsc->mount_options->congestion_kb)) { 927 919 set_bdi_congested(&fsc->backing_dev_info, 928 920 BLK_RW_ASYNC); 929 921 } 930 922 931 - set_page_writeback(page); 932 923 pages[locked_pages] = page; 933 924 locked_pages++; 934 - next = page->index + 1; 925 + len += PAGE_CACHE_SIZE; 935 926 } 936 927 937 928 /* did we get anything? */ ··· 949 944 /* shift unused pages over in the pvec... we 950 945 * will need to release them below. */ 951 946 for (j = i; j < pvec_pages; j++) { 952 - dout(" pvec leftover page %p\n", 953 - pvec.pages[j]); 947 + dout(" pvec leftover page %p\n", pvec.pages[j]); 954 948 pvec.pages[j-i+first] = pvec.pages[j]; 955 949 } 956 950 pvec.nr -= i-first; 957 951 } 958 952 959 - /* Format the osd request message and submit the write */ 953 + new_request: 960 954 offset = page_offset(pages[0]); 961 - len = (u64)locked_pages << PAGE_CACHE_SHIFT; 962 - if (snap_size == -1) { 963 - len = min(len, (u64)i_size_read(inode) - offset); 964 - /* writepages_finish() clears writeback pages 965 - * according to the data length, so make sure 966 - * data length covers all locked pages */ 967 - len = max(len, 1 + 968 - ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); 969 - } else { 970 - len = min(len, snap_size - offset); 955 + len = wsize; 956 + 957 + req = ceph_osdc_new_request(&fsc->client->osdc, 958 + &ci->i_layout, vino, 959 + offset, &len, 0, num_ops, 960 + CEPH_OSD_OP_WRITE, 961 + CEPH_OSD_FLAG_WRITE | 962 + CEPH_OSD_FLAG_ONDISK, 963 + snapc, truncate_seq, 964 + truncate_size, false); 965 + if (IS_ERR(req)) { 966 + req = ceph_osdc_new_request(&fsc->client->osdc, 967 + &ci->i_layout, vino, 968 + offset, &len, 0, 969 + min(num_ops, 970 + CEPH_OSD_SLAB_OPS), 971 + CEPH_OSD_OP_WRITE, 972 + CEPH_OSD_FLAG_WRITE | 973 + CEPH_OSD_FLAG_ONDISK, 974 + snapc, truncate_seq, 975 + truncate_size, true); 976 + BUG_ON(IS_ERR(req)); 971 977 } 972 - dout("writepages got %d pages at %llu~%llu\n", 973 - locked_pages, offset, len); 978 + BUG_ON(len < page_offset(pages[locked_pages - 1]) + 979 + PAGE_CACHE_SIZE - offset); 974 980 975 - osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, 981 + req->r_callback = writepages_finish; 982 + req->r_inode = inode; 983 + 984 + /* Format the osd request message and submit the write */ 985 + len = 0; 986 + data_pages = pages; 987 + op_idx = 0; 988 + for (i = 0; i < locked_pages; i++) { 989 + u64 cur_offset = page_offset(pages[i]); 990 + if (offset + len != cur_offset) { 991 + if (op_idx + do_sync + 1 == req->r_num_ops) 992 + break; 993 + osd_req_op_extent_dup_last(req, op_idx, 994 + cur_offset - offset); 995 + dout("writepages got pages at %llu~%llu\n", 996 + offset, len); 997 + osd_req_op_extent_osd_data_pages(req, op_idx, 998 + data_pages, len, 0, 976 999 !!pool, false); 1000 + osd_req_op_extent_update(req, op_idx, len); 977 1001 978 - pages = NULL; /* request message now owns the pages array */ 1002 + len = 0; 1003 + offset = cur_offset; 1004 + data_pages = pages + i; 1005 + op_idx++; 1006 + } 1007 + 1008 + set_page_writeback(pages[i]); 1009 + len += PAGE_CACHE_SIZE; 1010 + } 1011 + 1012 + if (snap_size != -1) { 1013 + len = min(len, snap_size - offset); 1014 + } else if (i == locked_pages) { 1015 + /* writepages_finish() clears writeback pages 1016 + * according to the data length, so make sure 1017 + * data length covers all locked pages */ 1018 + u64 min_len = len + 1 - PAGE_CACHE_SIZE; 1019 + len = min(len, (u64)i_size_read(inode) - offset); 1020 + len = max(len, min_len); 1021 + } 1022 + dout("writepages got pages at %llu~%llu\n", offset, len); 1023 + 1024 + osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 1025 + 0, !!pool, false); 1026 + osd_req_op_extent_update(req, op_idx, len); 1027 + 1028 + if (do_sync) { 1029 + op_idx++; 1030 + osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); 1031 + } 1032 + BUG_ON(op_idx + 1 != req->r_num_ops); 1033 + 979 1034 pool = NULL; 1035 + if (i < locked_pages) { 1036 + BUG_ON(num_ops <= req->r_num_ops); 1037 + num_ops -= req->r_num_ops; 1038 + num_ops += do_sync; 1039 + locked_pages -= i; 980 1040 981 - /* Update the write op length in case we changed it */ 982 - 983 - osd_req_op_extent_update(req, 0, len); 1041 + /* allocate new pages array for next request */ 1042 + data_pages = pages; 1043 + pages = kmalloc(locked_pages * sizeof (*pages), 1044 + GFP_NOFS); 1045 + if (!pages) { 1046 + pool = fsc->wb_pagevec_pool; 1047 + pages = mempool_alloc(pool, GFP_NOFS); 1048 + BUG_ON(!pages); 1049 + } 1050 + memcpy(pages, data_pages + i, 1051 + locked_pages * sizeof(*pages)); 1052 + memset(data_pages + i, 0, 1053 + locked_pages * sizeof(*pages)); 1054 + } else { 1055 + BUG_ON(num_ops != req->r_num_ops); 1056 + index = pages[i - 1]->index + 1; 1057 + /* request message now owns the pages array */ 1058 + pages = NULL; 1059 + } 984 1060 985 1061 vino = ceph_vino(inode); 986 1062 ceph_osdc_build_request(req, offset, snapc, vino.snap, ··· 1071 985 BUG_ON(rc); 1072 986 req = NULL; 1073 987 1074 - /* continue? */ 1075 - index = next; 1076 - wbc->nr_to_write -= locked_pages; 988 + wbc->nr_to_write -= i; 989 + if (pages) 990 + goto new_request; 991 + 1077 992 if (wbc->nr_to_write <= 0) 1078 993 done = 1; 1079 994 ··· 1609 1522 ceph_vino(inode), 0, &len, 0, 1, 1610 1523 CEPH_OSD_OP_CREATE, 1611 1524 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1612 - ceph_empty_snapc, 0, 0, false); 1525 + NULL, 0, 0, false); 1613 1526 if (IS_ERR(req)) { 1614 1527 err = PTR_ERR(req); 1615 1528 goto out; ··· 1627 1540 ceph_vino(inode), 0, &len, 1, 3, 1628 1541 CEPH_OSD_OP_WRITE, 1629 1542 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1630 - ceph_empty_snapc, 1631 - ci->i_truncate_seq, ci->i_truncate_size, 1632 - false); 1543 + NULL, ci->i_truncate_seq, 1544 + ci->i_truncate_size, false); 1633 1545 if (IS_ERR(req)) { 1634 1546 err = PTR_ERR(req); 1635 1547 goto out; ··· 1749 1663 goto out; 1750 1664 } 1751 1665 1752 - rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, 1753 - ceph_empty_snapc, 1666 + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1754 1667 1, false, GFP_NOFS); 1755 1668 if (!rd_req) { 1756 1669 err = -ENOMEM; ··· 1763 1678 "%llx.00000000", ci->i_vino.ino); 1764 1679 rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); 1765 1680 1766 - wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, 1767 - ceph_empty_snapc, 1681 + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL, 1768 1682 1, false, GFP_NOFS); 1769 1683 if (!wr_req) { 1770 1684 err = -ENOMEM;

+7 -4

fs/ceph/caps.c

··· 991 991 u32 seq, u64 flush_tid, u64 oldest_flush_tid, 992 992 u32 issue_seq, u32 mseq, u64 size, u64 max_size, 993 993 struct timespec *mtime, struct timespec *atime, 994 - u64 time_warp_seq, 994 + struct timespec *ctime, u64 time_warp_seq, 995 995 kuid_t uid, kgid_t gid, umode_t mode, 996 996 u64 xattr_version, 997 997 struct ceph_buffer *xattrs_buf, ··· 1042 1042 ceph_encode_timespec(&fc->mtime, mtime); 1043 1043 if (atime) 1044 1044 ceph_encode_timespec(&fc->atime, atime); 1045 + if (ctime) 1046 + ceph_encode_timespec(&fc->ctime, ctime); 1045 1047 fc->time_warp_seq = cpu_to_le32(time_warp_seq); 1046 1048 1047 1049 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); ··· 1118 1116 int held, revoking, dropping, keep; 1119 1117 u64 seq, issue_seq, mseq, time_warp_seq, follows; 1120 1118 u64 size, max_size; 1121 - struct timespec mtime, atime; 1119 + struct timespec mtime, atime, ctime; 1122 1120 int wake = 0; 1123 1121 umode_t mode; 1124 1122 kuid_t uid; ··· 1182 1180 ci->i_requested_max_size = max_size; 1183 1181 mtime = inode->i_mtime; 1184 1182 atime = inode->i_atime; 1183 + ctime = inode->i_ctime; 1185 1184 time_warp_seq = ci->i_time_warp_seq; 1186 1185 uid = inode->i_uid; 1187 1186 gid = inode->i_gid; ··· 1201 1198 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1202 1199 op, keep, want, flushing, seq, 1203 1200 flush_tid, oldest_flush_tid, issue_seq, mseq, 1204 - size, max_size, &mtime, &atime, time_warp_seq, 1201 + size, max_size, &mtime, &atime, &ctime, time_warp_seq, 1205 1202 uid, gid, mode, xattr_version, xattr_blob, 1206 1203 follows, inline_data); 1207 1204 if (ret < 0) { ··· 1323 1320 capsnap->dirty, 0, capsnap->flush_tid, 0, 1324 1321 0, mseq, capsnap->size, 0, 1325 1322 &capsnap->mtime, &capsnap->atime, 1326 - capsnap->time_warp_seq, 1323 + &capsnap->ctime, capsnap->time_warp_seq, 1327 1324 capsnap->uid, capsnap->gid, capsnap->mode, 1328 1325 capsnap->xattr_version, capsnap->xattr_blob, 1329 1326 capsnap->follows, capsnap->inline_data);

+47 -22

fs/ceph/dir.c

··· 38 38 if (dentry->d_fsdata) 39 39 return 0; 40 40 41 - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); 41 + di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); 42 42 if (!di) 43 43 return -ENOMEM; /* oh well */ 44 44 ··· 67 67 spin_unlock(&dentry->d_lock); 68 68 return 0; 69 69 } 70 - 71 - struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) 72 - { 73 - struct inode *inode = NULL; 74 - 75 - if (!dentry) 76 - return NULL; 77 - 78 - spin_lock(&dentry->d_lock); 79 - if (!IS_ROOT(dentry)) { 80 - inode = d_inode(dentry->d_parent); 81 - ihold(inode); 82 - } 83 - spin_unlock(&dentry->d_lock); 84 - return inode; 85 - } 86 - 87 70 88 71 /* 89 72 * for readdir, we encode the directory frag and offset within that ··· 607 624 struct ceph_mds_client *mdsc = fsc->mdsc; 608 625 struct ceph_mds_request *req; 609 626 int op; 627 + int mask; 610 628 int err; 611 629 612 630 dout("lookup %p dentry %p '%pd'\n", ··· 650 666 return ERR_CAST(req); 651 667 req->r_dentry = dget(dentry); 652 668 req->r_num_caps = 2; 653 - /* we only need inode linkage */ 654 - req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 669 + 670 + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 671 + if (ceph_security_xattr_wanted(dir)) 672 + mask |= CEPH_CAP_XATTR_SHARED; 673 + req->r_args.getattr.mask = cpu_to_le32(mask); 674 + 655 675 req->r_locked_dir = dir; 656 676 err = ceph_mdsc_do_request(mdsc, NULL, req); 657 677 err = ceph_handle_snapdir(req, dentry, err); ··· 1083 1095 static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) 1084 1096 { 1085 1097 int valid = 0; 1098 + struct dentry *parent; 1086 1099 struct inode *dir; 1087 1100 1088 1101 if (flags & LOOKUP_RCU) ··· 1092 1103 dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, 1093 1104 dentry, d_inode(dentry), ceph_dentry(dentry)->offset); 1094 1105 1095 - dir = ceph_get_dentry_parent_inode(dentry); 1106 + parent = dget_parent(dentry); 1107 + dir = d_inode(parent); 1096 1108 1097 1109 /* always trust cached snapped dentries, snapdir dentry */ 1098 1110 if (ceph_snap(dir) != CEPH_NOSNAP) { ··· 1111 1121 valid = 1; 1112 1122 } 1113 1123 1124 + if (!valid) { 1125 + struct ceph_mds_client *mdsc = 1126 + ceph_sb_to_client(dir->i_sb)->mdsc; 1127 + struct ceph_mds_request *req; 1128 + int op, mask, err; 1129 + 1130 + op = ceph_snap(dir) == CEPH_SNAPDIR ? 1131 + CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; 1132 + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); 1133 + if (!IS_ERR(req)) { 1134 + req->r_dentry = dget(dentry); 1135 + req->r_num_caps = 2; 1136 + 1137 + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 1138 + if (ceph_security_xattr_wanted(dir)) 1139 + mask |= CEPH_CAP_XATTR_SHARED; 1140 + req->r_args.getattr.mask = mask; 1141 + 1142 + req->r_locked_dir = dir; 1143 + err = ceph_mdsc_do_request(mdsc, NULL, req); 1144 + if (err == 0 || err == -ENOENT) { 1145 + if (dentry == req->r_dentry) { 1146 + valid = !d_unhashed(dentry); 1147 + } else { 1148 + d_invalidate(req->r_dentry); 1149 + err = -EAGAIN; 1150 + } 1151 + } 1152 + ceph_mdsc_put_request(req); 1153 + dout("d_revalidate %p lookup result=%d\n", 1154 + dentry, err); 1155 + } 1156 + } 1157 + 1114 1158 dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); 1115 1159 if (valid) { 1116 1160 ceph_dentry_lru_touch(dentry); 1117 1161 } else { 1118 1162 ceph_dir_clear_complete(dir); 1119 1163 } 1120 - iput(dir); 1164 + 1165 + dput(parent); 1121 1166 return valid; 1122 1167 } 1123 1168

+13

fs/ceph/export.c

··· 71 71 inode = ceph_find_inode(sb, vino); 72 72 if (!inode) { 73 73 struct ceph_mds_request *req; 74 + int mask; 74 75 75 76 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, 76 77 USE_ANY_MDS); 77 78 if (IS_ERR(req)) 78 79 return ERR_CAST(req); 80 + 81 + mask = CEPH_STAT_CAP_INODE; 82 + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) 83 + mask |= CEPH_CAP_XATTR_SHARED; 84 + req->r_args.getattr.mask = cpu_to_le32(mask); 79 85 80 86 req->r_ino1 = vino; 81 87 req->r_num_caps = 1; ··· 134 128 struct ceph_mds_request *req; 135 129 struct inode *inode; 136 130 struct dentry *dentry; 131 + int mask; 137 132 int err; 138 133 139 134 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, ··· 151 144 .snap = CEPH_NOSNAP, 152 145 }; 153 146 } 147 + 148 + mask = CEPH_STAT_CAP_INODE; 149 + if (ceph_security_xattr_wanted(d_inode(sb->s_root))) 150 + mask |= CEPH_CAP_XATTR_SHARED; 151 + req->r_args.getattr.mask = cpu_to_le32(mask); 152 + 154 153 req->r_num_caps = 1; 155 154 err = ceph_mdsc_do_request(mdsc, NULL, req); 156 155 inode = req->r_target_inode;

+10 -5

fs/ceph/file.c

··· 157 157 case S_IFDIR: 158 158 dout("init_file %p %p 0%o (regular)\n", inode, file, 159 159 inode->i_mode); 160 - cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); 160 + cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 161 161 if (cf == NULL) { 162 162 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 163 163 return -ENOMEM; ··· 300 300 struct ceph_mds_request *req; 301 301 struct dentry *dn; 302 302 struct ceph_acls_info acls = {}; 303 + int mask; 303 304 int err; 304 305 305 306 dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", ··· 336 335 acls.pagelist = NULL; 337 336 } 338 337 } 338 + 339 + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; 340 + if (ceph_security_xattr_wanted(dir)) 341 + mask |= CEPH_CAP_XATTR_SHARED; 342 + req->r_args.open.mask = cpu_to_le32(mask); 343 + 339 344 req->r_locked_dir = dir; /* caller holds dir->i_mutex */ 340 345 err = ceph_mdsc_do_request(mdsc, 341 346 (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, ··· 732 725 ret = ceph_osdc_start_request(req->r_osdc, req, false); 733 726 out: 734 727 if (ret < 0) { 735 - BUG_ON(ret == -EOLDSNAPC); 736 728 req->r_result = ret; 737 729 ceph_aio_complete_req(req, NULL); 738 730 } ··· 789 783 int num_pages = 0; 790 784 int flags; 791 785 int ret; 792 - struct timespec mtime = CURRENT_TIME; 786 + struct timespec mtime = current_fs_time(inode->i_sb); 793 787 size_t count = iov_iter_count(iter); 794 788 loff_t pos = iocb->ki_pos; 795 789 bool write = iov_iter_rw(iter) == WRITE; ··· 955 949 ret = ceph_osdc_start_request(req->r_osdc, 956 950 req, false); 957 951 if (ret < 0) { 958 - BUG_ON(ret == -EOLDSNAPC); 959 952 req->r_result = ret; 960 953 ceph_aio_complete_req(req, NULL); 961 954 } ··· 993 988 int flags; 994 989 int check_caps = 0; 995 990 int ret; 996 - struct timespec mtime = CURRENT_TIME; 991 + struct timespec mtime = current_fs_time(inode->i_sb); 997 992 size_t count = iov_iter_count(from); 998 993 999 994 if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)

+27 -7

fs/ceph/inode.c

··· 549 549 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || 550 550 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { 551 551 dout("size %lld -> %llu\n", inode->i_size, size); 552 + if (size > 0 && S_ISDIR(inode->i_mode)) { 553 + pr_err("fill_file_size non-zero size for directory\n"); 554 + size = 0; 555 + } 552 556 i_size_write(inode, size); 553 557 inode->i_blocks = (size + (1<<9) - 1) >> 9; 554 558 ci->i_reported_size = size; ··· 1265 1261 dout(" %p links to %p %llx.%llx, not %llx.%llx\n", 1266 1262 dn, d_inode(dn), ceph_vinop(d_inode(dn)), 1267 1263 ceph_vinop(in)); 1264 + d_invalidate(dn); 1268 1265 have_lease = false; 1269 1266 } 1270 1267 ··· 1354 1349 1355 1350 if (!ctl->page || pgoff != page_index(ctl->page)) { 1356 1351 ceph_readdir_cache_release(ctl); 1357 - ctl->page = grab_cache_page(&dir->i_data, pgoff); 1352 + if (idx == 0) 1353 + ctl->page = grab_cache_page(&dir->i_data, pgoff); 1354 + else 1355 + ctl->page = find_lock_page(&dir->i_data, pgoff); 1358 1356 if (!ctl->page) { 1359 1357 ctl->index = -1; 1360 - return -ENOMEM; 1358 + return idx == 0 ? -ENOMEM : 0; 1361 1359 } 1362 1360 /* reading/filling the cache are serialized by 1363 1361 * i_mutex, no need to use page lock */ 1364 1362 unlock_page(ctl->page); 1365 1363 ctl->dentries = kmap(ctl->page); 1364 + if (idx == 0) 1365 + memset(ctl->dentries, 0, PAGE_CACHE_SIZE); 1366 1366 } 1367 1367 1368 1368 if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && ··· 1390 1380 struct qstr dname; 1391 1381 struct dentry *dn; 1392 1382 struct inode *in; 1393 - int err = 0, ret, i; 1383 + int err = 0, skipped = 0, ret, i; 1394 1384 struct inode *snapdir = NULL; 1395 1385 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1396 1386 struct ceph_dentry_info *di; ··· 1502 1492 } 1503 1493 1504 1494 if (d_really_is_negative(dn)) { 1505 - struct dentry *realdn = splice_dentry(dn, in); 1495 + struct dentry *realdn; 1496 + 1497 + if (ceph_security_xattr_deadlock(in)) { 1498 + dout(" skip splicing dn %p to inode %p" 1499 + " (security xattr deadlock)\n", dn, in); 1500 + iput(in); 1501 + skipped++; 1502 + goto next_item; 1503 + } 1504 + 1505 + realdn = splice_dentry(dn, in); 1506 1506 if (IS_ERR(realdn)) { 1507 1507 err = PTR_ERR(realdn); 1508 1508 d_drop(dn); ··· 1529 1509 req->r_session, 1530 1510 req->r_request_started); 1531 1511 1532 - if (err == 0 && cache_ctl.index >= 0) { 1512 + if (err == 0 && skipped == 0 && cache_ctl.index >= 0) { 1533 1513 ret = fill_readdir_cache(d_inode(parent), dn, 1534 1514 &cache_ctl, req); 1535 1515 if (ret < 0) ··· 1540 1520 dput(dn); 1541 1521 } 1542 1522 out: 1543 - if (err == 0) { 1523 + if (err == 0 && skipped == 0) { 1544 1524 req->r_did_prepopulate = true; 1545 1525 req->r_readdir_cache_idx = cache_ctl.index; 1546 1526 } ··· 1970 1950 if (dirtied) { 1971 1951 inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, 1972 1952 &prealloc_cf); 1973 - inode->i_ctime = CURRENT_TIME; 1953 + inode->i_ctime = current_fs_time(inode->i_sb); 1974 1954 } 1975 1955 1976 1956 release &= issued;

+5 -2

fs/ceph/mds_client.c

··· 1729 1729 init_completion(&req->r_safe_completion); 1730 1730 INIT_LIST_HEAD(&req->r_unsafe_item); 1731 1731 1732 - req->r_stamp = CURRENT_TIME; 1732 + req->r_stamp = current_fs_time(mdsc->fsc->sb); 1733 1733 1734 1734 req->r_op = op; 1735 1735 req->r_direct_mode = mode; ··· 2540 2540 2541 2541 /* insert trace into our cache */ 2542 2542 mutex_lock(&req->r_fill_mutex); 2543 + current->journal_info = req; 2543 2544 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); 2544 2545 if (err == 0) { 2545 2546 if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || ··· 2548 2547 ceph_readdir_prepopulate(req, req->r_session); 2549 2548 ceph_unreserve_caps(mdsc, &req->r_caps_reservation); 2550 2549 } 2550 + current->journal_info = NULL; 2551 2551 mutex_unlock(&req->r_fill_mutex); 2552 2552 2553 2553 up_read(&mdsc->snap_rwsem); ··· 3766 3764 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3767 3765 3768 3766 /* do we need it? */ 3769 - ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch); 3770 3767 mutex_lock(&mdsc->mutex); 3771 3768 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3772 3769 dout("handle_map epoch %u <= our %u\n", ··· 3792 3791 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3793 3792 3794 3793 __wake_requests(mdsc, &mdsc->waiting_for_map); 3794 + ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, 3795 + mdsc->mdsmap->m_epoch); 3795 3796 3796 3797 mutex_unlock(&mdsc->mutex); 3797 3798 schedule_delayed(mdsc);

-16

fs/ceph/snap.c

··· 296 296 } 297 297 298 298 299 - struct ceph_snap_context *ceph_empty_snapc; 300 - 301 299 /* 302 300 * build the snap context for a given realm. 303 301 */ ··· 984 986 if (locked_rwsem) 985 987 up_write(&mdsc->snap_rwsem); 986 988 return; 987 - } 988 - 989 - int __init ceph_snap_init(void) 990 - { 991 - ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); 992 - if (!ceph_empty_snapc) 993 - return -ENOMEM; 994 - ceph_empty_snapc->seq = 1; 995 - return 0; 996 - } 997 - 998 - void ceph_snap_exit(void) 999 - { 1000 - ceph_put_snap_context(ceph_empty_snapc); 1001 989 }

+19 -28

fs/ceph/super.c

··· 439 439 440 440 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) 441 441 seq_puts(m, ",dirstat"); 442 - if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) 443 - seq_puts(m, ",norbytes"); 442 + if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES)) 443 + seq_puts(m, ",rbytes"); 444 444 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) 445 445 seq_puts(m, ",noasyncreaddir"); 446 446 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) ··· 530 530 goto fail; 531 531 } 532 532 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 533 - fsc->client->monc.want_mdsmap = 1; 533 + ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); 534 534 535 535 fsc->mount_options = fsopt; 536 536 ··· 793 793 struct dentry *root; 794 794 int first = 0; /* first vfsmount for this super_block */ 795 795 796 - dout("mount start\n"); 796 + dout("mount start %p\n", fsc); 797 797 mutex_lock(&fsc->client->mount_mutex); 798 798 799 - err = __ceph_open_session(fsc->client, started); 800 - if (err < 0) 801 - goto out; 799 + if (!fsc->sb->s_root) { 800 + err = __ceph_open_session(fsc->client, started); 801 + if (err < 0) 802 + goto out; 802 803 803 - dout("mount opening root\n"); 804 - root = open_root_dentry(fsc, "", started); 805 - if (IS_ERR(root)) { 806 - err = PTR_ERR(root); 807 - goto out; 808 - } 809 - if (fsc->sb->s_root) { 810 - dput(root); 811 - } else { 804 + dout("mount opening root\n"); 805 + root = open_root_dentry(fsc, "", started); 806 + if (IS_ERR(root)) { 807 + err = PTR_ERR(root); 808 + goto out; 809 + } 812 810 fsc->sb->s_root = root; 813 811 first = 1; 814 812 ··· 816 818 } 817 819 818 820 if (path[0] == 0) { 821 + root = fsc->sb->s_root; 819 822 dget(root); 820 823 } else { 821 824 dout("mount opening base mountpoint\n"); ··· 832 833 mutex_unlock(&fsc->client->mount_mutex); 833 834 return root; 834 835 835 - out: 836 - mutex_unlock(&fsc->client->mount_mutex); 837 - return ERR_PTR(err); 838 - 839 836 fail: 840 837 if (first) { 841 838 dput(fsc->sb->s_root); 842 839 fsc->sb->s_root = NULL; 843 840 } 844 - goto out; 841 + out: 842 + mutex_unlock(&fsc->client->mount_mutex); 843 + return ERR_PTR(err); 845 844 } 846 845 847 846 static int ceph_set_super(struct super_block *s, void *data) ··· 1039 1042 1040 1043 ceph_flock_init(); 1041 1044 ceph_xattr_init(); 1042 - ret = ceph_snap_init(); 1043 - if (ret) 1044 - goto out_xattr; 1045 1045 ret = register_filesystem(&ceph_fs_type); 1046 1046 if (ret) 1047 - goto out_snap; 1047 + goto out_xattr; 1048 1048 1049 1049 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); 1050 1050 1051 1051 return 0; 1052 1052 1053 - out_snap: 1054 - ceph_snap_exit(); 1055 1053 out_xattr: 1056 1054 ceph_xattr_exit(); 1057 1055 destroy_caches(); ··· 1058 1066 { 1059 1067 dout("exit_ceph\n"); 1060 1068 unregister_filesystem(&ceph_fs_type); 1061 - ceph_snap_exit(); 1062 1069 ceph_xattr_exit(); 1063 1070 destroy_caches(); 1064 1071 }

+16 -7

fs/ceph/super.h

··· 37 37 #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ 38 38 #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ 39 39 40 - #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ 41 - CEPH_MOUNT_OPT_DCACHE) 40 + #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE 42 41 43 42 #define ceph_set_mount_opt(fsc, opt) \ 44 43 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; ··· 468 469 #define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ 469 470 #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 470 471 #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 471 - 472 + #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ 472 473 473 474 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, 474 475 long long release_count, ··· 720 721 721 722 722 723 /* snap.c */ 723 - extern struct ceph_snap_context *ceph_empty_snapc; 724 724 struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 725 725 u64 ino); 726 726 extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, ··· 736 738 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 737 739 struct ceph_cap_snap *capsnap); 738 740 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 739 - extern int ceph_snap_init(void); 740 - extern void ceph_snap_exit(void); 741 741 742 742 /* 743 743 * a cap_snap is "pending" if it is still awaiting an in-progress ··· 803 807 extern void __init ceph_xattr_init(void); 804 808 extern void ceph_xattr_exit(void); 805 809 extern const struct xattr_handler *ceph_xattr_handlers[]; 810 + 811 + #ifdef CONFIG_SECURITY 812 + extern bool ceph_security_xattr_deadlock(struct inode *in); 813 + extern bool ceph_security_xattr_wanted(struct inode *in); 814 + #else 815 + static inline bool ceph_security_xattr_deadlock(struct inode *in) 816 + { 817 + return false; 818 + } 819 + static inline bool ceph_security_xattr_wanted(struct inode *in) 820 + { 821 + return false; 822 + } 823 + #endif 806 824 807 825 /* acl.c */ 808 826 struct ceph_acls_info { ··· 957 947 extern void ceph_dentry_lru_del(struct dentry *dn); 958 948 extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 959 949 extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); 960 - extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); 961 950 extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); 962 951 963 952 /*

+71 -7

fs/ceph/xattr.c

··· 714 714 } 715 715 } 716 716 717 + static inline int __get_request_mask(struct inode *in) { 718 + struct ceph_mds_request *req = current->journal_info; 719 + int mask = 0; 720 + if (req && req->r_target_inode == in) { 721 + if (req->r_op == CEPH_MDS_OP_LOOKUP || 722 + req->r_op == CEPH_MDS_OP_LOOKUPINO || 723 + req->r_op == CEPH_MDS_OP_LOOKUPPARENT || 724 + req->r_op == CEPH_MDS_OP_GETATTR) { 725 + mask = le32_to_cpu(req->r_args.getattr.mask); 726 + } else if (req->r_op == CEPH_MDS_OP_OPEN || 727 + req->r_op == CEPH_MDS_OP_CREATE) { 728 + mask = le32_to_cpu(req->r_args.open.mask); 729 + } 730 + } 731 + return mask; 732 + } 733 + 717 734 ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, 718 735 size_t size) 719 736 { 720 737 struct ceph_inode_info *ci = ceph_inode(inode); 721 - int err; 722 738 struct ceph_inode_xattr *xattr; 723 739 struct ceph_vxattr *vxattr = NULL; 740 + int req_mask; 741 + int err; 724 742 725 743 if (!ceph_is_valid_xattr(name)) 726 744 return -ENODATA; 727 745 728 746 /* let's see if a virtual xattr was requested */ 729 747 vxattr = ceph_match_vxattr(inode, name); 730 - if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { 731 - err = vxattr->getxattr_cb(ci, value, size); 748 + if (vxattr) { 749 + err = -ENODATA; 750 + if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) 751 + err = vxattr->getxattr_cb(ci, value, size); 732 752 return err; 733 753 } 754 + 755 + req_mask = __get_request_mask(inode); 734 756 735 757 spin_lock(&ci->i_ceph_lock); 736 758 dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 737 759 ci->i_xattrs.version, ci->i_xattrs.index_version); 738 760 739 761 if (ci->i_xattrs.version == 0 || 740 - !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { 762 + !((req_mask & CEPH_CAP_XATTR_SHARED) || 763 + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) { 741 764 spin_unlock(&ci->i_ceph_lock); 765 + 766 + /* security module gets xattr while filling trace */ 767 + if (current->journal_info != NULL) { 768 + pr_warn_ratelimited("sync getxattr %p " 769 + "during filling trace\n", inode); 770 + return -EBUSY; 771 + } 772 + 742 773 /* get xattrs from mds (if we don't already have them) */ 743 774 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); 744 775 if (err) ··· 796 765 797 766 memcpy(value, xattr->val, xattr->val_len); 798 767 768 + if (current->journal_info != NULL && 769 + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 770 + ci->i_ceph_flags |= CEPH_I_SEC_INITED; 799 771 out: 800 772 spin_unlock(&ci->i_ceph_lock); 801 773 return err; ··· 1033 999 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, 1034 1000 &prealloc_cf); 1035 1001 ci->i_xattrs.dirty = true; 1036 - inode->i_ctime = CURRENT_TIME; 1002 + inode->i_ctime = current_fs_time(inode->i_sb); 1037 1003 } 1038 1004 1039 1005 spin_unlock(&ci->i_ceph_lock); ··· 1049 1015 do_sync_unlocked: 1050 1016 if (lock_snap_rwsem) 1051 1017 up_read(&mdsc->snap_rwsem); 1052 - err = ceph_sync_setxattr(dentry, name, value, size, flags); 1018 + 1019 + /* security module set xattr while filling trace */ 1020 + if (current->journal_info != NULL) { 1021 + pr_warn_ratelimited("sync setxattr %p " 1022 + "during filling trace\n", inode); 1023 + err = -EBUSY; 1024 + } else { 1025 + err = ceph_sync_setxattr(dentry, name, value, size, flags); 1026 + } 1053 1027 out: 1054 1028 ceph_free_cap_flush(prealloc_cf); 1055 1029 kfree(newname); ··· 1178 1136 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, 1179 1137 &prealloc_cf); 1180 1138 ci->i_xattrs.dirty = true; 1181 - inode->i_ctime = CURRENT_TIME; 1139 + inode->i_ctime = current_fs_time(inode->i_sb); 1182 1140 spin_unlock(&ci->i_ceph_lock); 1183 1141 if (lock_snap_rwsem) 1184 1142 up_read(&mdsc->snap_rwsem); ··· 1206 1164 1207 1165 return __ceph_removexattr(dentry, name); 1208 1166 } 1167 + 1168 + #ifdef CONFIG_SECURITY 1169 + bool ceph_security_xattr_wanted(struct inode *in) 1170 + { 1171 + return in->i_security != NULL; 1172 + } 1173 + 1174 + bool ceph_security_xattr_deadlock(struct inode *in) 1175 + { 1176 + struct ceph_inode_info *ci; 1177 + bool ret; 1178 + if (in->i_security == NULL) 1179 + return false; 1180 + ci = ceph_inode(in); 1181 + spin_lock(&ci->i_ceph_lock); 1182 + ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) && 1183 + !(ci->i_xattrs.version > 0 && 1184 + __ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0)); 1185 + spin_unlock(&ci->i_ceph_lock); 1186 + return ret; 1187 + } 1188 + #endif

+2

include/linux/ceph/ceph_features.h

+4 -3

include/linux/ceph/ceph_fs.h

··· 198 198 #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ 199 199 200 200 struct ceph_mon_subscribe_item { 201 - __le64 have_version; __le64 have; 202 - __u8 onetime; 201 + __le64 start; 202 + __u8 flags; 203 203 } __attribute__ ((packed)); 204 204 205 205 struct ceph_mon_subscribe_ack { ··· 376 376 __le32 stripe_count; /* ... */ 377 377 __le32 object_size; 378 378 __le32 file_replication; 379 - __le32 unused; /* used to be preferred osd */ 379 + __le32 mask; /* CEPH_CAP_* */ 380 + __le32 old_size; 380 381 } __attribute__ ((packed)) open; 381 382 struct { 382 383 __le32 flags;

+6 -2

include/linux/ceph/libceph.h

··· 47 47 unsigned long mount_timeout; /* jiffies */ 48 48 unsigned long osd_idle_ttl; /* jiffies */ 49 49 unsigned long osd_keepalive_timeout; /* jiffies */ 50 - unsigned long monc_ping_timeout; /* jiffies */ 51 50 52 51 /* 53 52 * any type that can't be simply compared or doesn't need need ··· 67 68 #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) 68 69 #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) 69 70 #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) 70 - #define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000) 71 + 72 + #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) 73 + #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) 74 + #define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000) 75 + #define CEPH_MONC_HUNT_BACKOFF 2 76 + #define CEPH_MONC_HUNT_MAX_MULT 10 71 77 72 78 #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 73 79 #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)

+23 -8

include/linux/ceph/mon_client.h

··· 68 68 69 69 bool hunting; 70 70 int cur_mon; /* last monitor i contacted */ 71 - unsigned long sub_sent, sub_renew_after; 71 + unsigned long sub_renew_after; 72 + unsigned long sub_renew_sent; 72 73 struct ceph_connection con; 74 + 75 + bool had_a_connection; 76 + int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */ 73 77 74 78 /* pending generic requests */ 75 79 struct rb_root generic_request_tree; 76 80 int num_generic_requests; 77 81 u64 last_tid; 78 82 79 - /* mds/osd map */ 80 - int want_mdsmap; 81 - int want_next_osdmap; /* 1 = want, 2 = want+asked */ 82 - u32 have_osdmap, have_mdsmap; 83 + /* subs, indexed with CEPH_SUB_* */ 84 + struct { 85 + struct ceph_mon_subscribe_item item; 86 + bool want; 87 + u32 have; /* epoch */ 88 + } subs[3]; 83 89 84 90 #ifdef CONFIG_DEBUG_FS 85 91 struct dentry *debugfs_file; ··· 99 93 extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); 100 94 extern void ceph_monc_stop(struct ceph_mon_client *monc); 101 95 96 + enum { 97 + CEPH_SUB_MDSMAP = 0, 98 + CEPH_SUB_MONMAP, 99 + CEPH_SUB_OSDMAP, 100 + }; 101 + 102 + extern const char *ceph_sub_str[]; 103 + 102 104 /* 103 105 * The model here is to indicate that we need a new map of at least 104 - * epoch @want, and also call in when we receive a map. We will 106 + * epoch @epoch, and also call in when we receive a map. We will 105 107 * periodically rerequest the map from the monitor cluster until we 106 108 * get what we want. 107 109 */ 108 - extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); 109 - extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); 110 + bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, 111 + bool continuous); 112 + void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch); 110 113 111 114 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); 112 115 extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,

+10 -5

include/linux/ceph/osd_client.h

··· 43 43 }; 44 44 45 45 46 - #define CEPH_OSD_MAX_OP 3 46 + #define CEPH_OSD_SLAB_OPS 2 47 + #define CEPH_OSD_MAX_OPS 16 47 48 48 49 enum ceph_osd_data_type { 49 50 CEPH_OSD_DATA_TYPE_NONE = 0, ··· 78 77 struct ceph_osd_req_op { 79 78 u16 op; /* CEPH_OSD_OP_* */ 80 79 u32 flags; /* CEPH_OSD_OP_FLAG_* */ 81 - u32 payload_len; 80 + u32 indata_len; /* request */ 81 + u32 outdata_len; /* reply */ 82 + s32 rval; 83 + 82 84 union { 83 85 struct ceph_osd_data raw_data_in; 84 86 struct { ··· 140 136 141 137 /* request osd ops array */ 142 138 unsigned int r_num_ops; 143 - struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP]; 144 139 145 140 /* these are updated on each send */ 146 141 __le32 *r_request_osdmap_epoch; ··· 151 148 struct ceph_eversion *r_request_reassert_version; 152 149 153 150 int r_result; 154 - int r_reply_op_len[CEPH_OSD_MAX_OP]; 155 - s32 r_reply_op_result[CEPH_OSD_MAX_OP]; 156 151 int r_got_reply; 157 152 int r_linger; 158 153 ··· 175 174 unsigned long r_stamp; /* send OR check time */ 176 175 177 176 struct ceph_snap_context *r_snapc; /* snap context for writes */ 177 + 178 + struct ceph_osd_req_op r_ops[]; 178 179 }; 179 180 180 181 struct ceph_request_redirect { ··· 266 263 u64 truncate_size, u32 truncate_seq); 267 264 extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, 268 265 unsigned int which, u64 length); 266 + extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, 267 + unsigned int which, u64 offset_inc); 269 268 270 269 extern struct ceph_osd_data *osd_req_op_extent_osd_data( 271 270 struct ceph_osd_request *osd_req,

+3 -1

net/ceph/ceph_common.c

··· 361 361 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 362 362 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; 363 363 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; 364 - opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT; 365 364 366 365 /* get mon ip(s) */ 367 366 /* ip1[:port1][,ip2[:port2]...] */ ··· 684 685 if (client->auth_err < 0) 685 686 return client->auth_err; 686 687 } 688 + 689 + pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid); 690 + ceph_debugfs_client_init(client); 687 691 688 692 return 0; 689 693 }

+11 -6

net/ceph/debugfs.c

··· 112 112 struct ceph_mon_generic_request *req; 113 113 struct ceph_mon_client *monc = &client->monc; 114 114 struct rb_node *rp; 115 + int i; 115 116 116 117 mutex_lock(&monc->mutex); 117 118 118 - if (monc->have_mdsmap) 119 - seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); 120 - if (monc->have_osdmap) 121 - seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); 122 - if (monc->want_next_osdmap) 123 - seq_printf(s, "want next osdmap\n"); 119 + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 120 + seq_printf(s, "have %s %u", ceph_sub_str[i], 121 + monc->subs[i].have); 122 + if (monc->subs[i].want) 123 + seq_printf(s, " want %llu%s", 124 + le64_to_cpu(monc->subs[i].item.start), 125 + (monc->subs[i].item.flags & 126 + CEPH_SUBSCRIBE_ONETIME ? "" : "+")); 127 + seq_putc(s, '\n'); 128 + } 124 129 125 130 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { 126 131 __u16 op;

+5 -24

net/ceph/messenger.c

··· 235 235 static int ceph_msgr_slab_init(void) 236 236 { 237 237 BUG_ON(ceph_msg_cache); 238 - ceph_msg_cache = kmem_cache_create("ceph_msg", 239 - sizeof (struct ceph_msg), 240 - __alignof__(struct ceph_msg), 0, NULL); 241 - 238 + ceph_msg_cache = KMEM_CACHE(ceph_msg, 0); 242 239 if (!ceph_msg_cache) 243 240 return -ENOMEM; 244 241 245 242 BUG_ON(ceph_msg_data_cache); 246 - ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", 247 - sizeof (struct ceph_msg_data), 248 - __alignof__(struct ceph_msg_data), 249 - 0, NULL); 243 + ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); 250 244 if (ceph_msg_data_cache) 251 245 return 0; 252 246 ··· 1215 1221 static void prepare_write_message_footer(struct ceph_connection *con) 1216 1222 { 1217 1223 struct ceph_msg *m = con->out_msg; 1218 - int v = con->out_kvec_left; 1219 1224 1220 1225 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; 1221 1226 1222 1227 dout("prepare_write_message_footer %p\n", con); 1223 - con->out_kvec[v].iov_base = &m->footer; 1228 + con_out_kvec_add(con, sizeof_footer(con), &m->footer); 1224 1229 if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 1225 1230 if (con->ops->sign_message) 1226 1231 con->ops->sign_message(m); 1227 1232 else 1228 1233 m->footer.sig = 0; 1229 - con->out_kvec[v].iov_len = sizeof(m->footer); 1230 - con->out_kvec_bytes += sizeof(m->footer); 1231 1234 } else { 1232 1235 m->old_footer.flags = m->footer.flags; 1233 - con->out_kvec[v].iov_len = sizeof(m->old_footer); 1234 - con->out_kvec_bytes += sizeof(m->old_footer); 1235 1236 } 1236 - con->out_kvec_left++; 1237 1237 con->out_more = m->more_to_follow; 1238 1238 con->out_msg_done = true; 1239 1239 } ··· 2397 2409 } 2398 2410 2399 2411 /* footer */ 2400 - if (need_sign) 2401 - size = sizeof(m->footer); 2402 - else 2403 - size = sizeof(m->old_footer); 2404 - 2412 + size = sizeof_footer(con); 2405 2413 end += size; 2406 2414 ret = read_partial(con, end, size, &m->footer); 2407 2415 if (ret <= 0) ··· 3073 3089 con->out_skip += con_out_kvec_skip(con); 3074 3090 } else { 3075 3091 BUG_ON(!msg->data_length); 3076 - if (con->peer_features & CEPH_FEATURE_MSG_AUTH) 3077 - con->out_skip += sizeof(msg->footer); 3078 - else 3079 - con->out_skip += sizeof(msg->old_footer); 3092 + con->out_skip += sizeof_footer(con); 3080 3093 } 3081 3094 /* data, middle, front */ 3082 3095 if (msg->data_length)

+249 -202

net/ceph/mon_client.c

··· 122 122 ceph_msg_revoke(monc->m_subscribe); 123 123 ceph_msg_revoke_incoming(monc->m_subscribe_ack); 124 124 ceph_con_close(&monc->con); 125 - monc->cur_mon = -1; 125 + 126 126 monc->pending_auth = 0; 127 127 ceph_auth_reset(monc->auth); 128 128 } 129 129 130 130 /* 131 - * Open a session with a (new) monitor. 131 + * Pick a new monitor at random and set cur_mon. If we are repicking 132 + * (i.e. cur_mon is already set), be sure to pick a different one. 132 133 */ 133 - static int __open_session(struct ceph_mon_client *monc) 134 + static void pick_new_mon(struct ceph_mon_client *monc) 134 135 { 135 - char r; 136 - int ret; 136 + int old_mon = monc->cur_mon; 137 137 138 - if (monc->cur_mon < 0) { 139 - get_random_bytes(&r, 1); 140 - monc->cur_mon = r % monc->monmap->num_mon; 141 - dout("open_session num=%d r=%d -> mon%d\n", 142 - monc->monmap->num_mon, r, monc->cur_mon); 143 - monc->sub_sent = 0; 144 - monc->sub_renew_after = jiffies; /* i.e., expired */ 145 - monc->want_next_osdmap = !!monc->want_next_osdmap; 138 + BUG_ON(monc->monmap->num_mon < 1); 146 139 147 - dout("open_session mon%d opening\n", monc->cur_mon); 148 - ceph_con_open(&monc->con, 149 - CEPH_ENTITY_TYPE_MON, monc->cur_mon, 150 - &monc->monmap->mon_inst[monc->cur_mon].addr); 151 - 152 - /* send an initial keepalive to ensure our timestamp is 153 - * valid by the time we are in an OPENED state */ 154 - ceph_con_keepalive(&monc->con); 155 - 156 - /* initiatiate authentication handshake */ 157 - ret = ceph_auth_build_hello(monc->auth, 158 - monc->m_auth->front.iov_base, 159 - monc->m_auth->front_alloc_len); 160 - __send_prepared_auth_request(monc, ret); 140 + if (monc->monmap->num_mon == 1) { 141 + monc->cur_mon = 0; 161 142 } else { 162 - dout("open_session mon%d already open\n", monc->cur_mon); 143 + int max = monc->monmap->num_mon; 144 + int o = -1; 145 + int n; 146 + 147 + if (monc->cur_mon >= 0) { 148 + if (monc->cur_mon < monc->monmap->num_mon) 149 + o = monc->cur_mon; 150 + if (o >= 0) 151 + max--; 152 + } 153 + 154 + n = prandom_u32() % max; 155 + if (o >= 0 && n >= o) 156 + n++; 157 + 158 + monc->cur_mon = n; 163 159 } 164 - return 0; 160 + 161 + dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon, 162 + monc->cur_mon, monc->monmap->num_mon); 165 163 } 166 164 167 - static bool __sub_expired(struct ceph_mon_client *monc) 165 + /* 166 + * Open a session with a new monitor. 167 + */ 168 + static void __open_session(struct ceph_mon_client *monc) 168 169 { 169 - return time_after_eq(jiffies, monc->sub_renew_after); 170 + int ret; 171 + 172 + pick_new_mon(monc); 173 + 174 + monc->hunting = true; 175 + if (monc->had_a_connection) { 176 + monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF; 177 + if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT) 178 + monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT; 179 + } 180 + 181 + monc->sub_renew_after = jiffies; /* i.e., expired */ 182 + monc->sub_renew_sent = 0; 183 + 184 + dout("%s opening mon%d\n", __func__, monc->cur_mon); 185 + ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon, 186 + &monc->monmap->mon_inst[monc->cur_mon].addr); 187 + 188 + /* 189 + * send an initial keepalive to ensure our timestamp is valid 190 + * by the time we are in an OPENED state 191 + */ 192 + ceph_con_keepalive(&monc->con); 193 + 194 + /* initiate authentication handshake */ 195 + ret = ceph_auth_build_hello(monc->auth, 196 + monc->m_auth->front.iov_base, 197 + monc->m_auth->front_alloc_len); 198 + BUG_ON(ret <= 0); 199 + __send_prepared_auth_request(monc, ret); 200 + } 201 + 202 + static void reopen_session(struct ceph_mon_client *monc) 203 + { 204 + if (!monc->hunting) 205 + pr_info("mon%d %s session lost, hunting for new mon\n", 206 + monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr)); 207 + 208 + __close_session(monc); 209 + __open_session(monc); 170 210 } 171 211 172 212 /* ··· 214 174 */ 215 175 static void __schedule_delayed(struct ceph_mon_client *monc) 216 176 { 217 - struct ceph_options *opt = monc->client->options; 218 177 unsigned long delay; 219 178 220 - if (monc->cur_mon < 0 || __sub_expired(monc)) { 221 - delay = 10 * HZ; 222 - } else { 223 - delay = 20 * HZ; 224 - if (opt->monc_ping_timeout > 0) 225 - delay = min(delay, opt->monc_ping_timeout / 3); 226 - } 179 + if (monc->hunting) 180 + delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult; 181 + else 182 + delay = CEPH_MONC_PING_INTERVAL; 183 + 227 184 dout("__schedule_delayed after %lu\n", delay); 228 - schedule_delayed_work(&monc->delayed_work, 229 - round_jiffies_relative(delay)); 185 + mod_delayed_work(system_wq, &monc->delayed_work, 186 + round_jiffies_relative(delay)); 230 187 } 231 188 189 + const char *ceph_sub_str[] = { 190 + [CEPH_SUB_MDSMAP] = "mdsmap", 191 + [CEPH_SUB_MONMAP] = "monmap", 192 + [CEPH_SUB_OSDMAP] = "osdmap", 193 + }; 194 + 232 195 /* 233 - * Send subscribe request for mdsmap and/or osdmap. 196 + * Send subscribe request for one or more maps, according to 197 + * monc->subs. 234 198 */ 235 199 static void __send_subscribe(struct ceph_mon_client *monc) 236 200 { 237 - dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n", 238 - (unsigned int)monc->sub_sent, __sub_expired(monc), 239 - monc->want_next_osdmap); 240 - if ((__sub_expired(monc) && !monc->sub_sent) || 241 - monc->want_next_osdmap == 1) { 242 - struct ceph_msg *msg = monc->m_subscribe; 243 - struct ceph_mon_subscribe_item *i; 244 - void *p, *end; 245 - int num; 201 + struct ceph_msg *msg = monc->m_subscribe; 202 + void *p = msg->front.iov_base; 203 + void *const end = p + msg->front_alloc_len; 204 + int num = 0; 205 + int i; 246 206 247 - p = msg->front.iov_base; 248 - end = p + msg->front_alloc_len; 207 + dout("%s sent %lu\n", __func__, monc->sub_renew_sent); 249 208 250 - num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; 251 - ceph_encode_32(&p, num); 209 + BUG_ON(monc->cur_mon < 0); 252 210 253 - if (monc->want_next_osdmap) { 254 - dout("__send_subscribe to 'osdmap' %u\n", 255 - (unsigned int)monc->have_osdmap); 256 - ceph_encode_string(&p, end, "osdmap", 6); 257 - i = p; 258 - i->have = cpu_to_le64(monc->have_osdmap); 259 - i->onetime = 1; 260 - p += sizeof(*i); 261 - monc->want_next_osdmap = 2; /* requested */ 262 - } 263 - if (monc->want_mdsmap) { 264 - dout("__send_subscribe to 'mdsmap' %u+\n", 265 - (unsigned int)monc->have_mdsmap); 266 - ceph_encode_string(&p, end, "mdsmap", 6); 267 - i = p; 268 - i->have = cpu_to_le64(monc->have_mdsmap); 269 - i->onetime = 0; 270 - p += sizeof(*i); 271 - } 272 - ceph_encode_string(&p, end, "monmap", 6); 273 - i = p; 274 - i->have = 0; 275 - i->onetime = 0; 276 - p += sizeof(*i); 211 + if (!monc->sub_renew_sent) 212 + monc->sub_renew_sent = jiffies | 1; /* never 0 */ 277 213 278 - msg->front.iov_len = p - msg->front.iov_base; 279 - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 280 - ceph_msg_revoke(msg); 281 - ceph_con_send(&monc->con, ceph_msg_get(msg)); 214 + msg->hdr.version = cpu_to_le16(2); 282 215 283 - monc->sub_sent = jiffies | 1; /* never 0 */ 216 + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 217 + if (monc->subs[i].want) 218 + num++; 284 219 } 220 + BUG_ON(num < 1); /* monmap sub is always there */ 221 + ceph_encode_32(&p, num); 222 + for (i = 0; i < ARRAY_SIZE(monc->subs); i++) { 223 + const char *s = ceph_sub_str[i]; 224 + 225 + if (!monc->subs[i].want) 226 + continue; 227 + 228 + dout("%s %s start %llu flags 0x%x\n", __func__, s, 229 + le64_to_cpu(monc->subs[i].item.start), 230 + monc->subs[i].item.flags); 231 + ceph_encode_string(&p, end, s, strlen(s)); 232 + memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item)); 233 + p += sizeof(monc->subs[i].item); 234 + } 235 + 236 + BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19)); 237 + msg->front.iov_len = p - msg->front.iov_base; 238 + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 239 + ceph_msg_revoke(msg); 240 + ceph_con_send(&monc->con, ceph_msg_get(msg)); 285 241 } 286 242 287 243 static void handle_subscribe_ack(struct ceph_mon_client *monc, ··· 291 255 seconds = le32_to_cpu(h->duration); 292 256 293 257 mutex_lock(&monc->mutex); 294 - if (monc->hunting) { 295 - pr_info("mon%d %s session established\n", 296 - monc->cur_mon, 297 - ceph_pr_addr(&monc->con.peer_addr.in_addr)); 298 - monc->hunting = false; 258 + if (monc->sub_renew_sent) { 259 + monc->sub_renew_after = monc->sub_renew_sent + 260 + (seconds >> 1) * HZ - 1; 261 + dout("%s sent %lu duration %d renew after %lu\n", __func__, 262 + monc->sub_renew_sent, seconds, monc->sub_renew_after); 263 + monc->sub_renew_sent = 0; 264 + } else { 265 + dout("%s sent %lu renew after %lu, ignoring\n", __func__, 266 + monc->sub_renew_sent, monc->sub_renew_after); 299 267 } 300 - dout("handle_subscribe_ack after %d seconds\n", seconds); 301 - monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1; 302 - monc->sub_sent = 0; 303 268 mutex_unlock(&monc->mutex); 304 269 return; 305 270 bad: ··· 309 272 } 310 273 311 274 /* 312 - * Keep track of which maps we have 275 + * Register interest in a map 276 + * 277 + * @sub: one of CEPH_SUB_* 278 + * @epoch: X for "every map since X", or 0 for "just the latest" 313 279 */ 314 - int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) 280 + static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub, 281 + u32 epoch, bool continuous) 315 282 { 316 - mutex_lock(&monc->mutex); 317 - monc->have_mdsmap = got; 318 - mutex_unlock(&monc->mutex); 319 - return 0; 320 - } 321 - EXPORT_SYMBOL(ceph_monc_got_mdsmap); 283 + __le64 start = cpu_to_le64(epoch); 284 + u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0; 322 285 323 - int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) 286 + dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub], 287 + epoch, continuous); 288 + 289 + if (monc->subs[sub].want && 290 + monc->subs[sub].item.start == start && 291 + monc->subs[sub].item.flags == flags) 292 + return false; 293 + 294 + monc->subs[sub].item.start = start; 295 + monc->subs[sub].item.flags = flags; 296 + monc->subs[sub].want = true; 297 + 298 + return true; 299 + } 300 + 301 + bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch, 302 + bool continuous) 303 + { 304 + bool need_request; 305 + 306 + mutex_lock(&monc->mutex); 307 + need_request = __ceph_monc_want_map(monc, sub, epoch, continuous); 308 + mutex_unlock(&monc->mutex); 309 + 310 + return need_request; 311 + } 312 + EXPORT_SYMBOL(ceph_monc_want_map); 313 + 314 + /* 315 + * Keep track of which maps we have 316 + * 317 + * @sub: one of CEPH_SUB_* 318 + */ 319 + static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub, 320 + u32 epoch) 321 + { 322 + dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch); 323 + 324 + if (monc->subs[sub].want) { 325 + if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME) 326 + monc->subs[sub].want = false; 327 + else 328 + monc->subs[sub].item.start = cpu_to_le64(epoch + 1); 329 + } 330 + 331 + monc->subs[sub].have = epoch; 332 + } 333 + 334 + void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch) 324 335 { 325 336 mutex_lock(&monc->mutex); 326 - monc->have_osdmap = got; 327 - monc->want_next_osdmap = 0; 337 + __ceph_monc_got_map(monc, sub, epoch); 328 338 mutex_unlock(&monc->mutex); 329 - return 0; 330 339 } 340 + EXPORT_SYMBOL(ceph_monc_got_map); 331 341 332 342 /* 333 343 * Register interest in the next osdmap 334 344 */ 335 345 void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) 336 346 { 337 - dout("request_next_osdmap have %u\n", monc->have_osdmap); 347 + dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have); 338 348 mutex_lock(&monc->mutex); 339 - if (!monc->want_next_osdmap) 340 - monc->want_next_osdmap = 1; 341 - if (monc->want_next_osdmap < 2) 349 + if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 350 + monc->subs[CEPH_SUB_OSDMAP].have + 1, false)) 342 351 __send_subscribe(monc); 343 352 mutex_unlock(&monc->mutex); 344 353 } ··· 403 320 long ret; 404 321 405 322 mutex_lock(&monc->mutex); 406 - while (monc->have_osdmap < epoch) { 323 + while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) { 407 324 mutex_unlock(&monc->mutex); 408 325 409 326 if (timeout && time_after_eq(jiffies, started + timeout)) 410 327 return -ETIMEDOUT; 411 328 412 329 ret = wait_event_interruptible_timeout(monc->client->auth_wq, 413 - monc->have_osdmap >= epoch, 414 - ceph_timeout_jiffies(timeout)); 330 + monc->subs[CEPH_SUB_OSDMAP].have >= epoch, 331 + ceph_timeout_jiffies(timeout)); 415 332 if (ret < 0) 416 333 return ret; 417 334 ··· 424 341 EXPORT_SYMBOL(ceph_monc_wait_osdmap); 425 342 426 343 /* 427 - * 344 + * Open a session with a random monitor. Request monmap and osdmap, 345 + * which are waited upon in __ceph_open_session(). 428 346 */ 429 347 int ceph_monc_open_session(struct ceph_mon_client *monc) 430 348 { 431 349 mutex_lock(&monc->mutex); 350 + __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true); 351 + __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false); 432 352 __open_session(monc); 433 353 __schedule_delayed(monc); 434 354 mutex_unlock(&monc->mutex); ··· 439 353 } 440 354 EXPORT_SYMBOL(ceph_monc_open_session); 441 355 442 - /* 443 - * We require the fsid and global_id in order to initialize our 444 - * debugfs dir. 445 - */ 446 - static bool have_debugfs_info(struct ceph_mon_client *monc) 447 - { 448 - dout("have_debugfs_info fsid %d globalid %lld\n", 449 - (int)monc->client->have_fsid, monc->auth->global_id); 450 - return monc->client->have_fsid && monc->auth->global_id > 0; 451 - } 452 - 453 356 static void ceph_monc_handle_map(struct ceph_mon_client *monc, 454 357 struct ceph_msg *msg) 455 358 { 456 359 struct ceph_client *client = monc->client; 457 360 struct ceph_monmap *monmap = NULL, *old = monc->monmap; 458 361 void *p, *end; 459 - int had_debugfs_info, init_debugfs = 0; 460 362 461 363 mutex_lock(&monc->mutex); 462 - 463 - had_debugfs_info = have_debugfs_info(monc); 464 364 465 365 dout("handle_monmap\n"); 466 366 p = msg->front.iov_base; ··· 467 395 client->monc.monmap = monmap; 468 396 kfree(old); 469 397 470 - if (!client->have_fsid) { 471 - client->have_fsid = true; 472 - if (!had_debugfs_info && have_debugfs_info(monc)) { 473 - pr_info("client%lld fsid %pU\n", 474 - ceph_client_id(monc->client), 475 - &monc->client->fsid); 476 - init_debugfs = 1; 477 - } 478 - mutex_unlock(&monc->mutex); 398 + __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch); 399 + client->have_fsid = true; 479 400 480 - if (init_debugfs) { 481 - /* 482 - * do debugfs initialization without mutex to avoid 483 - * creating a locking dependency 484 - */ 485 - ceph_debugfs_client_init(monc->client); 486 - } 487 - 488 - goto out_unlocked; 489 - } 490 401 out: 491 402 mutex_unlock(&monc->mutex); 492 - out_unlocked: 493 403 wake_up_all(&client->auth_wq); 494 404 } 495 405 ··· 799 745 dout("monc delayed_work\n"); 800 746 mutex_lock(&monc->mutex); 801 747 if (monc->hunting) { 802 - __close_session(monc); 803 - __open_session(monc); /* continue hunting */ 748 + dout("%s continuing hunt\n", __func__); 749 + reopen_session(monc); 804 750 } else { 805 - struct ceph_options *opt = monc->client->options; 806 751 int is_auth = ceph_auth_is_authenticated(monc->auth); 807 752 if (ceph_con_keepalive_expired(&monc->con, 808 - opt->monc_ping_timeout)) { 753 + CEPH_MONC_PING_TIMEOUT)) { 809 754 dout("monc keepalive timeout\n"); 810 755 is_auth = 0; 811 - __close_session(monc); 812 - monc->hunting = true; 813 - __open_session(monc); 756 + reopen_session(monc); 814 757 } 815 758 816 759 if (!monc->hunting) { ··· 815 764 __validate_auth(monc); 816 765 } 817 766 818 - if (is_auth) 819 - __send_subscribe(monc); 767 + if (is_auth) { 768 + unsigned long now = jiffies; 769 + 770 + dout("%s renew subs? now %lu renew after %lu\n", 771 + __func__, now, monc->sub_renew_after); 772 + if (time_after_eq(now, monc->sub_renew_after)) 773 + __send_subscribe(monc); 774 + } 820 775 } 821 776 __schedule_delayed(monc); 822 777 mutex_unlock(&monc->mutex); ··· 909 852 &monc->client->msgr); 910 853 911 854 monc->cur_mon = -1; 912 - monc->hunting = true; 913 - monc->sub_renew_after = jiffies; 914 - monc->sub_sent = 0; 855 + monc->had_a_connection = false; 856 + monc->hunt_mult = 1; 915 857 916 858 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); 917 859 monc->generic_request_tree = RB_ROOT; 918 860 monc->num_generic_requests = 0; 919 861 monc->last_tid = 0; 920 862 921 - monc->have_mdsmap = 0; 922 - monc->have_osdmap = 0; 923 - monc->want_next_osdmap = 1; 924 863 return 0; 925 864 926 865 out_auth_reply: ··· 941 888 942 889 mutex_lock(&monc->mutex); 943 890 __close_session(monc); 944 - 891 + monc->cur_mon = -1; 945 892 mutex_unlock(&monc->mutex); 946 893 947 894 /* ··· 963 910 } 964 911 EXPORT_SYMBOL(ceph_monc_stop); 965 912 913 + static void finish_hunting(struct ceph_mon_client *monc) 914 + { 915 + if (monc->hunting) { 916 + dout("%s found mon%d\n", __func__, monc->cur_mon); 917 + monc->hunting = false; 918 + monc->had_a_connection = true; 919 + monc->hunt_mult /= 2; /* reduce by 50% */ 920 + if (monc->hunt_mult < 1) 921 + monc->hunt_mult = 1; 922 + } 923 + } 924 + 966 925 static void handle_auth_reply(struct ceph_mon_client *monc, 967 926 struct ceph_msg *msg) 968 927 { 969 928 int ret; 970 929 int was_auth = 0; 971 - int had_debugfs_info, init_debugfs = 0; 972 930 973 931 mutex_lock(&monc->mutex); 974 - had_debugfs_info = have_debugfs_info(monc); 975 932 was_auth = ceph_auth_is_authenticated(monc->auth); 976 933 monc->pending_auth = 0; 977 934 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 978 935 msg->front.iov_len, 979 936 monc->m_auth->front.iov_base, 980 937 monc->m_auth->front_alloc_len); 938 + if (ret > 0) { 939 + __send_prepared_auth_request(monc, ret); 940 + goto out; 941 + } 942 + 943 + finish_hunting(monc); 944 + 981 945 if (ret < 0) { 982 946 monc->client->auth_err = ret; 983 - wake_up_all(&monc->client->auth_wq); 984 - } else if (ret > 0) { 985 - __send_prepared_auth_request(monc, ret); 986 947 } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { 987 948 dout("authenticated, starting session\n"); 988 949 ··· 1006 939 1007 940 __send_subscribe(monc); 1008 941 __resend_generic_request(monc); 942 + 943 + pr_info("mon%d %s session established\n", monc->cur_mon, 944 + ceph_pr_addr(&monc->con.peer_addr.in_addr)); 1009 945 } 1010 946 1011 - if (!had_debugfs_info && have_debugfs_info(monc)) { 1012 - pr_info("client%lld fsid %pU\n", 1013 - ceph_client_id(monc->client), 1014 - &monc->client->fsid); 1015 - init_debugfs = 1; 1016 - } 947 + out: 1017 948 mutex_unlock(&monc->mutex); 1018 - 1019 - if (init_debugfs) { 1020 - /* 1021 - * do debugfs initialization without mutex to avoid 1022 - * creating a locking dependency 1023 - */ 1024 - ceph_debugfs_client_init(monc->client); 1025 - } 949 + if (monc->client->auth_err < 0) 950 + wake_up_all(&monc->client->auth_wq); 1026 951 } 1027 952 1028 953 static int __validate_auth(struct ceph_mon_client *monc) ··· 1155 1096 { 1156 1097 struct ceph_mon_client *monc = con->private; 1157 1098 1158 - if (!monc) 1159 - return; 1160 - 1161 - dout("mon_fault\n"); 1162 1099 mutex_lock(&monc->mutex); 1163 - if (!con->private) 1164 - goto out; 1165 - 1166 - if (!monc->hunting) 1167 - pr_info("mon%d %s session lost, " 1168 - "hunting for new mon\n", monc->cur_mon, 1169 - ceph_pr_addr(&monc->con.peer_addr.in_addr)); 1170 - 1171 - __close_session(monc); 1172 - if (!monc->hunting) { 1173 - /* start hunting */ 1174 - monc->hunting = true; 1175 - __open_session(monc); 1176 - } else { 1177 - /* already hunting, let's wait a bit */ 1178 - __schedule_delayed(monc); 1100 + dout("%s mon%d\n", __func__, monc->cur_mon); 1101 + if (monc->cur_mon >= 0) { 1102 + if (!monc->hunting) { 1103 + dout("%s hunting for new mon\n", __func__); 1104 + reopen_session(monc); 1105 + __schedule_delayed(monc); 1106 + } else { 1107 + dout("%s already hunting\n", __func__); 1108 + } 1179 1109 } 1180 - out: 1181 1110 mutex_unlock(&monc->mutex); 1182 1111 } 1183 1112

+73 -36

net/ceph/osd_client.c

··· 338 338 ceph_put_snap_context(req->r_snapc); 339 339 if (req->r_mempool) 340 340 mempool_free(req, req->r_osdc->req_mempool); 341 - else 341 + else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS) 342 342 kmem_cache_free(ceph_osd_request_cache, req); 343 - 343 + else 344 + kfree(req); 344 345 } 345 346 346 347 void ceph_osdc_get_request(struct ceph_osd_request *req) ··· 370 369 struct ceph_msg *msg; 371 370 size_t msg_size; 372 371 373 - BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX); 374 - BUG_ON(num_ops > CEPH_OSD_MAX_OP); 375 - 376 - msg_size = 4 + 4 + 8 + 8 + 4+8; 377 - msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 378 - msg_size += 1 + 8 + 4 + 4; /* pg_t */ 379 - msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 380 - msg_size += 2 + num_ops*sizeof(struct ceph_osd_op); 381 - msg_size += 8; /* snapid */ 382 - msg_size += 8; /* snap_seq */ 383 - msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ 384 - msg_size += 4; 385 - 386 372 if (use_mempool) { 373 + BUG_ON(num_ops > CEPH_OSD_SLAB_OPS); 387 374 req = mempool_alloc(osdc->req_mempool, gfp_flags); 388 - memset(req, 0, sizeof(*req)); 375 + } else if (num_ops <= CEPH_OSD_SLAB_OPS) { 376 + req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags); 389 377 } else { 390 - req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); 378 + BUG_ON(num_ops > CEPH_OSD_MAX_OPS); 379 + req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]), 380 + gfp_flags); 391 381 } 392 - if (req == NULL) 382 + if (unlikely(!req)) 393 383 return NULL; 384 + 385 + /* req only, each op is zeroed in _osd_req_op_init() */ 386 + memset(req, 0, sizeof(*req)); 394 387 395 388 req->r_osdc = osdc; 396 389 req->r_mempool = use_mempool; ··· 403 408 req->r_base_oloc.pool = -1; 404 409 req->r_target_oloc.pool = -1; 405 410 411 + msg_size = OSD_OPREPLY_FRONT_LEN; 412 + if (num_ops > CEPH_OSD_SLAB_OPS) { 413 + /* ceph_osd_op and rval */ 414 + msg_size += (num_ops - CEPH_OSD_SLAB_OPS) * 415 + (sizeof(struct ceph_osd_op) + 4); 416 + } 417 + 406 418 /* create reply message */ 407 419 if (use_mempool) 408 420 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 409 421 else 410 - msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 411 - OSD_OPREPLY_FRONT_LEN, gfp_flags, true); 422 + msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, 423 + gfp_flags, true); 412 424 if (!msg) { 413 425 ceph_osdc_put_request(req); 414 426 return NULL; 415 427 } 416 428 req->r_reply = msg; 429 + 430 + msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ 431 + msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ 432 + msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 433 + msg_size += 1 + 8 + 4 + 4; /* pgid */ 434 + msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */ 435 + msg_size += 2 + num_ops * sizeof(struct ceph_osd_op); 436 + msg_size += 8; /* snapid */ 437 + msg_size += 8; /* snap_seq */ 438 + msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */ 439 + msg_size += 4; /* retry_attempt */ 417 440 418 441 /* create request message; allow space for oid */ 419 442 if (use_mempool) ··· 511 498 if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) 512 499 payload_len += length; 513 500 514 - op->payload_len = payload_len; 501 + op->indata_len = payload_len; 515 502 } 516 503 EXPORT_SYMBOL(osd_req_op_extent_init); 517 504 ··· 530 517 BUG_ON(length > previous); 531 518 532 519 op->extent.length = length; 533 - op->payload_len -= previous - length; 520 + op->indata_len -= previous - length; 534 521 } 535 522 EXPORT_SYMBOL(osd_req_op_extent_update); 523 + 524 + void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, 525 + unsigned int which, u64 offset_inc) 526 + { 527 + struct ceph_osd_req_op *op, *prev_op; 528 + 529 + BUG_ON(which + 1 >= osd_req->r_num_ops); 530 + 531 + prev_op = &osd_req->r_ops[which]; 532 + op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags); 533 + /* dup previous one */ 534 + op->indata_len = prev_op->indata_len; 535 + op->outdata_len = prev_op->outdata_len; 536 + op->extent = prev_op->extent; 537 + /* adjust offset */ 538 + op->extent.offset += offset_inc; 539 + op->extent.length -= offset_inc; 540 + 541 + if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL) 542 + op->indata_len -= offset_inc; 543 + } 544 + EXPORT_SYMBOL(osd_req_op_extent_dup_last); 536 545 537 546 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, 538 547 u16 opcode, const char *class, const char *method) ··· 589 554 590 555 op->cls.argc = 0; /* currently unused */ 591 556 592 - op->payload_len = payload_len; 557 + op->indata_len = payload_len; 593 558 } 594 559 EXPORT_SYMBOL(osd_req_op_cls_init); 595 560 ··· 622 587 op->xattr.cmp_mode = cmp_mode; 623 588 624 589 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); 625 - op->payload_len = payload_len; 590 + op->indata_len = payload_len; 626 591 return 0; 627 592 } 628 593 EXPORT_SYMBOL(osd_req_op_xattr_init); ··· 742 707 BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); 743 708 dst->cls.indata_len = cpu_to_le32(data_length); 744 709 ceph_osdc_msg_data_add(req->r_request, osd_data); 745 - src->payload_len += data_length; 710 + src->indata_len += data_length; 746 711 request_data_len += data_length; 747 712 } 748 713 osd_data = &src->cls.response_data; ··· 785 750 786 751 dst->op = cpu_to_le16(src->op); 787 752 dst->flags = cpu_to_le32(src->flags); 788 - dst->payload_len = cpu_to_le32(src->payload_len); 753 + dst->payload_len = cpu_to_le32(src->indata_len); 789 754 790 755 return request_data_len; 791 756 } ··· 1845 1810 1846 1811 ceph_decode_need(&p, end, 4, bad_put); 1847 1812 numops = ceph_decode_32(&p); 1848 - if (numops > CEPH_OSD_MAX_OP) 1813 + if (numops > CEPH_OSD_MAX_OPS) 1849 1814 goto bad_put; 1850 1815 if (numops != req->r_num_ops) 1851 1816 goto bad_put; ··· 1856 1821 int len; 1857 1822 1858 1823 len = le32_to_cpu(op->payload_len); 1859 - req->r_reply_op_len[i] = len; 1824 + req->r_ops[i].outdata_len = len; 1860 1825 dout(" op %d has %d bytes\n", i, len); 1861 1826 payload_len += len; 1862 1827 p += sizeof(*op); ··· 1871 1836 ceph_decode_need(&p, end, 4 + numops * 4, bad_put); 1872 1837 retry_attempt = ceph_decode_32(&p); 1873 1838 for (i = 0; i < numops; i++) 1874 - req->r_reply_op_result[i] = ceph_decode_32(&p); 1839 + req->r_ops[i].rval = ceph_decode_32(&p); 1875 1840 1876 1841 if (le16_to_cpu(msg->hdr.version) >= 6) { 1877 1842 p += 8 + 4; /* skip replay_version */ ··· 2222 2187 goto bad; 2223 2188 done: 2224 2189 downgrade_write(&osdc->map_sem); 2225 - ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); 2190 + ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, 2191 + osdc->osdmap->epoch); 2226 2192 2227 2193 /* 2228 2194 * subscribe to subsequent osdmap updates if full to ensure ··· 2682 2646 round_jiffies_relative(osdc->client->options->osd_idle_ttl)); 2683 2647 2684 2648 err = -ENOMEM; 2685 - osdc->req_mempool = mempool_create_kmalloc_pool(10, 2686 - sizeof(struct ceph_osd_request)); 2649 + osdc->req_mempool = mempool_create_slab_pool(10, 2650 + ceph_osd_request_cache); 2687 2651 if (!osdc->req_mempool) 2688 2652 goto out; 2689 2653 ··· 2818 2782 2819 2783 int ceph_osdc_setup(void) 2820 2784 { 2785 + size_t size = sizeof(struct ceph_osd_request) + 2786 + CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op); 2787 + 2821 2788 BUG_ON(ceph_osd_request_cache); 2822 - ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", 2823 - sizeof (struct ceph_osd_request), 2824 - __alignof__(struct ceph_osd_request), 2825 - 0, NULL); 2789 + ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size, 2790 + 0, 0, NULL); 2826 2791 2827 2792 return ceph_osd_request_cache ? 0 : -ENOMEM; 2828 2793 }

Configure Feed

Configure Feed