Merge tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client

+9 -12

fs/ceph/addr.c

··· 530 530 long writeback_stat; 531 531 u64 truncate_size; 532 532 u32 truncate_seq; 533 - int err = 0, len = PAGE_SIZE; 533 + int err, len = PAGE_SIZE; 534 534 535 535 dout("writepage %p idx %lu\n", page, page->index); 536 536 537 - if (!page->mapping || !page->mapping->host) { 538 - dout("writepage %p - no mapping\n", page); 539 - return -EFAULT; 540 - } 541 537 inode = page->mapping->host; 542 538 ci = ceph_inode(inode); 543 539 fsc = ceph_inode_to_client(inode); ··· 543 547 snapc = page_snap_context(page); 544 548 if (snapc == NULL) { 545 549 dout("writepage %p page %p not dirty?\n", inode, page); 546 - goto out; 550 + return 0; 547 551 } 548 552 oldest = get_oldest_context(inode, &snap_size, 549 553 &truncate_size, &truncate_seq); ··· 551 555 dout("writepage %p page %p snapc %p not writeable - noop\n", 552 556 inode, page, snapc); 553 557 /* we should only noop if called by kswapd */ 554 - WARN_ON((current->flags & PF_MEMALLOC) == 0); 558 + WARN_ON(!(current->flags & PF_MEMALLOC)); 555 559 ceph_put_snap_context(oldest); 556 - goto out; 560 + redirty_page_for_writepage(wbc, page); 561 + return 0; 557 562 } 558 563 ceph_put_snap_context(oldest); 559 564 ··· 564 567 /* is this a partial page at end of file? */ 565 568 if (page_off >= snap_size) { 566 569 dout("%p page eof %llu\n", page, snap_size); 567 - goto out; 570 + return 0; 568 571 } 572 + 569 573 if (snap_size < page_off + len) 570 574 len = snap_size - page_off; 571 575 ··· 593 595 dout("writepage interrupted page %p\n", page); 594 596 redirty_page_for_writepage(wbc, page); 595 597 end_page_writeback(page); 596 - goto out; 598 + return err; 597 599 } 598 600 dout("writepage setting page/mapping error %d %p\n", 599 601 err, page); ··· 609 611 end_page_writeback(page); 610 612 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 611 613 ceph_put_snap_context(snapc); /* page's reference */ 612 - out: 613 614 return err; 614 615 } 615 616 ··· 1315 1318 struct page *page, void *fsdata) 1316 1319 { 1317 1320 struct inode *inode = file_inode(file); 1318 - int check_cap = 0; 1321 + bool check_cap = false; 1319 1322 1320 1323 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, 1321 1324 inode, page, (int)pos, (int)copied, (int)len);

+83 -9

fs/ceph/cache.c

··· 35 35 .version = 0, 36 36 }; 37 37 38 + static DEFINE_MUTEX(ceph_fscache_lock); 39 + static LIST_HEAD(ceph_fscache_list); 40 + 41 + struct ceph_fscache_entry { 42 + struct list_head list; 43 + struct fscache_cookie *fscache; 44 + struct ceph_fsid fsid; 45 + size_t uniq_len; 46 + char uniquifier[0]; 47 + }; 48 + 38 49 static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, 39 50 void *buffer, uint16_t maxbuf) 40 51 { 41 52 const struct ceph_fs_client* fsc = cookie_netfs_data; 42 - uint16_t klen; 53 + const char *fscache_uniq = fsc->mount_options->fscache_uniq; 54 + uint16_t fsid_len, uniq_len; 43 55 44 - klen = sizeof(fsc->client->fsid); 45 - if (klen > maxbuf) 56 + fsid_len = sizeof(fsc->client->fsid); 57 + uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; 58 + if (fsid_len + uniq_len > maxbuf) 46 59 return 0; 47 60 48 - memcpy(buffer, &fsc->client->fsid, klen); 49 - return klen; 61 + memcpy(buffer, &fsc->client->fsid, fsid_len); 62 + if (uniq_len) 63 + memcpy(buffer + fsid_len, fscache_uniq, uniq_len); 64 + 65 + return fsid_len + uniq_len; 50 66 } 51 67 52 68 static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { ··· 83 67 84 68 int ceph_fscache_register_fs(struct ceph_fs_client* fsc) 85 69 { 70 + const struct ceph_fsid *fsid = &fsc->client->fsid; 71 + const char *fscache_uniq = fsc->mount_options->fscache_uniq; 72 + size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0; 73 + struct ceph_fscache_entry *ent; 74 + int err = 0; 75 + 76 + mutex_lock(&ceph_fscache_lock); 77 + list_for_each_entry(ent, &ceph_fscache_list, list) { 78 + if (memcmp(&ent->fsid, fsid, sizeof(*fsid))) 79 + continue; 80 + if (ent->uniq_len != uniq_len) 81 + continue; 82 + if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len)) 83 + continue; 84 + 85 + pr_err("fscache cookie already registered for fsid %pU\n", fsid); 86 + pr_err(" use fsc=%%s mount option to specify a uniquifier\n"); 87 + err = -EBUSY; 88 + goto out_unlock; 89 + } 90 + 91 + ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL); 92 + if (!ent) { 93 + err = -ENOMEM; 94 + goto out_unlock; 95 + } 96 + 86 97 fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, 87 98 &ceph_fscache_fsid_object_def, 88 99 fsc, true); 89 - if (!fsc->fscache) 90 - pr_err("Unable to register fsid: %p fscache cookie\n", fsc); 91 100 92 - return 0; 101 + if (fsc->fscache) { 102 + memcpy(&ent->fsid, fsid, sizeof(*fsid)); 103 + if (uniq_len > 0) { 104 + memcpy(&ent->uniquifier, fscache_uniq, uniq_len); 105 + ent->uniq_len = uniq_len; 106 + } 107 + ent->fscache = fsc->fscache; 108 + list_add_tail(&ent->list, &ceph_fscache_list); 109 + } else { 110 + kfree(ent); 111 + pr_err("unable to register fscache cookie for fsid %pU\n", 112 + fsid); 113 + /* all other fs ignore this error */ 114 + } 115 + out_unlock: 116 + mutex_unlock(&ceph_fscache_lock); 117 + return err; 93 118 } 94 119 95 120 static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, ··· 406 349 407 350 void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) 408 351 { 409 - fscache_relinquish_cookie(fsc->fscache, 0); 352 + if (fscache_cookie_valid(fsc->fscache)) { 353 + struct ceph_fscache_entry *ent; 354 + bool found = false; 355 + 356 + mutex_lock(&ceph_fscache_lock); 357 + list_for_each_entry(ent, &ceph_fscache_list, list) { 358 + if (ent->fscache == fsc->fscache) { 359 + list_del(&ent->list); 360 + kfree(ent); 361 + found = true; 362 + break; 363 + } 364 + } 365 + WARN_ON_ONCE(!found); 366 + mutex_unlock(&ceph_fscache_lock); 367 + 368 + __fscache_relinquish_cookie(fsc->fscache, 0); 369 + } 410 370 fsc->fscache = NULL; 411 371 } 412 372

+33 -7

fs/ceph/caps.c

··· 1653 1653 return -1; 1654 1654 } 1655 1655 1656 + bool __ceph_should_report_size(struct ceph_inode_info *ci) 1657 + { 1658 + loff_t size = ci->vfs_inode.i_size; 1659 + /* mds will adjust max size according to the reported size */ 1660 + if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) 1661 + return false; 1662 + if (size >= ci->i_max_size) 1663 + return true; 1664 + /* half of previous max_size increment has been used */ 1665 + if (ci->i_max_size > ci->i_reported_size && 1666 + (size << 1) >= ci->i_max_size + ci->i_reported_size) 1667 + return true; 1668 + return false; 1669 + } 1670 + 1656 1671 /* 1657 1672 * Swiss army knife function to examine currently used and wanted 1658 1673 * versus held caps. Release, flush, ack revoked caps to mds as ··· 1821 1806 } 1822 1807 1823 1808 /* approaching file_max? */ 1824 - if ((inode->i_size << 1) >= ci->i_max_size && 1825 - (ci->i_reported_size << 1) < ci->i_max_size) { 1809 + if (__ceph_should_report_size(ci)) { 1826 1810 dout("i_size approaching max_size\n"); 1827 1811 goto ack; 1828 1812 } ··· 3041 3027 le32_to_cpu(grant->truncate_seq), 3042 3028 le64_to_cpu(grant->truncate_size), 3043 3029 size); 3044 - /* max size increase? */ 3045 - if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { 3030 + } 3031 + 3032 + if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { 3033 + if (max_size != ci->i_max_size) { 3046 3034 dout("max_size %lld -> %llu\n", 3047 3035 ci->i_max_size, max_size); 3048 3036 ci->i_max_size = max_size; ··· 3052 3036 ci->i_wanted_max_size = 0; /* reset */ 3053 3037 ci->i_requested_max_size = 0; 3054 3038 } 3039 + wake = true; 3040 + } else if (ci->i_wanted_max_size > ci->i_max_size && 3041 + ci->i_wanted_max_size > ci->i_requested_max_size) { 3042 + /* CEPH_CAP_OP_IMPORT */ 3055 3043 wake = true; 3056 3044 } 3057 3045 } ··· 3574 3554 } 3575 3555 3576 3556 /* make sure we re-request max_size, if necessary */ 3577 - ci->i_wanted_max_size = 0; 3578 3557 ci->i_requested_max_size = 0; 3579 3558 3580 3559 *old_issued = issued; ··· 3809 3790 */ 3810 3791 void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) 3811 3792 { 3793 + struct inode *inode; 3812 3794 struct ceph_inode_info *ci; 3813 3795 int flags = CHECK_CAPS_NODELAY; 3814 3796 ··· 3825 3805 time_before(jiffies, ci->i_hold_caps_max)) 3826 3806 break; 3827 3807 list_del_init(&ci->i_cap_delay_list); 3808 + 3809 + inode = igrab(&ci->vfs_inode); 3828 3810 spin_unlock(&mdsc->cap_delay_lock); 3829 - dout("check_delayed_caps on %p\n", &ci->vfs_inode); 3830 - ceph_check_caps(ci, flags, NULL); 3811 + 3812 + if (inode) { 3813 + dout("check_delayed_caps on %p\n", inode); 3814 + ceph_check_caps(ci, flags, NULL); 3815 + iput(inode); 3816 + } 3831 3817 } 3832 3818 spin_unlock(&mdsc->cap_delay_lock); 3833 3819 }

+1 -1

fs/ceph/file.c

··· 1040 1040 int num_pages; 1041 1041 int written = 0; 1042 1042 int flags; 1043 - int check_caps = 0; 1044 1043 int ret; 1044 + bool check_caps = false; 1045 1045 struct timespec mtime = current_time(inode); 1046 1046 size_t count = iov_iter_count(from); 1047 1047

+10 -8

fs/ceph/inode.c

··· 1016 1016 long unsigned ttl = from_time + (duration * HZ) / 1000; 1017 1017 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; 1018 1018 struct inode *dir; 1019 + struct ceph_mds_session *old_lease_session = NULL; 1019 1020 1020 1021 /* 1021 1022 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that ··· 1052 1051 time_before(ttl, di->time)) 1053 1052 goto out_unlock; /* we already have a newer lease. */ 1054 1053 1055 - if (di->lease_session && di->lease_session != session) 1056 - goto out_unlock; 1054 + if (di->lease_session && di->lease_session != session) { 1055 + old_lease_session = di->lease_session; 1056 + di->lease_session = NULL; 1057 + } 1057 1058 1058 1059 ceph_dentry_lru_touch(dentry); 1059 1060 ··· 1068 1065 di->time = ttl; 1069 1066 out_unlock: 1070 1067 spin_unlock(&dentry->d_lock); 1068 + if (old_lease_session) 1069 + ceph_put_mds_session(old_lease_session); 1071 1070 return; 1072 1071 } 1073 1072 ··· 1658 1653 return err; 1659 1654 } 1660 1655 1661 - int ceph_inode_set_size(struct inode *inode, loff_t size) 1656 + bool ceph_inode_set_size(struct inode *inode, loff_t size) 1662 1657 { 1663 1658 struct ceph_inode_info *ci = ceph_inode(inode); 1664 - int ret = 0; 1659 + bool ret; 1665 1660 1666 1661 spin_lock(&ci->i_ceph_lock); 1667 1662 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); 1668 1663 i_size_write(inode, size); 1669 1664 inode->i_blocks = calc_inode_blocks(size); 1670 1665 1671 - /* tell the MDS if we are approaching max_size */ 1672 - if ((size << 1) >= ci->i_max_size && 1673 - (ci->i_reported_size << 1) < ci->i_max_size) 1674 - ret = 1; 1666 + ret = __ceph_should_report_size(ci); 1675 1667 1676 1668 spin_unlock(&ci->i_ceph_lock); 1677 1669 return ret;

+24 -1

fs/ceph/locks.c

··· 127 127 dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", 128 128 req->r_tid); 129 129 130 + mutex_lock(&mdsc->mutex); 131 + if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { 132 + err = 0; 133 + } else { 134 + /* 135 + * ensure we aren't running concurrently with 136 + * ceph_fill_trace or ceph_readdir_prepopulate, which 137 + * rely on locks (dir mutex) held by our caller. 138 + */ 139 + mutex_lock(&req->r_fill_mutex); 140 + req->r_err = err; 141 + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); 142 + mutex_unlock(&req->r_fill_mutex); 143 + 144 + if (!req->r_session) { 145 + // haven't sent the request 146 + err = 0; 147 + } 148 + } 149 + mutex_unlock(&mdsc->mutex); 150 + if (!err) 151 + return 0; 152 + 130 153 intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, 131 154 USE_AUTH_MDS); 132 155 if (IS_ERR(intr_req)) ··· 169 146 if (err && err != -ERESTARTSYS) 170 147 return err; 171 148 172 - wait_for_completion(&req->r_completion); 149 + wait_for_completion_killable(&req->r_safe_completion); 173 150 return 0; 174 151 } 175 152

+2 -2

fs/ceph/mds_client.c

··· 3769 3769 void ceph_mdsc_destroy(struct ceph_fs_client *fsc) 3770 3770 { 3771 3771 struct ceph_mds_client *mdsc = fsc->mdsc; 3772 - 3773 3772 dout("mdsc_destroy %p\n", mdsc); 3774 - ceph_mdsc_stop(mdsc); 3775 3773 3776 3774 /* flush out any connection work with references to us */ 3777 3775 ceph_msgr_flush(); 3776 + 3777 + ceph_mdsc_stop(mdsc); 3778 3778 3779 3779 fsc->mdsc = NULL; 3780 3780 kfree(mdsc);

+33 -14

fs/ceph/super.c

··· 121 121 /* int args above */ 122 122 Opt_snapdirname, 123 123 Opt_mds_namespace, 124 + Opt_fscache_uniq, 124 125 Opt_last_string, 125 126 /* string args above */ 126 127 Opt_dirstat, ··· 159 158 /* int args above */ 160 159 {Opt_snapdirname, "snapdirname=%s"}, 161 160 {Opt_mds_namespace, "mds_namespace=%s"}, 161 + {Opt_fscache_uniq, "fsc=%s"}, 162 162 /* string args above */ 163 163 {Opt_dirstat, "dirstat"}, 164 164 {Opt_nodirstat, "nodirstat"}, ··· 224 222 GFP_KERNEL); 225 223 if (!fsopt->mds_namespace) 226 224 return -ENOMEM; 225 + break; 226 + case Opt_fscache_uniq: 227 + fsopt->fscache_uniq = kstrndup(argstr[0].from, 228 + argstr[0].to-argstr[0].from, 229 + GFP_KERNEL); 230 + if (!fsopt->fscache_uniq) 231 + return -ENOMEM; 232 + fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE; 227 233 break; 228 234 /* misc */ 229 235 case Opt_wsize: ··· 327 317 kfree(args->snapdir_name); 328 318 kfree(args->mds_namespace); 329 319 kfree(args->server_path); 320 + kfree(args->fscache_uniq); 330 321 kfree(args); 331 322 } 332 323 ··· 361 350 ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); 362 351 if (ret) 363 352 return ret; 364 - 365 353 ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); 354 + if (ret) 355 + return ret; 356 + ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); 366 357 if (ret) 367 358 return ret; 368 359 ··· 488 475 seq_puts(m, ",noasyncreaddir"); 489 476 if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) 490 477 seq_puts(m, ",nodcache"); 491 - if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) 492 - seq_puts(m, ",fsc"); 478 + if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) { 479 + if (fsopt->fscache_uniq) 480 + seq_printf(m, ",fsc=%s", fsopt->fscache_uniq); 481 + else 482 + seq_puts(m, ",fsc"); 483 + } 493 484 if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) 494 485 seq_puts(m, ",nopoolperm"); 495 486 ··· 614 597 if (!fsc->wb_pagevec_pool) 615 598 goto fail_trunc_wq; 616 599 617 - /* setup fscache */ 618 - if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) && 619 - (ceph_fscache_register_fs(fsc) != 0)) 620 - goto fail_fscache; 621 - 622 600 /* caps */ 623 601 fsc->min_caps = fsopt->max_readdir; 624 602 625 603 return fsc; 626 604 627 - fail_fscache: 628 - ceph_fscache_unregister_fs(fsc); 629 605 fail_trunc_wq: 630 606 destroy_workqueue(fsc->trunc_wq); 631 607 fail_pg_inv_wq: ··· 636 626 { 637 627 dout("destroy_fs_client %p\n", fsc); 638 628 639 - ceph_fscache_unregister_fs(fsc); 640 - 641 629 destroy_workqueue(fsc->wb_wq); 642 630 destroy_workqueue(fsc->pg_inv_wq); 643 631 destroy_workqueue(fsc->trunc_wq); ··· 643 635 mempool_destroy(fsc->wb_pagevec_pool); 644 636 645 637 destroy_mount_options(fsc->mount_options); 646 - 647 - ceph_fs_debugfs_cleanup(fsc); 648 638 649 639 ceph_destroy_client(fsc->client); 650 640 ··· 827 821 err = __ceph_open_session(fsc->client, started); 828 822 if (err < 0) 829 823 goto out; 824 + 825 + /* setup fscache */ 826 + if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) { 827 + err = ceph_fscache_register_fs(fsc); 828 + if (err < 0) 829 + goto out; 830 + } 830 831 831 832 if (!fsc->mount_options->server_path) { 832 833 path = ""; ··· 1053 1040 1054 1041 ceph_mdsc_pre_umount(fsc->mdsc); 1055 1042 generic_shutdown_super(s); 1043 + 1044 + fsc->client->extra_mon_dispatch = NULL; 1045 + ceph_fs_debugfs_cleanup(fsc); 1046 + 1047 + ceph_fscache_unregister_fs(fsc); 1048 + 1056 1049 ceph_mdsc_destroy(fsc); 1057 1050 1058 1051 destroy_fs_client(fsc);

+3 -1

fs/ceph/super.h

··· 73 73 char *snapdir_name; /* default ".snap" */ 74 74 char *mds_namespace; /* default NULL */ 75 75 char *server_path; /* default "/" */ 76 + char *fscache_uniq; /* default NULL */ 76 77 }; 77 78 78 79 struct ceph_fs_client { ··· 794 793 795 794 extern int ceph_inode_holds_cap(struct inode *inode, int mask); 796 795 797 - extern int ceph_inode_set_size(struct inode *inode, loff_t size); 796 + extern bool ceph_inode_set_size(struct inode *inode, loff_t size); 798 797 extern void __ceph_do_pending_vmtruncate(struct inode *inode); 799 798 extern void ceph_queue_vmtruncate(struct inode *inode); 800 799 ··· 919 918 struct ceph_snap_context *snapc); 920 919 extern void ceph_flush_snaps(struct ceph_inode_info *ci, 921 920 struct ceph_mds_session **psession); 921 + extern bool __ceph_should_report_size(struct ceph_inode_info *ci); 922 922 extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, 923 923 struct ceph_mds_session *session); 924 924 extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);

+3

fs/ceph/xattr.c

··· 756 756 /* let's see if a virtual xattr was requested */ 757 757 vxattr = ceph_match_vxattr(inode, name); 758 758 if (vxattr) { 759 + err = ceph_do_getattr(inode, 0, true); 760 + if (err) 761 + return err; 759 762 err = -ENODATA; 760 763 if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) 761 764 err = vxattr->getxattr_cb(ci, value, size);

+172 -92

include/linux/ceph/ceph_features.h

··· 2 2 #define __CEPH_FEATURES 3 3 4 4 /* 5 - * feature bits 5 + * Each time we reclaim bits for reuse we need to specify another bit 6 + * that, if present, indicates we have the new incarnation of that 7 + * feature. Base case is 1 (first use). 6 8 */ 7 - #define CEPH_FEATURE_UID (1ULL<<0) 8 - #define CEPH_FEATURE_NOSRCADDR (1ULL<<1) 9 - #define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2) 10 - #define CEPH_FEATURE_FLOCK (1ULL<<3) 11 - #define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) 12 - #define CEPH_FEATURE_MONNAMES (1ULL<<5) 13 - #define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) 14 - #define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7) 15 - #define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) 16 - #define CEPH_FEATURE_PGID64 (1ULL<<9) 17 - #define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) 18 - #define CEPH_FEATURE_PGPOOL3 (1ULL<<11) 19 - #define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) 20 - #define CEPH_FEATURE_OSDENC (1ULL<<13) 21 - #define CEPH_FEATURE_OMAP (1ULL<<14) 22 - #define CEPH_FEATURE_MONENC (1ULL<<15) 23 - #define CEPH_FEATURE_QUERY_T (1ULL<<16) 24 - #define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17) 25 - #define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18) 26 - #define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19) 27 - #define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20) 28 - #define CEPH_FEATURE_MON_GV (1ULL<<21) 29 - #define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22) 30 - #define CEPH_FEATURE_MSG_AUTH (1ULL<<23) 31 - #define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24) 32 - #define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25) 33 - #define CEPH_FEATURE_CREATEPOOLID (1ULL<<26) 34 - #define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27) 35 - #define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28) 36 - #define CEPH_FEATURE_MDSENC (1ULL<<29) 37 - #define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30) 38 - #define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31) 39 - #define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32) 40 - #define CEPH_FEATURE_MON_SCRUB (1ULL<<33) 41 - #define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34) 42 - #define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35) 43 - #define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */ 44 - #define CEPH_FEATURE_EXPORT_PEER (1ULL<<37) 45 - #define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38) 46 - #define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */ 47 - /* The process supports new-style OSDMap encoding. Monitors also use 48 - this bit to determine if peers support NAK messages. */ 49 - #define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39) 50 - #define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40) 51 - #define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41) 52 - #define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */ 53 - #define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42) 54 - #define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43) 55 - #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44) 56 - #define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45) 57 - #define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46) 58 - #define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */ 59 - #define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */ 60 - #define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */ 61 - #define CEPH_FEATURE_MDS_QUOTA (1ULL<<47) 62 - #define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */ 63 - #define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49) 64 - // duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY 65 - #define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */ 66 - #define CEPH_FEATURE_MON_METADATA (1ULL<<50) 67 - #define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */ 68 - #define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52) 69 - #define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53) 70 - #define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54) 71 - #define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55) 72 - #define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */ 73 - #define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */ 74 - #define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */ 75 - #define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */ 76 - // duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5 77 - #define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */ 78 - #define CEPH_FEATURE_FS_FILE_LAYOUT_V2 (1ULL<<58) /* file_layout_t */ 9 + #define CEPH_FEATURE_INCARNATION_1 (0ull) 10 + #define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL 11 + 12 + #define DEFINE_CEPH_FEATURE(bit, incarnation, name) \ 13 + const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \ 14 + const static uint64_t CEPH_FEATUREMASK_##name = \ 15 + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); 16 + 17 + /* this bit is ignored but still advertised by release *when* */ 18 + #define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \ 19 + const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \ 20 + const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \ 21 + (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation); 79 22 80 23 /* 81 - * The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature 82 - * vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 83 - * to mean 33 bit ~0, and introduce a helper below to do the 84 - * translation. 85 - * 86 - * This was introduced by ceph.git commit 87 - * 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 88 - * and fixed by ceph.git commit 89 - * 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c 24 + * this bit is ignored by release *unused* and not advertised by 25 + * release *unadvertised* 90 26 */ 91 - #define CEPH_FEATURE_RESERVED (1ULL<<63) 27 + #define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised) 92 28 93 - static inline u64 ceph_sanitize_features(u64 features) 94 - { 95 - if (features & CEPH_FEATURE_RESERVED) { 96 - /* everything through OSD_SNAPMAPPER */ 97 - return 0x1ffffffffull; 98 - } else { 99 - return features; 100 - } 101 - } 29 + 30 + /* 31 + * test for a feature. this test is safer than a typical mask against 32 + * the bit because it ensures that we have the bit AND the marker for the 33 + * bit's incarnation. this must be used in any case where the features 34 + * bits may include an old meaning of the bit. 35 + */ 36 + #define CEPH_HAVE_FEATURE(x, name) \ 37 + (((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name)) 38 + 39 + 40 + /* 41 + * Notes on deprecation: 42 + * 43 + * A *major* release is a release through which all upgrades must pass 44 + * (e.g., jewel). For example, no pre-jewel server will ever talk to 45 + * a post-jewel server (mon, osd, etc). 46 + * 47 + * For feature bits used *only* on the server-side: 48 + * 49 + * - In the first phase we indicate that a feature is DEPRECATED as of 50 + * a particular release. This is the first major release X (say, 51 + * jewel) that does not depend on its peers advertising the feature. 52 + * That is, it safely assumes its peers all have the feature. We 53 + * indicate this with the DEPRECATED macro. For example, 54 + * 55 + * DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL) 56 + * 57 + * because 10.2.z (jewel) did not care if its peers advertised this 58 + * feature bit. 59 + * 60 + * - In the second phase we stop advertising the the bit and call it 61 + * RETIRED. This can normally be done in the *next* major release 62 + * following the one in which we marked the feature DEPRECATED. In 63 + * the above example, for 12.0.z (luminous) we can say: 64 + * 65 + * DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) 66 + * 67 + * - The bit can be reused in the first post-luminous release, 13.0.z 68 + * (m). 69 + * 70 + * This ensures that no two versions who have different meanings for 71 + * the bit ever speak to each other. 72 + */ 73 + 74 + DEFINE_CEPH_FEATURE( 0, 1, UID) 75 + DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR) 76 + DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS) 77 + 78 + DEFINE_CEPH_FEATURE( 3, 1, FLOCK) 79 + DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2) 80 + DEFINE_CEPH_FEATURE( 5, 1, MONNAMES) 81 + DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ) 82 + DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH) 83 + DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR) 84 + DEFINE_CEPH_FEATURE( 9, 1, PGID64) 85 + DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP) 86 + DEFINE_CEPH_FEATURE(11, 1, PGPOOL3) 87 + DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX) 88 + DEFINE_CEPH_FEATURE(13, 1, OSDENC) 89 + DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL) 90 + DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN) 91 + DEFINE_CEPH_FEATURE(15, 1, MONENC) 92 + DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS) 93 + 94 + DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS) 95 + 96 + DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES) 97 + DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS) 98 + 99 + DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS) 100 + 101 + DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL) 102 + DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS) 103 + DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap 104 + DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap 105 + DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap 106 + DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap 107 + DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS) 108 + 109 + DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH) 110 + DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS) 111 + 112 + DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2) 113 + DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID) 114 + DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE) 115 + DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL) 116 + DEFINE_CEPH_FEATURE(28, 2, SERVER_M) 117 + DEFINE_CEPH_FEATURE(29, 1, MDSENC) 118 + DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL) 119 + DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me 120 + DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS) 121 + 122 + DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS) 123 + 124 + DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS) 125 + 126 + DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL) 127 + DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2) 128 + DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER) 129 + DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES) 130 + DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap 131 + DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC) 132 + DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA) 133 + DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3) 134 + DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap 135 + DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2) 136 + DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND) 137 + DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2) 138 + DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS) 139 + 140 + DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS) 141 + DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap 142 + DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap 143 + DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap 144 + 145 + DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA) 146 + DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4) 147 + DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS) 148 + DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap 149 + 150 + DEFINE_CEPH_FEATURE(50, 1, MON_METADATA) 151 + DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT) 152 + DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES) 153 + DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3) 154 + DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT) 155 + DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4) 156 + DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING) 157 + DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB) 158 + DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap 159 + DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap 160 + DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap 161 + DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5) 162 + DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap 163 + DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap 164 + DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) 165 + DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap 166 + DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap 167 + DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit* 168 + 169 + DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down! 170 + DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal 171 + DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing 172 + 102 173 103 174 /* 104 175 * Features supported. ··· 184 113 CEPH_FEATURE_PGPOOL3 | \ 185 114 CEPH_FEATURE_OSDENC | \ 186 115 CEPH_FEATURE_CRUSH_TUNABLES | \ 116 + CEPH_FEATURE_SERVER_LUMINOUS | \ 117 + CEPH_FEATURE_RESEND_ON_SPLIT | \ 118 + CEPH_FEATURE_RADOS_BACKOFF | \ 119 + CEPH_FEATURE_OSDMAP_PG_UPMAP | \ 120 + CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \ 187 121 CEPH_FEATURE_MSG_AUTH | \ 188 122 CEPH_FEATURE_CRUSH_TUNABLES2 | \ 189 123 CEPH_FEATURE_REPLY_CREATE_INODE | \ ··· 202 126 CEPH_FEATURE_CRUSH_TUNABLES3 | \ 203 127 CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ 204 128 CEPH_FEATURE_MSGR_KEEPALIVE2 | \ 129 + CEPH_FEATURE_OSD_POOLRESEND | \ 205 130 CEPH_FEATURE_CRUSH_V4 | \ 131 + CEPH_FEATURE_NEW_OSDOP_ENCODING | \ 132 + CEPH_FEATURE_SERVER_JEWEL | \ 133 + CEPH_FEATURE_MON_STATEFUL_SUB | \ 206 134 CEPH_FEATURE_CRUSH_TUNABLES5 | \ 207 135 CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) 208 136

+1

include/linux/ceph/ceph_fs.h

··· 147 147 #define CEPH_MSG_OSD_OP 42 148 148 #define CEPH_MSG_OSD_OPREPLY 43 149 149 #define CEPH_MSG_WATCH_NOTIFY 44 150 + #define CEPH_MSG_OSD_BACKOFF 61 150 151 151 152 152 153 /* watch-notify operations */

+60

include/linux/ceph/decode.h

··· 133 133 } 134 134 135 135 /* 136 + * skip helpers 137 + */ 138 + #define ceph_decode_skip_n(p, end, n, bad) \ 139 + do { \ 140 + ceph_decode_need(p, end, n, bad); \ 141 + *p += n; \ 142 + } while (0) 143 + 144 + #define ceph_decode_skip_64(p, end, bad) \ 145 + ceph_decode_skip_n(p, end, sizeof(u64), bad) 146 + 147 + #define ceph_decode_skip_32(p, end, bad) \ 148 + ceph_decode_skip_n(p, end, sizeof(u32), bad) 149 + 150 + #define ceph_decode_skip_16(p, end, bad) \ 151 + ceph_decode_skip_n(p, end, sizeof(u16), bad) 152 + 153 + #define ceph_decode_skip_8(p, end, bad) \ 154 + ceph_decode_skip_n(p, end, sizeof(u8), bad) 155 + 156 + #define ceph_decode_skip_string(p, end, bad) \ 157 + do { \ 158 + u32 len; \ 159 + \ 160 + ceph_decode_32_safe(p, end, len, bad); \ 161 + ceph_decode_skip_n(p, end, len, bad); \ 162 + } while (0) 163 + 164 + #define ceph_decode_skip_set(p, end, type, bad) \ 165 + do { \ 166 + u32 len; \ 167 + \ 168 + ceph_decode_32_safe(p, end, len, bad); \ 169 + while (len--) \ 170 + ceph_decode_skip_##type(p, end, bad); \ 171 + } while (0) 172 + 173 + #define ceph_decode_skip_map(p, end, ktype, vtype, bad) \ 174 + do { \ 175 + u32 len; \ 176 + \ 177 + ceph_decode_32_safe(p, end, len, bad); \ 178 + while (len--) { \ 179 + ceph_decode_skip_##ktype(p, end, bad); \ 180 + ceph_decode_skip_##vtype(p, end, bad); \ 181 + } \ 182 + } while (0) 183 + 184 + #define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \ 185 + do { \ 186 + u32 len; \ 187 + \ 188 + ceph_decode_32_safe(p, end, len, bad); \ 189 + while (len--) { \ 190 + ceph_decode_skip_##ktype1(p, end, bad); \ 191 + ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \ 192 + } \ 193 + } while (0) 194 + 195 + /* 136 196 * struct ceph_timespec <-> struct timespec 137 197 */ 138 198 static inline void ceph_decode_timespec(struct timespec *ts,

+37 -12

include/linux/ceph/libceph.h

··· 184 184 (off >> PAGE_SHIFT); 185 185 } 186 186 187 - /* 188 - * These are not meant to be generic - an integer key is assumed. 189 - */ 190 - #define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ 187 + #define RB_BYVAL(a) (a) 188 + #define RB_BYPTR(a) (&(a)) 189 + #define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b)) 190 + 191 + #define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ 191 192 static void insert_##name(struct rb_root *root, type *t) \ 192 193 { \ 193 194 struct rb_node **n = &root->rb_node; \ ··· 198 197 \ 199 198 while (*n) { \ 200 199 type *cur = rb_entry(*n, type, nodefld); \ 200 + int cmp; \ 201 201 \ 202 202 parent = *n; \ 203 - if (t->keyfld < cur->keyfld) \ 203 + cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \ 204 + if (cmp < 0) \ 204 205 n = &(*n)->rb_left; \ 205 - else if (t->keyfld > cur->keyfld) \ 206 + else if (cmp > 0) \ 206 207 n = &(*n)->rb_right; \ 207 208 else \ 208 209 BUG(); \ ··· 220 217 RB_CLEAR_NODE(&t->nodefld); \ 221 218 } 222 219 223 - #define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ 224 - extern type __lookup_##name##_key; \ 225 - static type *lookup_##name(struct rb_root *root, \ 226 - typeof(__lookup_##name##_key.keyfld) key) \ 220 + /* 221 + * @lookup_param_type is a parameter and not constructed from (@type, 222 + * @keyfld) with typeof() because adding const is too unwieldy. 223 + */ 224 + #define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ 225 + lookup_param_type, nodefld) \ 226 + static type *lookup_##name(struct rb_root *root, lookup_param_type key) \ 227 227 { \ 228 228 struct rb_node *n = root->rb_node; \ 229 229 \ 230 230 while (n) { \ 231 231 type *cur = rb_entry(n, type, nodefld); \ 232 + int cmp; \ 232 233 \ 233 - if (key < cur->keyfld) \ 234 + cmp = cmpexp(key, keyexp(cur->keyfld)); \ 235 + if (cmp < 0) \ 234 236 n = n->rb_left; \ 235 - else if (key > cur->keyfld) \ 237 + else if (cmp > 0) \ 236 238 n = n->rb_right; \ 237 239 else \ 238 240 return cur; \ ··· 245 237 \ 246 238 return NULL; \ 247 239 } 240 + 241 + #define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \ 242 + lookup_param_type, nodefld) \ 243 + DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \ 244 + DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \ 245 + lookup_param_type, nodefld) 246 + 247 + /* 248 + * Shorthands for integer keys. 249 + */ 250 + #define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ 251 + DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld) 252 + 253 + #define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ 254 + extern type __lookup_##name##_key; \ 255 + DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \ 256 + typeof(__lookup_##name##_key.keyfld), nodefld) 248 257 249 258 #define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ 250 259 DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \

+2

include/linux/ceph/messenger.h

··· 44 44 struct ceph_msg_header *hdr, 45 45 int *skip); 46 46 47 + void (*reencode_message) (struct ceph_msg *msg); 48 + 47 49 int (*sign_message) (struct ceph_msg *msg); 48 50 int (*check_message_signature) (struct ceph_msg *msg); 49 51 };

+67 -3

include/linux/ceph/osd_client.h

··· 1 1 #ifndef _FS_CEPH_OSD_CLIENT_H 2 2 #define _FS_CEPH_OSD_CLIENT_H 3 3 4 + #include <linux/bitrev.h> 4 5 #include <linux/completion.h> 5 6 #include <linux/kref.h> 6 7 #include <linux/mempool.h> ··· 37 36 struct ceph_connection o_con; 38 37 struct rb_root o_requests; 39 38 struct rb_root o_linger_requests; 39 + struct rb_root o_backoff_mappings; 40 + struct rb_root o_backoffs_by_id; 40 41 struct list_head o_osd_lru; 41 42 struct ceph_auth_handshake o_auth; 42 43 unsigned long lru_ttl; ··· 139 136 struct ceph_object_id target_oid; 140 137 struct ceph_object_locator target_oloc; 141 138 142 - struct ceph_pg pgid; 139 + struct ceph_pg pgid; /* last raw pg we mapped to */ 140 + struct ceph_spg spgid; /* last actual spg we mapped to */ 143 141 u32 pg_num; 144 142 u32 pg_num_mask; 145 143 struct ceph_osds acting; ··· 151 147 152 148 unsigned int flags; /* CEPH_OSD_FLAG_* */ 153 149 bool paused; 150 + 151 + u32 epoch; 152 + u32 last_force_resend; 154 153 155 154 int osd; 156 155 }; ··· 200 193 unsigned long r_stamp; /* jiffies, send or check time */ 201 194 unsigned long r_start_stamp; /* jiffies */ 202 195 int r_attempts; 203 - u32 r_last_force_resend; 204 196 u32 r_map_dne_bound; 205 197 206 198 struct ceph_osd_req_op r_ops[]; ··· 208 202 struct ceph_request_redirect { 209 203 struct ceph_object_locator oloc; 210 204 }; 205 + 206 + /* 207 + * osd request identifier 208 + * 209 + * caller name + incarnation# + tid to unique identify this request 210 + */ 211 + struct ceph_osd_reqid { 212 + struct ceph_entity_name name; 213 + __le64 tid; 214 + __le32 inc; 215 + } __packed; 216 + 217 + struct ceph_blkin_trace_info { 218 + __le64 trace_id; 219 + __le64 span_id; 220 + __le64 parent_span_id; 221 + } __packed; 211 222 212 223 typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, 213 224 u64 notifier_id, void *data, size_t data_len); ··· 244 221 struct list_head pending_lworks; 245 222 246 223 struct ceph_osd_request_target t; 247 - u32 last_force_resend; 248 224 u32 map_dne_bound; 249 225 250 226 struct timespec mtime; ··· 276 254 struct ceph_entity_name name; 277 255 u64 cookie; 278 256 struct ceph_entity_addr addr; 257 + }; 258 + 259 + struct ceph_spg_mapping { 260 + struct rb_node node; 261 + struct ceph_spg spgid; 262 + 263 + struct rb_root backoffs; 264 + }; 265 + 266 + struct ceph_hobject_id { 267 + void *key; 268 + size_t key_len; 269 + void *oid; 270 + size_t oid_len; 271 + u64 snapid; 272 + u32 hash; 273 + u8 is_max; 274 + void *nspace; 275 + size_t nspace_len; 276 + s64 pool; 277 + 278 + /* cache */ 279 + u32 hash_reverse_bits; 280 + }; 281 + 282 + static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid) 283 + { 284 + hoid->hash_reverse_bits = bitrev32(hoid->hash); 285 + } 286 + 287 + /* 288 + * PG-wide backoff: [begin, end) 289 + * per-object backoff: begin == end 290 + */ 291 + struct ceph_osd_backoff { 292 + struct rb_node spg_node; 293 + struct rb_node id_node; 294 + 295 + struct ceph_spg spgid; 296 + u64 id; 297 + struct ceph_hobject_id *begin; 298 + struct ceph_hobject_id *end; 279 299 }; 280 300 281 301 #define CEPH_LINGER_ID_START 0xffff000000000000ULL

+35 -6

include/linux/ceph/osdmap.h

··· 24 24 uint32_t seed; 25 25 }; 26 26 27 + #define CEPH_SPG_NOSHARD -1 28 + 29 + struct ceph_spg { 30 + struct ceph_pg pgid; 31 + s8 shard; 32 + }; 33 + 27 34 int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); 35 + int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs); 28 36 29 37 #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id 30 38 together */ ··· 143 135 struct { 144 136 int len; 145 137 int osds[]; 146 - } pg_temp; 138 + } pg_temp, pg_upmap; 147 139 struct { 148 140 int osd; 149 141 } primary_temp; 142 + struct { 143 + int len; 144 + int from_to[][2]; 145 + } pg_upmap_items; 150 146 }; 151 147 }; 152 148 ··· 162 150 u32 flags; /* CEPH_OSDMAP_* */ 163 151 164 152 u32 max_osd; /* size of osd_state, _offload, _addr arrays */ 165 - u8 *osd_state; /* CEPH_OSD_* */ 153 + u32 *osd_state; /* CEPH_OSD_* */ 166 154 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ 167 155 struct ceph_entity_addr *osd_addr; 168 156 169 157 struct rb_root pg_temp; 170 158 struct rb_root primary_temp; 159 + 160 + /* remap (post-CRUSH, pre-up) */ 161 + struct rb_root pg_upmap; /* PG := raw set */ 162 + struct rb_root pg_upmap_items; /* from -> to within raw set */ 171 163 172 164 u32 *osd_primary_affinity; 173 165 ··· 203 187 return !ceph_osd_is_up(map, osd); 204 188 } 205 189 206 - extern char *ceph_osdmap_state_str(char *str, int len, int state); 190 + char *ceph_osdmap_state_str(char *str, int len, u32 state); 207 191 extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); 208 192 209 193 static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, ··· 214 198 return &map->osd_addr[osd]; 215 199 } 216 200 201 + #define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4) 202 + 217 203 static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) 218 204 { 219 205 __u8 version; 220 206 221 - if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { 207 + if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) { 222 208 pr_warn("incomplete pg encoding\n"); 223 209 return -EINVAL; 224 210 } ··· 258 240 259 241 void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); 260 242 243 + bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, 244 + u32 new_pg_num); 261 245 bool ceph_is_new_interval(const struct ceph_osds *old_acting, 262 246 const struct ceph_osds *new_acting, 263 247 const struct ceph_osds *old_up, ··· 282 262 u64 off, u64 len, 283 263 u64 *bno, u64 *oxoff, u64 *oxlen); 284 264 265 + int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 266 + const struct ceph_object_id *oid, 267 + const struct ceph_object_locator *oloc, 268 + struct ceph_pg *raw_pgid); 285 269 int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, 286 - struct ceph_object_id *oid, 287 - struct ceph_object_locator *oloc, 270 + const struct ceph_object_id *oid, 271 + const struct ceph_object_locator *oloc, 288 272 struct ceph_pg *raw_pgid); 289 273 290 274 void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, 275 + struct ceph_pg_pool_info *pi, 291 276 const struct ceph_pg *raw_pgid, 292 277 struct ceph_osds *up, 293 278 struct ceph_osds *acting); 279 + bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, 280 + struct ceph_pg_pool_info *pi, 281 + const struct ceph_pg *raw_pgid, 282 + struct ceph_spg *spgid); 294 283 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, 295 284 const struct ceph_pg *raw_pgid); 296 285

+6

include/linux/ceph/rados.h

··· 439 439 440 440 const char *ceph_osd_watch_op_name(int o); 441 441 442 + enum { 443 + CEPH_OSD_BACKOFF_OP_BLOCK = 1, 444 + CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2, 445 + CEPH_OSD_BACKOFF_OP_UNBLOCK = 3, 446 + }; 447 + 442 448 /* 443 449 * an individual object operation. each may be accompanied by some data 444 450 * payload

+66

include/linux/crush/crush.h

··· 2 2 #define CEPH_CRUSH_CRUSH_H 3 3 4 4 #ifdef __KERNEL__ 5 + # include <linux/rbtree.h> 5 6 # include <linux/types.h> 6 7 #else 7 8 # include "crush_compat.h" ··· 138 137 139 138 }; 140 139 140 + /** @ingroup API 141 + * 142 + * Replacement weights for each item in a bucket. The size of the 143 + * array must be exactly the size of the straw2 bucket, just as the 144 + * item_weights array. 145 + * 146 + */ 147 + struct crush_weight_set { 148 + __u32 *weights; /*!< 16.16 fixed point weights 149 + in the same order as items */ 150 + __u32 size; /*!< size of the __weights__ array */ 151 + }; 152 + 153 + /** @ingroup API 154 + * 155 + * Replacement weights and ids for a given straw2 bucket, for 156 + * placement purposes. 157 + * 158 + * When crush_do_rule() chooses the Nth item from a straw2 bucket, the 159 + * replacement weights found at __weight_set[N]__ are used instead of 160 + * the weights from __item_weights__. If __N__ is greater than 161 + * __weight_set_size__, the weights found at __weight_set_size-1__ are 162 + * used instead. For instance if __weight_set__ is: 163 + * 164 + * [ [ 0x10000, 0x20000 ], // position 0 165 + * [ 0x20000, 0x40000 ] ] // position 1 166 + * 167 + * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ] 168 + * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ] 169 + * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ] 170 + * etc. 171 + * 172 + */ 173 + struct crush_choose_arg { 174 + __s32 *ids; /*!< values to use instead of items */ 175 + __u32 ids_size; /*!< size of the __ids__ array */ 176 + struct crush_weight_set *weight_set; /*!< weight replacements for 177 + a given position */ 178 + __u32 weight_set_size; /*!< size of the __weight_set__ array */ 179 + }; 180 + 181 + /** @ingroup API 182 + * 183 + * Replacement weights and ids for each bucket in the crushmap. The 184 + * __size__ of the __args__ array must be exactly the same as the 185 + * __map->max_buckets__. 186 + * 187 + * The __crush_choose_arg__ at index N will be used when choosing 188 + * an item from the bucket __map->buckets[N]__ bucket, provided it 189 + * is a straw2 bucket. 190 + * 191 + */ 192 + struct crush_choose_arg_map { 193 + #ifdef __KERNEL__ 194 + struct rb_node node; 195 + u64 choose_args_index; 196 + #endif 197 + struct crush_choose_arg *args; /*!< replacement for each bucket 198 + in the crushmap */ 199 + __u32 size; /*!< size of the __args__ array */ 200 + }; 201 + 141 202 struct crush_bucket_uniform { 142 203 struct crush_bucket h; 143 204 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */ ··· 299 236 __u32 allowed_bucket_algs; 300 237 301 238 __u32 *choose_tries; 239 + #else 240 + /* CrushWrapper::choose_args */ 241 + struct rb_root choose_args; 302 242 #endif 303 243 }; 304 244

+4 -5

include/linux/crush/mapper.h

··· 11 11 #include "crush.h" 12 12 13 13 extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); 14 - extern int crush_do_rule(const struct crush_map *map, 15 - int ruleno, 16 - int x, int *result, int result_max, 17 - const __u32 *weights, int weight_max, 18 - void *cwin); 14 + int crush_do_rule(const struct crush_map *map, 15 + int ruleno, int x, int *result, int result_max, 16 + const __u32 *weight, int weight_max, 17 + void *cwin, const struct crush_choose_arg *choose_args); 19 18 20 19 /* 21 20 * Returns the exact amount of workspace that will need to be used

+1

net/ceph/ceph_common.c

··· 85 85 case CEPH_MSG_OSD_OP: return "osd_op"; 86 86 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; 87 87 case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; 88 + case CEPH_MSG_OSD_BACKOFF: return "osd_backoff"; 88 89 default: return "unknown"; 89 90 } 90 91 }

+3

net/ceph/crush/crush.c

··· 1 1 #ifdef __KERNEL__ 2 2 # include <linux/slab.h> 3 3 # include <linux/crush/crush.h> 4 + void clear_choose_args(struct crush_map *c); 4 5 #else 5 6 # include "crush_compat.h" 6 7 # include "crush.h" ··· 128 127 129 128 #ifndef __KERNEL__ 130 129 kfree(map->choose_tries); 130 + #else 131 + clear_choose_args(map); 131 132 #endif 132 133 kfree(map); 133 134 }

+57 -24

net/ceph/crush/mapper.c

··· 302 302 * 303 303 */ 304 304 305 + static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket, 306 + const struct crush_choose_arg *arg, 307 + int position) 308 + { 309 + if (!arg || !arg->weight_set || arg->weight_set_size == 0) 310 + return bucket->item_weights; 311 + 312 + if (position >= arg->weight_set_size) 313 + position = arg->weight_set_size - 1; 314 + return arg->weight_set[position].weights; 315 + } 316 + 317 + static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket, 318 + const struct crush_choose_arg *arg) 319 + { 320 + if (!arg || !arg->ids) 321 + return bucket->h.items; 322 + 323 + return arg->ids; 324 + } 325 + 305 326 static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, 306 - int x, int r) 327 + int x, int r, 328 + const struct crush_choose_arg *arg, 329 + int position) 307 330 { 308 331 unsigned int i, high = 0; 309 332 unsigned int u; 310 - unsigned int w; 311 333 __s64 ln, draw, high_draw = 0; 334 + __u32 *weights = get_choose_arg_weights(bucket, arg, position); 335 + __s32 *ids = get_choose_arg_ids(bucket, arg); 312 336 313 337 for (i = 0; i < bucket->h.size; i++) { 314 - w = bucket->item_weights[i]; 315 - if (w) { 316 - u = crush_hash32_3(bucket->h.hash, x, 317 - bucket->h.items[i], r); 338 + dprintk("weight 0x%x item %d\n", weights[i], ids[i]); 339 + if (weights[i]) { 340 + u = crush_hash32_3(bucket->h.hash, x, ids[i], r); 318 341 u &= 0xffff; 319 342 320 343 /* ··· 358 335 * weight means a larger (less negative) value 359 336 * for draw. 360 337 */ 361 - draw = div64_s64(ln, w); 338 + draw = div64_s64(ln, weights[i]); 362 339 } else { 363 340 draw = S64_MIN; 364 341 } ··· 375 352 376 353 static int crush_bucket_choose(const struct crush_bucket *in, 377 354 struct crush_work_bucket *work, 378 - int x, int r) 355 + int x, int r, 356 + const struct crush_choose_arg *arg, 357 + int position) 379 358 { 380 359 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 381 360 BUG_ON(in->size == 0); ··· 399 374 case CRUSH_BUCKET_STRAW2: 400 375 return bucket_straw2_choose( 401 376 (const struct crush_bucket_straw2 *)in, 402 - x, r); 377 + x, r, arg, position); 403 378 default: 404 379 dprintk("unknown bucket %d alg %d\n", in->id, in->alg); 405 380 return in->items[0]; ··· 461 436 unsigned int vary_r, 462 437 unsigned int stable, 463 438 int *out2, 464 - int parent_r) 439 + int parent_r, 440 + const struct crush_choose_arg *choose_args) 465 441 { 466 442 int rep; 467 443 unsigned int ftotal, flocal; ··· 512 486 else 513 487 item = crush_bucket_choose( 514 488 in, work->work[-1-in->id], 515 - x, r); 489 + x, r, 490 + (choose_args ? 491 + &choose_args[-1-in->id] : 0), 492 + outpos); 516 493 if (item >= map->max_devices) { 517 494 dprintk(" bad item %d\n", item); 518 495 skip_rep = 1; ··· 572 543 vary_r, 573 544 stable, 574 545 NULL, 575 - sub_r) <= outpos) 546 + sub_r, 547 + choose_args) <= outpos) 576 548 /* didn't get leaf */ 577 549 reject = 1; 578 550 } else { ··· 650 620 unsigned int recurse_tries, 651 621 int recurse_to_leaf, 652 622 int *out2, 653 - int parent_r) 623 + int parent_r, 624 + const struct crush_choose_arg *choose_args) 654 625 { 655 626 const struct crush_bucket *in = bucket; 656 627 int endpos = outpos + left; ··· 723 692 724 693 item = crush_bucket_choose( 725 694 in, work->work[-1-in->id], 726 - x, r); 695 + x, r, 696 + (choose_args ? 697 + &choose_args[-1-in->id] : 0), 698 + outpos); 727 699 if (item >= map->max_devices) { 728 700 dprintk(" bad item %d\n", item); 729 701 out[rep] = CRUSH_ITEM_NONE; ··· 780 746 x, 1, numrep, 0, 781 747 out2, rep, 782 748 recurse_tries, 0, 783 - 0, NULL, r); 749 + 0, NULL, r, 750 + choose_args); 784 751 if (out2[rep] == CRUSH_ITEM_NONE) { 785 752 /* placed nothing; no leaf */ 786 753 break; ··· 858 823 * set the pointer first and then reserve the space for it to 859 824 * point to by incrementing the point. 860 825 */ 861 - v += sizeof(struct crush_work *); 826 + v += sizeof(struct crush_work); 862 827 w->work = v; 863 828 v += map->max_buckets * sizeof(struct crush_work_bucket *); 864 829 for (b = 0; b < map->max_buckets; ++b) { ··· 889 854 * @weight: weight vector (for map leaves) 890 855 * @weight_max: size of weight vector 891 856 * @cwin: pointer to at least crush_work_size() bytes of memory 857 + * @choose_args: weights and ids for each known bucket 892 858 */ 893 859 int crush_do_rule(const struct crush_map *map, 894 860 int ruleno, int x, int *result, int result_max, 895 861 const __u32 *weight, int weight_max, 896 - void *cwin) 862 + void *cwin, const struct crush_choose_arg *choose_args) 897 863 { 898 864 int result_len; 899 865 struct crush_work *cw = cwin; ··· 1004 968 1005 969 for (i = 0; i < wsize; i++) { 1006 970 int bno; 1007 - /* 1008 - * see CRUSH_N, CRUSH_N_MINUS macros. 1009 - * basically, numrep <= 0 means relative to 1010 - * the provided result_max 1011 - */ 1012 971 numrep = curstep->arg1; 1013 972 if (numrep <= 0) { 1014 973 numrep += result_max; ··· 1044 1013 vary_r, 1045 1014 stable, 1046 1015 c+osize, 1047 - 0); 1016 + 0, 1017 + choose_args); 1048 1018 } else { 1049 1019 out_size = ((numrep < (result_max-osize)) ? 1050 1020 numrep : (result_max-osize)); ··· 1062 1030 choose_leaf_tries : 1, 1063 1031 recurse_to_leaf, 1064 1032 c+osize, 1065 - 0); 1033 + 0, 1034 + choose_args); 1066 1035 osize += out_size; 1067 1036 } 1068 1037 }

+109 -3

net/ceph/debugfs.c

··· 77 77 } 78 78 for (i = 0; i < map->max_osd; i++) { 79 79 struct ceph_entity_addr *addr = &map->osd_addr[i]; 80 - int state = map->osd_state[i]; 80 + u32 state = map->osd_state[i]; 81 81 char sb[64]; 82 82 83 83 seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", ··· 103 103 104 104 seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, 105 105 pg->pgid.seed, pg->primary_temp.osd); 106 + } 107 + for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) { 108 + struct ceph_pg_mapping *pg = 109 + rb_entry(n, struct ceph_pg_mapping, node); 110 + 111 + seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool, 112 + pg->pgid.seed); 113 + for (i = 0; i < pg->pg_upmap.len; i++) 114 + seq_printf(s, "%s%d", (i == 0 ? "" : ","), 115 + pg->pg_upmap.osds[i]); 116 + seq_printf(s, "]\n"); 117 + } 118 + for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) { 119 + struct ceph_pg_mapping *pg = 120 + rb_entry(n, struct ceph_pg_mapping, node); 121 + 122 + seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool, 123 + pg->pgid.seed); 124 + for (i = 0; i < pg->pg_upmap_items.len; i++) 125 + seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","), 126 + pg->pg_upmap_items.from_to[i][0], 127 + pg->pg_upmap_items.from_to[i][1]); 128 + seq_printf(s, "]\n"); 106 129 } 107 130 108 131 up_read(&osdc->lock); ··· 170 147 return 0; 171 148 } 172 149 150 + static void dump_spgid(struct seq_file *s, const struct ceph_spg *spgid) 151 + { 152 + seq_printf(s, "%llu.%x", spgid->pgid.pool, spgid->pgid.seed); 153 + if (spgid->shard != CEPH_SPG_NOSHARD) 154 + seq_printf(s, "s%d", spgid->shard); 155 + } 156 + 173 157 static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) 174 158 { 175 159 int i; 176 160 177 - seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed); 161 + seq_printf(s, "osd%d\t%llu.%x\t", t->osd, t->pgid.pool, t->pgid.seed); 162 + dump_spgid(s, &t->spgid); 163 + seq_puts(s, "\t["); 178 164 for (i = 0; i < t->up.size; i++) 179 165 seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]); 180 166 seq_printf(s, "]/%d\t[", t->up.primary); 181 167 for (i = 0; i < t->acting.size; i++) 182 168 seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); 183 - seq_printf(s, "]/%d\t", t->acting.primary); 169 + seq_printf(s, "]/%d\te%u\t", t->acting.primary, t->epoch); 184 170 if (t->target_oloc.pool_ns) { 185 171 seq_printf(s, "%*pE/%*pE\t0x%x", 186 172 (int)t->target_oloc.pool_ns->len, ··· 266 234 mutex_unlock(&osd->lock); 267 235 } 268 236 237 + static void dump_snapid(struct seq_file *s, u64 snapid) 238 + { 239 + if (snapid == CEPH_NOSNAP) 240 + seq_puts(s, "head"); 241 + else if (snapid == CEPH_SNAPDIR) 242 + seq_puts(s, "snapdir"); 243 + else 244 + seq_printf(s, "%llx", snapid); 245 + } 246 + 247 + static void dump_name_escaped(struct seq_file *s, unsigned char *name, 248 + size_t len) 249 + { 250 + size_t i; 251 + 252 + for (i = 0; i < len; i++) { 253 + if (name[i] == '%' || name[i] == ':' || name[i] == '/' || 254 + name[i] < 32 || name[i] >= 127) { 255 + seq_printf(s, "%%%02x", name[i]); 256 + } else { 257 + seq_putc(s, name[i]); 258 + } 259 + } 260 + } 261 + 262 + static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid) 263 + { 264 + if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max && 265 + hoid->pool == S64_MIN) { 266 + seq_puts(s, "MIN"); 267 + return; 268 + } 269 + if (hoid->is_max) { 270 + seq_puts(s, "MAX"); 271 + return; 272 + } 273 + seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits); 274 + dump_name_escaped(s, hoid->nspace, hoid->nspace_len); 275 + seq_putc(s, ':'); 276 + dump_name_escaped(s, hoid->key, hoid->key_len); 277 + seq_putc(s, ':'); 278 + dump_name_escaped(s, hoid->oid, hoid->oid_len); 279 + seq_putc(s, ':'); 280 + dump_snapid(s, hoid->snapid); 281 + } 282 + 283 + static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd) 284 + { 285 + struct rb_node *n; 286 + 287 + mutex_lock(&osd->lock); 288 + for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) { 289 + struct ceph_osd_backoff *backoff = 290 + rb_entry(n, struct ceph_osd_backoff, id_node); 291 + 292 + seq_printf(s, "osd%d\t", osd->o_osd); 293 + dump_spgid(s, &backoff->spgid); 294 + seq_printf(s, "\t%llu\t", backoff->id); 295 + dump_hoid(s, backoff->begin); 296 + seq_putc(s, '\t'); 297 + dump_hoid(s, backoff->end); 298 + seq_putc(s, '\n'); 299 + } 300 + 301 + mutex_unlock(&osd->lock); 302 + } 303 + 269 304 static int osdc_show(struct seq_file *s, void *pp) 270 305 { 271 306 struct ceph_client *client = s->private; ··· 357 258 dump_linger_requests(s, osd); 358 259 } 359 260 dump_linger_requests(s, &osdc->homeless_osd); 261 + 262 + seq_puts(s, "BACKOFFS\n"); 263 + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { 264 + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); 265 + 266 + dump_backoffs(s, osd); 267 + } 360 268 361 269 up_read(&osdc->lock); 362 270 return 0;

+6 -4

net/ceph/messenger.c

··· 1288 1288 m->hdr.seq = cpu_to_le64(++con->out_seq); 1289 1289 m->needs_out_seq = false; 1290 1290 } 1291 - WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); 1291 + 1292 + if (con->ops->reencode_message) 1293 + con->ops->reencode_message(m); 1292 1294 1293 1295 dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", 1294 1296 m, con->out_seq, le16_to_cpu(m->hdr.type), 1295 1297 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), 1296 1298 m->data_length); 1297 - BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); 1299 + WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len)); 1300 + WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len)); 1298 1301 1299 1302 /* tag + hdr + front + middle */ 1300 1303 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); ··· 2036 2033 { 2037 2034 u64 sup_feat = from_msgr(con->msgr)->supported_features; 2038 2035 u64 req_feat = from_msgr(con->msgr)->required_features; 2039 - u64 server_feat = ceph_sanitize_features( 2040 - le64_to_cpu(con->in_reply.features)); 2036 + u64 server_feat = le64_to_cpu(con->in_reply.features); 2041 2037 int ret; 2042 2038 2043 2039 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);

+7 -1

net/ceph/mon_client.c

··· 6 6 #include <linux/random.h> 7 7 #include <linux/sched.h> 8 8 9 + #include <linux/ceph/ceph_features.h> 9 10 #include <linux/ceph/mon_client.h> 10 11 #include <linux/ceph/libceph.h> 11 12 #include <linux/ceph/debugfs.h> ··· 298 297 299 298 mutex_lock(&monc->mutex); 300 299 if (monc->sub_renew_sent) { 300 + /* 301 + * This is only needed for legacy (infernalis or older) 302 + * MONs -- see delayed_work(). 303 + */ 301 304 monc->sub_renew_after = monc->sub_renew_sent + 302 305 (seconds >> 1) * HZ - 1; 303 306 dout("%s sent %lu duration %d renew after %lu\n", __func__, ··· 960 955 __validate_auth(monc); 961 956 } 962 957 963 - if (is_auth) { 958 + if (is_auth && 959 + !(monc->con.peer_features & CEPH_FEATURE_MON_STATEFUL_SUB)) { 964 960 unsigned long now = jiffies; 965 961 966 962 dout("%s renew subs? now %lu renew after %lu\n",

+825 -82

net/ceph/osd_client.c

··· 12 12 #include <linux/bio.h> 13 13 #endif 14 14 15 + #include <linux/ceph/ceph_features.h> 15 16 #include <linux/ceph/libceph.h> 16 17 #include <linux/ceph/osd_client.h> 17 18 #include <linux/ceph/messenger.h> ··· 50 49 struct ceph_osd_linger_request *lreq); 51 50 static void unlink_linger(struct ceph_osd *osd, 52 51 struct ceph_osd_linger_request *lreq); 52 + static void clear_backoffs(struct ceph_osd *osd); 53 53 54 54 #if 1 55 55 static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) ··· 375 373 ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); 376 374 377 375 dest->pgid = src->pgid; /* struct */ 376 + dest->spgid = src->spgid; /* struct */ 378 377 dest->pg_num = src->pg_num; 379 378 dest->pg_num_mask = src->pg_num_mask; 380 379 ceph_osds_copy(&dest->acting, &src->acting); ··· 386 383 387 384 dest->flags = src->flags; 388 385 dest->paused = src->paused; 386 + 387 + dest->epoch = src->epoch; 388 + dest->last_force_resend = src->last_force_resend; 389 389 390 390 dest->osd = src->osd; 391 391 } ··· 543 537 } 544 538 EXPORT_SYMBOL(ceph_osdc_alloc_request); 545 539 546 - static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) 540 + static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) 547 541 { 548 542 return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); 549 543 } ··· 558 552 WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); 559 553 560 554 /* create request message */ 561 - msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ 562 - msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ 555 + msg_size = CEPH_ENCODING_START_BLK_LEN + 556 + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ 557 + msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */ 558 + msg_size += CEPH_ENCODING_START_BLK_LEN + 559 + sizeof(struct ceph_osd_reqid); /* reqid */ 560 + msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */ 561 + msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */ 563 562 msg_size += CEPH_ENCODING_START_BLK_LEN + 564 563 ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ 565 - msg_size += 1 + 8 + 4 + 4; /* pgid */ 566 564 msg_size += 4 + req->r_base_oid.name_len; /* oid */ 567 565 msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); 568 566 msg_size += 8; /* snapid */ 569 567 msg_size += 8; /* snap_seq */ 570 568 msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); 571 - msg_size += 4; /* retry_attempt */ 569 + msg_size += 4 + 8; /* retry_attempt, features */ 572 570 573 571 if (req->r_mempool) 574 572 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); ··· 1020 1010 RB_CLEAR_NODE(&osd->o_node); 1021 1011 osd->o_requests = RB_ROOT; 1022 1012 osd->o_linger_requests = RB_ROOT; 1013 + osd->o_backoff_mappings = RB_ROOT; 1014 + osd->o_backoffs_by_id = RB_ROOT; 1023 1015 INIT_LIST_HEAD(&osd->o_osd_lru); 1024 1016 INIT_LIST_HEAD(&osd->o_keepalive_item); 1025 1017 osd->o_incarnation = 1; ··· 1033 1021 WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); 1034 1022 WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); 1035 1023 WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); 1024 + WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings)); 1025 + WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id)); 1036 1026 WARN_ON(!list_empty(&osd->o_osd_lru)); 1037 1027 WARN_ON(!list_empty(&osd->o_keepalive_item)); 1038 1028 ··· 1155 1141 unlink_linger(osd, lreq); 1156 1142 link_linger(&osdc->homeless_osd, lreq); 1157 1143 } 1144 + clear_backoffs(osd); 1158 1145 1159 1146 __remove_osd_from_lru(osd); 1160 1147 erase_osd(&osdc->osds, osd); ··· 1312 1297 ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || 1313 1298 __pool_full(pi); 1314 1299 1315 - WARN_ON(pi->id != t->base_oloc.pool); 1300 + WARN_ON(pi->id != t->target_oloc.pool); 1316 1301 return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || 1317 1302 ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || 1318 1303 (osdc->osdmap->epoch < osdc->epoch_barrier); ··· 1326 1311 1327 1312 static enum calc_target_result calc_target(struct ceph_osd_client *osdc, 1328 1313 struct ceph_osd_request_target *t, 1329 - u32 *last_force_resend, 1314 + struct ceph_connection *con, 1330 1315 bool any_change) 1331 1316 { 1332 1317 struct ceph_pg_pool_info *pi; 1333 1318 struct ceph_pg pgid, last_pgid; 1334 1319 struct ceph_osds up, acting; 1335 1320 bool force_resend = false; 1336 - bool need_check_tiering = false; 1337 - bool need_resend = false; 1321 + bool unpaused = false; 1322 + bool legacy_change; 1323 + bool split = false; 1338 1324 bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); 1339 1325 enum calc_target_result ct_res; 1340 1326 int ret; 1341 1327 1328 + t->epoch = osdc->osdmap->epoch; 1342 1329 pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); 1343 1330 if (!pi) { 1344 1331 t->osd = CEPH_HOMELESS_OSD; ··· 1349 1332 } 1350 1333 1351 1334 if (osdc->osdmap->epoch == pi->last_force_request_resend) { 1352 - if (last_force_resend && 1353 - *last_force_resend < pi->last_force_request_resend) { 1354 - *last_force_resend = pi->last_force_request_resend; 1335 + if (t->last_force_resend < pi->last_force_request_resend) { 1336 + t->last_force_resend = pi->last_force_request_resend; 1355 1337 force_resend = true; 1356 - } else if (!last_force_resend) { 1338 + } else if (t->last_force_resend == 0) { 1357 1339 force_resend = true; 1358 1340 } 1359 1341 } 1360 - if (ceph_oid_empty(&t->target_oid) || force_resend) { 1361 - ceph_oid_copy(&t->target_oid, &t->base_oid); 1362 - need_check_tiering = true; 1363 - } 1364 - if (ceph_oloc_empty(&t->target_oloc) || force_resend) { 1365 - ceph_oloc_copy(&t->target_oloc, &t->base_oloc); 1366 - need_check_tiering = true; 1367 - } 1368 1342 1369 - if (need_check_tiering && 1370 - (t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { 1343 + /* apply tiering */ 1344 + ceph_oid_copy(&t->target_oid, &t->base_oid); 1345 + ceph_oloc_copy(&t->target_oloc, &t->base_oloc); 1346 + if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { 1371 1347 if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) 1372 1348 t->target_oloc.pool = pi->read_tier; 1373 1349 if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) 1374 1350 t->target_oloc.pool = pi->write_tier; 1351 + 1352 + pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); 1353 + if (!pi) { 1354 + t->osd = CEPH_HOMELESS_OSD; 1355 + ct_res = CALC_TARGET_POOL_DNE; 1356 + goto out; 1357 + } 1375 1358 } 1376 1359 1377 - ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, 1378 - &t->target_oloc, &pgid); 1360 + ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc, 1361 + &pgid); 1379 1362 if (ret) { 1380 1363 WARN_ON(ret != -ENOENT); 1381 1364 t->osd = CEPH_HOMELESS_OSD; ··· 1385 1368 last_pgid.pool = pgid.pool; 1386 1369 last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); 1387 1370 1388 - ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); 1371 + ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting); 1389 1372 if (any_change && 1390 1373 ceph_is_new_interval(&t->acting, 1391 1374 &acting, ··· 1404 1387 1405 1388 if (t->paused && !target_should_be_paused(osdc, t, pi)) { 1406 1389 t->paused = false; 1407 - need_resend = true; 1390 + unpaused = true; 1408 1391 } 1392 + legacy_change = ceph_pg_compare(&t->pgid, &pgid) || 1393 + ceph_osds_changed(&t->acting, &acting, any_change); 1394 + if (t->pg_num) 1395 + split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); 1409 1396 1410 - if (ceph_pg_compare(&t->pgid, &pgid) || 1411 - ceph_osds_changed(&t->acting, &acting, any_change) || 1412 - force_resend) { 1397 + if (legacy_change || force_resend || split) { 1413 1398 t->pgid = pgid; /* struct */ 1399 + ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid); 1414 1400 ceph_osds_copy(&t->acting, &acting); 1415 1401 ceph_osds_copy(&t->up, &up); 1416 1402 t->size = pi->size; ··· 1423 1403 t->sort_bitwise = sort_bitwise; 1424 1404 1425 1405 t->osd = acting.primary; 1426 - need_resend = true; 1427 1406 } 1428 1407 1429 - ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; 1408 + if (unpaused || legacy_change || force_resend || 1409 + (split && con && CEPH_HAVE_FEATURE(con->peer_features, 1410 + RESEND_ON_SPLIT))) 1411 + ct_res = CALC_TARGET_NEED_RESEND; 1412 + else 1413 + ct_res = CALC_TARGET_NO_ACTION; 1414 + 1430 1415 out: 1431 1416 dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); 1432 1417 return ct_res; 1418 + } 1419 + 1420 + static struct ceph_spg_mapping *alloc_spg_mapping(void) 1421 + { 1422 + struct ceph_spg_mapping *spg; 1423 + 1424 + spg = kmalloc(sizeof(*spg), GFP_NOIO); 1425 + if (!spg) 1426 + return NULL; 1427 + 1428 + RB_CLEAR_NODE(&spg->node); 1429 + spg->backoffs = RB_ROOT; 1430 + return spg; 1431 + } 1432 + 1433 + static void free_spg_mapping(struct ceph_spg_mapping *spg) 1434 + { 1435 + WARN_ON(!RB_EMPTY_NODE(&spg->node)); 1436 + WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs)); 1437 + 1438 + kfree(spg); 1439 + } 1440 + 1441 + /* 1442 + * rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to 1443 + * ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is 1444 + * defined only within a specific spgid; it does not pass anything to 1445 + * children on split, or to another primary. 1446 + */ 1447 + DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare, 1448 + RB_BYPTR, const struct ceph_spg *, node) 1449 + 1450 + static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid) 1451 + { 1452 + return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits; 1453 + } 1454 + 1455 + static void hoid_get_effective_key(const struct ceph_hobject_id *hoid, 1456 + void **pkey, size_t *pkey_len) 1457 + { 1458 + if (hoid->key_len) { 1459 + *pkey = hoid->key; 1460 + *pkey_len = hoid->key_len; 1461 + } else { 1462 + *pkey = hoid->oid; 1463 + *pkey_len = hoid->oid_len; 1464 + } 1465 + } 1466 + 1467 + static int compare_names(const void *name1, size_t name1_len, 1468 + const void *name2, size_t name2_len) 1469 + { 1470 + int ret; 1471 + 1472 + ret = memcmp(name1, name2, min(name1_len, name2_len)); 1473 + if (!ret) { 1474 + if (name1_len < name2_len) 1475 + ret = -1; 1476 + else if (name1_len > name2_len) 1477 + ret = 1; 1478 + } 1479 + return ret; 1480 + } 1481 + 1482 + static int hoid_compare(const struct ceph_hobject_id *lhs, 1483 + const struct ceph_hobject_id *rhs) 1484 + { 1485 + void *effective_key1, *effective_key2; 1486 + size_t effective_key1_len, effective_key2_len; 1487 + int ret; 1488 + 1489 + if (lhs->is_max < rhs->is_max) 1490 + return -1; 1491 + if (lhs->is_max > rhs->is_max) 1492 + return 1; 1493 + 1494 + if (lhs->pool < rhs->pool) 1495 + return -1; 1496 + if (lhs->pool > rhs->pool) 1497 + return 1; 1498 + 1499 + if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs)) 1500 + return -1; 1501 + if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs)) 1502 + return 1; 1503 + 1504 + ret = compare_names(lhs->nspace, lhs->nspace_len, 1505 + rhs->nspace, rhs->nspace_len); 1506 + if (ret) 1507 + return ret; 1508 + 1509 + hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len); 1510 + hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len); 1511 + ret = compare_names(effective_key1, effective_key1_len, 1512 + effective_key2, effective_key2_len); 1513 + if (ret) 1514 + return ret; 1515 + 1516 + ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len); 1517 + if (ret) 1518 + return ret; 1519 + 1520 + if (lhs->snapid < rhs->snapid) 1521 + return -1; 1522 + if (lhs->snapid > rhs->snapid) 1523 + return 1; 1524 + 1525 + return 0; 1526 + } 1527 + 1528 + /* 1529 + * For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX 1530 + * compat stuff here. 1531 + * 1532 + * Assumes @hoid is zero-initialized. 1533 + */ 1534 + static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid) 1535 + { 1536 + u8 struct_v; 1537 + u32 struct_len; 1538 + int ret; 1539 + 1540 + ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v, 1541 + &struct_len); 1542 + if (ret) 1543 + return ret; 1544 + 1545 + if (struct_v < 4) { 1546 + pr_err("got struct_v %d < 4 of hobject_t\n", struct_v); 1547 + goto e_inval; 1548 + } 1549 + 1550 + hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len, 1551 + GFP_NOIO); 1552 + if (IS_ERR(hoid->key)) { 1553 + ret = PTR_ERR(hoid->key); 1554 + hoid->key = NULL; 1555 + return ret; 1556 + } 1557 + 1558 + hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len, 1559 + GFP_NOIO); 1560 + if (IS_ERR(hoid->oid)) { 1561 + ret = PTR_ERR(hoid->oid); 1562 + hoid->oid = NULL; 1563 + return ret; 1564 + } 1565 + 1566 + ceph_decode_64_safe(p, end, hoid->snapid, e_inval); 1567 + ceph_decode_32_safe(p, end, hoid->hash, e_inval); 1568 + ceph_decode_8_safe(p, end, hoid->is_max, e_inval); 1569 + 1570 + hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len, 1571 + GFP_NOIO); 1572 + if (IS_ERR(hoid->nspace)) { 1573 + ret = PTR_ERR(hoid->nspace); 1574 + hoid->nspace = NULL; 1575 + return ret; 1576 + } 1577 + 1578 + ceph_decode_64_safe(p, end, hoid->pool, e_inval); 1579 + 1580 + ceph_hoid_build_hash_cache(hoid); 1581 + return 0; 1582 + 1583 + e_inval: 1584 + return -EINVAL; 1585 + } 1586 + 1587 + static int hoid_encoding_size(const struct ceph_hobject_id *hoid) 1588 + { 1589 + return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */ 1590 + 4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len; 1591 + } 1592 + 1593 + static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid) 1594 + { 1595 + ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid)); 1596 + ceph_encode_string(p, end, hoid->key, hoid->key_len); 1597 + ceph_encode_string(p, end, hoid->oid, hoid->oid_len); 1598 + ceph_encode_64(p, hoid->snapid); 1599 + ceph_encode_32(p, hoid->hash); 1600 + ceph_encode_8(p, hoid->is_max); 1601 + ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len); 1602 + ceph_encode_64(p, hoid->pool); 1603 + } 1604 + 1605 + static void free_hoid(struct ceph_hobject_id *hoid) 1606 + { 1607 + if (hoid) { 1608 + kfree(hoid->key); 1609 + kfree(hoid->oid); 1610 + kfree(hoid->nspace); 1611 + kfree(hoid); 1612 + } 1613 + } 1614 + 1615 + static struct ceph_osd_backoff *alloc_backoff(void) 1616 + { 1617 + struct ceph_osd_backoff *backoff; 1618 + 1619 + backoff = kzalloc(sizeof(*backoff), GFP_NOIO); 1620 + if (!backoff) 1621 + return NULL; 1622 + 1623 + RB_CLEAR_NODE(&backoff->spg_node); 1624 + RB_CLEAR_NODE(&backoff->id_node); 1625 + return backoff; 1626 + } 1627 + 1628 + static void free_backoff(struct ceph_osd_backoff *backoff) 1629 + { 1630 + WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node)); 1631 + WARN_ON(!RB_EMPTY_NODE(&backoff->id_node)); 1632 + 1633 + free_hoid(backoff->begin); 1634 + free_hoid(backoff->end); 1635 + kfree(backoff); 1636 + } 1637 + 1638 + /* 1639 + * Within a specific spgid, backoffs are managed by ->begin hoid. 1640 + */ 1641 + DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare, 1642 + RB_BYVAL, spg_node); 1643 + 1644 + static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root, 1645 + const struct ceph_hobject_id *hoid) 1646 + { 1647 + struct rb_node *n = root->rb_node; 1648 + 1649 + while (n) { 1650 + struct ceph_osd_backoff *cur = 1651 + rb_entry(n, struct ceph_osd_backoff, spg_node); 1652 + int cmp; 1653 + 1654 + cmp = hoid_compare(hoid, cur->begin); 1655 + if (cmp < 0) { 1656 + n = n->rb_left; 1657 + } else if (cmp > 0) { 1658 + if (hoid_compare(hoid, cur->end) < 0) 1659 + return cur; 1660 + 1661 + n = n->rb_right; 1662 + } else { 1663 + return cur; 1664 + } 1665 + } 1666 + 1667 + return NULL; 1668 + } 1669 + 1670 + /* 1671 + * Each backoff has a unique id within its OSD session. 1672 + */ 1673 + DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node) 1674 + 1675 + static void clear_backoffs(struct ceph_osd *osd) 1676 + { 1677 + while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) { 1678 + struct ceph_spg_mapping *spg = 1679 + rb_entry(rb_first(&osd->o_backoff_mappings), 1680 + struct ceph_spg_mapping, node); 1681 + 1682 + while (!RB_EMPTY_ROOT(&spg->backoffs)) { 1683 + struct ceph_osd_backoff *backoff = 1684 + rb_entry(rb_first(&spg->backoffs), 1685 + struct ceph_osd_backoff, spg_node); 1686 + 1687 + erase_backoff(&spg->backoffs, backoff); 1688 + erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); 1689 + free_backoff(backoff); 1690 + } 1691 + erase_spg_mapping(&osd->o_backoff_mappings, spg); 1692 + free_spg_mapping(spg); 1693 + } 1694 + } 1695 + 1696 + /* 1697 + * Set up a temporary, non-owning view into @t. 1698 + */ 1699 + static void hoid_fill_from_target(struct ceph_hobject_id *hoid, 1700 + const struct ceph_osd_request_target *t) 1701 + { 1702 + hoid->key = NULL; 1703 + hoid->key_len = 0; 1704 + hoid->oid = t->target_oid.name; 1705 + hoid->oid_len = t->target_oid.name_len; 1706 + hoid->snapid = CEPH_NOSNAP; 1707 + hoid->hash = t->pgid.seed; 1708 + hoid->is_max = false; 1709 + if (t->target_oloc.pool_ns) { 1710 + hoid->nspace = t->target_oloc.pool_ns->str; 1711 + hoid->nspace_len = t->target_oloc.pool_ns->len; 1712 + } else { 1713 + hoid->nspace = NULL; 1714 + hoid->nspace_len = 0; 1715 + } 1716 + hoid->pool = t->target_oloc.pool; 1717 + ceph_hoid_build_hash_cache(hoid); 1718 + } 1719 + 1720 + static bool should_plug_request(struct ceph_osd_request *req) 1721 + { 1722 + struct ceph_osd *osd = req->r_osd; 1723 + struct ceph_spg_mapping *spg; 1724 + struct ceph_osd_backoff *backoff; 1725 + struct ceph_hobject_id hoid; 1726 + 1727 + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid); 1728 + if (!spg) 1729 + return false; 1730 + 1731 + hoid_fill_from_target(&hoid, &req->r_t); 1732 + backoff = lookup_containing_backoff(&spg->backoffs, &hoid); 1733 + if (!backoff) 1734 + return false; 1735 + 1736 + dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n", 1737 + __func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool, 1738 + backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id); 1739 + return true; 1433 1740 } 1434 1741 1435 1742 static void setup_request_data(struct ceph_osd_request *req, ··· 1830 1483 WARN_ON(data_len != msg->data_length); 1831 1484 } 1832 1485 1833 - static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) 1486 + static void encode_pgid(void **p, const struct ceph_pg *pgid) 1487 + { 1488 + ceph_encode_8(p, 1); 1489 + ceph_encode_64(p, pgid->pool); 1490 + ceph_encode_32(p, pgid->seed); 1491 + ceph_encode_32(p, -1); /* preferred */ 1492 + } 1493 + 1494 + static void encode_spgid(void **p, const struct ceph_spg *spgid) 1495 + { 1496 + ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1); 1497 + encode_pgid(p, &spgid->pgid); 1498 + ceph_encode_8(p, spgid->shard); 1499 + } 1500 + 1501 + static void encode_oloc(void **p, void *end, 1502 + const struct ceph_object_locator *oloc) 1503 + { 1504 + ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc)); 1505 + ceph_encode_64(p, oloc->pool); 1506 + ceph_encode_32(p, -1); /* preferred */ 1507 + ceph_encode_32(p, 0); /* key len */ 1508 + if (oloc->pool_ns) 1509 + ceph_encode_string(p, end, oloc->pool_ns->str, 1510 + oloc->pool_ns->len); 1511 + else 1512 + ceph_encode_32(p, 0); 1513 + } 1514 + 1515 + static void encode_request_partial(struct ceph_osd_request *req, 1516 + struct ceph_msg *msg) 1834 1517 { 1835 1518 void *p = msg->front.iov_base; 1836 1519 void *const end = p + msg->front_alloc_len; ··· 1877 1500 1878 1501 setup_request_data(req, msg); 1879 1502 1880 - ceph_encode_32(&p, 1); /* client_inc, always 1 */ 1503 + encode_spgid(&p, &req->r_t.spgid); /* actual spg */ 1504 + ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ 1881 1505 ceph_encode_32(&p, req->r_osdc->osdmap->epoch); 1882 1506 ceph_encode_32(&p, req->r_flags); 1507 + 1508 + /* reqid */ 1509 + ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid)); 1510 + memset(p, 0, sizeof(struct ceph_osd_reqid)); 1511 + p += sizeof(struct ceph_osd_reqid); 1512 + 1513 + /* trace */ 1514 + memset(p, 0, sizeof(struct ceph_blkin_trace_info)); 1515 + p += sizeof(struct ceph_blkin_trace_info); 1516 + 1517 + ceph_encode_32(&p, 0); /* client_inc, always 0 */ 1883 1518 ceph_encode_timespec(p, &req->r_mtime); 1884 1519 p += sizeof(struct ceph_timespec); 1885 1520 1886 - /* reassert_version */ 1887 - memset(p, 0, sizeof(struct ceph_eversion)); 1888 - p += sizeof(struct ceph_eversion); 1889 - 1890 - /* oloc */ 1891 - ceph_start_encoding(&p, 5, 4, 1892 - ceph_oloc_encoding_size(&req->r_t.target_oloc)); 1893 - ceph_encode_64(&p, req->r_t.target_oloc.pool); 1894 - ceph_encode_32(&p, -1); /* preferred */ 1895 - ceph_encode_32(&p, 0); /* key len */ 1896 - if (req->r_t.target_oloc.pool_ns) 1897 - ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str, 1898 - req->r_t.target_oloc.pool_ns->len); 1899 - else 1900 - ceph_encode_32(&p, 0); 1901 - 1902 - /* pgid */ 1903 - ceph_encode_8(&p, 1); 1904 - ceph_encode_64(&p, req->r_t.pgid.pool); 1905 - ceph_encode_32(&p, req->r_t.pgid.seed); 1906 - ceph_encode_32(&p, -1); /* preferred */ 1907 - 1908 - /* oid */ 1909 - ceph_encode_32(&p, req->r_t.target_oid.name_len); 1910 - memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len); 1911 - p += req->r_t.target_oid.name_len; 1521 + encode_oloc(&p, end, &req->r_t.target_oloc); 1522 + ceph_encode_string(&p, end, req->r_t.target_oid.name, 1523 + req->r_t.target_oid.name_len); 1912 1524 1913 1525 /* ops, can imply data */ 1914 1526 ceph_encode_16(&p, req->r_num_ops); ··· 1918 1552 } 1919 1553 1920 1554 ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ 1555 + BUG_ON(p != end - 8); /* space for features */ 1921 1556 1922 - BUG_ON(p > end); 1923 - msg->front.iov_len = p - msg->front.iov_base; 1924 - msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ 1925 - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1557 + msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */ 1558 + /* front_len is finalized in encode_request_finish() */ 1926 1559 msg->hdr.data_len = cpu_to_le32(data_len); 1927 1560 /* 1928 1561 * The header "data_off" is a hint to the receiver allowing it ··· 1930 1565 */ 1931 1566 msg->hdr.data_off = cpu_to_le16(req->r_data_offset); 1932 1567 1933 - dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__, 1934 - req, req->r_t.target_oid.name, req->r_t.target_oid.name_len, 1935 - msg->front.iov_len, data_len); 1568 + dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg, 1569 + req->r_t.target_oid.name, req->r_t.target_oid.name_len); 1570 + } 1571 + 1572 + static void encode_request_finish(struct ceph_msg *msg) 1573 + { 1574 + void *p = msg->front.iov_base; 1575 + void *const end = p + msg->front_alloc_len; 1576 + 1577 + if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) { 1578 + /* luminous OSD -- encode features and be done */ 1579 + p = end - 8; 1580 + ceph_encode_64(&p, msg->con->peer_features); 1581 + } else { 1582 + struct { 1583 + char spgid[CEPH_ENCODING_START_BLK_LEN + 1584 + CEPH_PGID_ENCODING_LEN + 1]; 1585 + __le32 hash; 1586 + __le32 epoch; 1587 + __le32 flags; 1588 + char reqid[CEPH_ENCODING_START_BLK_LEN + 1589 + sizeof(struct ceph_osd_reqid)]; 1590 + char trace[sizeof(struct ceph_blkin_trace_info)]; 1591 + __le32 client_inc; 1592 + struct ceph_timespec mtime; 1593 + } __packed head; 1594 + struct ceph_pg pgid; 1595 + void *oloc, *oid, *tail; 1596 + int oloc_len, oid_len, tail_len; 1597 + int len; 1598 + 1599 + /* 1600 + * Pre-luminous OSD -- reencode v8 into v4 using @head 1601 + * as a temporary buffer. Encode the raw PG; the rest 1602 + * is just a matter of moving oloc, oid and tail blobs 1603 + * around. 1604 + */ 1605 + memcpy(&head, p, sizeof(head)); 1606 + p += sizeof(head); 1607 + 1608 + oloc = p; 1609 + p += CEPH_ENCODING_START_BLK_LEN; 1610 + pgid.pool = ceph_decode_64(&p); 1611 + p += 4 + 4; /* preferred, key len */ 1612 + len = ceph_decode_32(&p); 1613 + p += len; /* nspace */ 1614 + oloc_len = p - oloc; 1615 + 1616 + oid = p; 1617 + len = ceph_decode_32(&p); 1618 + p += len; 1619 + oid_len = p - oid; 1620 + 1621 + tail = p; 1622 + tail_len = (end - p) - 8; 1623 + 1624 + p = msg->front.iov_base; 1625 + ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc)); 1626 + ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch)); 1627 + ceph_encode_copy(&p, &head.flags, sizeof(head.flags)); 1628 + ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime)); 1629 + 1630 + /* reassert_version */ 1631 + memset(p, 0, sizeof(struct ceph_eversion)); 1632 + p += sizeof(struct ceph_eversion); 1633 + 1634 + BUG_ON(p >= oloc); 1635 + memmove(p, oloc, oloc_len); 1636 + p += oloc_len; 1637 + 1638 + pgid.seed = le32_to_cpu(head.hash); 1639 + encode_pgid(&p, &pgid); /* raw pg */ 1640 + 1641 + BUG_ON(p >= oid); 1642 + memmove(p, oid, oid_len); 1643 + p += oid_len; 1644 + 1645 + /* tail -- ops, snapid, snapc, retry_attempt */ 1646 + BUG_ON(p >= tail); 1647 + memmove(p, tail, tail_len); 1648 + p += tail_len; 1649 + 1650 + msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */ 1651 + } 1652 + 1653 + BUG_ON(p > end); 1654 + msg->front.iov_len = p - msg->front.iov_base; 1655 + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 1656 + 1657 + dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg, 1658 + le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len), 1659 + le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len), 1660 + le16_to_cpu(msg->hdr.version)); 1936 1661 } 1937 1662 1938 1663 /* ··· 2034 1579 2035 1580 verify_osd_locked(osd); 2036 1581 WARN_ON(osd->o_osd != req->r_t.osd); 1582 + 1583 + /* backoff? */ 1584 + if (should_plug_request(req)) 1585 + return; 2037 1586 2038 1587 /* 2039 1588 * We may have a previously queued request message hanging ··· 2052 1593 else 2053 1594 WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); 2054 1595 2055 - encode_request(req, req->r_request); 1596 + encode_request_partial(req, req->r_request); 2056 1597 2057 - dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n", 1598 + dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n", 2058 1599 __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, 2059 - req->r_t.osd, req->r_flags, req->r_attempts); 1600 + req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed, 1601 + req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags, 1602 + req->r_attempts); 2060 1603 2061 1604 req->r_t.paused = false; 2062 1605 req->r_stamp = jiffies; ··· 2106 1645 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); 2107 1646 2108 1647 again: 2109 - ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); 1648 + ct_res = calc_target(osdc, &req->r_t, NULL, false); 2110 1649 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) 2111 1650 goto promote; 2112 1651 ··· 2198 1737 static void finish_request(struct ceph_osd_request *req) 2199 1738 { 2200 1739 struct ceph_osd_client *osdc = req->r_osdc; 2201 - struct ceph_osd *osd = req->r_osd; 2202 - 2203 - verify_osd_locked(osd); 2204 - dout("%s req %p tid %llu\n", __func__, req, req->r_tid); 2205 1740 2206 1741 WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); 2207 - unlink_request(osd, req); 1742 + dout("%s req %p tid %llu\n", __func__, req, req->r_tid); 1743 + 1744 + if (req->r_osd) 1745 + unlink_request(req->r_osd, req); 2208 1746 atomic_dec(&osdc->num_requests); 2209 1747 2210 1748 /* ··· 2901 2441 struct ceph_osd_client *osdc = lreq->osdc; 2902 2442 struct ceph_osd *osd; 2903 2443 2904 - calc_target(osdc, &lreq->t, &lreq->last_force_resend, false); 2444 + calc_target(osdc, &lreq->t, NULL, false); 2905 2445 osd = lookup_create_osd(osdc, lreq->t.osd, true); 2906 2446 link_linger(osd, lreq); 2907 2447 ··· 3519 3059 struct ceph_osd_client *osdc = lreq->osdc; 3520 3060 enum calc_target_result ct_res; 3521 3061 3522 - ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true); 3062 + ct_res = calc_target(osdc, &lreq->t, NULL, true); 3523 3063 if (ct_res == CALC_TARGET_NEED_RESEND) { 3524 3064 struct ceph_osd *osd; 3525 3065 ··· 3577 3117 list_add_tail(&lreq->scan_item, need_resend_linger); 3578 3118 break; 3579 3119 case CALC_TARGET_POOL_DNE: 3120 + list_del_init(&lreq->scan_item); 3580 3121 check_linger_pool_dne(lreq); 3581 3122 break; 3582 3123 } ··· 3591 3130 n = rb_next(n); /* unlink_request(), check_pool_dne() */ 3592 3131 3593 3132 dout("%s req %p tid %llu\n", __func__, req, req->r_tid); 3594 - ct_res = calc_target(osdc, &req->r_t, 3595 - &req->r_last_force_resend, false); 3133 + ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, 3134 + false); 3596 3135 switch (ct_res) { 3597 3136 case CALC_TARGET_NO_ACTION: 3598 3137 force_resend_writes = cleared_full || ··· 3690 3229 struct list_head *need_resend_linger) 3691 3230 { 3692 3231 struct ceph_osd_linger_request *lreq, *nlreq; 3232 + enum calc_target_result ct_res; 3693 3233 struct rb_node *n; 3234 + 3235 + /* make sure need_resend targets reflect latest map */ 3236 + for (n = rb_first(need_resend); n; ) { 3237 + struct ceph_osd_request *req = 3238 + rb_entry(n, struct ceph_osd_request, r_node); 3239 + 3240 + n = rb_next(n); 3241 + 3242 + if (req->r_t.epoch < osdc->osdmap->epoch) { 3243 + ct_res = calc_target(osdc, &req->r_t, NULL, false); 3244 + if (ct_res == CALC_TARGET_POOL_DNE) { 3245 + erase_request(need_resend, req); 3246 + check_pool_dne(req); 3247 + } 3248 + } 3249 + } 3694 3250 3695 3251 for (n = rb_first(need_resend); n; ) { 3696 3252 struct ceph_osd_request *req = ··· 3717 3239 n = rb_next(n); 3718 3240 erase_request(need_resend, req); /* before link_request() */ 3719 3241 3720 - WARN_ON(req->r_osd); 3721 - calc_target(osdc, &req->r_t, NULL, false); 3722 3242 osd = lookup_create_osd(osdc, req->r_t.osd, true); 3723 3243 link_request(osd, req); 3724 3244 if (!req->r_linger) { ··· 3859 3383 { 3860 3384 struct rb_node *n; 3861 3385 3386 + clear_backoffs(osd); 3387 + 3862 3388 for (n = rb_first(&osd->o_requests); n; ) { 3863 3389 struct ceph_osd_request *req = 3864 3390 rb_entry(n, struct ceph_osd_request, r_node); ··· 3904 3426 3905 3427 out_unlock: 3906 3428 up_write(&osdc->lock); 3429 + } 3430 + 3431 + struct MOSDBackoff { 3432 + struct ceph_spg spgid; 3433 + u32 map_epoch; 3434 + u8 op; 3435 + u64 id; 3436 + struct ceph_hobject_id *begin; 3437 + struct ceph_hobject_id *end; 3438 + }; 3439 + 3440 + static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m) 3441 + { 3442 + void *p = msg->front.iov_base; 3443 + void *const end = p + msg->front.iov_len; 3444 + u8 struct_v; 3445 + u32 struct_len; 3446 + int ret; 3447 + 3448 + ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len); 3449 + if (ret) 3450 + return ret; 3451 + 3452 + ret = ceph_decode_pgid(&p, end, &m->spgid.pgid); 3453 + if (ret) 3454 + return ret; 3455 + 3456 + ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval); 3457 + ceph_decode_32_safe(&p, end, m->map_epoch, e_inval); 3458 + ceph_decode_8_safe(&p, end, m->op, e_inval); 3459 + ceph_decode_64_safe(&p, end, m->id, e_inval); 3460 + 3461 + m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO); 3462 + if (!m->begin) 3463 + return -ENOMEM; 3464 + 3465 + ret = decode_hoid(&p, end, m->begin); 3466 + if (ret) { 3467 + free_hoid(m->begin); 3468 + return ret; 3469 + } 3470 + 3471 + m->end = kzalloc(sizeof(*m->end), GFP_NOIO); 3472 + if (!m->end) { 3473 + free_hoid(m->begin); 3474 + return -ENOMEM; 3475 + } 3476 + 3477 + ret = decode_hoid(&p, end, m->end); 3478 + if (ret) { 3479 + free_hoid(m->begin); 3480 + free_hoid(m->end); 3481 + return ret; 3482 + } 3483 + 3484 + return 0; 3485 + 3486 + e_inval: 3487 + return -EINVAL; 3488 + } 3489 + 3490 + static struct ceph_msg *create_backoff_message( 3491 + const struct ceph_osd_backoff *backoff, 3492 + u32 map_epoch) 3493 + { 3494 + struct ceph_msg *msg; 3495 + void *p, *end; 3496 + int msg_size; 3497 + 3498 + msg_size = CEPH_ENCODING_START_BLK_LEN + 3499 + CEPH_PGID_ENCODING_LEN + 1; /* spgid */ 3500 + msg_size += 4 + 1 + 8; /* map_epoch, op, id */ 3501 + msg_size += CEPH_ENCODING_START_BLK_LEN + 3502 + hoid_encoding_size(backoff->begin); 3503 + msg_size += CEPH_ENCODING_START_BLK_LEN + 3504 + hoid_encoding_size(backoff->end); 3505 + 3506 + msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true); 3507 + if (!msg) 3508 + return NULL; 3509 + 3510 + p = msg->front.iov_base; 3511 + end = p + msg->front_alloc_len; 3512 + 3513 + encode_spgid(&p, &backoff->spgid); 3514 + ceph_encode_32(&p, map_epoch); 3515 + ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK); 3516 + ceph_encode_64(&p, backoff->id); 3517 + encode_hoid(&p, end, backoff->begin); 3518 + encode_hoid(&p, end, backoff->end); 3519 + BUG_ON(p != end); 3520 + 3521 + msg->front.iov_len = p - msg->front.iov_base; 3522 + msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */ 3523 + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 3524 + 3525 + return msg; 3526 + } 3527 + 3528 + static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m) 3529 + { 3530 + struct ceph_spg_mapping *spg; 3531 + struct ceph_osd_backoff *backoff; 3532 + struct ceph_msg *msg; 3533 + 3534 + dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, 3535 + m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); 3536 + 3537 + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid); 3538 + if (!spg) { 3539 + spg = alloc_spg_mapping(); 3540 + if (!spg) { 3541 + pr_err("%s failed to allocate spg\n", __func__); 3542 + return; 3543 + } 3544 + spg->spgid = m->spgid; /* struct */ 3545 + insert_spg_mapping(&osd->o_backoff_mappings, spg); 3546 + } 3547 + 3548 + backoff = alloc_backoff(); 3549 + if (!backoff) { 3550 + pr_err("%s failed to allocate backoff\n", __func__); 3551 + return; 3552 + } 3553 + backoff->spgid = m->spgid; /* struct */ 3554 + backoff->id = m->id; 3555 + backoff->begin = m->begin; 3556 + m->begin = NULL; /* backoff now owns this */ 3557 + backoff->end = m->end; 3558 + m->end = NULL; /* ditto */ 3559 + 3560 + insert_backoff(&spg->backoffs, backoff); 3561 + insert_backoff_by_id(&osd->o_backoffs_by_id, backoff); 3562 + 3563 + /* 3564 + * Ack with original backoff's epoch so that the OSD can 3565 + * discard this if there was a PG split. 3566 + */ 3567 + msg = create_backoff_message(backoff, m->map_epoch); 3568 + if (!msg) { 3569 + pr_err("%s failed to allocate msg\n", __func__); 3570 + return; 3571 + } 3572 + ceph_con_send(&osd->o_con, msg); 3573 + } 3574 + 3575 + static bool target_contained_by(const struct ceph_osd_request_target *t, 3576 + const struct ceph_hobject_id *begin, 3577 + const struct ceph_hobject_id *end) 3578 + { 3579 + struct ceph_hobject_id hoid; 3580 + int cmp; 3581 + 3582 + hoid_fill_from_target(&hoid, t); 3583 + cmp = hoid_compare(&hoid, begin); 3584 + return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0); 3585 + } 3586 + 3587 + static void handle_backoff_unblock(struct ceph_osd *osd, 3588 + const struct MOSDBackoff *m) 3589 + { 3590 + struct ceph_spg_mapping *spg; 3591 + struct ceph_osd_backoff *backoff; 3592 + struct rb_node *n; 3593 + 3594 + dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd, 3595 + m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id); 3596 + 3597 + backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id); 3598 + if (!backoff) { 3599 + pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n", 3600 + __func__, osd->o_osd, m->spgid.pgid.pool, 3601 + m->spgid.pgid.seed, m->spgid.shard, m->id); 3602 + return; 3603 + } 3604 + 3605 + if (hoid_compare(backoff->begin, m->begin) && 3606 + hoid_compare(backoff->end, m->end)) { 3607 + pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n", 3608 + __func__, osd->o_osd, m->spgid.pgid.pool, 3609 + m->spgid.pgid.seed, m->spgid.shard, m->id); 3610 + /* unblock it anyway... */ 3611 + } 3612 + 3613 + spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid); 3614 + BUG_ON(!spg); 3615 + 3616 + erase_backoff(&spg->backoffs, backoff); 3617 + erase_backoff_by_id(&osd->o_backoffs_by_id, backoff); 3618 + free_backoff(backoff); 3619 + 3620 + if (RB_EMPTY_ROOT(&spg->backoffs)) { 3621 + erase_spg_mapping(&osd->o_backoff_mappings, spg); 3622 + free_spg_mapping(spg); 3623 + } 3624 + 3625 + for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) { 3626 + struct ceph_osd_request *req = 3627 + rb_entry(n, struct ceph_osd_request, r_node); 3628 + 3629 + if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) { 3630 + /* 3631 + * Match against @m, not @backoff -- the PG may 3632 + * have split on the OSD. 3633 + */ 3634 + if (target_contained_by(&req->r_t, m->begin, m->end)) { 3635 + /* 3636 + * If no other installed backoff applies, 3637 + * resend. 3638 + */ 3639 + send_request(req); 3640 + } 3641 + } 3642 + } 3643 + } 3644 + 3645 + static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg) 3646 + { 3647 + struct ceph_osd_client *osdc = osd->o_osdc; 3648 + struct MOSDBackoff m; 3649 + int ret; 3650 + 3651 + down_read(&osdc->lock); 3652 + if (!osd_registered(osd)) { 3653 + dout("%s osd%d unknown\n", __func__, osd->o_osd); 3654 + up_read(&osdc->lock); 3655 + return; 3656 + } 3657 + WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num)); 3658 + 3659 + mutex_lock(&osd->lock); 3660 + ret = decode_MOSDBackoff(msg, &m); 3661 + if (ret) { 3662 + pr_err("failed to decode MOSDBackoff: %d\n", ret); 3663 + ceph_msg_dump(msg); 3664 + goto out_unlock; 3665 + } 3666 + 3667 + switch (m.op) { 3668 + case CEPH_OSD_BACKOFF_OP_BLOCK: 3669 + handle_backoff_block(osd, &m); 3670 + break; 3671 + case CEPH_OSD_BACKOFF_OP_UNBLOCK: 3672 + handle_backoff_unblock(osd, &m); 3673 + break; 3674 + default: 3675 + pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op); 3676 + } 3677 + 3678 + free_hoid(m.begin); 3679 + free_hoid(m.end); 3680 + 3681 + out_unlock: 3682 + mutex_unlock(&osd->lock); 3683 + up_read(&osdc->lock); 3907 3684 } 3908 3685 3909 3686 /* ··· 5098 4365 case CEPH_MSG_OSD_OPREPLY: 5099 4366 handle_reply(osd, msg); 5100 4367 break; 4368 + case CEPH_MSG_OSD_BACKOFF: 4369 + handle_backoff(osd, msg); 4370 + break; 5101 4371 case CEPH_MSG_WATCH_NOTIFY: 5102 4372 handle_watch_notify(osdc, msg); 5103 4373 break; ··· 5223 4487 *skip = 0; 5224 4488 switch (type) { 5225 4489 case CEPH_MSG_OSD_MAP: 4490 + case CEPH_MSG_OSD_BACKOFF: 5226 4491 case CEPH_MSG_WATCH_NOTIFY: 5227 4492 return alloc_msg_with_page_vector(hdr); 5228 4493 case CEPH_MSG_OSD_OPREPLY: ··· 5308 4571 return ceph_monc_validate_auth(&osdc->client->monc); 5309 4572 } 5310 4573 4574 + static void osd_reencode_message(struct ceph_msg *msg) 4575 + { 4576 + encode_request_finish(msg); 4577 + } 4578 + 5311 4579 static int osd_sign_message(struct ceph_msg *msg) 5312 4580 { 5313 4581 struct ceph_osd *o = msg->con->private; ··· 5337 4595 .verify_authorizer_reply = verify_authorizer_reply, 5338 4596 .invalidate_authorizer = invalidate_authorizer, 5339 4597 .alloc_msg = alloc_msg, 4598 + .reencode_message = osd_reencode_message, 5340 4599 .sign_message = osd_sign_message, 5341 4600 .check_message_signature = osd_check_message_signature, 5342 4601 .fault = osd_fault,

+660 -200

net/ceph/osdmap.c

··· 11 11 #include <linux/crush/hash.h> 12 12 #include <linux/crush/mapper.h> 13 13 14 - char *ceph_osdmap_state_str(char *str, int len, int state) 14 + char *ceph_osdmap_state_str(char *str, int len, u32 state) 15 15 { 16 16 if (!len) 17 17 return str; ··· 138 138 return -EINVAL; 139 139 } 140 140 141 - static int skip_name_map(void **p, void *end) 141 + static struct crush_choose_arg_map *alloc_choose_arg_map(void) 142 142 { 143 - int len; 144 - ceph_decode_32_safe(p, end, len ,bad); 145 - while (len--) { 146 - int strlen; 147 - *p += sizeof(u32); 148 - ceph_decode_32_safe(p, end, strlen, bad); 149 - *p += strlen; 143 + struct crush_choose_arg_map *arg_map; 144 + 145 + arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO); 146 + if (!arg_map) 147 + return NULL; 148 + 149 + RB_CLEAR_NODE(&arg_map->node); 150 + return arg_map; 150 151 } 151 - return 0; 152 - bad: 153 - return -EINVAL; 152 + 153 + static void free_choose_arg_map(struct crush_choose_arg_map *arg_map) 154 + { 155 + if (arg_map) { 156 + int i, j; 157 + 158 + WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); 159 + 160 + for (i = 0; i < arg_map->size; i++) { 161 + struct crush_choose_arg *arg = &arg_map->args[i]; 162 + 163 + for (j = 0; j < arg->weight_set_size; j++) 164 + kfree(arg->weight_set[j].weights); 165 + kfree(arg->weight_set); 166 + kfree(arg->ids); 167 + } 168 + kfree(arg_map->args); 169 + kfree(arg_map); 170 + } 171 + } 172 + 173 + DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index, 174 + node); 175 + 176 + void clear_choose_args(struct crush_map *c) 177 + { 178 + while (!RB_EMPTY_ROOT(&c->choose_args)) { 179 + struct crush_choose_arg_map *arg_map = 180 + rb_entry(rb_first(&c->choose_args), 181 + struct crush_choose_arg_map, node); 182 + 183 + erase_choose_arg_map(&c->choose_args, arg_map); 184 + free_choose_arg_map(arg_map); 185 + } 186 + } 187 + 188 + static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen) 189 + { 190 + u32 *a = NULL; 191 + u32 len; 192 + int ret; 193 + 194 + ceph_decode_32_safe(p, end, len, e_inval); 195 + if (len) { 196 + u32 i; 197 + 198 + a = kmalloc_array(len, sizeof(u32), GFP_NOIO); 199 + if (!a) { 200 + ret = -ENOMEM; 201 + goto fail; 202 + } 203 + 204 + ceph_decode_need(p, end, len * sizeof(u32), e_inval); 205 + for (i = 0; i < len; i++) 206 + a[i] = ceph_decode_32(p); 207 + } 208 + 209 + *plen = len; 210 + return a; 211 + 212 + e_inval: 213 + ret = -EINVAL; 214 + fail: 215 + kfree(a); 216 + return ERR_PTR(ret); 217 + } 218 + 219 + /* 220 + * Assumes @arg is zero-initialized. 221 + */ 222 + static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg) 223 + { 224 + int ret; 225 + 226 + ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval); 227 + if (arg->weight_set_size) { 228 + u32 i; 229 + 230 + arg->weight_set = kmalloc_array(arg->weight_set_size, 231 + sizeof(*arg->weight_set), 232 + GFP_NOIO); 233 + if (!arg->weight_set) 234 + return -ENOMEM; 235 + 236 + for (i = 0; i < arg->weight_set_size; i++) { 237 + struct crush_weight_set *w = &arg->weight_set[i]; 238 + 239 + w->weights = decode_array_32_alloc(p, end, &w->size); 240 + if (IS_ERR(w->weights)) { 241 + ret = PTR_ERR(w->weights); 242 + w->weights = NULL; 243 + return ret; 244 + } 245 + } 246 + } 247 + 248 + arg->ids = decode_array_32_alloc(p, end, &arg->ids_size); 249 + if (IS_ERR(arg->ids)) { 250 + ret = PTR_ERR(arg->ids); 251 + arg->ids = NULL; 252 + return ret; 253 + } 254 + 255 + return 0; 256 + 257 + e_inval: 258 + return -EINVAL; 259 + } 260 + 261 + static int decode_choose_args(void **p, void *end, struct crush_map *c) 262 + { 263 + struct crush_choose_arg_map *arg_map = NULL; 264 + u32 num_choose_arg_maps, num_buckets; 265 + int ret; 266 + 267 + ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval); 268 + while (num_choose_arg_maps--) { 269 + arg_map = alloc_choose_arg_map(); 270 + if (!arg_map) { 271 + ret = -ENOMEM; 272 + goto fail; 273 + } 274 + 275 + ceph_decode_64_safe(p, end, arg_map->choose_args_index, 276 + e_inval); 277 + arg_map->size = c->max_buckets; 278 + arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args), 279 + GFP_NOIO); 280 + if (!arg_map->args) { 281 + ret = -ENOMEM; 282 + goto fail; 283 + } 284 + 285 + ceph_decode_32_safe(p, end, num_buckets, e_inval); 286 + while (num_buckets--) { 287 + struct crush_choose_arg *arg; 288 + u32 bucket_index; 289 + 290 + ceph_decode_32_safe(p, end, bucket_index, e_inval); 291 + if (bucket_index >= arg_map->size) 292 + goto e_inval; 293 + 294 + arg = &arg_map->args[bucket_index]; 295 + ret = decode_choose_arg(p, end, arg); 296 + if (ret) 297 + goto fail; 298 + } 299 + 300 + insert_choose_arg_map(&c->choose_args, arg_map); 301 + } 302 + 303 + return 0; 304 + 305 + e_inval: 306 + ret = -EINVAL; 307 + fail: 308 + free_choose_arg_map(arg_map); 309 + return ret; 154 310 } 155 311 156 312 static void crush_finalize(struct crush_map *c) ··· 343 187 void **p = &pbyval; 344 188 void *start = pbyval; 345 189 u32 magic; 346 - u32 num_name_maps; 347 190 348 191 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 349 192 350 193 c = kzalloc(sizeof(*c), GFP_NOFS); 351 194 if (c == NULL) 352 195 return ERR_PTR(-ENOMEM); 196 + 197 + c->choose_args = RB_ROOT; 353 198 354 199 /* set tunables to default values */ 355 200 c->choose_local_tries = 2; ··· 510 353 } 511 354 } 512 355 513 - /* ignore trailing name maps. */ 514 - for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { 515 - err = skip_name_map(p, end); 516 - if (err < 0) 517 - goto done; 518 - } 356 + ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */ 357 + ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */ 358 + ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ 519 359 520 360 /* tunables */ 521 361 ceph_decode_need(p, end, 3*sizeof(u32), done); ··· 545 391 dout("crush decode tunable chooseleaf_stable = %d\n", 546 392 c->chooseleaf_stable); 547 393 394 + if (*p != end) { 395 + /* class_map */ 396 + ceph_decode_skip_map(p, end, 32, 32, bad); 397 + /* class_name */ 398 + ceph_decode_skip_map(p, end, 32, string, bad); 399 + /* class_bucket */ 400 + ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad); 401 + } 402 + 403 + if (*p != end) { 404 + err = decode_choose_args(p, end, c); 405 + if (err) 406 + goto bad; 407 + } 408 + 548 409 done: 549 410 crush_finalize(c); 550 411 dout("crush_decode success\n"); ··· 587 418 return 0; 588 419 } 589 420 421 + int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs) 422 + { 423 + int ret; 424 + 425 + ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid); 426 + if (ret) 427 + return ret; 428 + 429 + if (lhs->shard < rhs->shard) 430 + return -1; 431 + if (lhs->shard > rhs->shard) 432 + return 1; 433 + 434 + return 0; 435 + } 436 + 437 + static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len) 438 + { 439 + struct ceph_pg_mapping *pg; 440 + 441 + pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO); 442 + if (!pg) 443 + return NULL; 444 + 445 + RB_CLEAR_NODE(&pg->node); 446 + return pg; 447 + } 448 + 449 + static void free_pg_mapping(struct ceph_pg_mapping *pg) 450 + { 451 + WARN_ON(!RB_EMPTY_NODE(&pg->node)); 452 + 453 + kfree(pg); 454 + } 455 + 590 456 /* 591 457 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 592 458 * to a set of osds) and primary_temp (explicit primary setting) 593 459 */ 594 - static int __insert_pg_mapping(struct ceph_pg_mapping *new, 595 - struct rb_root *root) 596 - { 597 - struct rb_node **p = &root->rb_node; 598 - struct rb_node *parent = NULL; 599 - struct ceph_pg_mapping *pg = NULL; 600 - int c; 601 - 602 - dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); 603 - while (*p) { 604 - parent = *p; 605 - pg = rb_entry(parent, struct ceph_pg_mapping, node); 606 - c = ceph_pg_compare(&new->pgid, &pg->pgid); 607 - if (c < 0) 608 - p = &(*p)->rb_left; 609 - else if (c > 0) 610 - p = &(*p)->rb_right; 611 - else 612 - return -EEXIST; 613 - } 614 - 615 - rb_link_node(&new->node, parent, p); 616 - rb_insert_color(&new->node, root); 617 - return 0; 618 - } 619 - 620 - static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, 621 - struct ceph_pg pgid) 622 - { 623 - struct rb_node *n = root->rb_node; 624 - struct ceph_pg_mapping *pg; 625 - int c; 626 - 627 - while (n) { 628 - pg = rb_entry(n, struct ceph_pg_mapping, node); 629 - c = ceph_pg_compare(&pgid, &pg->pgid); 630 - if (c < 0) { 631 - n = n->rb_left; 632 - } else if (c > 0) { 633 - n = n->rb_right; 634 - } else { 635 - dout("__lookup_pg_mapping %lld.%x got %p\n", 636 - pgid.pool, pgid.seed, pg); 637 - return pg; 638 - } 639 - } 640 - return NULL; 641 - } 642 - 643 - static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) 644 - { 645 - struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); 646 - 647 - if (pg) { 648 - dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed, 649 - pg); 650 - rb_erase(&pg->node, root); 651 - kfree(pg); 652 - return 0; 653 - } 654 - dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed); 655 - return -ENOENT; 656 - } 460 + DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, 461 + RB_BYPTR, const struct ceph_pg *, node) 657 462 658 463 /* 659 464 * rbtree of pg pool info ··· 825 682 *p += len; 826 683 } 827 684 685 + /* 686 + * last_force_op_resend_preluminous, will be overridden if the 687 + * map was encoded with RESEND_ON_SPLIT 688 + */ 828 689 if (ev >= 15) 829 690 pi->last_force_request_resend = ceph_decode_32(p); 830 691 else 831 692 pi->last_force_request_resend = 0; 693 + 694 + if (ev >= 16) 695 + *p += 4; /* skip min_read_recency_for_promote */ 696 + 697 + if (ev >= 17) 698 + *p += 8; /* skip expected_num_objects */ 699 + 700 + if (ev >= 19) 701 + *p += 4; /* skip cache_target_dirty_high_ratio_micro */ 702 + 703 + if (ev >= 20) 704 + *p += 4; /* skip min_write_recency_for_promote */ 705 + 706 + if (ev >= 21) 707 + *p += 1; /* skip use_gmt_hitset */ 708 + 709 + if (ev >= 22) 710 + *p += 1; /* skip fast_read */ 711 + 712 + if (ev >= 23) { 713 + *p += 4; /* skip hit_set_grade_decay_rate */ 714 + *p += 4; /* skip hit_set_search_last_n */ 715 + } 716 + 717 + if (ev >= 24) { 718 + /* skip opts */ 719 + *p += 1 + 1; /* versions */ 720 + len = ceph_decode_32(p); 721 + *p += len; 722 + } 723 + 724 + if (ev >= 25) 725 + pi->last_force_request_resend = ceph_decode_32(p); 832 726 833 727 /* ignore the rest */ 834 728 ··· 923 743 map->pool_max = -1; 924 744 map->pg_temp = RB_ROOT; 925 745 map->primary_temp = RB_ROOT; 746 + map->pg_upmap = RB_ROOT; 747 + map->pg_upmap_items = RB_ROOT; 926 748 mutex_init(&map->crush_workspace_mutex); 927 749 928 750 return map; ··· 939 757 struct ceph_pg_mapping *pg = 940 758 rb_entry(rb_first(&map->pg_temp), 941 759 struct ceph_pg_mapping, node); 942 - rb_erase(&pg->node, &map->pg_temp); 943 - kfree(pg); 760 + erase_pg_mapping(&map->pg_temp, pg); 761 + free_pg_mapping(pg); 944 762 } 945 763 while (!RB_EMPTY_ROOT(&map->primary_temp)) { 946 764 struct ceph_pg_mapping *pg = 947 765 rb_entry(rb_first(&map->primary_temp), 948 766 struct ceph_pg_mapping, node); 949 - rb_erase(&pg->node, &map->primary_temp); 767 + erase_pg_mapping(&map->primary_temp, pg); 768 + free_pg_mapping(pg); 769 + } 770 + while (!RB_EMPTY_ROOT(&map->pg_upmap)) { 771 + struct ceph_pg_mapping *pg = 772 + rb_entry(rb_first(&map->pg_upmap), 773 + struct ceph_pg_mapping, node); 774 + rb_erase(&pg->node, &map->pg_upmap); 775 + kfree(pg); 776 + } 777 + while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) { 778 + struct ceph_pg_mapping *pg = 779 + rb_entry(rb_first(&map->pg_upmap_items), 780 + struct ceph_pg_mapping, node); 781 + rb_erase(&pg->node, &map->pg_upmap_items); 950 782 kfree(pg); 951 783 } 952 784 while (!RB_EMPTY_ROOT(&map->pg_pools)) { ··· 984 788 */ 985 789 static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 986 790 { 987 - u8 *state; 791 + u32 *state; 988 792 u32 *weight; 989 793 struct ceph_entity_addr *addr; 990 794 int i; ··· 1160 964 return __decode_pools(p, end, map, true); 1161 965 } 1162 966 1163 - static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, 1164 - bool incremental) 967 + typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool); 968 + 969 + static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root, 970 + decode_mapping_fn_t fn, bool incremental) 1165 971 { 1166 972 u32 n; 1167 973 974 + WARN_ON(!incremental && !fn); 975 + 1168 976 ceph_decode_32_safe(p, end, n, e_inval); 1169 977 while (n--) { 978 + struct ceph_pg_mapping *pg; 1170 979 struct ceph_pg pgid; 1171 - u32 len, i; 1172 980 int ret; 1173 981 1174 982 ret = ceph_decode_pgid(p, end, &pgid); 1175 983 if (ret) 1176 984 return ret; 1177 985 1178 - ceph_decode_32_safe(p, end, len, e_inval); 986 + pg = lookup_pg_mapping(mapping_root, &pgid); 987 + if (pg) { 988 + WARN_ON(!incremental); 989 + erase_pg_mapping(mapping_root, pg); 990 + free_pg_mapping(pg); 991 + } 1179 992 1180 - ret = __remove_pg_mapping(&map->pg_temp, pgid); 1181 - BUG_ON(!incremental && ret != -ENOENT); 993 + if (fn) { 994 + pg = fn(p, end, incremental); 995 + if (IS_ERR(pg)) 996 + return PTR_ERR(pg); 1182 997 1183 - if (!incremental || len > 0) { 1184 - struct ceph_pg_mapping *pg; 1185 - 1186 - ceph_decode_need(p, end, len*sizeof(u32), e_inval); 1187 - 1188 - if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 1189 - return -EINVAL; 1190 - 1191 - pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); 1192 - if (!pg) 1193 - return -ENOMEM; 1194 - 1195 - pg->pgid = pgid; 1196 - pg->pg_temp.len = len; 1197 - for (i = 0; i < len; i++) 1198 - pg->pg_temp.osds[i] = ceph_decode_32(p); 1199 - 1200 - ret = __insert_pg_mapping(pg, &map->pg_temp); 1201 - if (ret) { 1202 - kfree(pg); 1203 - return ret; 998 + if (pg) { 999 + pg->pgid = pgid; /* struct */ 1000 + insert_pg_mapping(mapping_root, pg); 1204 1001 } 1205 1002 } 1206 1003 } ··· 1202 1013 1203 1014 e_inval: 1204 1015 return -EINVAL; 1016 + } 1017 + 1018 + static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, 1019 + bool incremental) 1020 + { 1021 + struct ceph_pg_mapping *pg; 1022 + u32 len, i; 1023 + 1024 + ceph_decode_32_safe(p, end, len, e_inval); 1025 + if (len == 0 && incremental) 1026 + return NULL; /* new_pg_temp: [] to remove */ 1027 + if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) 1028 + return ERR_PTR(-EINVAL); 1029 + 1030 + ceph_decode_need(p, end, len * sizeof(u32), e_inval); 1031 + pg = alloc_pg_mapping(len * sizeof(u32)); 1032 + if (!pg) 1033 + return ERR_PTR(-ENOMEM); 1034 + 1035 + pg->pg_temp.len = len; 1036 + for (i = 0; i < len; i++) 1037 + pg->pg_temp.osds[i] = ceph_decode_32(p); 1038 + 1039 + return pg; 1040 + 1041 + e_inval: 1042 + return ERR_PTR(-EINVAL); 1205 1043 } 1206 1044 1207 1045 static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) 1208 1046 { 1209 - return __decode_pg_temp(p, end, map, false); 1047 + return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, 1048 + false); 1210 1049 } 1211 1050 1212 1051 static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) 1213 1052 { 1214 - return __decode_pg_temp(p, end, map, true); 1053 + return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, 1054 + true); 1215 1055 } 1216 1056 1217 - static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, 1218 - bool incremental) 1057 + static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end, 1058 + bool incremental) 1219 1059 { 1220 - u32 n; 1060 + struct ceph_pg_mapping *pg; 1061 + u32 osd; 1221 1062 1222 - ceph_decode_32_safe(p, end, n, e_inval); 1223 - while (n--) { 1224 - struct ceph_pg pgid; 1225 - u32 osd; 1226 - int ret; 1063 + ceph_decode_32_safe(p, end, osd, e_inval); 1064 + if (osd == (u32)-1 && incremental) 1065 + return NULL; /* new_primary_temp: -1 to remove */ 1227 1066 1228 - ret = ceph_decode_pgid(p, end, &pgid); 1229 - if (ret) 1230 - return ret; 1067 + pg = alloc_pg_mapping(0); 1068 + if (!pg) 1069 + return ERR_PTR(-ENOMEM); 1231 1070 1232 - ceph_decode_32_safe(p, end, osd, e_inval); 1233 - 1234 - ret = __remove_pg_mapping(&map->primary_temp, pgid); 1235 - BUG_ON(!incremental && ret != -ENOENT); 1236 - 1237 - if (!incremental || osd != (u32)-1) { 1238 - struct ceph_pg_mapping *pg; 1239 - 1240 - pg = kzalloc(sizeof(*pg), GFP_NOFS); 1241 - if (!pg) 1242 - return -ENOMEM; 1243 - 1244 - pg->pgid = pgid; 1245 - pg->primary_temp.osd = osd; 1246 - 1247 - ret = __insert_pg_mapping(pg, &map->primary_temp); 1248 - if (ret) { 1249 - kfree(pg); 1250 - return ret; 1251 - } 1252 - } 1253 - } 1254 - 1255 - return 0; 1071 + pg->primary_temp.osd = osd; 1072 + return pg; 1256 1073 1257 1074 e_inval: 1258 - return -EINVAL; 1075 + return ERR_PTR(-EINVAL); 1259 1076 } 1260 1077 1261 1078 static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) 1262 1079 { 1263 - return __decode_primary_temp(p, end, map, false); 1080 + return decode_pg_mapping(p, end, &map->primary_temp, 1081 + __decode_primary_temp, false); 1264 1082 } 1265 1083 1266 1084 static int decode_new_primary_temp(void **p, void *end, 1267 1085 struct ceph_osdmap *map) 1268 1086 { 1269 - return __decode_primary_temp(p, end, map, true); 1087 + return decode_pg_mapping(p, end, &map->primary_temp, 1088 + __decode_primary_temp, true); 1270 1089 } 1271 1090 1272 1091 u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) ··· 1365 1168 return -EINVAL; 1366 1169 } 1367 1170 1171 + static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end, 1172 + bool __unused) 1173 + { 1174 + return __decode_pg_temp(p, end, false); 1175 + } 1176 + 1177 + static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map) 1178 + { 1179 + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, 1180 + false); 1181 + } 1182 + 1183 + static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map) 1184 + { 1185 + return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, 1186 + true); 1187 + } 1188 + 1189 + static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map) 1190 + { 1191 + return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true); 1192 + } 1193 + 1194 + static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, 1195 + bool __unused) 1196 + { 1197 + struct ceph_pg_mapping *pg; 1198 + u32 len, i; 1199 + 1200 + ceph_decode_32_safe(p, end, len, e_inval); 1201 + if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) 1202 + return ERR_PTR(-EINVAL); 1203 + 1204 + ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); 1205 + pg = kzalloc(sizeof(*pg) + 2 * len * sizeof(u32), GFP_NOIO); 1206 + if (!pg) 1207 + return ERR_PTR(-ENOMEM); 1208 + 1209 + pg->pg_upmap_items.len = len; 1210 + for (i = 0; i < len; i++) { 1211 + pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p); 1212 + pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p); 1213 + } 1214 + 1215 + return pg; 1216 + 1217 + e_inval: 1218 + return ERR_PTR(-EINVAL); 1219 + } 1220 + 1221 + static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) 1222 + { 1223 + return decode_pg_mapping(p, end, &map->pg_upmap_items, 1224 + __decode_pg_upmap_items, false); 1225 + } 1226 + 1227 + static int decode_new_pg_upmap_items(void **p, void *end, 1228 + struct ceph_osdmap *map) 1229 + { 1230 + return decode_pg_mapping(p, end, &map->pg_upmap_items, 1231 + __decode_pg_upmap_items, true); 1232 + } 1233 + 1234 + static int decode_old_pg_upmap_items(void **p, void *end, 1235 + struct ceph_osdmap *map) 1236 + { 1237 + return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true); 1238 + } 1239 + 1368 1240 /* 1369 1241 * decode a full map. 1370 1242 */ ··· 1484 1218 1485 1219 /* osd_state, osd_weight, osd_addrs->client_addr */ 1486 1220 ceph_decode_need(p, end, 3*sizeof(u32) + 1487 - map->max_osd*(1 + sizeof(*map->osd_weight) + 1221 + map->max_osd*((struct_v >= 5 ? sizeof(u32) : 1222 + sizeof(u8)) + 1223 + sizeof(*map->osd_weight) + 1488 1224 sizeof(*map->osd_addr)), e_inval); 1489 1225 1490 1226 if (ceph_decode_32(p) != map->max_osd) 1491 1227 goto e_inval; 1492 1228 1493 - ceph_decode_copy(p, map->osd_state, map->max_osd); 1229 + if (struct_v >= 5) { 1230 + for (i = 0; i < map->max_osd; i++) 1231 + map->osd_state[i] = ceph_decode_32(p); 1232 + } else { 1233 + for (i = 0; i < map->max_osd; i++) 1234 + map->osd_state[i] = ceph_decode_8(p); 1235 + } 1494 1236 1495 1237 if (ceph_decode_32(p) != map->max_osd) 1496 1238 goto e_inval; ··· 1531 1257 if (err) 1532 1258 goto bad; 1533 1259 } else { 1534 - /* XXX can this happen? */ 1535 - kfree(map->osd_primary_affinity); 1536 - map->osd_primary_affinity = NULL; 1260 + WARN_ON(map->osd_primary_affinity); 1537 1261 } 1538 1262 1539 1263 /* crush */ ··· 1539 1267 err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); 1540 1268 if (err) 1541 1269 goto bad; 1270 + 1271 + *p += len; 1272 + if (struct_v >= 3) { 1273 + /* erasure_code_profiles */ 1274 + ceph_decode_skip_map_of_map(p, end, string, string, string, 1275 + bad); 1276 + } 1277 + 1278 + if (struct_v >= 4) { 1279 + err = decode_pg_upmap(p, end, map); 1280 + if (err) 1281 + goto bad; 1282 + 1283 + err = decode_pg_upmap_items(p, end, map); 1284 + if (err) 1285 + goto bad; 1286 + } else { 1287 + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap)); 1288 + WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items)); 1289 + } 1542 1290 1543 1291 /* ignore the rest */ 1544 1292 *p = end; ··· 1606 1314 * new_up_client: { osd=6, addr=... } # set osd_state and addr 1607 1315 * new_state: { osd=6, xorstate=EXISTS } # clear osd_state 1608 1316 */ 1609 - static int decode_new_up_state_weight(void **p, void *end, 1317 + static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, 1610 1318 struct ceph_osdmap *map) 1611 1319 { 1612 1320 void *new_up_client; ··· 1622 1330 1623 1331 new_state = *p; 1624 1332 ceph_decode_32_safe(p, end, len, e_inval); 1625 - len *= sizeof(u32) + sizeof(u8); 1333 + len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8)); 1626 1334 ceph_decode_need(p, end, len, e_inval); 1627 1335 *p += len; 1628 1336 ··· 1658 1366 len = ceph_decode_32(p); 1659 1367 while (len--) { 1660 1368 s32 osd; 1661 - u8 xorstate; 1369 + u32 xorstate; 1662 1370 int ret; 1663 1371 1664 1372 osd = ceph_decode_32(p); 1665 - xorstate = ceph_decode_8(p); 1373 + if (struct_v >= 5) 1374 + xorstate = ceph_decode_32(p); 1375 + else 1376 + xorstate = ceph_decode_8(p); 1666 1377 if (xorstate == 0) 1667 1378 xorstate = CEPH_OSD_UP; 1668 1379 BUG_ON(osd >= map->max_osd); ··· 1799 1504 } 1800 1505 1801 1506 /* new_up_client, new_state, new_weight */ 1802 - err = decode_new_up_state_weight(p, end, map); 1507 + err = decode_new_up_state_weight(p, end, struct_v, map); 1803 1508 if (err) 1804 1509 goto bad; 1805 1510 ··· 1818 1523 /* new_primary_affinity */ 1819 1524 if (struct_v >= 2) { 1820 1525 err = decode_new_primary_affinity(p, end, map); 1526 + if (err) 1527 + goto bad; 1528 + } 1529 + 1530 + if (struct_v >= 3) { 1531 + /* new_erasure_code_profiles */ 1532 + ceph_decode_skip_map_of_map(p, end, string, string, string, 1533 + bad); 1534 + /* old_erasure_code_profiles */ 1535 + ceph_decode_skip_set(p, end, string, bad); 1536 + } 1537 + 1538 + if (struct_v >= 4) { 1539 + err = decode_new_pg_upmap(p, end, map); 1540 + if (err) 1541 + goto bad; 1542 + 1543 + err = decode_old_pg_upmap(p, end, map); 1544 + if (err) 1545 + goto bad; 1546 + 1547 + err = decode_new_pg_upmap_items(p, end, map); 1548 + if (err) 1549 + goto bad; 1550 + 1551 + err = decode_old_pg_upmap_items(p, end, map); 1821 1552 if (err) 1822 1553 goto bad; 1823 1554 } ··· 1868 1547 void ceph_oloc_copy(struct ceph_object_locator *dest, 1869 1548 const struct ceph_object_locator *src) 1870 1549 { 1871 - WARN_ON(!ceph_oloc_empty(dest)); 1872 - WARN_ON(dest->pool_ns); /* empty() only covers ->pool */ 1550 + ceph_oloc_destroy(dest); 1873 1551 1874 1552 dest->pool = src->pool; 1875 1553 if (src->pool_ns) 1876 1554 dest->pool_ns = ceph_get_string(src->pool_ns); 1555 + else 1556 + dest->pool_ns = NULL; 1877 1557 } 1878 1558 EXPORT_SYMBOL(ceph_oloc_copy); 1879 1559 ··· 1887 1565 void ceph_oid_copy(struct ceph_object_id *dest, 1888 1566 const struct ceph_object_id *src) 1889 1567 { 1890 - WARN_ON(!ceph_oid_empty(dest)); 1568 + ceph_oid_destroy(dest); 1891 1569 1892 1570 if (src->name != src->inline_name) { 1893 1571 /* very rare, see ceph_object_id definition */ 1894 1572 dest->name = kmalloc(src->name_len + 1, 1895 1573 GFP_NOIO | __GFP_NOFAIL); 1574 + } else { 1575 + dest->name = dest->inline_name; 1896 1576 } 1897 - 1898 1577 memcpy(dest->name, src->name, src->name_len + 1); 1899 1578 dest->name_len = src->name_len; 1900 1579 } ··· 2037 1714 dest->primary = src->primary; 2038 1715 } 2039 1716 2040 - static bool is_split(const struct ceph_pg *pgid, 2041 - u32 old_pg_num, 2042 - u32 new_pg_num) 1717 + bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, 1718 + u32 new_pg_num) 2043 1719 { 2044 1720 int old_bits = calc_bits_of(old_pg_num); 2045 1721 int old_mask = (1 << old_bits) - 1; ··· 2083 1761 !osds_equal(old_up, new_up) || 2084 1762 old_size != new_size || 2085 1763 old_min_size != new_min_size || 2086 - is_split(pgid, old_pg_num, new_pg_num) || 1764 + ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || 2087 1765 old_sort_bitwise != new_sort_bitwise; 2088 1766 } 2089 1767 ··· 2207 1885 * Should only be called with target_oid and target_oloc (as opposed to 2208 1886 * base_oid and base_oloc), since tiering isn't taken into account. 2209 1887 */ 2210 - int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, 2211 - struct ceph_object_id *oid, 2212 - struct ceph_object_locator *oloc, 2213 - struct ceph_pg *raw_pgid) 1888 + int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, 1889 + const struct ceph_object_id *oid, 1890 + const struct ceph_object_locator *oloc, 1891 + struct ceph_pg *raw_pgid) 2214 1892 { 2215 - struct ceph_pg_pool_info *pi; 2216 - 2217 - pi = ceph_pg_pool_by_id(osdmap, oloc->pool); 2218 - if (!pi) 2219 - return -ENOENT; 1893 + WARN_ON(pi->id != oloc->pool); 2220 1894 2221 1895 if (!oloc->pool_ns) { 2222 1896 raw_pgid->pool = oloc->pool; ··· 2243 1925 raw_pgid->pool, raw_pgid->seed); 2244 1926 } 2245 1927 return 0; 1928 + } 1929 + 1930 + int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, 1931 + const struct ceph_object_id *oid, 1932 + const struct ceph_object_locator *oloc, 1933 + struct ceph_pg *raw_pgid) 1934 + { 1935 + struct ceph_pg_pool_info *pi; 1936 + 1937 + pi = ceph_pg_pool_by_id(osdmap, oloc->pool); 1938 + if (!pi) 1939 + return -ENOENT; 1940 + 1941 + return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); 2246 1942 } 2247 1943 EXPORT_SYMBOL(ceph_object_locator_to_pg); 2248 1944 ··· 2302 1970 2303 1971 static int do_crush(struct ceph_osdmap *map, int ruleno, int x, 2304 1972 int *result, int result_max, 2305 - const __u32 *weight, int weight_max) 1973 + const __u32 *weight, int weight_max, 1974 + u64 choose_args_index) 2306 1975 { 1976 + struct crush_choose_arg_map *arg_map; 2307 1977 int r; 2308 1978 2309 1979 BUG_ON(result_max > CEPH_PG_MAX_SIZE); 2310 1980 1981 + arg_map = lookup_choose_arg_map(&map->crush->choose_args, 1982 + choose_args_index); 1983 + 2311 1984 mutex_lock(&map->crush_workspace_mutex); 2312 1985 r = crush_do_rule(map->crush, ruleno, x, result, result_max, 2313 - weight, weight_max, map->crush_workspace); 1986 + weight, weight_max, map->crush_workspace, 1987 + arg_map ? arg_map->args : NULL); 2314 1988 mutex_unlock(&map->crush_workspace_mutex); 2315 1989 2316 1990 return r; 2317 1991 } 2318 1992 1993 + static void remove_nonexistent_osds(struct ceph_osdmap *osdmap, 1994 + struct ceph_pg_pool_info *pi, 1995 + struct ceph_osds *set) 1996 + { 1997 + int i; 1998 + 1999 + if (ceph_can_shift_osds(pi)) { 2000 + int removed = 0; 2001 + 2002 + /* shift left */ 2003 + for (i = 0; i < set->size; i++) { 2004 + if (!ceph_osd_exists(osdmap, set->osds[i])) { 2005 + removed++; 2006 + continue; 2007 + } 2008 + if (removed) 2009 + set->osds[i - removed] = set->osds[i]; 2010 + } 2011 + set->size -= removed; 2012 + } else { 2013 + /* set dne devices to NONE */ 2014 + for (i = 0; i < set->size; i++) { 2015 + if (!ceph_osd_exists(osdmap, set->osds[i])) 2016 + set->osds[i] = CRUSH_ITEM_NONE; 2017 + } 2018 + } 2019 + } 2020 + 2319 2021 /* 2320 - * Calculate raw set (CRUSH output) for given PG. The result may 2321 - * contain nonexistent OSDs. ->primary is undefined for a raw set. 2022 + * Calculate raw set (CRUSH output) for given PG and filter out 2023 + * nonexistent OSDs. ->primary is undefined for a raw set. 2322 2024 * 2323 2025 * Placement seed (CRUSH input) is returned through @ppps. 2324 2026 */ ··· 2386 2020 } 2387 2021 2388 2022 len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, 2389 - osdmap->osd_weight, osdmap->max_osd); 2023 + osdmap->osd_weight, osdmap->max_osd, pi->id); 2390 2024 if (len < 0) { 2391 2025 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", 2392 2026 len, ruleno, pi->id, pi->crush_ruleset, pi->type, ··· 2395 2029 } 2396 2030 2397 2031 raw->size = len; 2032 + remove_nonexistent_osds(osdmap, pi, raw); 2033 + } 2034 + 2035 + /* apply pg_upmap[_items] mappings */ 2036 + static void apply_upmap(struct ceph_osdmap *osdmap, 2037 + const struct ceph_pg *pgid, 2038 + struct ceph_osds *raw) 2039 + { 2040 + struct ceph_pg_mapping *pg; 2041 + int i, j; 2042 + 2043 + pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid); 2044 + if (pg) { 2045 + /* make sure targets aren't marked out */ 2046 + for (i = 0; i < pg->pg_upmap.len; i++) { 2047 + int osd = pg->pg_upmap.osds[i]; 2048 + 2049 + if (osd != CRUSH_ITEM_NONE && 2050 + osd < osdmap->max_osd && 2051 + osdmap->osd_weight[osd] == 0) { 2052 + /* reject/ignore explicit mapping */ 2053 + return; 2054 + } 2055 + } 2056 + for (i = 0; i < pg->pg_upmap.len; i++) 2057 + raw->osds[i] = pg->pg_upmap.osds[i]; 2058 + raw->size = pg->pg_upmap.len; 2059 + return; 2060 + } 2061 + 2062 + pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); 2063 + if (pg) { 2064 + /* 2065 + * Note: this approach does not allow a bidirectional swap, 2066 + * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. 2067 + */ 2068 + for (i = 0; i < pg->pg_upmap_items.len; i++) { 2069 + int from = pg->pg_upmap_items.from_to[i][0]; 2070 + int to = pg->pg_upmap_items.from_to[i][1]; 2071 + int pos = -1; 2072 + bool exists = false; 2073 + 2074 + /* make sure replacement doesn't already appear */ 2075 + for (j = 0; j < raw->size; j++) { 2076 + int osd = raw->osds[j]; 2077 + 2078 + if (osd == to) { 2079 + exists = true; 2080 + break; 2081 + } 2082 + /* ignore mapping if target is marked out */ 2083 + if (osd == from && pos < 0 && 2084 + !(to != CRUSH_ITEM_NONE && 2085 + to < osdmap->max_osd && 2086 + osdmap->osd_weight[to] == 0)) { 2087 + pos = j; 2088 + } 2089 + } 2090 + if (!exists && pos >= 0) { 2091 + raw->osds[pos] = to; 2092 + return; 2093 + } 2094 + } 2095 + } 2398 2096 } 2399 2097 2400 2098 /* ··· 2581 2151 */ 2582 2152 static void get_temp_osds(struct ceph_osdmap *osdmap, 2583 2153 struct ceph_pg_pool_info *pi, 2584 - const struct ceph_pg *raw_pgid, 2154 + const struct ceph_pg *pgid, 2585 2155 struct ceph_osds *temp) 2586 2156 { 2587 - struct ceph_pg pgid; 2588 2157 struct ceph_pg_mapping *pg; 2589 2158 int i; 2590 2159 2591 - raw_pg_to_pg(pi, raw_pgid, &pgid); 2592 2160 ceph_osds_init(temp); 2593 2161 2594 2162 /* pg_temp? */ 2595 - pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 2163 + pg = lookup_pg_mapping(&osdmap->pg_temp, pgid); 2596 2164 if (pg) { 2597 2165 for (i = 0; i < pg->pg_temp.len; i++) { 2598 2166 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { ··· 2613 2185 } 2614 2186 2615 2187 /* primary_temp? */ 2616 - pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); 2188 + pg = lookup_pg_mapping(&osdmap->primary_temp, pgid); 2617 2189 if (pg) 2618 2190 temp->primary = pg->primary_temp.osd; 2619 2191 } ··· 2626 2198 * resend a request. 2627 2199 */ 2628 2200 void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, 2201 + struct ceph_pg_pool_info *pi, 2629 2202 const struct ceph_pg *raw_pgid, 2630 2203 struct ceph_osds *up, 2631 2204 struct ceph_osds *acting) 2632 2205 { 2633 - struct ceph_pg_pool_info *pi; 2206 + struct ceph_pg pgid; 2634 2207 u32 pps; 2635 2208 2636 - pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); 2637 - if (!pi) { 2638 - ceph_osds_init(up); 2639 - ceph_osds_init(acting); 2640 - goto out; 2641 - } 2209 + WARN_ON(pi->id != raw_pgid->pool); 2210 + raw_pg_to_pg(pi, raw_pgid, &pgid); 2642 2211 2643 2212 pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); 2213 + apply_upmap(osdmap, &pgid, up); 2644 2214 raw_to_up_osds(osdmap, pi, up); 2645 2215 apply_primary_affinity(osdmap, pi, pps, up); 2646 - get_temp_osds(osdmap, pi, raw_pgid, acting); 2216 + get_temp_osds(osdmap, pi, &pgid, acting); 2647 2217 if (!acting->size) { 2648 2218 memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); 2649 2219 acting->size = up->size; 2650 2220 if (acting->primary == -1) 2651 2221 acting->primary = up->primary; 2652 2222 } 2653 - out: 2654 2223 WARN_ON(!osds_valid(up) || !osds_valid(acting)); 2224 + } 2225 + 2226 + bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, 2227 + struct ceph_pg_pool_info *pi, 2228 + const struct ceph_pg *raw_pgid, 2229 + struct ceph_spg *spgid) 2230 + { 2231 + struct ceph_pg pgid; 2232 + struct ceph_osds up, acting; 2233 + int i; 2234 + 2235 + WARN_ON(pi->id != raw_pgid->pool); 2236 + raw_pg_to_pg(pi, raw_pgid, &pgid); 2237 + 2238 + if (ceph_can_shift_osds(pi)) { 2239 + spgid->pgid = pgid; /* struct */ 2240 + spgid->shard = CEPH_SPG_NOSHARD; 2241 + return true; 2242 + } 2243 + 2244 + ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting); 2245 + for (i = 0; i < acting.size; i++) { 2246 + if (acting.osds[i] == acting.primary) { 2247 + spgid->pgid = pgid; /* struct */ 2248 + spgid->shard = i; 2249 + return true; 2250 + } 2251 + } 2252 + 2253 + return false; 2655 2254 } 2656 2255 2657 2256 /* ··· 2687 2232 int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, 2688 2233 const struct ceph_pg *raw_pgid) 2689 2234 { 2235 + struct ceph_pg_pool_info *pi; 2690 2236 struct ceph_osds up, acting; 2691 2237 2692 - ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); 2238 + pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); 2239 + if (!pi) 2240 + return -1; 2241 + 2242 + ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting); 2693 2243 return acting.primary; 2694 2244 } 2695 2245 EXPORT_SYMBOL(ceph_pg_to_acting_primary);

Configure Feed

Configure Feed