Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
"This has a mix of bug fixes and cleanups.

Alex's patch fixes a rare race in RBD. Ilya's patches fix an ENOENT
check when a second rbd image is mapped and a couple memory leaks.
Zheng fixes several issues with fragmented directories and multiple
MDSs. Josh fixes a spin/sleep issue, and Josh and Guangliang's
patches fix setting and unsetting RBD images read-only.

Naturally there are several other cleanups mixed in for good measure"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (23 commits)
rbd: only set disk to read-only once
rbd: move calls that may sleep out of spin lock range
rbd: add ioctl for rbd
ceph: use truncate_pagecache() instead of truncate_inode_pages()
ceph: include time stamp in every MDS request
rbd: fix ida/idr memory leak
rbd: use reference counts for image requests
rbd: fix osd_request memory leak in __rbd_dev_header_watch_sync()
rbd: make sure we have latest osdmap on 'rbd map'
libceph: add ceph_monc_wait_osdmap()
libceph: mon_get_version request infrastructure
libceph: recognize poolop requests in debugfs
ceph: refactor readpage_nounlock() to make the logic clearer
mds: check cap ID when handling cap export message
ceph: remember subtree root dirfrag's auth MDS
ceph: introduce ceph_fill_fragtree()
ceph: handle cap import atomically
ceph: pre-allocate ceph_cap struct for ceph_add_cap()
ceph: update inode fields according to issued caps
rbd: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO
...

+670 -286
+199 -47
drivers/block/rbd.c
··· 541 541 return -ENOENT; 542 542 543 543 (void) get_device(&rbd_dev->dev); 544 - set_device_ro(bdev, rbd_dev->mapping.read_only); 545 544 546 545 return 0; 547 546 } ··· 558 559 put_device(&rbd_dev->dev); 559 560 } 560 561 562 + static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) 563 + { 564 + int ret = 0; 565 + int val; 566 + bool ro; 567 + bool ro_changed = false; 568 + 569 + /* get_user() may sleep, so call it before taking rbd_dev->lock */ 570 + if (get_user(val, (int __user *)(arg))) 571 + return -EFAULT; 572 + 573 + ro = val ? true : false; 574 + /* Snapshot doesn't allow to write*/ 575 + if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) 576 + return -EROFS; 577 + 578 + spin_lock_irq(&rbd_dev->lock); 579 + /* prevent others open this device */ 580 + if (rbd_dev->open_count > 1) { 581 + ret = -EBUSY; 582 + goto out; 583 + } 584 + 585 + if (rbd_dev->mapping.read_only != ro) { 586 + rbd_dev->mapping.read_only = ro; 587 + ro_changed = true; 588 + } 589 + 590 + out: 591 + spin_unlock_irq(&rbd_dev->lock); 592 + /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ 593 + if (ret == 0 && ro_changed) 594 + set_disk_ro(rbd_dev->disk, ro ? 1 : 0); 595 + 596 + return ret; 597 + } 598 + 599 + static int rbd_ioctl(struct block_device *bdev, fmode_t mode, 600 + unsigned int cmd, unsigned long arg) 601 + { 602 + struct rbd_device *rbd_dev = bdev->bd_disk->private_data; 603 + int ret = 0; 604 + 605 + switch (cmd) { 606 + case BLKROSET: 607 + ret = rbd_ioctl_set_ro(rbd_dev, arg); 608 + break; 609 + default: 610 + ret = -ENOTTY; 611 + } 612 + 613 + return ret; 614 + } 615 + 616 + #ifdef CONFIG_COMPAT 617 + static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode, 618 + unsigned int cmd, unsigned long arg) 619 + { 620 + return rbd_ioctl(bdev, mode, cmd, arg); 621 + } 622 + #endif /* CONFIG_COMPAT */ 623 + 561 624 static const struct block_device_operations rbd_bd_ops = { 562 625 .owner = THIS_MODULE, 563 626 .open = rbd_open, 564 627 .release = rbd_release, 628 + .ioctl = rbd_ioctl, 629 + #ifdef CONFIG_COMPAT 630 + .compat_ioctl = rbd_compat_ioctl, 631 + #endif 565 632 }; 566 633 567 634 /* ··· 1447 1382 kref_put(&obj_request->kref, rbd_obj_request_destroy); 1448 1383 } 1449 1384 1385 + static void rbd_img_request_get(struct rbd_img_request *img_request) 1386 + { 1387 + dout("%s: img %p (was %d)\n", __func__, img_request, 1388 + atomic_read(&img_request->kref.refcount)); 1389 + kref_get(&img_request->kref); 1390 + } 1391 + 1450 1392 static bool img_request_child_test(struct rbd_img_request *img_request); 1451 1393 static void rbd_parent_request_destroy(struct kref *kref); 1452 1394 static void rbd_img_request_destroy(struct kref *kref); ··· 2214 2142 img_request->next_completion = which; 2215 2143 out: 2216 2144 spin_unlock_irq(&img_request->completion_lock); 2145 + rbd_img_request_put(img_request); 2217 2146 2218 2147 if (!more) 2219 2148 rbd_img_request_complete(img_request); ··· 2315 2242 goto out_unwind; 2316 2243 obj_request->osd_req = osd_req; 2317 2244 obj_request->callback = rbd_img_obj_callback; 2245 + rbd_img_request_get(img_request); 2318 2246 2319 2247 if (write_request) { 2320 2248 osd_req_op_alloc_hint_init(osd_req, which, ··· 2946 2872 } 2947 2873 2948 2874 /* 2949 - * Request sync osd watch/unwatch. The value of "start" determines 2950 - * whether a watch request is being initiated or torn down. 2875 + * Initiate a watch request, synchronously. 2951 2876 */ 2952 - static int __rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, bool start) 2877 + static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 2953 2878 { 2954 2879 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2955 2880 struct rbd_obj_request *obj_request; 2956 2881 int ret; 2957 2882 2958 - rbd_assert(start ^ !!rbd_dev->watch_event); 2959 - rbd_assert(start ^ !!rbd_dev->watch_request); 2883 + rbd_assert(!rbd_dev->watch_event); 2884 + rbd_assert(!rbd_dev->watch_request); 2960 2885 2961 - if (start) { 2962 - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 2963 - &rbd_dev->watch_event); 2964 - if (ret < 0) 2965 - return ret; 2966 - rbd_assert(rbd_dev->watch_event != NULL); 2967 - } 2886 + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev, 2887 + &rbd_dev->watch_event); 2888 + if (ret < 0) 2889 + return ret; 2968 2890 2969 - ret = -ENOMEM; 2891 + rbd_assert(rbd_dev->watch_event); 2892 + 2970 2893 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2971 - OBJ_REQUEST_NODATA); 2972 - if (!obj_request) 2894 + OBJ_REQUEST_NODATA); 2895 + if (!obj_request) { 2896 + ret = -ENOMEM; 2973 2897 goto out_cancel; 2898 + } 2974 2899 2975 2900 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 2976 2901 obj_request); 2977 - if (!obj_request->osd_req) 2978 - goto out_cancel; 2902 + if (!obj_request->osd_req) { 2903 + ret = -ENOMEM; 2904 + goto out_put; 2905 + } 2979 2906 2980 - if (start) 2981 - ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 2982 - else 2983 - ceph_osdc_unregister_linger_request(osdc, 2984 - rbd_dev->watch_request->osd_req); 2907 + ceph_osdc_set_request_linger(osdc, obj_request->osd_req); 2985 2908 2986 2909 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 2987 - rbd_dev->watch_event->cookie, 0, start ? 1 : 0); 2910 + rbd_dev->watch_event->cookie, 0, 1); 2988 2911 rbd_osd_req_format_write(obj_request); 2989 2912 2990 2913 ret = rbd_obj_request_submit(osdc, obj_request); 2991 2914 if (ret) 2992 - goto out_cancel; 2915 + goto out_linger; 2916 + 2993 2917 ret = rbd_obj_request_wait(obj_request); 2994 2918 if (ret) 2995 - goto out_cancel; 2919 + goto out_linger; 2920 + 2996 2921 ret = obj_request->result; 2997 2922 if (ret) 2998 - goto out_cancel; 2923 + goto out_linger; 2999 2924 3000 2925 /* 3001 2926 * A watch request is set to linger, so the underlying osd ··· 3004 2931 * it. We'll drop that reference (below) after we've 3005 2932 * unregistered it. 3006 2933 */ 3007 - if (start) { 3008 - rbd_dev->watch_request = obj_request; 2934 + rbd_dev->watch_request = obj_request; 3009 2935 3010 - return 0; 3011 - } 2936 + return 0; 3012 2937 3013 - /* We have successfully torn down the watch request */ 3014 - 3015 - rbd_obj_request_put(rbd_dev->watch_request); 3016 - rbd_dev->watch_request = NULL; 2938 + out_linger: 2939 + ceph_osdc_unregister_linger_request(osdc, obj_request->osd_req); 2940 + out_put: 2941 + rbd_obj_request_put(obj_request); 3017 2942 out_cancel: 3018 - /* Cancel the event if we're tearing down, or on error */ 3019 2943 ceph_osdc_cancel_event(rbd_dev->watch_event); 3020 2944 rbd_dev->watch_event = NULL; 3021 - if (obj_request) 3022 - rbd_obj_request_put(obj_request); 3023 2945 3024 2946 return ret; 3025 2947 } 3026 2948 3027 - static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev) 2949 + /* 2950 + * Tear down a watch request, synchronously. 2951 + */ 2952 + static int __rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3028 2953 { 3029 - return __rbd_dev_header_watch_sync(rbd_dev, true); 2954 + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; 2955 + struct rbd_obj_request *obj_request; 2956 + int ret; 2957 + 2958 + rbd_assert(rbd_dev->watch_event); 2959 + rbd_assert(rbd_dev->watch_request); 2960 + 2961 + obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0, 2962 + OBJ_REQUEST_NODATA); 2963 + if (!obj_request) { 2964 + ret = -ENOMEM; 2965 + goto out_cancel; 2966 + } 2967 + 2968 + obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 2969 + obj_request); 2970 + if (!obj_request->osd_req) { 2971 + ret = -ENOMEM; 2972 + goto out_put; 2973 + } 2974 + 2975 + osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH, 2976 + rbd_dev->watch_event->cookie, 0, 0); 2977 + rbd_osd_req_format_write(obj_request); 2978 + 2979 + ret = rbd_obj_request_submit(osdc, obj_request); 2980 + if (ret) 2981 + goto out_put; 2982 + 2983 + ret = rbd_obj_request_wait(obj_request); 2984 + if (ret) 2985 + goto out_put; 2986 + 2987 + ret = obj_request->result; 2988 + if (ret) 2989 + goto out_put; 2990 + 2991 + /* We have successfully torn down the watch request */ 2992 + 2993 + ceph_osdc_unregister_linger_request(osdc, 2994 + rbd_dev->watch_request->osd_req); 2995 + rbd_obj_request_put(rbd_dev->watch_request); 2996 + rbd_dev->watch_request = NULL; 2997 + 2998 + out_put: 2999 + rbd_obj_request_put(obj_request); 3000 + out_cancel: 3001 + ceph_osdc_cancel_event(rbd_dev->watch_event); 3002 + rbd_dev->watch_event = NULL; 3003 + 3004 + return ret; 3030 3005 } 3031 3006 3032 3007 static void rbd_dev_header_unwatch_sync(struct rbd_device *rbd_dev) 3033 3008 { 3034 3009 int ret; 3035 3010 3036 - ret = __rbd_dev_header_watch_sync(rbd_dev, false); 3011 + ret = __rbd_dev_header_unwatch_sync(rbd_dev); 3037 3012 if (ret) { 3038 3013 rbd_warn(rbd_dev, "unable to tear down watch request: %d\n", 3039 3014 ret); ··· 3179 3058 __releases(q->queue_lock) __acquires(q->queue_lock) 3180 3059 { 3181 3060 struct rbd_device *rbd_dev = q->queuedata; 3182 - bool read_only = rbd_dev->mapping.read_only; 3183 3061 struct request *rq; 3184 3062 int result; 3185 3063 ··· 3214 3094 3215 3095 if (write_request) { 3216 3096 result = -EROFS; 3217 - if (read_only) 3097 + if (rbd_dev->mapping.read_only) 3218 3098 goto end_request; 3219 3099 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); 3220 3100 } ··· 4803 4683 } 4804 4684 4805 4685 /* 4686 + * Return pool id (>= 0) or a negative error code. 4687 + */ 4688 + static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name) 4689 + { 4690 + u64 newest_epoch; 4691 + unsigned long timeout = rbdc->client->options->mount_timeout * HZ; 4692 + int tries = 0; 4693 + int ret; 4694 + 4695 + again: 4696 + ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name); 4697 + if (ret == -ENOENT && tries++ < 1) { 4698 + ret = ceph_monc_do_get_version(&rbdc->client->monc, "osdmap", 4699 + &newest_epoch); 4700 + if (ret < 0) 4701 + return ret; 4702 + 4703 + if (rbdc->client->osdc.osdmap->epoch < newest_epoch) { 4704 + ceph_monc_request_next_osdmap(&rbdc->client->monc); 4705 + (void) ceph_monc_wait_osdmap(&rbdc->client->monc, 4706 + newest_epoch, timeout); 4707 + goto again; 4708 + } else { 4709 + /* the osdmap we have is new enough */ 4710 + return -ENOENT; 4711 + } 4712 + } 4713 + 4714 + return ret; 4715 + } 4716 + 4717 + /* 4806 4718 * An rbd format 2 image has a unique identifier, distinct from the 4807 4719 * name given to it by the user. Internally, that identifier is 4808 4720 * what's used to specify the names of objects related to the image. ··· 4904 4752 4905 4753 image_id = ceph_extract_encoded_string(&p, p + ret, 4906 4754 NULL, GFP_NOIO); 4907 - ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0; 4755 + ret = PTR_ERR_OR_ZERO(image_id); 4908 4756 if (!ret) 4909 4757 rbd_dev->image_format = 2; 4910 4758 } else { ··· 5059 4907 if (ret) 5060 4908 goto err_out_disk; 5061 4909 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 4910 + set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5062 4911 5063 4912 ret = rbd_bus_add_dev(rbd_dev); 5064 4913 if (ret) ··· 5206 5053 struct rbd_options *rbd_opts = NULL; 5207 5054 struct rbd_spec *spec = NULL; 5208 5055 struct rbd_client *rbdc; 5209 - struct ceph_osd_client *osdc; 5210 5056 bool read_only; 5211 5057 int rc = -ENOMEM; 5212 5058 ··· 5227 5075 } 5228 5076 5229 5077 /* pick the pool */ 5230 - osdc = &rbdc->client->osdc; 5231 - rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name); 5078 + rc = rbd_add_get_pool_id(rbdc, spec->pool_name); 5232 5079 if (rc < 0) 5233 5080 goto err_out_client; 5234 5081 spec->pool_id = (u64)rc; ··· 5538 5387 5539 5388 static void __exit rbd_exit(void) 5540 5389 { 5390 + ida_destroy(&rbd_dev_id_ida); 5541 5391 rbd_sysfs_cleanup(); 5542 5392 if (single_major) 5543 5393 unregister_blkdev(rbd_major, RBD_DRV_NAME);
-6
fs/ceph/acl.c
··· 104 104 umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; 105 105 struct dentry *dentry; 106 106 107 - if (acl) { 108 - ret = posix_acl_valid(acl); 109 - if (ret < 0) 110 - goto out; 111 - } 112 - 113 107 switch (type) { 114 108 case ACL_TYPE_ACCESS: 115 109 name = POSIX_ACL_XATTR_ACCESS;
+7 -10
fs/ceph/addr.c
··· 211 211 SetPageError(page); 212 212 ceph_fscache_readpage_cancel(inode, page); 213 213 goto out; 214 - } else { 215 - if (err < PAGE_CACHE_SIZE) { 216 - /* zero fill remainder of page */ 217 - zero_user_segment(page, err, PAGE_CACHE_SIZE); 218 - } else { 219 - flush_dcache_page(page); 220 - } 221 214 } 222 - SetPageUptodate(page); 215 + if (err < PAGE_CACHE_SIZE) 216 + /* zero fill remainder of page */ 217 + zero_user_segment(page, err, PAGE_CACHE_SIZE); 218 + else 219 + flush_dcache_page(page); 223 220 224 - if (err >= 0) 225 - ceph_readpage_to_fscache(inode, page); 221 + SetPageUptodate(page); 222 + ceph_readpage_to_fscache(inode, page); 226 223 227 224 out: 228 225 return err < 0 ? err : 0;
+131 -113
fs/ceph/caps.c
··· 221 221 return 0; 222 222 } 223 223 224 - static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc, 225 - struct ceph_cap_reservation *ctx) 224 + struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, 225 + struct ceph_cap_reservation *ctx) 226 226 { 227 227 struct ceph_cap *cap = NULL; 228 228 ··· 508 508 * it is < 0. (This is so we can atomically add the cap and add an 509 509 * open file reference to it.) 510 510 */ 511 - int ceph_add_cap(struct inode *inode, 512 - struct ceph_mds_session *session, u64 cap_id, 513 - int fmode, unsigned issued, unsigned wanted, 514 - unsigned seq, unsigned mseq, u64 realmino, int flags, 515 - struct ceph_cap_reservation *caps_reservation) 511 + void ceph_add_cap(struct inode *inode, 512 + struct ceph_mds_session *session, u64 cap_id, 513 + int fmode, unsigned issued, unsigned wanted, 514 + unsigned seq, unsigned mseq, u64 realmino, int flags, 515 + struct ceph_cap **new_cap) 516 516 { 517 517 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 518 518 struct ceph_inode_info *ci = ceph_inode(inode); 519 - struct ceph_cap *new_cap = NULL; 520 519 struct ceph_cap *cap; 521 520 int mds = session->s_mds; 522 521 int actual_wanted; ··· 530 531 if (fmode >= 0) 531 532 wanted |= ceph_caps_for_mode(fmode); 532 533 533 - retry: 534 - spin_lock(&ci->i_ceph_lock); 535 534 cap = __get_cap_for_mds(ci, mds); 536 535 if (!cap) { 537 - if (new_cap) { 538 - cap = new_cap; 539 - new_cap = NULL; 540 - } else { 541 - spin_unlock(&ci->i_ceph_lock); 542 - new_cap = get_cap(mdsc, caps_reservation); 543 - if (new_cap == NULL) 544 - return -ENOMEM; 545 - goto retry; 546 - } 536 + cap = *new_cap; 537 + *new_cap = NULL; 547 538 548 539 cap->issued = 0; 549 540 cap->implemented = 0; ··· 551 562 session->s_nr_caps++; 552 563 spin_unlock(&session->s_cap_lock); 553 564 } else { 554 - if (new_cap) 555 - ceph_put_cap(mdsc, new_cap); 556 - 557 565 /* 558 566 * auth mds of the inode changed. we received the cap export 559 567 * message, but still haven't received the cap import message. ··· 612 626 ci->i_auth_cap = cap; 613 627 cap->mds_wanted = wanted; 614 628 } 615 - ci->i_cap_exporting_issued = 0; 616 629 } else { 617 630 WARN_ON(ci->i_auth_cap == cap); 618 631 } ··· 633 648 634 649 if (fmode >= 0) 635 650 __ceph_get_fmode(ci, fmode); 636 - spin_unlock(&ci->i_ceph_lock); 637 - wake_up_all(&ci->i_cap_wq); 638 - return 0; 639 651 } 640 652 641 653 /* ··· 667 685 */ 668 686 int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) 669 687 { 670 - int have = ci->i_snap_caps | ci->i_cap_exporting_issued; 688 + int have = ci->i_snap_caps; 671 689 struct ceph_cap *cap; 672 690 struct rb_node *p; 673 691 ··· 882 900 */ 883 901 static int __ceph_is_any_caps(struct ceph_inode_info *ci) 884 902 { 885 - return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_issued; 903 + return !RB_EMPTY_ROOT(&ci->i_caps); 886 904 } 887 905 888 906 int ceph_is_any_caps(struct inode *inode) ··· 2379 2397 * actually be a revocation if it specifies a smaller cap set.) 2380 2398 * 2381 2399 * caller holds s_mutex and i_ceph_lock, we drop both. 2382 - * 2383 - * return value: 2384 - * 0 - ok 2385 - * 1 - check_caps on auth cap only (writeback) 2386 - * 2 - check_caps (ack revoke) 2387 2400 */ 2388 - static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, 2401 + static void handle_cap_grant(struct ceph_mds_client *mdsc, 2402 + struct inode *inode, struct ceph_mds_caps *grant, 2403 + void *snaptrace, int snaptrace_len, 2404 + struct ceph_buffer *xattr_buf, 2389 2405 struct ceph_mds_session *session, 2390 - struct ceph_cap *cap, 2391 - struct ceph_buffer *xattr_buf) 2392 - __releases(ci->i_ceph_lock) 2406 + struct ceph_cap *cap, int issued) 2407 + __releases(ci->i_ceph_lock) 2393 2408 { 2394 2409 struct ceph_inode_info *ci = ceph_inode(inode); 2395 2410 int mds = session->s_mds; 2396 2411 int seq = le32_to_cpu(grant->seq); 2397 2412 int newcaps = le32_to_cpu(grant->caps); 2398 - int issued, implemented, used, wanted, dirty; 2413 + int used, wanted, dirty; 2399 2414 u64 size = le64_to_cpu(grant->size); 2400 2415 u64 max_size = le64_to_cpu(grant->max_size); 2401 2416 struct timespec mtime, atime, ctime; 2402 2417 int check_caps = 0; 2403 - int wake = 0; 2404 - int writeback = 0; 2405 - int queue_invalidate = 0; 2406 - int deleted_inode = 0; 2407 - int queue_revalidate = 0; 2418 + bool wake = 0; 2419 + bool writeback = 0; 2420 + bool queue_trunc = 0; 2421 + bool queue_invalidate = 0; 2422 + bool queue_revalidate = 0; 2423 + bool deleted_inode = 0; 2408 2424 2409 2425 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2410 2426 inode, cap, mds, seq, ceph_cap_string(newcaps)); ··· 2446 2466 } 2447 2467 2448 2468 /* side effects now are allowed */ 2449 - 2450 - issued = __ceph_caps_issued(ci, &implemented); 2451 - issued |= implemented | __ceph_caps_dirty(ci); 2452 - 2453 2469 cap->cap_gen = session->s_cap_gen; 2454 2470 cap->seq = seq; 2455 2471 2456 2472 __check_cap_issue(ci, cap, newcaps); 2457 2473 2458 - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { 2474 + if ((newcaps & CEPH_CAP_AUTH_SHARED) && 2475 + (issued & CEPH_CAP_AUTH_EXCL) == 0) { 2459 2476 inode->i_mode = le32_to_cpu(grant->mode); 2460 2477 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); 2461 2478 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ··· 2461 2484 from_kgid(&init_user_ns, inode->i_gid)); 2462 2485 } 2463 2486 2464 - if ((issued & CEPH_CAP_LINK_EXCL) == 0) { 2487 + if ((newcaps & CEPH_CAP_AUTH_SHARED) && 2488 + (issued & CEPH_CAP_LINK_EXCL) == 0) { 2465 2489 set_nlink(inode, le32_to_cpu(grant->nlink)); 2466 2490 if (inode->i_nlink == 0 && 2467 2491 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) ··· 2489 2511 if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) 2490 2512 queue_revalidate = 1; 2491 2513 2492 - /* size/ctime/mtime/atime? */ 2493 - ceph_fill_file_size(inode, issued, 2494 - le32_to_cpu(grant->truncate_seq), 2495 - le64_to_cpu(grant->truncate_size), size); 2496 - ceph_decode_timespec(&mtime, &grant->mtime); 2497 - ceph_decode_timespec(&atime, &grant->atime); 2498 - ceph_decode_timespec(&ctime, &grant->ctime); 2499 - ceph_fill_file_time(inode, issued, 2500 - le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, 2501 - &atime); 2514 + if (newcaps & CEPH_CAP_ANY_RD) { 2515 + /* ctime/mtime/atime? */ 2516 + ceph_decode_timespec(&mtime, &grant->mtime); 2517 + ceph_decode_timespec(&atime, &grant->atime); 2518 + ceph_decode_timespec(&ctime, &grant->ctime); 2519 + ceph_fill_file_time(inode, issued, 2520 + le32_to_cpu(grant->time_warp_seq), 2521 + &ctime, &mtime, &atime); 2522 + } 2502 2523 2503 - 2504 - /* file layout may have changed */ 2505 - ci->i_layout = grant->layout; 2506 - 2507 - /* max size increase? */ 2508 - if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { 2509 - dout("max_size %lld -> %llu\n", ci->i_max_size, max_size); 2510 - ci->i_max_size = max_size; 2511 - if (max_size >= ci->i_wanted_max_size) { 2512 - ci->i_wanted_max_size = 0; /* reset */ 2513 - ci->i_requested_max_size = 0; 2524 + if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { 2525 + /* file layout may have changed */ 2526 + ci->i_layout = grant->layout; 2527 + /* size/truncate_seq? */ 2528 + queue_trunc = ceph_fill_file_size(inode, issued, 2529 + le32_to_cpu(grant->truncate_seq), 2530 + le64_to_cpu(grant->truncate_size), 2531 + size); 2532 + /* max size increase? */ 2533 + if (ci->i_auth_cap == cap && max_size != ci->i_max_size) { 2534 + dout("max_size %lld -> %llu\n", 2535 + ci->i_max_size, max_size); 2536 + ci->i_max_size = max_size; 2537 + if (max_size >= ci->i_wanted_max_size) { 2538 + ci->i_wanted_max_size = 0; /* reset */ 2539 + ci->i_requested_max_size = 0; 2540 + } 2541 + wake = 1; 2514 2542 } 2515 - wake = 1; 2516 2543 } 2517 2544 2518 2545 /* check cap bits */ ··· 2578 2595 2579 2596 spin_unlock(&ci->i_ceph_lock); 2580 2597 2598 + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 2599 + down_write(&mdsc->snap_rwsem); 2600 + ceph_update_snap_trace(mdsc, snaptrace, 2601 + snaptrace + snaptrace_len, false); 2602 + downgrade_write(&mdsc->snap_rwsem); 2603 + kick_flushing_inode_caps(mdsc, session, inode); 2604 + up_read(&mdsc->snap_rwsem); 2605 + if (newcaps & ~issued) 2606 + wake = 1; 2607 + } 2608 + 2609 + if (queue_trunc) { 2610 + ceph_queue_vmtruncate(inode); 2611 + ceph_queue_revalidate(inode); 2612 + } else if (queue_revalidate) 2613 + ceph_queue_revalidate(inode); 2614 + 2581 2615 if (writeback) 2582 2616 /* 2583 2617 * queue inode for writeback: we can't actually call ··· 2606 2606 ceph_queue_invalidate(inode); 2607 2607 if (deleted_inode) 2608 2608 invalidate_aliases(inode); 2609 - if (queue_revalidate) 2610 - ceph_queue_revalidate(inode); 2611 2609 if (wake) 2612 2610 wake_up_all(&ci->i_cap_wq); 2613 2611 ··· 2782 2784 { 2783 2785 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2784 2786 struct ceph_mds_session *tsession = NULL; 2785 - struct ceph_cap *cap, *tcap; 2787 + struct ceph_cap *cap, *tcap, *new_cap = NULL; 2786 2788 struct ceph_inode_info *ci = ceph_inode(inode); 2787 2789 u64 t_cap_id; 2788 2790 unsigned mseq = le32_to_cpu(ex->migrate_seq); ··· 2805 2807 retry: 2806 2808 spin_lock(&ci->i_ceph_lock); 2807 2809 cap = __get_cap_for_mds(ci, mds); 2808 - if (!cap) 2810 + if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) 2809 2811 goto out_unlock; 2810 2812 2811 2813 if (target < 0) { ··· 2844 2846 } 2845 2847 __ceph_remove_cap(cap, false); 2846 2848 goto out_unlock; 2847 - } 2848 - 2849 - if (tsession) { 2850 - int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; 2851 - spin_unlock(&ci->i_ceph_lock); 2849 + } else if (tsession) { 2852 2850 /* add placeholder for the export tagert */ 2851 + int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; 2853 2852 ceph_add_cap(inode, tsession, t_cap_id, -1, issued, 0, 2854 - t_seq - 1, t_mseq, (u64)-1, flag, NULL); 2855 - goto retry; 2853 + t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); 2854 + 2855 + __ceph_remove_cap(cap, false); 2856 + goto out_unlock; 2856 2857 } 2857 2858 2858 2859 spin_unlock(&ci->i_ceph_lock); ··· 2870 2873 SINGLE_DEPTH_NESTING); 2871 2874 } 2872 2875 ceph_add_cap_releases(mdsc, tsession); 2876 + new_cap = ceph_get_cap(mdsc, NULL); 2873 2877 } else { 2874 2878 WARN_ON(1); 2875 2879 tsession = NULL; ··· 2885 2887 mutex_unlock(&tsession->s_mutex); 2886 2888 ceph_put_mds_session(tsession); 2887 2889 } 2890 + if (new_cap) 2891 + ceph_put_cap(mdsc, new_cap); 2888 2892 } 2889 2893 2890 2894 /* 2891 - * Handle cap IMPORT. If there are temp bits from an older EXPORT, 2892 - * clean them up. 2895 + * Handle cap IMPORT. 2893 2896 * 2894 - * caller holds s_mutex. 2897 + * caller holds s_mutex. acquires i_ceph_lock 2895 2898 */ 2896 2899 static void handle_cap_import(struct ceph_mds_client *mdsc, 2897 2900 struct inode *inode, struct ceph_mds_caps *im, 2898 2901 struct ceph_mds_cap_peer *ph, 2899 2902 struct ceph_mds_session *session, 2900 - void *snaptrace, int snaptrace_len) 2903 + struct ceph_cap **target_cap, int *old_issued) 2904 + __acquires(ci->i_ceph_lock) 2901 2905 { 2902 2906 struct ceph_inode_info *ci = ceph_inode(inode); 2903 - struct ceph_cap *cap; 2907 + struct ceph_cap *cap, *ocap, *new_cap = NULL; 2904 2908 int mds = session->s_mds; 2905 - unsigned issued = le32_to_cpu(im->caps); 2909 + int issued; 2910 + unsigned caps = le32_to_cpu(im->caps); 2906 2911 unsigned wanted = le32_to_cpu(im->wanted); 2907 2912 unsigned seq = le32_to_cpu(im->seq); 2908 2913 unsigned mseq = le32_to_cpu(im->migrate_seq); ··· 2925 2924 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n", 2926 2925 inode, ci, mds, mseq, peer); 2927 2926 2927 + retry: 2928 2928 spin_lock(&ci->i_ceph_lock); 2929 - cap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; 2930 - if (cap && cap->cap_id == p_cap_id) { 2929 + cap = __get_cap_for_mds(ci, mds); 2930 + if (!cap) { 2931 + if (!new_cap) { 2932 + spin_unlock(&ci->i_ceph_lock); 2933 + new_cap = ceph_get_cap(mdsc, NULL); 2934 + goto retry; 2935 + } 2936 + cap = new_cap; 2937 + } else { 2938 + if (new_cap) { 2939 + ceph_put_cap(mdsc, new_cap); 2940 + new_cap = NULL; 2941 + } 2942 + } 2943 + 2944 + __ceph_caps_issued(ci, &issued); 2945 + issued |= __ceph_caps_dirty(ci); 2946 + 2947 + ceph_add_cap(inode, session, cap_id, -1, caps, wanted, seq, mseq, 2948 + realmino, CEPH_CAP_FLAG_AUTH, &new_cap); 2949 + 2950 + ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; 2951 + if (ocap && ocap->cap_id == p_cap_id) { 2931 2952 dout(" remove export cap %p mds%d flags %d\n", 2932 - cap, peer, ph->flags); 2953 + ocap, peer, ph->flags); 2933 2954 if ((ph->flags & CEPH_CAP_FLAG_AUTH) && 2934 - (cap->seq != le32_to_cpu(ph->seq) || 2935 - cap->mseq != le32_to_cpu(ph->mseq))) { 2955 + (ocap->seq != le32_to_cpu(ph->seq) || 2956 + ocap->mseq != le32_to_cpu(ph->mseq))) { 2936 2957 pr_err("handle_cap_import: mismatched seq/mseq: " 2937 2958 "ino (%llx.%llx) mds%d seq %d mseq %d " 2938 2959 "importer mds%d has peer seq %d mseq %d\n", 2939 - ceph_vinop(inode), peer, cap->seq, 2940 - cap->mseq, mds, le32_to_cpu(ph->seq), 2960 + ceph_vinop(inode), peer, ocap->seq, 2961 + ocap->mseq, mds, le32_to_cpu(ph->seq), 2941 2962 le32_to_cpu(ph->mseq)); 2942 2963 } 2943 - ci->i_cap_exporting_issued = cap->issued; 2944 - __ceph_remove_cap(cap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); 2964 + __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); 2945 2965 } 2946 2966 2947 2967 /* make sure we re-request max_size, if necessary */ 2948 2968 ci->i_wanted_max_size = 0; 2949 2969 ci->i_requested_max_size = 0; 2950 - spin_unlock(&ci->i_ceph_lock); 2951 2970 2952 - down_write(&mdsc->snap_rwsem); 2953 - ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len, 2954 - false); 2955 - downgrade_write(&mdsc->snap_rwsem); 2956 - ceph_add_cap(inode, session, cap_id, -1, 2957 - issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, 2958 - NULL /* no caps context */); 2959 - kick_flushing_inode_caps(mdsc, session, inode); 2960 - up_read(&mdsc->snap_rwsem); 2961 - 2971 + *old_issued = issued; 2972 + *target_cap = cap; 2962 2973 } 2963 2974 2964 2975 /* ··· 2990 2977 struct ceph_mds_caps *h; 2991 2978 struct ceph_mds_cap_peer *peer = NULL; 2992 2979 int mds = session->s_mds; 2993 - int op; 2980 + int op, issued; 2994 2981 u32 seq, mseq; 2995 2982 struct ceph_vino vino; 2996 2983 u64 cap_id; ··· 3082 3069 3083 3070 case CEPH_CAP_OP_IMPORT: 3084 3071 handle_cap_import(mdsc, inode, h, peer, session, 3085 - snaptrace, snaptrace_len); 3072 + &cap, &issued); 3073 + handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, 3074 + msg->middle, session, cap, issued); 3075 + goto done_unlocked; 3086 3076 } 3087 3077 3088 3078 /* the rest require a cap */ ··· 3102 3086 switch (op) { 3103 3087 case CEPH_CAP_OP_REVOKE: 3104 3088 case CEPH_CAP_OP_GRANT: 3105 - case CEPH_CAP_OP_IMPORT: 3106 - handle_cap_grant(inode, h, session, cap, msg->middle); 3089 + __ceph_caps_issued(ci, &issued); 3090 + issued |= __ceph_caps_dirty(ci); 3091 + handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, 3092 + session, cap, issued); 3107 3093 goto done_unlocked; 3108 3094 3109 3095 case CEPH_CAP_OP_FLUSH_ACK:
+1 -1
fs/ceph/export.c
··· 169 169 return dentry; 170 170 } 171 171 172 - struct dentry *ceph_get_parent(struct dentry *child) 172 + static struct dentry *ceph_get_parent(struct dentry *child) 173 173 { 174 174 /* don't re-export snaps */ 175 175 if (ceph_snap(child->d_inode) != CEPH_NOSNAP)
+153 -92
fs/ceph/inode.c
··· 10 10 #include <linux/writeback.h> 11 11 #include <linux/vmalloc.h> 12 12 #include <linux/posix_acl.h> 13 + #include <linux/random.h> 13 14 14 15 #include "super.h" 15 16 #include "mds_client.h" ··· 180 179 * specified, copy the frag delegation info to the caller if 181 180 * it is present. 182 181 */ 183 - u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 184 - struct ceph_inode_frag *pfrag, 185 - int *found) 182 + static u32 __ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 183 + struct ceph_inode_frag *pfrag, int *found) 186 184 { 187 185 u32 t = ceph_frag_make(0, 0); 188 186 struct ceph_inode_frag *frag; ··· 191 191 if (found) 192 192 *found = 0; 193 193 194 - mutex_lock(&ci->i_fragtree_mutex); 195 194 while (1) { 196 195 WARN_ON(!ceph_frag_contains_value(t, v)); 197 196 frag = __ceph_find_frag(ci, t); ··· 219 220 } 220 221 dout("choose_frag(%x) = %x\n", v, t); 221 222 222 - mutex_unlock(&ci->i_fragtree_mutex); 223 223 return t; 224 + } 225 + 226 + u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v, 227 + struct ceph_inode_frag *pfrag, int *found) 228 + { 229 + u32 ret; 230 + mutex_lock(&ci->i_fragtree_mutex); 231 + ret = __ceph_choose_frag(ci, v, pfrag, found); 232 + mutex_unlock(&ci->i_fragtree_mutex); 233 + return ret; 224 234 } 225 235 226 236 /* ··· 245 237 u32 id = le32_to_cpu(dirinfo->frag); 246 238 int mds = le32_to_cpu(dirinfo->auth); 247 239 int ndist = le32_to_cpu(dirinfo->ndist); 240 + int diri_auth = -1; 248 241 int i; 249 242 int err = 0; 250 243 244 + spin_lock(&ci->i_ceph_lock); 245 + if (ci->i_auth_cap) 246 + diri_auth = ci->i_auth_cap->mds; 247 + spin_unlock(&ci->i_ceph_lock); 248 + 251 249 mutex_lock(&ci->i_fragtree_mutex); 252 - if (ndist == 0) { 250 + if (ndist == 0 && mds == diri_auth) { 253 251 /* no delegation info needed. */ 254 252 frag = __ceph_find_frag(ci, id); 255 253 if (!frag) ··· 300 286 return err; 301 287 } 302 288 289 + static int ceph_fill_fragtree(struct inode *inode, 290 + struct ceph_frag_tree_head *fragtree, 291 + struct ceph_mds_reply_dirfrag *dirinfo) 292 + { 293 + struct ceph_inode_info *ci = ceph_inode(inode); 294 + struct ceph_inode_frag *frag; 295 + struct rb_node *rb_node; 296 + int i; 297 + u32 id, nsplits; 298 + bool update = false; 299 + 300 + mutex_lock(&ci->i_fragtree_mutex); 301 + nsplits = le32_to_cpu(fragtree->nsplits); 302 + if (nsplits) { 303 + i = prandom_u32() % nsplits; 304 + id = le32_to_cpu(fragtree->splits[i].frag); 305 + if (!__ceph_find_frag(ci, id)) 306 + update = true; 307 + } else if (!RB_EMPTY_ROOT(&ci->i_fragtree)) { 308 + rb_node = rb_first(&ci->i_fragtree); 309 + frag = rb_entry(rb_node, struct ceph_inode_frag, node); 310 + if (frag->frag != ceph_frag_make(0, 0) || rb_next(rb_node)) 311 + update = true; 312 + } 313 + if (!update && dirinfo) { 314 + id = le32_to_cpu(dirinfo->frag); 315 + if (id != __ceph_choose_frag(ci, id, NULL, NULL)) 316 + update = true; 317 + } 318 + if (!update) 319 + goto out_unlock; 320 + 321 + dout("fill_fragtree %llx.%llx\n", ceph_vinop(inode)); 322 + rb_node = rb_first(&ci->i_fragtree); 323 + for (i = 0; i < nsplits; i++) { 324 + id = le32_to_cpu(fragtree->splits[i].frag); 325 + frag = NULL; 326 + while (rb_node) { 327 + frag = rb_entry(rb_node, struct ceph_inode_frag, node); 328 + if (ceph_frag_compare(frag->frag, id) >= 0) { 329 + if (frag->frag != id) 330 + frag = NULL; 331 + else 332 + rb_node = rb_next(rb_node); 333 + break; 334 + } 335 + rb_node = rb_next(rb_node); 336 + rb_erase(&frag->node, &ci->i_fragtree); 337 + kfree(frag); 338 + frag = NULL; 339 + } 340 + if (!frag) { 341 + frag = __get_or_create_frag(ci, id); 342 + if (IS_ERR(frag)) 343 + continue; 344 + } 345 + frag->split_by = le32_to_cpu(fragtree->splits[i].by); 346 + dout(" frag %x split by %d\n", frag->frag, frag->split_by); 347 + } 348 + while (rb_node) { 349 + frag = rb_entry(rb_node, struct ceph_inode_frag, node); 350 + rb_node = rb_next(rb_node); 351 + rb_erase(&frag->node, &ci->i_fragtree); 352 + kfree(frag); 353 + } 354 + out_unlock: 355 + mutex_unlock(&ci->i_fragtree_mutex); 356 + return 0; 357 + } 303 358 304 359 /* 305 360 * initialize a newly allocated inode. ··· 424 341 INIT_LIST_HEAD(&ci->i_cap_snaps); 425 342 ci->i_head_snapc = NULL; 426 343 ci->i_snap_caps = 0; 427 - ci->i_cap_exporting_issued = 0; 428 344 429 345 for (i = 0; i < CEPH_FILE_MODE_NUM; i++) 430 346 ci->i_nr_by_mode[i] = 0; ··· 489 407 490 408 /* 491 409 * we may still have a snap_realm reference if there are stray 492 - * caps in i_cap_exporting_issued or i_snap_caps. 410 + * caps in i_snap_caps. 493 411 */ 494 412 if (ci->i_snap_realm) { 495 413 struct ceph_mds_client *mdsc = ··· 664 582 unsigned long ttl_from, int cap_fmode, 665 583 struct ceph_cap_reservation *caps_reservation) 666 584 { 585 + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 667 586 struct ceph_mds_reply_inode *info = iinfo->in; 668 587 struct ceph_inode_info *ci = ceph_inode(inode); 669 - int i; 670 - int issued = 0, implemented; 588 + int issued = 0, implemented, new_issued; 671 589 struct timespec mtime, atime, ctime; 672 - u32 nsplits; 673 - struct ceph_inode_frag *frag; 674 - struct rb_node *rb_node; 675 590 struct ceph_buffer *xattr_blob = NULL; 591 + struct ceph_cap *new_cap = NULL; 676 592 int err = 0; 677 - int queue_trunc = 0; 593 + bool wake = false; 594 + bool queue_trunc = false; 595 + bool new_version = false; 678 596 679 597 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", 680 598 inode, ceph_vinop(inode), le64_to_cpu(info->version), 681 599 ci->i_version); 600 + 601 + /* prealloc new cap struct */ 602 + if (info->cap.caps && ceph_snap(inode) == CEPH_NOSNAP) 603 + new_cap = ceph_get_cap(mdsc, caps_reservation); 682 604 683 605 /* 684 606 * prealloc xattr data, if it looks like we'll need it. only ··· 709 623 * 3 2 skip 710 624 * 3 3 update 711 625 */ 712 - if (le64_to_cpu(info->version) > 0 && 713 - (ci->i_version & ~1) >= le64_to_cpu(info->version)) 714 - goto no_change; 715 - 626 + if (ci->i_version == 0 || 627 + ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 628 + le64_to_cpu(info->version) > (ci->i_version & ~1))) 629 + new_version = true; 630 + 716 631 issued = __ceph_caps_issued(ci, &implemented); 717 632 issued |= implemented | __ceph_caps_dirty(ci); 633 + new_issued = ~issued & le32_to_cpu(info->cap.caps); 718 634 719 635 /* update inode */ 720 636 ci->i_version = le64_to_cpu(info->version); 721 637 inode->i_version++; 722 638 inode->i_rdev = le32_to_cpu(info->rdev); 639 + inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 723 640 724 - if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { 641 + if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && 642 + (issued & CEPH_CAP_AUTH_EXCL) == 0) { 725 643 inode->i_mode = le32_to_cpu(info->mode); 726 644 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); 727 645 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); ··· 734 644 from_kgid(&init_user_ns, inode->i_gid)); 735 645 } 736 646 737 - if ((issued & CEPH_CAP_LINK_EXCL) == 0) 647 + if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) && 648 + (issued & CEPH_CAP_LINK_EXCL) == 0) 738 649 set_nlink(inode, le32_to_cpu(info->nlink)); 739 650 740 - /* be careful with mtime, atime, size */ 741 - ceph_decode_timespec(&atime, &info->atime); 742 - ceph_decode_timespec(&mtime, &info->mtime); 743 - ceph_decode_timespec(&ctime, &info->ctime); 744 - queue_trunc = ceph_fill_file_size(inode, issued, 745 - le32_to_cpu(info->truncate_seq), 746 - le64_to_cpu(info->truncate_size), 747 - le64_to_cpu(info->size)); 748 - ceph_fill_file_time(inode, issued, 749 - le32_to_cpu(info->time_warp_seq), 750 - &ctime, &mtime, &atime); 651 + if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { 652 + /* be careful with mtime, atime, size */ 653 + ceph_decode_timespec(&atime, &info->atime); 654 + ceph_decode_timespec(&mtime, &info->mtime); 655 + ceph_decode_timespec(&ctime, &info->ctime); 656 + ceph_fill_file_time(inode, issued, 657 + le32_to_cpu(info->time_warp_seq), 658 + &ctime, &mtime, &atime); 659 + } 751 660 752 - ci->i_layout = info->layout; 753 - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 661 + if (new_version || 662 + (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { 663 + ci->i_layout = info->layout; 664 + queue_trunc = ceph_fill_file_size(inode, issued, 665 + le32_to_cpu(info->truncate_seq), 666 + le64_to_cpu(info->truncate_size), 667 + le64_to_cpu(info->size)); 668 + /* only update max_size on auth cap */ 669 + if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 670 + ci->i_max_size != le64_to_cpu(info->max_size)) { 671 + dout("max_size %lld -> %llu\n", ci->i_max_size, 672 + le64_to_cpu(info->max_size)); 673 + ci->i_max_size = le64_to_cpu(info->max_size); 674 + } 675 + } 754 676 755 677 /* xattrs */ 756 678 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ ··· 847 745 dout(" marking %p complete (empty)\n", inode); 848 746 __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); 849 747 } 850 - no_change: 851 - /* only update max_size on auth cap */ 852 - if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) && 853 - ci->i_max_size != le64_to_cpu(info->max_size)) { 854 - dout("max_size %lld -> %llu\n", ci->i_max_size, 855 - le64_to_cpu(info->max_size)); 856 - ci->i_max_size = le64_to_cpu(info->max_size); 857 - } 858 - 859 - spin_unlock(&ci->i_ceph_lock); 860 - 861 - /* queue truncate if we saw i_size decrease */ 862 - if (queue_trunc) 863 - ceph_queue_vmtruncate(inode); 864 - 865 - /* populate frag tree */ 866 - /* FIXME: move me up, if/when version reflects fragtree changes */ 867 - nsplits = le32_to_cpu(info->fragtree.nsplits); 868 - mutex_lock(&ci->i_fragtree_mutex); 869 - rb_node = rb_first(&ci->i_fragtree); 870 - for (i = 0; i < nsplits; i++) { 871 - u32 id = le32_to_cpu(info->fragtree.splits[i].frag); 872 - frag = NULL; 873 - while (rb_node) { 874 - frag = rb_entry(rb_node, struct ceph_inode_frag, node); 875 - if (ceph_frag_compare(frag->frag, id) >= 0) { 876 - if (frag->frag != id) 877 - frag = NULL; 878 - else 879 - rb_node = rb_next(rb_node); 880 - break; 881 - } 882 - rb_node = rb_next(rb_node); 883 - rb_erase(&frag->node, &ci->i_fragtree); 884 - kfree(frag); 885 - frag = NULL; 886 - } 887 - if (!frag) { 888 - frag = __get_or_create_frag(ci, id); 889 - if (IS_ERR(frag)) 890 - continue; 891 - } 892 - frag->split_by = le32_to_cpu(info->fragtree.splits[i].by); 893 - dout(" frag %x split by %d\n", frag->frag, frag->split_by); 894 - } 895 - while (rb_node) { 896 - frag = rb_entry(rb_node, struct ceph_inode_frag, node); 897 - rb_node = rb_next(rb_node); 898 - rb_erase(&frag->node, &ci->i_fragtree); 899 - kfree(frag); 900 - } 901 - mutex_unlock(&ci->i_fragtree_mutex); 902 748 903 749 /* were we issued a capability? */ 904 750 if (info->cap.caps) { ··· 859 809 le32_to_cpu(info->cap.seq), 860 810 le32_to_cpu(info->cap.mseq), 861 811 le64_to_cpu(info->cap.realm), 862 - info->cap.flags, 863 - caps_reservation); 812 + info->cap.flags, &new_cap); 813 + wake = true; 864 814 } else { 865 - spin_lock(&ci->i_ceph_lock); 866 815 dout(" %p got snap_caps %s\n", inode, 867 816 ceph_cap_string(le32_to_cpu(info->cap.caps))); 868 817 ci->i_snap_caps |= le32_to_cpu(info->cap.caps); 869 818 if (cap_fmode >= 0) 870 819 __ceph_get_fmode(ci, cap_fmode); 871 - spin_unlock(&ci->i_ceph_lock); 872 820 } 873 821 } else if (cap_fmode >= 0) { 874 822 pr_warn("mds issued no caps on %llx.%llx\n", 875 823 ceph_vinop(inode)); 876 824 __ceph_get_fmode(ci, cap_fmode); 877 825 } 826 + spin_unlock(&ci->i_ceph_lock); 827 + 828 + if (wake) 829 + wake_up_all(&ci->i_cap_wq); 830 + 831 + /* queue truncate if we saw i_size decrease */ 832 + if (queue_trunc) 833 + ceph_queue_vmtruncate(inode); 834 + 835 + /* populate frag tree */ 836 + if (S_ISDIR(inode->i_mode)) 837 + ceph_fill_fragtree(inode, &info->fragtree, dirinfo); 878 838 879 839 /* update delegation info? */ 880 840 if (dirinfo) 881 841 ceph_fill_dirfrag(inode, dirinfo); 882 842 883 843 err = 0; 884 - 885 844 out: 845 + if (new_cap) 846 + ceph_put_cap(mdsc, new_cap); 886 847 if (xattr_blob) 887 848 ceph_buffer_put(xattr_blob); 888 849 return err; ··· 1546 1485 orig_gen = ci->i_rdcache_gen; 1547 1486 spin_unlock(&ci->i_ceph_lock); 1548 1487 1549 - truncate_inode_pages(inode->i_mapping, 0); 1488 + truncate_pagecache(inode, 0); 1550 1489 1551 1490 spin_lock(&ci->i_ceph_lock); 1552 1491 if (orig_gen == ci->i_rdcache_gen && ··· 1649 1588 ci->i_truncate_pending, to); 1650 1589 spin_unlock(&ci->i_ceph_lock); 1651 1590 1652 - truncate_inode_pages(inode->i_mapping, to); 1591 + truncate_pagecache(inode, to); 1653 1592 1654 1593 spin_lock(&ci->i_ceph_lock); 1655 1594 if (to == ci->i_truncate_size) {
+8 -1
fs/ceph/mds_client.c
··· 1558 1558 init_completion(&req->r_safe_completion); 1559 1559 INIT_LIST_HEAD(&req->r_unsafe_item); 1560 1560 1561 + req->r_stamp = CURRENT_TIME; 1562 + 1561 1563 req->r_op = op; 1562 1564 req->r_direct_mode = mode; 1563 1565 return req; ··· 1785 1783 } 1786 1784 1787 1785 len = sizeof(*head) + 1788 - pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)); 1786 + pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) + 1787 + sizeof(struct timespec); 1789 1788 1790 1789 /* calculate (max) length for cap releases */ 1791 1790 len += sizeof(struct ceph_mds_request_release) * ··· 1803 1800 goto out_free2; 1804 1801 } 1805 1802 1803 + msg->hdr.version = 2; 1806 1804 msg->hdr.tid = cpu_to_le64(req->r_tid); 1807 1805 1808 1806 head = msg->front.iov_base; ··· 1839 1835 req->r_old_dentry->d_inode, 1840 1836 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); 1841 1837 head->num_releases = cpu_to_le16(releases); 1838 + 1839 + /* time stamp */ 1840 + ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp)); 1842 1841 1843 1842 BUG_ON(p > end); 1844 1843 msg->front.iov_len = p - msg->front.iov_base;
+1
fs/ceph/mds_client.h
··· 194 194 int r_fmode; /* file mode, if expecting cap */ 195 195 kuid_t r_uid; 196 196 kgid_t r_gid; 197 + struct timespec r_stamp; 197 198 198 199 /* for choosing which mds to send this request to */ 199 200 int r_direct_mode;
+7 -6
fs/ceph/super.h
··· 292 292 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or 293 293 dirty|flushing caps */ 294 294 unsigned i_snap_caps; /* cap bits for snapped files */ 295 - unsigned i_cap_exporting_issued; 296 295 297 296 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ 298 297 ··· 774 775 extern const char *ceph_cap_string(int c); 775 776 extern void ceph_handle_caps(struct ceph_mds_session *session, 776 777 struct ceph_msg *msg); 777 - extern int ceph_add_cap(struct inode *inode, 778 - struct ceph_mds_session *session, u64 cap_id, 779 - int fmode, unsigned issued, unsigned wanted, 780 - unsigned cap, unsigned seq, u64 realmino, int flags, 781 - struct ceph_cap_reservation *caps_reservation); 778 + extern struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, 779 + struct ceph_cap_reservation *ctx); 780 + extern void ceph_add_cap(struct inode *inode, 781 + struct ceph_mds_session *session, u64 cap_id, 782 + int fmode, unsigned issued, unsigned wanted, 783 + unsigned cap, unsigned seq, u64 realmino, int flags, 784 + struct ceph_cap **new_cap); 782 785 extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); 783 786 extern void ceph_put_cap(struct ceph_mds_client *mdsc, 784 787 struct ceph_cap *cap);
+2
include/linux/ceph/ceph_fs.h
··· 625 625 CEPH_CAP_LINK_EXCL | \ 626 626 CEPH_CAP_XATTR_EXCL | \ 627 627 CEPH_CAP_FILE_EXCL) 628 + #define CEPH_CAP_ANY_FILE_RD (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE | \ 629 + CEPH_CAP_FILE_SHARED) 628 630 #define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \ 629 631 CEPH_CAP_FILE_EXCL) 630 632 #define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+8 -3
include/linux/ceph/mon_client.h
··· 40 40 }; 41 41 42 42 /* 43 - * ceph_mon_generic_request is being used for the statfs and poolop requests 44 - * which are bening done a bit differently because we need to get data back 45 - * to the caller 43 + * ceph_mon_generic_request is being used for the statfs, poolop and 44 + * mon_get_version requests which are being done a bit differently 45 + * because we need to get data back to the caller 46 46 */ 47 47 struct ceph_mon_generic_request { 48 48 struct kref kref; ··· 104 104 extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); 105 105 106 106 extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); 107 + extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, 108 + unsigned long timeout); 107 109 108 110 extern int ceph_monc_do_statfs(struct ceph_mon_client *monc, 109 111 struct ceph_statfs *buf); 112 + 113 + extern int ceph_monc_do_get_version(struct ceph_mon_client *monc, 114 + const char *what, u64 *newest); 110 115 111 116 extern int ceph_monc_open_session(struct ceph_mon_client *monc); 112 117
+2
net/ceph/ceph_common.c
··· 72 72 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack"; 73 73 case CEPH_MSG_STATFS: return "statfs"; 74 74 case CEPH_MSG_STATFS_REPLY: return "statfs_reply"; 75 + case CEPH_MSG_MON_GET_VERSION: return "mon_get_version"; 76 + case CEPH_MSG_MON_GET_VERSION_REPLY: return "mon_get_version_reply"; 75 77 case CEPH_MSG_MDS_MAP: return "mds_map"; 76 78 case CEPH_MSG_CLIENT_SESSION: return "client_session"; 77 79 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+6 -2
net/ceph/debugfs.c
··· 126 126 req = rb_entry(rp, struct ceph_mon_generic_request, node); 127 127 op = le16_to_cpu(req->request->hdr.type); 128 128 if (op == CEPH_MSG_STATFS) 129 - seq_printf(s, "%lld statfs\n", req->tid); 129 + seq_printf(s, "%llu statfs\n", req->tid); 130 + else if (op == CEPH_MSG_POOLOP) 131 + seq_printf(s, "%llu poolop\n", req->tid); 132 + else if (op == CEPH_MSG_MON_GET_VERSION) 133 + seq_printf(s, "%llu mon_get_version", req->tid); 130 134 else 131 - seq_printf(s, "%lld unknown\n", req->tid); 135 + seq_printf(s, "%llu unknown\n", req->tid); 132 136 } 133 137 134 138 mutex_unlock(&monc->mutex);
+145 -5
net/ceph/mon_client.c
··· 296 296 __send_subscribe(monc); 297 297 mutex_unlock(&monc->mutex); 298 298 } 299 + EXPORT_SYMBOL(ceph_monc_request_next_osdmap); 300 + 301 + int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, 302 + unsigned long timeout) 303 + { 304 + unsigned long started = jiffies; 305 + int ret; 306 + 307 + mutex_lock(&monc->mutex); 308 + while (monc->have_osdmap < epoch) { 309 + mutex_unlock(&monc->mutex); 310 + 311 + if (timeout != 0 && time_after_eq(jiffies, started + timeout)) 312 + return -ETIMEDOUT; 313 + 314 + ret = wait_event_interruptible_timeout(monc->client->auth_wq, 315 + monc->have_osdmap >= epoch, timeout); 316 + if (ret < 0) 317 + return ret; 318 + 319 + mutex_lock(&monc->mutex); 320 + } 321 + 322 + mutex_unlock(&monc->mutex); 323 + return 0; 324 + } 325 + EXPORT_SYMBOL(ceph_monc_wait_osdmap); 299 326 300 327 /* 301 328 * ··· 504 477 return m; 505 478 } 506 479 507 - static int do_generic_request(struct ceph_mon_client *monc, 508 - struct ceph_mon_generic_request *req) 480 + static int __do_generic_request(struct ceph_mon_client *monc, u64 tid, 481 + struct ceph_mon_generic_request *req) 509 482 { 510 483 int err; 511 484 512 485 /* register request */ 513 - mutex_lock(&monc->mutex); 514 - req->tid = ++monc->last_tid; 486 + req->tid = tid != 0 ? tid : ++monc->last_tid; 515 487 req->request->hdr.tid = cpu_to_le64(req->tid); 516 488 __insert_generic_request(monc, req); 517 489 monc->num_generic_requests++; ··· 522 496 mutex_lock(&monc->mutex); 523 497 rb_erase(&req->node, &monc->generic_request_tree); 524 498 monc->num_generic_requests--; 525 - mutex_unlock(&monc->mutex); 526 499 527 500 if (!err) 528 501 err = req->result; 502 + return err; 503 + } 504 + 505 + static int do_generic_request(struct ceph_mon_client *monc, 506 + struct ceph_mon_generic_request *req) 507 + { 508 + int err; 509 + 510 + mutex_lock(&monc->mutex); 511 + err = __do_generic_request(monc, 0, req); 512 + mutex_unlock(&monc->mutex); 513 + 529 514 return err; 530 515 } 531 516 ··· 615 578 return err; 616 579 } 617 580 EXPORT_SYMBOL(ceph_monc_do_statfs); 581 + 582 + static void handle_get_version_reply(struct ceph_mon_client *monc, 583 + struct ceph_msg *msg) 584 + { 585 + struct ceph_mon_generic_request *req; 586 + u64 tid = le64_to_cpu(msg->hdr.tid); 587 + void *p = msg->front.iov_base; 588 + void *end = p + msg->front_alloc_len; 589 + u64 handle; 590 + 591 + dout("%s %p tid %llu\n", __func__, msg, tid); 592 + 593 + ceph_decode_need(&p, end, 2*sizeof(u64), bad); 594 + handle = ceph_decode_64(&p); 595 + if (tid != 0 && tid != handle) 596 + goto bad; 597 + 598 + mutex_lock(&monc->mutex); 599 + req = __lookup_generic_req(monc, handle); 600 + if (req) { 601 + *(u64 *)req->buf = ceph_decode_64(&p); 602 + req->result = 0; 603 + get_generic_request(req); 604 + } 605 + mutex_unlock(&monc->mutex); 606 + if (req) { 607 + complete_all(&req->completion); 608 + put_generic_request(req); 609 + } 610 + 611 + return; 612 + bad: 613 + pr_err("corrupt mon_get_version reply\n"); 614 + ceph_msg_dump(msg); 615 + } 616 + 617 + /* 618 + * Send MMonGetVersion and wait for the reply. 619 + * 620 + * @what: one of "mdsmap", "osdmap" or "monmap" 621 + */ 622 + int ceph_monc_do_get_version(struct ceph_mon_client *monc, const char *what, 623 + u64 *newest) 624 + { 625 + struct ceph_mon_generic_request *req; 626 + void *p, *end; 627 + u64 tid; 628 + int err; 629 + 630 + req = kzalloc(sizeof(*req), GFP_NOFS); 631 + if (!req) 632 + return -ENOMEM; 633 + 634 + kref_init(&req->kref); 635 + req->buf = newest; 636 + req->buf_len = sizeof(*newest); 637 + init_completion(&req->completion); 638 + 639 + req->request = ceph_msg_new(CEPH_MSG_MON_GET_VERSION, 640 + sizeof(u64) + sizeof(u32) + strlen(what), 641 + GFP_NOFS, true); 642 + if (!req->request) { 643 + err = -ENOMEM; 644 + goto out; 645 + } 646 + 647 + req->reply = ceph_msg_new(CEPH_MSG_MON_GET_VERSION_REPLY, 1024, 648 + GFP_NOFS, true); 649 + if (!req->reply) { 650 + err = -ENOMEM; 651 + goto out; 652 + } 653 + 654 + p = req->request->front.iov_base; 655 + end = p + req->request->front_alloc_len; 656 + 657 + /* fill out request */ 658 + mutex_lock(&monc->mutex); 659 + tid = ++monc->last_tid; 660 + ceph_encode_64(&p, tid); /* handle */ 661 + ceph_encode_string(&p, end, what, strlen(what)); 662 + 663 + err = __do_generic_request(monc, tid, req); 664 + 665 + mutex_unlock(&monc->mutex); 666 + out: 667 + kref_put(&req->kref, release_generic_request); 668 + return err; 669 + } 670 + EXPORT_SYMBOL(ceph_monc_do_get_version); 618 671 619 672 /* 620 673 * pool ops ··· 1108 981 handle_statfs_reply(monc, msg); 1109 982 break; 1110 983 984 + case CEPH_MSG_MON_GET_VERSION_REPLY: 985 + handle_get_version_reply(monc, msg); 986 + break; 987 + 1111 988 case CEPH_MSG_POOLOP_REPLY: 1112 989 handle_poolop_reply(monc, msg); 1113 990 break; ··· 1160 1029 case CEPH_MSG_AUTH_REPLY: 1161 1030 m = ceph_msg_get(monc->m_auth_reply); 1162 1031 break; 1032 + case CEPH_MSG_MON_GET_VERSION_REPLY: 1033 + if (le64_to_cpu(hdr->tid) != 0) 1034 + return get_generic_reply(con, hdr, skip); 1035 + 1036 + /* 1037 + * Older OSDs don't set reply tid even if the orignal 1038 + * request had a non-zero tid. Workaround this weirdness 1039 + * by falling through to the allocate case. 1040 + */ 1163 1041 case CEPH_MSG_MON_MAP: 1164 1042 case CEPH_MSG_MDS_MAP: 1165 1043 case CEPH_MSG_OSD_MAP: