Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

tjh.dev / kernel

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

fork

Configure Feed

Issues Pull Requests Commits Tags

Feed URL

Select the types of activity you want to include in your feed.

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
"The two main changes are aio support in CephFS, and a series that
fixes several issues in the authentication key timeout/renewal code.

On top of that are a variety of cleanups and minor bug fixes"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client:
libceph: remove outdated comment
libceph: kill off ceph_x_ticket_handler::validity
libceph: invalidate AUTH in addition to a service ticket
libceph: fix authorizer invalidation, take 2
libceph: clear messenger auth_retry flag if we fault
libceph: fix ceph_msg_revoke()
libceph: use list_for_each_entry_safe
ceph: use i_size_{read,write} to get/set i_size
ceph: re-send AIO write request when getting -EOLDSNAP error
ceph: Asynchronous IO support
ceph: Avoid to propagate the invalid page point
ceph: fix double page_unlock() in page_mkwrite()
rbd: delete an unnecessary check before rbd_dev_destroy()
libceph: use list_next_entry instead of list_entry_next
ceph: ceph_frag_contains_value can be boolean
ceph: remove unused functions in ceph_frag.h

Linus Torvalds 10 years ago 00e3f5cc 772950ed

+500 -239

11 changed files

expand all collapse all

drivers

block

rbd.c

ceph

addr.c

cache.c

file.c

inode.c

include

linux

ceph

ceph_frag.h

messenger.h

net

ceph

auth_x.c

auth_x.h

messenger.c

mon_client.c

+1 -2

drivers/block/rbd.c

reviewed

··· 5185 5185 5186 5186 out_err: 5187 5187 rbd_dev_unparent(rbd_dev); 5188 5188 - if (parent) 5189 5189 - rbd_dev_destroy(parent); 5188 5188 + rbd_dev_destroy(parent); 5190 5189 return ret; 5191 5190 } 5192 5191

+6 -8

fs/ceph/addr.c

reviewed

··· 1108 1108 return 0; 1109 1109 1110 1110 /* past end of file? */ 1111 1111 - i_size = inode->i_size; /* caller holds i_mutex */ 1111 1111 + i_size = i_size_read(inode); 1112 1112 1113 1113 if (page_off >= i_size || 1114 1114 (pos_in_page == 0 && (pos+len) >= i_size && ··· 1149 1149 page = grab_cache_page_write_begin(mapping, index, 0); 1150 1150 if (!page) 1151 1151 return -ENOMEM; 1152 1152 - *pagep = page; 1153 1152 1154 1153 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1155 1154 inode, page, (int)pos, (int)len); ··· 1183 1184 zero_user_segment(page, from+copied, len); 1184 1185 1185 1186 /* did file size increase? */ 1186 1186 - /* (no need for i_size_read(); we caller holds i_mutex */ 1187 1187 - if (pos+copied > inode->i_size) 1187 1187 + if (pos+copied > i_size_read(inode)) 1188 1188 check_cap = ceph_inode_set_size(inode, pos+copied); 1189 1189 1190 1190 if (!PageUptodate(page)) ··· 1376 1378 1377 1379 ret = VM_FAULT_NOPAGE; 1378 1380 if ((off > size) || 1379 1379 - (page->mapping != inode->i_mapping)) 1381 1381 + (page->mapping != inode->i_mapping)) { 1382 1382 + unlock_page(page); 1380 1383 goto out; 1384 1384 + } 1381 1385 1382 1386 ret = ceph_update_writeable_page(vma->vm_file, off, len, page); 1383 1383 - if (ret == 0) { 1387 1387 + if (ret >= 0) { 1384 1388 /* success. we'll keep the page locked. */ 1385 1389 set_page_dirty(page); 1386 1390 ret = VM_FAULT_LOCKED; ··· 1393 1393 ret = VM_FAULT_SIGBUS; 1394 1394 } 1395 1395 out: 1396 1396 - if (ret != VM_FAULT_LOCKED) 1397 1397 - unlock_page(page); 1398 1396 if (ret == VM_FAULT_LOCKED || 1399 1397 ci->i_inline_version != CEPH_INLINE_NONE) { 1400 1398 int dirty;

+3 -5

fs/ceph/cache.c

reviewed

··· 106 106 107 107 memset(&aux, 0, sizeof(aux)); 108 108 aux.mtime = inode->i_mtime; 109 109 - aux.size = inode->i_size; 109 109 + aux.size = i_size_read(inode); 110 110 111 111 memcpy(buffer, &aux, sizeof(aux)); 112 112 ··· 117 117 uint64_t *size) 118 118 { 119 119 const struct ceph_inode_info* ci = cookie_netfs_data; 120 120 - const struct inode* inode = &ci->vfs_inode; 121 121 - 122 122 - *size = inode->i_size; 120 120 + *size = i_size_read(&ci->vfs_inode); 123 121 } 124 122 125 123 static enum fscache_checkaux ceph_fscache_inode_check_aux( ··· 132 134 133 135 memset(&aux, 0, sizeof(aux)); 134 136 aux.mtime = inode->i_mtime; 135 135 - aux.size = inode->i_size; 137 137 + aux.size = i_size_read(inode); 136 138 137 139 if (memcmp(data, &aux, sizeof(aux)) != 0) 138 140 return FSCACHE_CHECKAUX_OBSOLETE;

+375 -132

fs/ceph/file.c

reviewed

··· 397 397 } 398 398 399 399 enum { 400 400 - CHECK_EOF = 1, 401 401 - READ_INLINE = 2, 400 400 + HAVE_RETRIED = 1, 401 401 + CHECK_EOF = 2, 402 402 + READ_INLINE = 3, 402 403 }; 403 404 404 405 /* ··· 412 411 static int striped_read(struct inode *inode, 413 412 u64 off, u64 len, 414 413 struct page **pages, int num_pages, 415 415 - int *checkeof, bool o_direct, 416 416 - unsigned long buf_align) 414 414 + int *checkeof) 417 415 { 418 416 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 419 417 struct ceph_inode_info *ci = ceph_inode(inode); 420 418 u64 pos, this_len, left; 421 421 - int io_align, page_align; 422 422 - int pages_left; 423 423 - int read; 419 419 + loff_t i_size; 420 420 + int page_align, pages_left; 421 421 + int read, ret; 424 422 struct page **page_pos; 425 425 - int ret; 426 423 bool hit_stripe, was_short; 427 424 428 425 /* ··· 431 432 page_pos = pages; 432 433 pages_left = num_pages; 433 434 read = 0; 434 434 - io_align = off & ~PAGE_MASK; 435 435 436 436 more: 437 437 - if (o_direct) 438 438 - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; 439 439 - else 440 440 - page_align = pos & ~PAGE_MASK; 437 437 + page_align = pos & ~PAGE_MASK; 441 438 this_len = left; 442 439 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 443 440 &ci->i_layout, pos, &this_len, ··· 447 452 dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, 448 453 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); 449 454 455 455 + i_size = i_size_read(inode); 450 456 if (ret >= 0) { 451 457 int didpages; 452 452 - if (was_short && (pos + ret < inode->i_size)) { 453 453 - int zlen = min(this_len - ret, 454 454 - inode->i_size - pos - ret); 455 455 - int zoff = (o_direct ? buf_align : io_align) + 456 456 - read + ret; 458 458 + if (was_short && (pos + ret < i_size)) { 459 459 + int zlen = min(this_len - ret, i_size - pos - ret); 460 460 + int zoff = (off & ~PAGE_MASK) + read + ret; 457 461 dout(" zero gap %llu to %llu\n", 458 462 pos + ret, pos + ret + zlen); 459 463 ceph_zero_page_vector_range(zoff, zlen, pages); ··· 467 473 pages_left -= didpages; 468 474 469 475 /* hit stripe and need continue*/ 470 470 - if (left && hit_stripe && pos < inode->i_size) 476 476 + if (left && hit_stripe && pos < i_size) 471 477 goto more; 472 478 } 473 479 474 480 if (read > 0) { 475 481 ret = read; 476 482 /* did we bounce off eof? */ 477 477 - if (pos + left > inode->i_size) 483 483 + if (pos + left > i_size) 478 484 *checkeof = CHECK_EOF; 479 485 } 480 486 ··· 515 521 if (ret < 0) 516 522 return ret; 517 523 518 518 - if (iocb->ki_flags & IOCB_DIRECT) { 519 519 - while (iov_iter_count(i)) { 520 520 - size_t start; 521 521 - ssize_t n; 524 524 + num_pages = calc_pages_for(off, len); 525 525 + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 526 526 + if (IS_ERR(pages)) 527 527 + return PTR_ERR(pages); 528 528 + ret = striped_read(inode, off, len, pages, 529 529 + num_pages, checkeof); 530 530 + if (ret > 0) { 531 531 + int l, k = 0; 532 532 + size_t left = ret; 522 533 523 523 - n = dio_get_pagev_size(i); 524 524 - pages = dio_get_pages_alloc(i, n, &start, &num_pages); 525 525 - if (IS_ERR(pages)) 526 526 - return PTR_ERR(pages); 527 527 - 528 528 - ret = striped_read(inode, off, n, 529 529 - pages, num_pages, checkeof, 530 530 - 1, start); 531 531 - 532 532 - ceph_put_page_vector(pages, num_pages, true); 533 533 - 534 534 - if (ret <= 0) 535 535 - break; 536 536 - off += ret; 537 537 - iov_iter_advance(i, ret); 538 538 - if (ret < n) 534 534 + while (left) { 535 535 + size_t page_off = off & ~PAGE_MASK; 536 536 + size_t copy = min_t(size_t, left, 537 537 + PAGE_SIZE - page_off); 538 538 + l = copy_page_to_iter(pages[k++], page_off, copy, i); 539 539 + off += l; 540 540 + left -= l; 541 541 + if (l < copy) 539 542 break; 540 543 } 541 541 - } else { 542 542 - num_pages = calc_pages_for(off, len); 543 543 - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); 544 544 - if (IS_ERR(pages)) 545 545 - return PTR_ERR(pages); 546 546 - ret = striped_read(inode, off, len, pages, 547 547 - num_pages, checkeof, 0, 0); 548 548 - if (ret > 0) { 549 549 - int l, k = 0; 550 550 - size_t left = ret; 551 551 - 552 552 - while (left) { 553 553 - size_t page_off = off & ~PAGE_MASK; 554 554 - size_t copy = min_t(size_t, 555 555 - PAGE_SIZE - page_off, left); 556 556 - l = copy_page_to_iter(pages[k++], page_off, 557 557 - copy, i); 558 558 - off += l; 559 559 - left -= l; 560 560 - if (l < copy) 561 561 - break; 562 562 - } 563 563 - } 564 564 - ceph_release_page_vector(pages, num_pages); 565 544 } 545 545 + ceph_release_page_vector(pages, num_pages); 566 546 567 547 if (off > iocb->ki_pos) { 568 548 ret = off - iocb->ki_pos; ··· 545 577 546 578 dout("sync_read result %d\n", ret); 547 579 return ret; 580 580 + } 581 581 + 582 582 + struct ceph_aio_request { 583 583 + struct kiocb *iocb; 584 584 + size_t total_len; 585 585 + int write; 586 586 + int error; 587 587 + struct list_head osd_reqs; 588 588 + unsigned num_reqs; 589 589 + atomic_t pending_reqs; 590 590 + struct timespec mtime; 591 591 + struct ceph_cap_flush *prealloc_cf; 592 592 + }; 593 593 + 594 594 + struct ceph_aio_work { 595 595 + struct work_struct work; 596 596 + struct ceph_osd_request *req; 597 597 + }; 598 598 + 599 599 + static void ceph_aio_retry_work(struct work_struct *work); 600 600 + 601 601 + static void ceph_aio_complete(struct inode *inode, 602 602 + struct ceph_aio_request *aio_req) 603 603 + { 604 604 + struct ceph_inode_info *ci = ceph_inode(inode); 605 605 + int ret; 606 606 + 607 607 + if (!atomic_dec_and_test(&aio_req->pending_reqs)) 608 608 + return; 609 609 + 610 610 + ret = aio_req->error; 611 611 + if (!ret) 612 612 + ret = aio_req->total_len; 613 613 + 614 614 + dout("ceph_aio_complete %p rc %d\n", inode, ret); 615 615 + 616 616 + if (ret >= 0 && aio_req->write) { 617 617 + int dirty; 618 618 + 619 619 + loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; 620 620 + if (endoff > i_size_read(inode)) { 621 621 + if (ceph_inode_set_size(inode, endoff)) 622 622 + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 623 623 + } 624 624 + 625 625 + spin_lock(&ci->i_ceph_lock); 626 626 + ci->i_inline_version = CEPH_INLINE_NONE; 627 627 + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, 628 628 + &aio_req->prealloc_cf); 629 629 + spin_unlock(&ci->i_ceph_lock); 630 630 + if (dirty) 631 631 + __mark_inode_dirty(inode, dirty); 632 632 + 633 633 + } 634 634 + 635 635 + ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : 636 636 + CEPH_CAP_FILE_RD)); 637 637 + 638 638 + aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); 639 639 + 640 640 + ceph_free_cap_flush(aio_req->prealloc_cf); 641 641 + kfree(aio_req); 642 642 + } 643 643 + 644 644 + static void ceph_aio_complete_req(struct ceph_osd_request *req, 645 645 + struct ceph_msg *msg) 646 646 + { 647 647 + int rc = req->r_result; 648 648 + struct inode *inode = req->r_inode; 649 649 + struct ceph_aio_request *aio_req = req->r_priv; 650 650 + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); 651 651 + int num_pages = calc_pages_for((u64)osd_data->alignment, 652 652 + osd_data->length); 653 653 + 654 654 + dout("ceph_aio_complete_req %p rc %d bytes %llu\n", 655 655 + inode, rc, osd_data->length); 656 656 + 657 657 + if (rc == -EOLDSNAPC) { 658 658 + struct ceph_aio_work *aio_work; 659 659 + BUG_ON(!aio_req->write); 660 660 + 661 661 + aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); 662 662 + if (aio_work) { 663 663 + INIT_WORK(&aio_work->work, ceph_aio_retry_work); 664 664 + aio_work->req = req; 665 665 + queue_work(ceph_inode_to_client(inode)->wb_wq, 666 666 + &aio_work->work); 667 667 + return; 668 668 + } 669 669 + rc = -ENOMEM; 670 670 + } else if (!aio_req->write) { 671 671 + if (rc == -ENOENT) 672 672 + rc = 0; 673 673 + if (rc >= 0 && osd_data->length > rc) { 674 674 + int zoff = osd_data->alignment + rc; 675 675 + int zlen = osd_data->length - rc; 676 676 + /* 677 677 + * If read is satisfied by single OSD request, 678 678 + * it can pass EOF. Otherwise read is within 679 679 + * i_size. 680 680 + */ 681 681 + if (aio_req->num_reqs == 1) { 682 682 + loff_t i_size = i_size_read(inode); 683 683 + loff_t endoff = aio_req->iocb->ki_pos + rc; 684 684 + if (endoff < i_size) 685 685 + zlen = min_t(size_t, zlen, 686 686 + i_size - endoff); 687 687 + aio_req->total_len = rc + zlen; 688 688 + } 689 689 + 690 690 + if (zlen > 0) 691 691 + ceph_zero_page_vector_range(zoff, zlen, 692 692 + osd_data->pages); 693 693 + } 694 694 + } 695 695 + 696 696 + ceph_put_page_vector(osd_data->pages, num_pages, false); 697 697 + ceph_osdc_put_request(req); 698 698 + 699 699 + if (rc < 0) 700 700 + cmpxchg(&aio_req->error, 0, rc); 701 701 + 702 702 + ceph_aio_complete(inode, aio_req); 703 703 + return; 704 704 + } 705 705 + 706 706 + static void ceph_aio_retry_work(struct work_struct *work) 707 707 + { 708 708 + struct ceph_aio_work *aio_work = 709 709 + container_of(work, struct ceph_aio_work, work); 710 710 + struct ceph_osd_request *orig_req = aio_work->req; 711 711 + struct ceph_aio_request *aio_req = orig_req->r_priv; 712 712 + struct inode *inode = orig_req->r_inode; 713 713 + struct ceph_inode_info *ci = ceph_inode(inode); 714 714 + struct ceph_snap_context *snapc; 715 715 + struct ceph_osd_request *req; 716 716 + int ret; 717 717 + 718 718 + spin_lock(&ci->i_ceph_lock); 719 719 + if (__ceph_have_pending_cap_snap(ci)) { 720 720 + struct ceph_cap_snap *capsnap = 721 721 + list_last_entry(&ci->i_cap_snaps, 722 722 + struct ceph_cap_snap, 723 723 + ci_item); 724 724 + snapc = ceph_get_snap_context(capsnap->context); 725 725 + } else { 726 726 + BUG_ON(!ci->i_head_snapc); 727 727 + snapc = ceph_get_snap_context(ci->i_head_snapc); 728 728 + } 729 729 + spin_unlock(&ci->i_ceph_lock); 730 730 + 731 731 + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, 732 732 + false, GFP_NOFS); 733 733 + if (IS_ERR(req)) { 734 734 + ret = PTR_ERR(req); 735 735 + req = orig_req; 736 736 + goto out; 737 737 + } 738 738 + 739 739 + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | 740 740 + CEPH_OSD_FLAG_ONDISK | 741 741 + CEPH_OSD_FLAG_WRITE; 742 742 + req->r_base_oloc = orig_req->r_base_oloc; 743 743 + req->r_base_oid = orig_req->r_base_oid; 744 744 + 745 745 + req->r_ops[0] = orig_req->r_ops[0]; 746 746 + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 747 747 + 748 748 + ceph_osdc_build_request(req, req->r_ops[0].extent.offset, 749 749 + snapc, CEPH_NOSNAP, &aio_req->mtime); 750 750 + 751 751 + ceph_put_snap_context(snapc); 752 752 + ceph_osdc_put_request(orig_req); 753 753 + 754 754 + req->r_callback = ceph_aio_complete_req; 755 755 + req->r_inode = inode; 756 756 + req->r_priv = aio_req; 757 757 + 758 758 + ret = ceph_osdc_start_request(req->r_osdc, req, false); 759 759 + out: 760 760 + if (ret < 0) { 761 761 + BUG_ON(ret == -EOLDSNAPC); 762 762 + req->r_result = ret; 763 763 + ceph_aio_complete_req(req, NULL); 764 764 + } 765 765 + 766 766 + kfree(aio_work); 548 767 } 549 768 550 769 /* ··· 767 612 } 768 613 769 614 770 770 - /* 771 771 - * Synchronous write, straight from __user pointer or user pages. 772 772 - * 773 773 - * If write spans object boundary, just do multiple writes. (For a 774 774 - * correct atomic write, we should e.g. take write locks on all 775 775 - * objects, rollback on failure, etc.) 776 776 - */ 777 615 static ssize_t 778 778 - ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, 779 779 - struct ceph_snap_context *snapc) 616 616 + ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, 617 617 + struct ceph_snap_context *snapc, 618 618 + struct ceph_cap_flush **pcf) 780 619 { 781 620 struct file *file = iocb->ki_filp; 782 621 struct inode *inode = file_inode(file); ··· 779 630 struct ceph_vino vino; 780 631 struct ceph_osd_request *req; 781 632 struct page **pages; 782 782 - int num_pages; 783 783 - int written = 0; 633 633 + struct ceph_aio_request *aio_req = NULL; 634 634 + int num_pages = 0; 784 635 int flags; 785 785 - int check_caps = 0; 786 636 int ret; 787 637 struct timespec mtime = CURRENT_TIME; 788 788 - size_t count = iov_iter_count(from); 638 638 + size_t count = iov_iter_count(iter); 639 639 + loff_t pos = iocb->ki_pos; 640 640 + bool write = iov_iter_rw(iter) == WRITE; 789 641 790 790 - if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) 642 642 + if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) 791 643 return -EROFS; 792 644 793 793 - dout("sync_direct_write on file %p %lld~%u\n", file, pos, 794 794 - (unsigned)count); 645 645 + dout("sync_direct_read_write (%s) on file %p %lld~%u\n", 646 646 + (write ? "write" : "read"), file, pos, (unsigned)count); 795 647 796 648 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); 797 649 if (ret < 0) 798 650 return ret; 799 651 800 800 - ret = invalidate_inode_pages2_range(inode->i_mapping, 801 801 - pos >> PAGE_CACHE_SHIFT, 802 802 - (pos + count) >> PAGE_CACHE_SHIFT); 803 803 - if (ret < 0) 804 804 - dout("invalidate_inode_pages2_range returned %d\n", ret); 652 652 + if (write) { 653 653 + ret = invalidate_inode_pages2_range(inode->i_mapping, 654 654 + pos >> PAGE_CACHE_SHIFT, 655 655 + (pos + count) >> PAGE_CACHE_SHIFT); 656 656 + if (ret < 0) 657 657 + dout("invalidate_inode_pages2_range returned %d\n", ret); 805 658 806 806 - flags = CEPH_OSD_FLAG_ORDERSNAP | 807 807 - CEPH_OSD_FLAG_ONDISK | 808 808 - CEPH_OSD_FLAG_WRITE; 659 659 + flags = CEPH_OSD_FLAG_ORDERSNAP | 660 660 + CEPH_OSD_FLAG_ONDISK | 661 661 + CEPH_OSD_FLAG_WRITE; 662 662 + } else { 663 663 + flags = CEPH_OSD_FLAG_READ; 664 664 + } 809 665 810 810 - while (iov_iter_count(from) > 0) { 811 811 - u64 len = dio_get_pagev_size(from); 812 812 - size_t start; 813 813 - ssize_t n; 666 666 + while (iov_iter_count(iter) > 0) { 667 667 + u64 size = dio_get_pagev_size(iter); 668 668 + size_t start = 0; 669 669 + ssize_t len; 814 670 815 671 vino = ceph_vino(inode); 816 672 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 817 817 - vino, pos, &len, 0, 818 818 - 2,/*include a 'startsync' command*/ 819 819 - CEPH_OSD_OP_WRITE, flags, snapc, 673 673 + vino, pos, &size, 0, 674 674 + /*include a 'startsync' command*/ 675 675 + write ? 2 : 1, 676 676 + write ? CEPH_OSD_OP_WRITE : 677 677 + CEPH_OSD_OP_READ, 678 678 + flags, snapc, 820 679 ci->i_truncate_seq, 821 680 ci->i_truncate_size, 822 681 false); ··· 833 676 break; 834 677 } 835 678 836 836 - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 837 837 - 838 838 - n = len; 839 839 - pages = dio_get_pages_alloc(from, len, &start, &num_pages); 679 679 + len = size; 680 680 + pages = dio_get_pages_alloc(iter, len, &start, &num_pages); 840 681 if (IS_ERR(pages)) { 841 682 ceph_osdc_put_request(req); 842 683 ret = PTR_ERR(pages); ··· 842 687 } 843 688 844 689 /* 845 845 - * throw out any page cache pages in this range. this 846 846 - * may block. 690 690 + * To simplify error handling, allow AIO when IO within i_size 691 691 + * or IO can be satisfied by single OSD request. 847 692 */ 848 848 - truncate_inode_pages_range(inode->i_mapping, pos, 849 849 - (pos+n) | (PAGE_CACHE_SIZE-1)); 850 850 - osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, 851 851 - false, false); 693 693 + if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && 694 694 + (len == count || pos + count <= i_size_read(inode))) { 695 695 + aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); 696 696 + if (aio_req) { 697 697 + aio_req->iocb = iocb; 698 698 + aio_req->write = write; 699 699 + INIT_LIST_HEAD(&aio_req->osd_reqs); 700 700 + if (write) { 701 701 + aio_req->mtime = mtime; 702 702 + swap(aio_req->prealloc_cf, *pcf); 703 703 + } 704 704 + } 705 705 + /* ignore error */ 706 706 + } 852 707 853 853 - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ 708 708 + if (write) { 709 709 + /* 710 710 + * throw out any page cache pages in this range. this 711 711 + * may block. 712 712 + */ 713 713 + truncate_inode_pages_range(inode->i_mapping, pos, 714 714 + (pos+len) | (PAGE_CACHE_SIZE - 1)); 715 715 + 716 716 + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); 717 717 + } 718 718 + 719 719 + 720 720 + osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, 721 721 + false, false); 722 722 + 854 723 ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); 855 724 856 856 - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 725 725 + if (aio_req) { 726 726 + aio_req->total_len += len; 727 727 + aio_req->num_reqs++; 728 728 + atomic_inc(&aio_req->pending_reqs); 729 729 + 730 730 + req->r_callback = ceph_aio_complete_req; 731 731 + req->r_inode = inode; 732 732 + req->r_priv = aio_req; 733 733 + list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); 734 734 + 735 735 + pos += len; 736 736 + iov_iter_advance(iter, len); 737 737 + continue; 738 738 + } 739 739 + 740 740 + ret = ceph_osdc_start_request(req->r_osdc, req, false); 857 741 if (!ret) 858 742 ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 743 743 + 744 744 + size = i_size_read(inode); 745 745 + if (!write) { 746 746 + if (ret == -ENOENT) 747 747 + ret = 0; 748 748 + if (ret >= 0 && ret < len && pos + ret < size) { 749 749 + int zlen = min_t(size_t, len - ret, 750 750 + size - pos - ret); 751 751 + ceph_zero_page_vector_range(start + ret, zlen, 752 752 + pages); 753 753 + ret += zlen; 754 754 + } 755 755 + if (ret >= 0) 756 756 + len = ret; 757 757 + } 859 758 860 759 ceph_put_page_vector(pages, num_pages, false); 861 760 862 761 ceph_osdc_put_request(req); 863 863 - if (ret) 762 762 + if (ret < 0) 864 763 break; 865 865 - pos += n; 866 866 - written += n; 867 867 - iov_iter_advance(from, n); 868 764 869 869 - if (pos > i_size_read(inode)) { 870 870 - check_caps = ceph_inode_set_size(inode, pos); 871 871 - if (check_caps) 765 765 + pos += len; 766 766 + iov_iter_advance(iter, len); 767 767 + 768 768 + if (!write && pos >= size) 769 769 + break; 770 770 + 771 771 + if (write && pos > size) { 772 772 + if (ceph_inode_set_size(inode, pos)) 872 773 ceph_check_caps(ceph_inode(inode), 873 774 CHECK_CAPS_AUTHONLY, 874 775 NULL); 875 776 } 876 777 } 877 778 878 878 - if (ret != -EOLDSNAPC && written > 0) { 779 779 + if (aio_req) { 780 780 + if (aio_req->num_reqs == 0) { 781 781 + kfree(aio_req); 782 782 + return ret; 783 783 + } 784 784 + 785 785 + ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : 786 786 + CEPH_CAP_FILE_RD); 787 787 + 788 788 + while (!list_empty(&aio_req->osd_reqs)) { 789 789 + req = list_first_entry(&aio_req->osd_reqs, 790 790 + struct ceph_osd_request, 791 791 + r_unsafe_item); 792 792 + list_del_init(&req->r_unsafe_item); 793 793 + if (ret >= 0) 794 794 + ret = ceph_osdc_start_request(req->r_osdc, 795 795 + req, false); 796 796 + if (ret < 0) { 797 797 + BUG_ON(ret == -EOLDSNAPC); 798 798 + req->r_result = ret; 799 799 + ceph_aio_complete_req(req, NULL); 800 800 + } 801 801 + } 802 802 + return -EIOCBQUEUED; 803 803 + } 804 804 + 805 805 + if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { 806 806 + ret = pos - iocb->ki_pos; 879 807 iocb->ki_pos = pos; 880 880 - ret = written; 881 808 } 882 809 return ret; 883 810 } 884 884 - 885 811 886 812 /* 887 813 * Synchronous write, straight from __user pointer or user pages. ··· 1133 897 ceph_cap_string(got)); 1134 898 1135 899 if (ci->i_inline_version == CEPH_INLINE_NONE) { 1136 1136 - /* hmm, this isn't really async... */ 1137 1137 - ret = ceph_sync_read(iocb, to, &retry_op); 900 900 + if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 901 901 + ret = ceph_direct_read_write(iocb, to, 902 902 + NULL, NULL); 903 903 + if (ret >= 0 && ret < len) 904 904 + retry_op = CHECK_EOF; 905 905 + } else { 906 906 + ret = ceph_sync_read(iocb, to, &retry_op); 907 907 + } 1138 908 } else { 1139 909 retry_op = READ_INLINE; 1140 910 } ··· 1158 916 pinned_page = NULL; 1159 917 } 1160 918 ceph_put_cap_refs(ci, got); 1161 1161 - if (retry_op && ret >= 0) { 919 919 + if (retry_op > HAVE_RETRIED && ret >= 0) { 1162 920 int statret; 1163 921 struct page *page = NULL; 1164 922 loff_t i_size; ··· 1210 968 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 1211 969 ret < len) { 1212 970 dout("sync_read hit hole, ppos %lld < size %lld" 1213 1213 - ", reading more\n", iocb->ki_pos, 1214 1214 - inode->i_size); 971 971 + ", reading more\n", iocb->ki_pos, i_size); 1215 972 1216 973 read += ret; 1217 974 len -= ret; 1218 1218 - retry_op = 0; 975 975 + retry_op = HAVE_RETRIED; 1219 976 goto again; 1220 977 } 1221 978 } ··· 1293 1052 } 1294 1053 1295 1054 dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", 1296 1296 - inode, ceph_vinop(inode), pos, count, inode->i_size); 1055 1055 + inode, ceph_vinop(inode), pos, count, i_size_read(inode)); 1297 1056 if (fi->fmode & CEPH_FILE_MODE_LAZY) 1298 1057 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; 1299 1058 else ··· 1329 1088 /* we might need to revert back to that point */ 1330 1089 data = *from; 1331 1090 if (iocb->ki_flags & IOCB_DIRECT) 1332 1332 - written = ceph_sync_direct_write(iocb, &data, pos, 1333 1333 - snapc); 1091 1091 + written = ceph_direct_read_write(iocb, &data, snapc, 1092 1092 + &prealloc_cf); 1334 1093 else 1335 1094 written = ceph_sync_write(iocb, &data, pos, snapc); 1336 1095 if (written == -EOLDSNAPC) { ··· 1345 1104 iov_iter_advance(from, written); 1346 1105 ceph_put_snap_context(snapc); 1347 1106 } else { 1348 1348 - loff_t old_size = inode->i_size; 1107 1107 + loff_t old_size = i_size_read(inode); 1349 1108 /* 1350 1109 * No need to acquire the i_truncate_mutex. Because 1351 1110 * the MDS revokes Fwb caps before sending truncate ··· 1356 1115 written = generic_perform_write(file, from, pos); 1357 1116 if (likely(written >= 0)) 1358 1117 iocb->ki_pos = pos + written; 1359 1359 - if (inode->i_size > old_size) 1118 1118 + if (i_size_read(inode) > old_size) 1360 1119 ceph_fscache_update_objectsize(inode); 1361 1120 inode_unlock(inode); 1362 1121 } ··· 1401 1160 static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) 1402 1161 { 1403 1162 struct inode *inode = file->f_mapping->host; 1163 1163 + loff_t i_size; 1404 1164 int ret; 1405 1165 1406 1166 inode_lock(inode); ··· 1414 1172 } 1415 1173 } 1416 1174 1175 1175 + i_size = i_size_read(inode); 1417 1176 switch (whence) { 1418 1177 case SEEK_END: 1419 1419 - offset += inode->i_size; 1178 1178 + offset += i_size; 1420 1179 break; 1421 1180 case SEEK_CUR: 1422 1181 /* ··· 1433 1190 offset += file->f_pos; 1434 1191 break; 1435 1192 case SEEK_DATA: 1436 1436 - if (offset >= inode->i_size) { 1193 1193 + if (offset >= i_size) { 1437 1194 ret = -ENXIO; 1438 1195 goto out; 1439 1196 } 1440 1197 break; 1441 1198 case SEEK_HOLE: 1442 1442 - if (offset >= inode->i_size) { 1199 1199 + if (offset >= i_size) { 1443 1200 ret = -ENXIO; 1444 1201 goto out; 1445 1202 } 1446 1446 - offset = inode->i_size; 1203 1203 + offset = i_size; 1447 1204 break; 1448 1205 } 1449 1206

+4 -4

fs/ceph/inode.c

reviewed

··· 548 548 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || 549 549 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { 550 550 dout("size %lld -> %llu\n", inode->i_size, size); 551 551 - inode->i_size = size; 551 551 + i_size_write(inode, size); 552 552 inode->i_blocks = (size + (1<<9) - 1) >> 9; 553 553 ci->i_reported_size = size; 554 554 if (truncate_seq != ci->i_truncate_seq) { ··· 808 808 spin_unlock(&ci->i_ceph_lock); 809 809 810 810 err = -EINVAL; 811 811 - if (WARN_ON(symlen != inode->i_size)) 811 811 + if (WARN_ON(symlen != i_size_read(inode))) 812 812 goto out; 813 813 814 814 err = -ENOMEM; ··· 1549 1549 1550 1550 spin_lock(&ci->i_ceph_lock); 1551 1551 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); 1552 1552 - inode->i_size = size; 1552 1552 + i_size_write(inode, size); 1553 1553 inode->i_blocks = (size + (1 << 9) - 1) >> 9; 1554 1554 1555 1555 /* tell the MDS if we are approaching max_size */ ··· 1911 1911 inode->i_size, attr->ia_size); 1912 1912 if ((issued & CEPH_CAP_FILE_EXCL) && 1913 1913 attr->ia_size > inode->i_size) { 1914 1914 - inode->i_size = attr->ia_size; 1914 1914 + i_size_write(inode, attr->ia_size); 1915 1915 inode->i_blocks = 1916 1916 (attr->ia_size + (1 << 9) - 1) >> 9; 1917 1917 inode->i_ctime = attr->ia_ctime;

+1 -36

include/linux/ceph/ceph_frag.h

reviewed

··· 40 40 return 24 - ceph_frag_bits(f); 41 41 } 42 42 43 43 - static inline int ceph_frag_contains_value(__u32 f, __u32 v) 43 43 + static inline bool ceph_frag_contains_value(__u32 f, __u32 v) 44 44 { 45 45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f); 46 46 } 47 47 - static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) 48 48 - { 49 49 - /* is sub as specific as us, and contained by us? */ 50 50 - return ceph_frag_bits(sub) >= ceph_frag_bits(f) && 51 51 - (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); 52 52 - } 53 47 54 54 - static inline __u32 ceph_frag_parent(__u32 f) 55 55 - { 56 56 - return ceph_frag_make(ceph_frag_bits(f) - 1, 57 57 - ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); 58 58 - } 59 59 - static inline int ceph_frag_is_left_child(__u32 f) 60 60 - { 61 61 - return ceph_frag_bits(f) > 0 && 62 62 - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; 63 63 - } 64 64 - static inline int ceph_frag_is_right_child(__u32 f) 65 65 - { 66 66 - return ceph_frag_bits(f) > 0 && 67 67 - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; 68 68 - } 69 69 - static inline __u32 ceph_frag_sibling(__u32 f) 70 70 - { 71 71 - return ceph_frag_make(ceph_frag_bits(f), 72 72 - ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); 73 73 - } 74 74 - static inline __u32 ceph_frag_left_child(__u32 f) 75 75 - { 76 76 - return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); 77 77 - } 78 78 - static inline __u32 ceph_frag_right_child(__u32 f) 79 79 - { 80 80 - return ceph_frag_make(ceph_frag_bits(f)+1, 81 81 - ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); 82 82 - } 83 48 static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) 84 49 { 85 50 int newbits = ceph_frag_bits(f) + by;

+1 -1

include/linux/ceph/messenger.h

reviewed

··· 220 220 struct ceph_entity_addr actual_peer_addr; 221 221 222 222 /* message out temps */ 223 223 + struct ceph_msg_header out_hdr; 223 224 struct ceph_msg *out_msg; /* sending message (== tail of 224 225 out_sent) */ 225 226 bool out_msg_done; ··· 230 229 int out_kvec_left; /* kvec's left in out_kvec */ 231 230 int out_skip; /* skip this many bytes */ 232 231 int out_kvec_bytes; /* total bytes left */ 233 233 - bool out_kvec_is_msg; /* kvec refers to out_msg */ 234 232 int out_more; /* there is more data after the kvecs */ 235 233 __le64 out_temp_ack; /* for writing an ack */ 236 234 struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2

+38 -11

net/ceph/auth_x.c

reviewed

··· 152 152 void *ticket_buf = NULL; 153 153 void *tp, *tpend; 154 154 void **ptp; 155 155 - struct ceph_timespec new_validity; 156 155 struct ceph_crypto_key new_session_key; 157 156 struct ceph_buffer *new_ticket_blob; 158 157 unsigned long new_expires, new_renew_after; ··· 192 193 if (ret) 193 194 goto out; 194 195 195 195 - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); 196 196 - ceph_decode_timespec(&validity, &new_validity); 196 196 + ceph_decode_timespec(&validity, dp); 197 197 + dp += sizeof(struct ceph_timespec); 197 198 new_expires = get_seconds() + validity.tv_sec; 198 199 new_renew_after = new_expires - (validity.tv_sec / 4); 199 200 dout(" expires=%lu renew_after=%lu\n", new_expires, ··· 232 233 ceph_buffer_put(th->ticket_blob); 233 234 th->session_key = new_session_key; 234 235 th->ticket_blob = new_ticket_blob; 235 235 - th->validity = new_validity; 236 236 th->secret_id = new_secret_id; 237 237 th->expires = new_expires; 238 238 th->renew_after = new_renew_after; 239 239 + th->have_key = true; 239 240 dout(" got ticket service %d (%s) secret_id %lld len %d\n", 240 241 type, ceph_entity_type_name(type), th->secret_id, 241 242 (int)th->ticket_blob->vec.iov_len); ··· 383 384 return -ERANGE; 384 385 } 385 386 387 387 + static bool need_key(struct ceph_x_ticket_handler *th) 388 388 + { 389 389 + if (!th->have_key) 390 390 + return true; 391 391 + 392 392 + return get_seconds() >= th->renew_after; 393 393 + } 394 394 + 395 395 + static bool have_key(struct ceph_x_ticket_handler *th) 396 396 + { 397 397 + if (th->have_key) { 398 398 + if (get_seconds() >= th->expires) 399 399 + th->have_key = false; 400 400 + } 401 401 + 402 402 + return th->have_key; 403 403 + } 404 404 + 386 405 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) 387 406 { 388 407 int want = ac->want_keys; ··· 419 402 continue; 420 403 421 404 th = get_ticket_handler(ac, service); 422 422 - 423 405 if (IS_ERR(th)) { 424 406 *pneed |= service; 425 407 continue; 426 408 } 427 409 428 428 - if (get_seconds() >= th->renew_after) 410 410 + if (need_key(th)) 429 411 *pneed |= service; 430 430 - if (get_seconds() >= th->expires) 412 412 + if (!have_key(th)) 431 413 xi->have_keys &= ~service; 432 414 } 433 415 } 434 434 - 435 416 436 417 static int ceph_x_build_request(struct ceph_auth_client *ac, 437 418 void *buf, void *end) ··· 682 667 ac->private = NULL; 683 668 } 684 669 685 685 - static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, 686 686 - int peer_type) 670 670 + static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) 687 671 { 688 672 struct ceph_x_ticket_handler *th; 689 673 690 674 th = get_ticket_handler(ac, peer_type); 691 675 if (!IS_ERR(th)) 692 692 - memset(&th->validity, 0, sizeof(th->validity)); 676 676 + th->have_key = false; 677 677 + } 678 678 + 679 679 + static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, 680 680 + int peer_type) 681 681 + { 682 682 + /* 683 683 + * We are to invalidate a service ticket in the hopes of 684 684 + * getting a new, hopefully more valid, one. But, we won't get 685 685 + * it unless our AUTH ticket is good, so invalidate AUTH ticket 686 686 + * as well, just in case. 687 687 + */ 688 688 + invalidate_ticket(ac, peer_type); 689 689 + invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH); 693 690 } 694 691 695 692 static int calcu_signature(struct ceph_x_authorizer *au,

+1 -1

net/ceph/auth_x.h

reviewed

··· 16 16 unsigned int service; 17 17 18 18 struct ceph_crypto_key session_key; 19 19 - struct ceph_timespec validity; 19 19 + bool have_key; 20 20 21 21 u64 secret_id; 22 22 struct ceph_buffer *ticket_blob;

+70 -35

net/ceph/messenger.c

reviewed

··· 23 23 #include <linux/ceph/pagelist.h> 24 24 #include <linux/export.h> 25 25 26 26 - #define list_entry_next(pos, member) \ 27 27 - list_entry(pos->member.next, typeof(*pos), member) 28 28 - 29 26 /* 30 27 * Ceph uses the messenger to exchange ceph_msg messages with other 31 28 * hosts in the system. The messenger provides ordered and reliable ··· 669 672 } 670 673 con->in_seq = 0; 671 674 con->in_seq_acked = 0; 675 675 + 676 676 + con->out_skip = 0; 672 677 } 673 678 674 679 /* ··· 770 771 771 772 static void con_out_kvec_reset(struct ceph_connection *con) 772 773 { 774 774 + BUG_ON(con->out_skip); 775 775 + 773 776 con->out_kvec_left = 0; 774 777 con->out_kvec_bytes = 0; 775 778 con->out_kvec_cur = &con->out_kvec[0]; ··· 780 779 static void con_out_kvec_add(struct ceph_connection *con, 781 780 size_t size, void *data) 782 781 { 783 783 - int index; 782 782 + int index = con->out_kvec_left; 784 783 785 785 - index = con->out_kvec_left; 784 784 + BUG_ON(con->out_skip); 786 785 BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); 787 786 788 787 con->out_kvec[index].iov_len = size; 789 788 con->out_kvec[index].iov_base = data; 790 789 con->out_kvec_left++; 791 790 con->out_kvec_bytes += size; 791 791 + } 792 792 + 793 793 + /* 794 794 + * Chop off a kvec from the end. Return residual number of bytes for 795 795 + * that kvec, i.e. how many bytes would have been written if the kvec 796 796 + * hadn't been nuked. 797 797 + */ 798 798 + static int con_out_kvec_skip(struct ceph_connection *con) 799 799 + { 800 800 + int off = con->out_kvec_cur - con->out_kvec; 801 801 + int skip = 0; 802 802 + 803 803 + if (con->out_kvec_bytes > 0) { 804 804 + skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; 805 805 + BUG_ON(con->out_kvec_bytes < skip); 806 806 + BUG_ON(!con->out_kvec_left); 807 807 + con->out_kvec_bytes -= skip; 808 808 + con->out_kvec_left--; 809 809 + } 810 810 + 811 811 + return skip; 792 812 } 793 813 794 814 #ifdef CONFIG_BLOCK ··· 1064 1042 /* Move on to the next page */ 1065 1043 1066 1044 BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); 1067 1067 - cursor->page = list_entry_next(cursor->page, lru); 1045 1045 + cursor->page = list_next_entry(cursor->page, lru); 1068 1046 cursor->last_piece = cursor->resid <= PAGE_SIZE; 1069 1047 1070 1048 return true; ··· 1188 1166 if (!cursor->resid && cursor->total_resid) { 1189 1167 WARN_ON(!cursor->last_piece); 1190 1168 BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); 1191 1191 - cursor->data = list_entry_next(cursor->data, links); 1169 1169 + cursor->data = list_next_entry(cursor->data, links); 1192 1170 __ceph_msg_data_cursor_init(cursor); 1193 1171 new_piece = true; 1194 1172 } ··· 1219 1197 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; 1220 1198 1221 1199 dout("prepare_write_message_footer %p\n", con); 1222 1222 - con->out_kvec_is_msg = true; 1223 1200 con->out_kvec[v].iov_base = &m->footer; 1224 1201 if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 1225 1202 if (con->ops->sign_message) ··· 1246 1225 u32 crc; 1247 1226 1248 1227 con_out_kvec_reset(con); 1249 1249 - con->out_kvec_is_msg = true; 1250 1228 con->out_msg_done = false; 1251 1229 1252 1230 /* Sneak an ack in there first? If we can get it into the same ··· 1285 1265 1286 1266 /* tag + hdr + front + middle */ 1287 1267 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); 1288 1288 - con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); 1268 1268 + con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); 1289 1269 con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); 1290 1270 1291 1271 if (m->middle) 1292 1272 con_out_kvec_add(con, m->middle->vec.iov_len, 1293 1273 m->middle->vec.iov_base); 1294 1274 1295 1295 - /* fill in crc (except data pages), footer */ 1275 1275 + /* fill in hdr crc and finalize hdr */ 1296 1276 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); 1297 1277 con->out_msg->hdr.crc = cpu_to_le32(crc); 1298 1298 - con->out_msg->footer.flags = 0; 1278 1278 + memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); 1299 1279 1280 1280 + /* fill in front and middle crc, footer */ 1300 1281 crc = crc32c(0, m->front.iov_base, m->front.iov_len); 1301 1282 con->out_msg->footer.front_crc = cpu_to_le32(crc); 1302 1283 if (m->middle) { ··· 1309 1288 dout("%s front_crc %u middle_crc %u\n", __func__, 1310 1289 le32_to_cpu(con->out_msg->footer.front_crc), 1311 1290 le32_to_cpu(con->out_msg->footer.middle_crc)); 1291 1291 + con->out_msg->footer.flags = 0; 1312 1292 1313 1293 /* is there a data payload? */ 1314 1294 con->out_msg->footer.data_crc = 0; ··· 1514 1492 } 1515 1493 } 1516 1494 con->out_kvec_left = 0; 1517 1517 - con->out_kvec_is_msg = false; 1518 1495 ret = 1; 1519 1496 out: 1520 1497 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, ··· 1605 1584 { 1606 1585 int ret; 1607 1586 1587 1587 + dout("%s %p %d left\n", __func__, con, con->out_skip); 1608 1588 while (con->out_skip > 0) { 1609 1589 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); 1610 1590 ··· 2528 2506 2529 2507 more_kvec: 2530 2508 /* kvec data queued? */ 2531 2531 - if (con->out_skip) { 2532 2532 - ret = write_partial_skip(con); 2509 2509 + if (con->out_kvec_left) { 2510 2510 + ret = write_partial_kvec(con); 2533 2511 if (ret <= 0) 2534 2512 goto out; 2535 2513 } 2536 2536 - if (con->out_kvec_left) { 2537 2537 - ret = write_partial_kvec(con); 2514 2514 + if (con->out_skip) { 2515 2515 + ret = write_partial_skip(con); 2538 2516 if (ret <= 0) 2539 2517 goto out; 2540 2518 } ··· 2827 2805 2828 2806 static void con_fault_finish(struct ceph_connection *con) 2829 2807 { 2808 2808 + dout("%s %p\n", __func__, con); 2809 2809 + 2830 2810 /* 2831 2811 * in case we faulted due to authentication, invalidate our 2832 2812 * current tickets so that we can get new ones. 2833 2813 */ 2834 2834 - if (con->auth_retry && con->ops->invalidate_authorizer) { 2835 2835 - dout("calling invalidate_authorizer()\n"); 2836 2836 - con->ops->invalidate_authorizer(con); 2814 2814 + if (con->auth_retry) { 2815 2815 + dout("auth_retry %d, invalidating\n", con->auth_retry); 2816 2816 + if (con->ops->invalidate_authorizer) 2817 2817 + con->ops->invalidate_authorizer(con); 2818 2818 + con->auth_retry = 0; 2837 2819 } 2838 2820 2839 2821 if (con->ops->fault) ··· 3076 3050 ceph_msg_put(msg); 3077 3051 } 3078 3052 if (con->out_msg == msg) { 3079 3079 - dout("%s %p msg %p - was sending\n", __func__, con, msg); 3080 3080 - con->out_msg = NULL; 3081 3081 - if (con->out_kvec_is_msg) { 3082 3082 - con->out_skip = con->out_kvec_bytes; 3083 3083 - con->out_kvec_is_msg = false; 3053 3053 + BUG_ON(con->out_skip); 3054 3054 + /* footer */ 3055 3055 + if (con->out_msg_done) { 3056 3056 + con->out_skip += con_out_kvec_skip(con); 3057 3057 + } else { 3058 3058 + BUG_ON(!msg->data_length); 3059 3059 + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) 3060 3060 + con->out_skip += sizeof(msg->footer); 3061 3061 + else 3062 3062 + con->out_skip += sizeof(msg->old_footer); 3084 3063 } 3085 3085 - msg->hdr.seq = 0; 3064 3064 + /* data, middle, front */ 3065 3065 + if (msg->data_length) 3066 3066 + con->out_skip += msg->cursor.total_resid; 3067 3067 + if (msg->middle) 3068 3068 + con->out_skip += con_out_kvec_skip(con); 3069 3069 + con->out_skip += con_out_kvec_skip(con); 3086 3070 3071 3071 + dout("%s %p msg %p - was sending, will write %d skip %d\n", 3072 3072 + __func__, con, msg, con->out_kvec_bytes, con->out_skip); 3073 3073 + msg->hdr.seq = 0; 3074 3074 + con->out_msg = NULL; 3087 3075 ceph_msg_put(msg); 3088 3076 } 3077 3077 + 3089 3078 mutex_unlock(&con->mutex); 3090 3079 } 3091 3080 ··· 3402 3361 static void ceph_msg_release(struct kref *kref) 3403 3362 { 3404 3363 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); 3405 3405 - LIST_HEAD(data); 3406 3406 - struct list_head *links; 3407 3407 - struct list_head *next; 3364 3364 + struct ceph_msg_data *data, *next; 3408 3365 3409 3366 dout("%s %p\n", __func__, m); 3410 3367 WARN_ON(!list_empty(&m->list_head)); ··· 3415 3376 m->middle = NULL; 3416 3377 } 3417 3378 3418 3418 - list_splice_init(&m->data, &data); 3419 3419 - list_for_each_safe(links, next, &data) { 3420 3420 - struct ceph_msg_data *data; 3421 3421 - 3422 3422 - data = list_entry(links, struct ceph_msg_data, links); 3423 3423 - list_del_init(links); 3379 3379 + list_for_each_entry_safe(data, next, &m->data, links) { 3380 3380 + list_del_init(&data->links); 3424 3381 ceph_msg_data_destroy(data); 3425 3382 } 3426 3383 m->data_length = 0;

-4

net/ceph/mon_client.c

reviewed

··· 364 364 return monc->client->have_fsid && monc->auth->global_id > 0; 365 365 } 366 366 367 367 - /* 368 368 - * The monitor responds with mount ack indicate mount success. The 369 369 - * included client ticket allows the client to talk to MDSs and OSDs. 370 370 - */ 371 367 static void ceph_monc_handle_map(struct ceph_mon_client *monc, 372 368 struct ceph_msg *msg) 373 369 {