Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'vfs-6.15-rc1.ceph' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs ceph updates from Christian Brauner:
"This contains the work to remove access to page->index from ceph
and fixes the test failure observed for ceph with generic/421 by
refactoring ceph_writepages_start()"

* tag 'vfs-6.15-rc1.ceph' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
fscrypt: Change fscrypt_encrypt_pagecache_blocks() to take a folio
ceph: Fix error handling in fill_readdir_cache()
fs: Remove page_mkwrite_check_truncate()
ceph: Pass a folio to ceph_allocate_page_array()
ceph: Convert ceph_move_dirty_page_in_page_array() to move_dirty_folio_in_page_array()
ceph: Remove uses of page from ceph_process_folio_batch()
ceph: Convert ceph_check_page_before_write() to use a folio
ceph: Convert writepage_nounlock() to write_folio_nounlock()
ceph: Convert ceph_readdir_cache_control to store a folio
ceph: Convert ceph_find_incompatible() to take a folio
ceph: Use a folio in ceph_page_mkwrite()
ceph: Remove ceph_writepage()
ceph: fix generic/421 test failure
ceph: introduce ceph_submit_write() method
ceph: introduce ceph_process_folio_batch() method
ceph: extend ceph_writeback_ctl for ceph_writepages_start() refactoring

+854 -547
+795 -476
fs/ceph/addr.c
··· 82 82 { 83 83 struct inode *inode = mapping->host; 84 84 struct ceph_client *cl = ceph_inode_to_client(inode); 85 + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 85 86 struct ceph_inode_info *ci; 86 87 struct ceph_snap_context *snapc; 87 88 ··· 92 91 VM_BUG_ON_FOLIO(!folio_test_private(folio), folio); 93 92 return false; 94 93 } 94 + 95 + atomic64_inc(&mdsc->dirty_folios); 95 96 96 97 ci = ceph_inode(inode); 97 98 ··· 571 568 u64 truncate_size; 572 569 u32 truncate_seq; 573 570 bool size_stable; 571 + 574 572 bool head_snapc; 573 + struct ceph_snap_context *snapc; 574 + struct ceph_snap_context *last_snapc; 575 + 576 + bool done; 577 + bool should_loop; 578 + bool range_whole; 579 + pgoff_t start_index; 580 + pgoff_t index; 581 + pgoff_t end; 582 + xa_mark_t tag; 583 + 584 + pgoff_t strip_unit_end; 585 + unsigned int wsize; 586 + unsigned int nr_folios; 587 + unsigned int max_pages; 588 + unsigned int locked_pages; 589 + 590 + int op_idx; 591 + int num_ops; 592 + u64 offset; 593 + u64 len; 594 + 595 + struct folio_batch fbatch; 596 + unsigned int processed_in_fbatch; 597 + 598 + bool from_pool; 599 + struct page **pages; 600 + struct page **data_pages; 575 601 }; 576 602 577 603 /* ··· 698 666 } 699 667 700 668 /* 701 - * Write a single page, but leave the page locked. 669 + * Write a folio, but leave it locked. 702 670 * 703 671 * If we get a write error, mark the mapping for error, but still adjust the 704 - * dirty page accounting (i.e., page is no longer dirty). 672 + * dirty page accounting (i.e., folio is no longer dirty). 705 673 */ 706 - static int writepage_nounlock(struct page *page, struct writeback_control *wbc) 674 + static int write_folio_nounlock(struct folio *folio, 675 + struct writeback_control *wbc) 707 676 { 708 - struct folio *folio = page_folio(page); 709 - struct inode *inode = page->mapping->host; 677 + struct page *page = &folio->page; 678 + struct inode *inode = folio->mapping->host; 710 679 struct ceph_inode_info *ci = ceph_inode(inode); 711 680 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 712 681 struct ceph_client *cl = fsc->client; 713 682 struct ceph_snap_context *snapc, *oldest; 714 - loff_t page_off = page_offset(page); 683 + loff_t page_off = folio_pos(folio); 715 684 int err; 716 - loff_t len = thp_size(page); 685 + loff_t len = folio_size(folio); 717 686 loff_t wlen; 718 687 struct ceph_writeback_ctl ceph_wbc; 719 688 struct ceph_osd_client *osdc = &fsc->client->osdc; ··· 722 689 bool caching = ceph_is_cache_enabled(inode); 723 690 struct page *bounce_page = NULL; 724 691 725 - doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, 726 - page->index); 692 + doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio, 693 + folio->index); 727 694 728 695 if (ceph_inode_is_shutdown(inode)) 729 696 return -EIO; 730 697 731 698 /* verify this is a writeable snap context */ 732 - snapc = page_snap_context(page); 699 + snapc = page_snap_context(&folio->page); 733 700 if (!snapc) { 734 - doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), 735 - page); 701 + doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode), 702 + folio); 736 703 return 0; 737 704 } 738 705 oldest = get_oldest_context(inode, &ceph_wbc, snapc); 739 706 if (snapc->seq > oldest->seq) { 740 - doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", 741 - ceph_vinop(inode), page, snapc); 707 + doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n", 708 + ceph_vinop(inode), folio, snapc); 742 709 /* we should only noop if called by kswapd */ 743 710 WARN_ON(!(current->flags & PF_MEMALLOC)); 744 711 ceph_put_snap_context(oldest); 745 - redirty_page_for_writepage(wbc, page); 712 + folio_redirty_for_writepage(wbc, folio); 746 713 return 0; 747 714 } 748 715 ceph_put_snap_context(oldest); ··· 759 726 len = ceph_wbc.i_size - page_off; 760 727 761 728 wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; 762 - doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", 763 - ceph_vinop(inode), page, page->index, page_off, wlen, snapc, 729 + doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n", 730 + ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc, 764 731 snapc->seq); 765 732 766 733 if (atomic_long_inc_return(&fsc->writeback_count) > ··· 773 740 ceph_wbc.truncate_seq, 774 741 ceph_wbc.truncate_size, true); 775 742 if (IS_ERR(req)) { 776 - redirty_page_for_writepage(wbc, page); 743 + folio_redirty_for_writepage(wbc, folio); 777 744 return PTR_ERR(req); 778 745 } 779 746 780 747 if (wlen < len) 781 748 len = wlen; 782 749 783 - set_page_writeback(page); 750 + folio_start_writeback(folio); 784 751 if (caching) 785 - ceph_set_page_fscache(page); 752 + ceph_set_page_fscache(&folio->page); 786 753 ceph_fscache_write_to_cache(inode, page_off, len, caching); 787 754 788 755 if (IS_ENCRYPTED(inode)) { 789 - bounce_page = fscrypt_encrypt_pagecache_blocks(page, 756 + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, 790 757 CEPH_FSCRYPT_BLOCK_SIZE, 0, 791 758 GFP_NOFS); 792 759 if (IS_ERR(bounce_page)) { 793 - redirty_page_for_writepage(wbc, page); 794 - end_page_writeback(page); 760 + folio_redirty_for_writepage(wbc, folio); 761 + folio_end_writeback(folio); 795 762 ceph_osdc_put_request(req); 796 763 return PTR_ERR(bounce_page); 797 764 } 798 765 } 799 766 800 767 /* it may be a short write due to an object boundary */ 801 - WARN_ON_ONCE(len > thp_size(page)); 768 + WARN_ON_ONCE(len > folio_size(folio)); 802 769 osd_req_op_extent_osd_data_pages(req, 0, 803 770 bounce_page ? &bounce_page : &page, wlen, 0, 804 771 false, false); ··· 824 791 if (err == -ERESTARTSYS) { 825 792 /* killed by SIGKILL */ 826 793 doutc(cl, "%llx.%llx interrupted page %p\n", 827 - ceph_vinop(inode), page); 828 - redirty_page_for_writepage(wbc, page); 829 - end_page_writeback(page); 794 + ceph_vinop(inode), folio); 795 + folio_redirty_for_writepage(wbc, folio); 796 + folio_end_writeback(folio); 830 797 return err; 831 798 } 832 799 if (err == -EBLOCKLISTED) 833 800 fsc->blocklisted = true; 834 - doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", 835 - ceph_vinop(inode), err, page); 801 + doutc(cl, "%llx.%llx setting mapping error %d %p\n", 802 + ceph_vinop(inode), err, folio); 836 803 mapping_set_error(&inode->i_data, err); 837 804 wbc->pages_skipped++; 838 805 } else { 839 806 doutc(cl, "%llx.%llx cleaned page %p\n", 840 - ceph_vinop(inode), page); 807 + ceph_vinop(inode), folio); 841 808 err = 0; /* vfs expects us to return 0 */ 842 809 } 843 - oldest = detach_page_private(page); 810 + oldest = folio_detach_private(folio); 844 811 WARN_ON_ONCE(oldest != snapc); 845 - end_page_writeback(page); 812 + folio_end_writeback(folio); 846 813 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 847 814 ceph_put_snap_context(snapc); /* page's reference */ 848 815 ··· 850 817 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb)) 851 818 fsc->write_congested = false; 852 819 853 - return err; 854 - } 855 - 856 - static int ceph_writepage(struct page *page, struct writeback_control *wbc) 857 - { 858 - int err; 859 - struct inode *inode = page->mapping->host; 860 - BUG_ON(!inode); 861 - ihold(inode); 862 - 863 - if (wbc->sync_mode == WB_SYNC_NONE && 864 - ceph_inode_to_fs_client(inode)->write_congested) { 865 - redirty_page_for_writepage(wbc, page); 866 - return AOP_WRITEPAGE_ACTIVATE; 867 - } 868 - 869 - folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ 870 - 871 - err = writepage_nounlock(page, wbc); 872 - if (err == -ERESTARTSYS) { 873 - /* direct memory reclaimer was killed by SIGKILL. return 0 874 - * to prevent caller from setting mapping/page error */ 875 - err = 0; 876 - } 877 - unlock_page(page); 878 - iput(inode); 879 820 return err; 880 821 } 881 822 ··· 872 865 struct ceph_snap_context *snapc = req->r_snapc; 873 866 struct address_space *mapping = inode->i_mapping; 874 867 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 868 + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); 875 869 unsigned int len = 0; 876 870 bool remove_page; 877 871 ··· 928 920 929 921 ceph_put_snap_context(detach_page_private(page)); 930 922 end_page_writeback(page); 923 + 924 + if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) { 925 + wake_up_all(&mdsc->flush_end_wq); 926 + WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0); 927 + } 928 + 931 929 doutc(cl, "unlocking %p\n", page); 932 930 933 931 if (remove_page) ··· 963 949 ceph_dec_osd_stopping_blocker(fsc->mdsc); 964 950 } 965 951 966 - /* 967 - * initiate async writeback 968 - */ 969 - static int ceph_writepages_start(struct address_space *mapping, 970 - struct writeback_control *wbc) 952 + static inline 953 + bool is_forced_umount(struct address_space *mapping) 971 954 { 972 955 struct inode *inode = mapping->host; 973 956 struct ceph_inode_info *ci = ceph_inode(inode); 974 957 struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 975 958 struct ceph_client *cl = fsc->client; 976 - struct ceph_vino vino = ceph_vino(inode); 977 - pgoff_t index, start_index, end = -1; 978 - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; 979 - struct folio_batch fbatch; 980 - int rc = 0; 981 - unsigned int wsize = i_blocksize(inode); 982 - struct ceph_osd_request *req = NULL; 983 - struct ceph_writeback_ctl ceph_wbc; 984 - bool should_loop, range_whole = false; 985 - bool done = false; 986 - bool caching = ceph_is_cache_enabled(inode); 987 - xa_mark_t tag; 988 - 989 - if (wbc->sync_mode == WB_SYNC_NONE && 990 - fsc->write_congested) 991 - return 0; 992 - 993 - doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), 994 - wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 995 - (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 996 959 997 960 if (ceph_inode_is_shutdown(inode)) { 998 961 if (ci->i_wrbuffer_ref > 0) { ··· 978 987 ceph_vinop(inode), ceph_ino(inode)); 979 988 } 980 989 mapping_set_error(mapping, -EIO); 981 - return -EIO; /* we're in a forced umount, don't write! */ 990 + return true; 982 991 } 992 + 993 + return false; 994 + } 995 + 996 + static inline 997 + unsigned int ceph_define_write_size(struct address_space *mapping) 998 + { 999 + struct inode *inode = mapping->host; 1000 + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 1001 + unsigned int wsize = i_blocksize(inode); 1002 + 983 1003 if (fsc->mount_options->wsize < wsize) 984 1004 wsize = fsc->mount_options->wsize; 985 1005 986 - folio_batch_init(&fbatch); 1006 + return wsize; 1007 + } 987 1008 988 - start_index = wbc->range_cyclic ? mapping->writeback_index : 0; 989 - index = start_index; 1009 + static inline 1010 + void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc) 1011 + { 1012 + folio_batch_init(&ceph_wbc->fbatch); 1013 + ceph_wbc->processed_in_fbatch = 0; 1014 + } 1015 + 1016 + static inline 1017 + void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc) 1018 + { 1019 + folio_batch_release(&ceph_wbc->fbatch); 1020 + ceph_folio_batch_init(ceph_wbc); 1021 + } 1022 + 1023 + static inline 1024 + void ceph_init_writeback_ctl(struct address_space *mapping, 1025 + struct writeback_control *wbc, 1026 + struct ceph_writeback_ctl *ceph_wbc) 1027 + { 1028 + ceph_wbc->snapc = NULL; 1029 + ceph_wbc->last_snapc = NULL; 1030 + 1031 + ceph_wbc->strip_unit_end = 0; 1032 + ceph_wbc->wsize = ceph_define_write_size(mapping); 1033 + 1034 + ceph_wbc->nr_folios = 0; 1035 + ceph_wbc->max_pages = 0; 1036 + ceph_wbc->locked_pages = 0; 1037 + 1038 + ceph_wbc->done = false; 1039 + ceph_wbc->should_loop = false; 1040 + ceph_wbc->range_whole = false; 1041 + 1042 + ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0; 1043 + ceph_wbc->index = ceph_wbc->start_index; 1044 + ceph_wbc->end = -1; 990 1045 991 1046 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { 992 - tag = PAGECACHE_TAG_TOWRITE; 1047 + ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; 993 1048 } else { 994 - tag = PAGECACHE_TAG_DIRTY; 1049 + ceph_wbc->tag = PAGECACHE_TAG_DIRTY; 995 1050 } 996 - retry: 1051 + 1052 + ceph_wbc->op_idx = -1; 1053 + ceph_wbc->num_ops = 0; 1054 + ceph_wbc->offset = 0; 1055 + ceph_wbc->len = 0; 1056 + ceph_wbc->from_pool = false; 1057 + 1058 + ceph_folio_batch_init(ceph_wbc); 1059 + 1060 + ceph_wbc->pages = NULL; 1061 + ceph_wbc->data_pages = NULL; 1062 + } 1063 + 1064 + static inline 1065 + int ceph_define_writeback_range(struct address_space *mapping, 1066 + struct writeback_control *wbc, 1067 + struct ceph_writeback_ctl *ceph_wbc) 1068 + { 1069 + struct inode *inode = mapping->host; 1070 + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 1071 + struct ceph_client *cl = fsc->client; 1072 + 997 1073 /* find oldest snap context with dirty data */ 998 - snapc = get_oldest_context(inode, &ceph_wbc, NULL); 999 - if (!snapc) { 1074 + ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL); 1075 + if (!ceph_wbc->snapc) { 1000 1076 /* hmm, why does writepages get called when there 1001 1077 is no dirty data? */ 1002 1078 doutc(cl, " no snap context with dirty data?\n"); 1003 - goto out; 1079 + return -ENODATA; 1004 1080 } 1005 - doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, 1006 - snapc->seq, snapc->num_snaps); 1007 1081 1008 - should_loop = false; 1009 - if (ceph_wbc.head_snapc && snapc != last_snapc) { 1082 + doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", 1083 + ceph_wbc->snapc, ceph_wbc->snapc->seq, 1084 + ceph_wbc->snapc->num_snaps); 1085 + 1086 + ceph_wbc->should_loop = false; 1087 + 1088 + if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) { 1010 1089 /* where to start/end? */ 1011 1090 if (wbc->range_cyclic) { 1012 - index = start_index; 1013 - end = -1; 1014 - if (index > 0) 1015 - should_loop = true; 1016 - doutc(cl, " cyclic, start at %lu\n", index); 1091 + ceph_wbc->index = ceph_wbc->start_index; 1092 + ceph_wbc->end = -1; 1093 + if (ceph_wbc->index > 0) 1094 + ceph_wbc->should_loop = true; 1095 + doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index); 1017 1096 } else { 1018 - index = wbc->range_start >> PAGE_SHIFT; 1019 - end = wbc->range_end >> PAGE_SHIFT; 1097 + ceph_wbc->index = wbc->range_start >> PAGE_SHIFT; 1098 + ceph_wbc->end = wbc->range_end >> PAGE_SHIFT; 1020 1099 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 1021 - range_whole = true; 1022 - doutc(cl, " not cyclic, %lu to %lu\n", index, end); 1100 + ceph_wbc->range_whole = true; 1101 + doutc(cl, " not cyclic, %lu to %lu\n", 1102 + ceph_wbc->index, ceph_wbc->end); 1023 1103 } 1024 - } else if (!ceph_wbc.head_snapc) { 1104 + } else if (!ceph_wbc->head_snapc) { 1025 1105 /* Do not respect wbc->range_{start,end}. Dirty pages 1026 1106 * in that range can be associated with newer snapc. 1027 1107 * They are not writeable until we write all dirty pages 1028 1108 * associated with 'snapc' get written */ 1029 - if (index > 0) 1030 - should_loop = true; 1109 + if (ceph_wbc->index > 0) 1110 + ceph_wbc->should_loop = true; 1031 1111 doutc(cl, " non-head snapc, range whole\n"); 1032 1112 } 1033 1113 1034 - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 1035 - tag_pages_for_writeback(mapping, index, end); 1114 + ceph_put_snap_context(ceph_wbc->last_snapc); 1115 + ceph_wbc->last_snapc = ceph_wbc->snapc; 1036 1116 1037 - ceph_put_snap_context(last_snapc); 1038 - last_snapc = snapc; 1117 + return 0; 1118 + } 1039 1119 1040 - while (!done && index <= end) { 1041 - int num_ops = 0, op_idx; 1042 - unsigned i, nr_folios, max_pages, locked_pages = 0; 1043 - struct page **pages = NULL, **data_pages; 1044 - struct page *page; 1045 - pgoff_t strip_unit_end = 0; 1046 - u64 offset = 0, len = 0; 1047 - bool from_pool = false; 1120 + static inline 1121 + bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc) 1122 + { 1123 + return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end; 1124 + } 1048 1125 1049 - max_pages = wsize >> PAGE_SHIFT; 1126 + static inline 1127 + bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc, 1128 + unsigned index) 1129 + { 1130 + return index < ceph_wbc->nr_folios && 1131 + ceph_wbc->locked_pages < ceph_wbc->max_pages; 1132 + } 1050 1133 1051 - get_more_pages: 1052 - nr_folios = filemap_get_folios_tag(mapping, &index, 1053 - end, tag, &fbatch); 1054 - doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); 1055 - if (!nr_folios && !locked_pages) 1056 - break; 1057 - for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { 1058 - struct folio *folio = fbatch.folios[i]; 1134 + static 1135 + int ceph_check_page_before_write(struct address_space *mapping, 1136 + struct writeback_control *wbc, 1137 + struct ceph_writeback_ctl *ceph_wbc, 1138 + struct folio *folio) 1139 + { 1140 + struct inode *inode = mapping->host; 1141 + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 1142 + struct ceph_client *cl = fsc->client; 1143 + struct ceph_snap_context *pgsnapc; 1059 1144 1060 - page = &folio->page; 1061 - doutc(cl, "? %p idx %lu\n", page, page->index); 1062 - if (locked_pages == 0) 1063 - lock_page(page); /* first page */ 1064 - else if (!trylock_page(page)) 1065 - break; 1145 + /* only dirty folios, or our accounting breaks */ 1146 + if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) { 1147 + doutc(cl, "!dirty or !mapping %p\n", folio); 1148 + return -ENODATA; 1149 + } 1066 1150 1067 - /* only dirty pages, or our accounting breaks */ 1068 - if (unlikely(!PageDirty(page)) || 1069 - unlikely(page->mapping != mapping)) { 1070 - doutc(cl, "!dirty or !mapping %p\n", page); 1071 - unlock_page(page); 1072 - continue; 1073 - } 1074 - /* only if matching snap context */ 1075 - pgsnapc = page_snap_context(page); 1076 - if (pgsnapc != snapc) { 1077 - doutc(cl, "page snapc %p %lld != oldest %p %lld\n", 1078 - pgsnapc, pgsnapc->seq, snapc, snapc->seq); 1079 - if (!should_loop && 1080 - !ceph_wbc.head_snapc && 1081 - wbc->sync_mode != WB_SYNC_NONE) 1082 - should_loop = true; 1083 - unlock_page(page); 1084 - continue; 1085 - } 1086 - if (page_offset(page) >= ceph_wbc.i_size) { 1087 - doutc(cl, "folio at %lu beyond eof %llu\n", 1088 - folio->index, ceph_wbc.i_size); 1089 - if ((ceph_wbc.size_stable || 1090 - folio_pos(folio) >= i_size_read(inode)) && 1091 - folio_clear_dirty_for_io(folio)) 1092 - folio_invalidate(folio, 0, 1093 - folio_size(folio)); 1094 - folio_unlock(folio); 1095 - continue; 1096 - } 1097 - if (strip_unit_end && (page->index > strip_unit_end)) { 1098 - doutc(cl, "end of strip unit %p\n", page); 1099 - unlock_page(page); 1100 - break; 1101 - } 1102 - if (folio_test_writeback(folio) || 1103 - folio_test_private_2(folio) /* [DEPRECATED] */) { 1104 - if (wbc->sync_mode == WB_SYNC_NONE) { 1105 - doutc(cl, "%p under writeback\n", folio); 1106 - folio_unlock(folio); 1107 - continue; 1108 - } 1109 - doutc(cl, "waiting on writeback %p\n", folio); 1110 - folio_wait_writeback(folio); 1111 - folio_wait_private_2(folio); /* [DEPRECATED] */ 1112 - } 1151 + /* only if matching snap context */ 1152 + pgsnapc = page_snap_context(&folio->page); 1153 + if (pgsnapc != ceph_wbc->snapc) { 1154 + doutc(cl, "folio snapc %p %lld != oldest %p %lld\n", 1155 + pgsnapc, pgsnapc->seq, 1156 + ceph_wbc->snapc, ceph_wbc->snapc->seq); 1113 1157 1114 - if (!clear_page_dirty_for_io(page)) { 1115 - doutc(cl, "%p !clear_page_dirty_for_io\n", page); 1116 - unlock_page(page); 1117 - continue; 1118 - } 1158 + if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc && 1159 + wbc->sync_mode != WB_SYNC_NONE) 1160 + ceph_wbc->should_loop = true; 1119 1161 1120 - /* 1121 - * We have something to write. If this is 1122 - * the first locked page this time through, 1123 - * calculate max possinle write size and 1124 - * allocate a page array 1125 - */ 1126 - if (locked_pages == 0) { 1127 - u64 objnum; 1128 - u64 objoff; 1129 - u32 xlen; 1162 + return -ENODATA; 1163 + } 1130 1164 1131 - /* prepare async write request */ 1132 - offset = (u64)page_offset(page); 1133 - ceph_calc_file_object_mapping(&ci->i_layout, 1134 - offset, wsize, 1135 - &objnum, &objoff, 1136 - &xlen); 1137 - len = xlen; 1165 + if (folio_pos(folio) >= ceph_wbc->i_size) { 1166 + doutc(cl, "folio at %lu beyond eof %llu\n", 1167 + folio->index, ceph_wbc->i_size); 1138 1168 1139 - num_ops = 1; 1140 - strip_unit_end = page->index + 1141 - ((len - 1) >> PAGE_SHIFT); 1169 + if ((ceph_wbc->size_stable || 1170 + folio_pos(folio) >= i_size_read(inode)) && 1171 + folio_clear_dirty_for_io(folio)) 1172 + folio_invalidate(folio, 0, folio_size(folio)); 1142 1173 1143 - BUG_ON(pages); 1144 - max_pages = calc_pages_for(0, (u64)len); 1145 - pages = kmalloc_array(max_pages, 1146 - sizeof(*pages), 1147 - GFP_NOFS); 1148 - if (!pages) { 1149 - from_pool = true; 1150 - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); 1151 - BUG_ON(!pages); 1152 - } 1174 + return -ENODATA; 1175 + } 1153 1176 1154 - len = 0; 1155 - } else if (page->index != 1156 - (offset + len) >> PAGE_SHIFT) { 1157 - if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : 1158 - CEPH_OSD_MAX_OPS)) { 1159 - redirty_page_for_writepage(wbc, page); 1160 - unlock_page(page); 1161 - break; 1162 - } 1177 + if (ceph_wbc->strip_unit_end && 1178 + (folio->index > ceph_wbc->strip_unit_end)) { 1179 + doutc(cl, "end of strip unit %p\n", folio); 1180 + return -E2BIG; 1181 + } 1163 1182 1164 - num_ops++; 1165 - offset = (u64)page_offset(page); 1166 - len = 0; 1167 - } 1183 + return 0; 1184 + } 1168 1185 1169 - /* note position of first page in fbatch */ 1170 - doutc(cl, "%llx.%llx will write page %p idx %lu\n", 1171 - ceph_vinop(inode), page, page->index); 1186 + static inline 1187 + void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc, 1188 + unsigned int max_pages) 1189 + { 1190 + ceph_wbc->pages = kmalloc_array(max_pages, 1191 + sizeof(*ceph_wbc->pages), 1192 + GFP_NOFS); 1193 + if (!ceph_wbc->pages) { 1194 + ceph_wbc->from_pool = true; 1195 + ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); 1196 + BUG_ON(!ceph_wbc->pages); 1197 + } 1198 + } 1172 1199 1173 - if (atomic_long_inc_return(&fsc->writeback_count) > 1174 - CONGESTION_ON_THRESH( 1175 - fsc->mount_options->congestion_kb)) 1176 - fsc->write_congested = true; 1200 + static inline 1201 + void ceph_allocate_page_array(struct address_space *mapping, 1202 + struct ceph_writeback_ctl *ceph_wbc, 1203 + struct folio *folio) 1204 + { 1205 + struct inode *inode = mapping->host; 1206 + struct ceph_inode_info *ci = ceph_inode(inode); 1207 + u64 objnum; 1208 + u64 objoff; 1209 + u32 xlen; 1177 1210 1178 - if (IS_ENCRYPTED(inode)) { 1179 - pages[locked_pages] = 1180 - fscrypt_encrypt_pagecache_blocks(page, 1181 - PAGE_SIZE, 0, 1182 - locked_pages ? GFP_NOWAIT : GFP_NOFS); 1183 - if (IS_ERR(pages[locked_pages])) { 1184 - if (PTR_ERR(pages[locked_pages]) == -EINVAL) 1185 - pr_err_client(cl, 1186 - "inode->i_blkbits=%hhu\n", 1187 - inode->i_blkbits); 1188 - /* better not fail on first page! */ 1189 - BUG_ON(locked_pages == 0); 1190 - pages[locked_pages] = NULL; 1191 - redirty_page_for_writepage(wbc, page); 1192 - unlock_page(page); 1193 - break; 1194 - } 1195 - ++locked_pages; 1196 - } else { 1197 - pages[locked_pages++] = page; 1211 + /* prepare async write request */ 1212 + ceph_wbc->offset = (u64)folio_pos(folio); 1213 + ceph_calc_file_object_mapping(&ci->i_layout, 1214 + ceph_wbc->offset, ceph_wbc->wsize, 1215 + &objnum, &objoff, &xlen); 1216 + 1217 + ceph_wbc->num_ops = 1; 1218 + ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT); 1219 + 1220 + BUG_ON(ceph_wbc->pages); 1221 + ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen); 1222 + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages); 1223 + 1224 + ceph_wbc->len = 0; 1225 + } 1226 + 1227 + static inline 1228 + bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc, 1229 + const struct folio *folio) 1230 + { 1231 + return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT; 1232 + } 1233 + 1234 + static inline 1235 + bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc) 1236 + { 1237 + return ceph_wbc->num_ops >= 1238 + (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS); 1239 + } 1240 + 1241 + static inline 1242 + bool is_write_congestion_happened(struct ceph_fs_client *fsc) 1243 + { 1244 + return atomic_long_inc_return(&fsc->writeback_count) > 1245 + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb); 1246 + } 1247 + 1248 + static inline int move_dirty_folio_in_page_array(struct address_space *mapping, 1249 + struct writeback_control *wbc, 1250 + struct ceph_writeback_ctl *ceph_wbc, struct folio *folio) 1251 + { 1252 + struct inode *inode = mapping->host; 1253 + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 1254 + struct ceph_client *cl = fsc->client; 1255 + struct page **pages = ceph_wbc->pages; 1256 + unsigned int index = ceph_wbc->locked_pages; 1257 + gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS; 1258 + 1259 + if (IS_ENCRYPTED(inode)) { 1260 + pages[index] = fscrypt_encrypt_pagecache_blocks(folio, 1261 + PAGE_SIZE, 1262 + 0, 1263 + gfp_flags); 1264 + if (IS_ERR(pages[index])) { 1265 + if (PTR_ERR(pages[index]) == -EINVAL) { 1266 + pr_err_client(cl, "inode->i_blkbits=%hhu\n", 1267 + inode->i_blkbits); 1198 1268 } 1199 1269 1200 - fbatch.folios[i] = NULL; 1201 - len += thp_size(page); 1270 + /* better not fail on first page! */ 1271 + BUG_ON(ceph_wbc->locked_pages == 0); 1272 + 1273 + pages[index] = NULL; 1274 + return PTR_ERR(pages[index]); 1275 + } 1276 + } else { 1277 + pages[index] = &folio->page; 1278 + } 1279 + 1280 + ceph_wbc->locked_pages++; 1281 + 1282 + return 0; 1283 + } 1284 + 1285 + static 1286 + int ceph_process_folio_batch(struct address_space *mapping, 1287 + struct writeback_control *wbc, 1288 + struct ceph_writeback_ctl *ceph_wbc) 1289 + { 1290 + struct inode *inode = mapping->host; 1291 + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 1292 + struct ceph_client *cl = fsc->client; 1293 + struct folio *folio = NULL; 1294 + unsigned i; 1295 + int rc = 0; 1296 + 1297 + for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) { 1298 + folio = ceph_wbc->fbatch.folios[i]; 1299 + 1300 + if (!folio) 1301 + continue; 1302 + 1303 + doutc(cl, "? %p idx %lu, folio_test_writeback %#x, " 1304 + "folio_test_dirty %#x, folio_test_locked %#x\n", 1305 + folio, folio->index, folio_test_writeback(folio), 1306 + folio_test_dirty(folio), 1307 + folio_test_locked(folio)); 1308 + 1309 + if (folio_test_writeback(folio) || 1310 + folio_test_private_2(folio) /* [DEPRECATED] */) { 1311 + doutc(cl, "waiting on writeback %p\n", folio); 1312 + folio_wait_writeback(folio); 1313 + folio_wait_private_2(folio); /* [DEPRECATED] */ 1314 + continue; 1202 1315 } 1203 1316 1204 - /* did we get anything? */ 1205 - if (!locked_pages) 1206 - goto release_folios; 1207 - if (i) { 1208 - unsigned j, n = 0; 1209 - /* shift unused page to beginning of fbatch */ 1210 - for (j = 0; j < nr_folios; j++) { 1211 - if (!fbatch.folios[j]) 1212 - continue; 1213 - if (n < j) 1214 - fbatch.folios[n] = fbatch.folios[j]; 1215 - n++; 1216 - } 1217 - fbatch.nr = n; 1317 + if (ceph_wbc->locked_pages == 0) 1318 + folio_lock(folio); 1319 + else if (!folio_trylock(folio)) 1320 + break; 1218 1321 1219 - if (nr_folios && i == nr_folios && 1220 - locked_pages < max_pages) { 1322 + rc = ceph_check_page_before_write(mapping, wbc, 1323 + ceph_wbc, folio); 1324 + if (rc == -ENODATA) { 1325 + rc = 0; 1326 + folio_unlock(folio); 1327 + ceph_wbc->fbatch.folios[i] = NULL; 1328 + continue; 1329 + } else if (rc == -E2BIG) { 1330 + rc = 0; 1331 + folio_unlock(folio); 1332 + ceph_wbc->fbatch.folios[i] = NULL; 1333 + break; 1334 + } 1335 + 1336 + if (!folio_clear_dirty_for_io(folio)) { 1337 + doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); 1338 + folio_unlock(folio); 1339 + ceph_wbc->fbatch.folios[i] = NULL; 1340 + continue; 1341 + } 1342 + 1343 + /* 1344 + * We have something to write. If this is 1345 + * the first locked page this time through, 1346 + * calculate max possible write size and 1347 + * allocate a page array 1348 + */ 1349 + if (ceph_wbc->locked_pages == 0) { 1350 + ceph_allocate_page_array(mapping, ceph_wbc, folio); 1351 + } else if (!is_folio_index_contiguous(ceph_wbc, folio)) { 1352 + if (is_num_ops_too_big(ceph_wbc)) { 1353 + folio_redirty_for_writepage(wbc, folio); 1354 + folio_unlock(folio); 1355 + break; 1356 + } 1357 + 1358 + ceph_wbc->num_ops++; 1359 + ceph_wbc->offset = (u64)folio_pos(folio); 1360 + ceph_wbc->len = 0; 1361 + } 1362 + 1363 + /* note position of first page in fbatch */ 1364 + doutc(cl, "%llx.%llx will write folio %p idx %lu\n", 1365 + ceph_vinop(inode), folio, folio->index); 1366 + 1367 + fsc->write_congested = is_write_congestion_happened(fsc); 1368 + 1369 + rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, 1370 + folio); 1371 + if (rc) { 1372 + folio_redirty_for_writepage(wbc, folio); 1373 + folio_unlock(folio); 1374 + break; 1375 + } 1376 + 1377 + ceph_wbc->fbatch.folios[i] = NULL; 1378 + ceph_wbc->len += folio_size(folio); 1379 + } 1380 + 1381 + ceph_wbc->processed_in_fbatch = i; 1382 + 1383 + return rc; 1384 + } 1385 + 1386 + static inline 1387 + void ceph_shift_unused_folios_left(struct folio_batch *fbatch) 1388 + { 1389 + unsigned j, n = 0; 1390 + 1391 + /* shift unused page to beginning of fbatch */ 1392 + for (j = 0; j < folio_batch_count(fbatch); j++) { 1393 + if (!fbatch->folios[j]) 1394 + continue; 1395 + 1396 + if (n < j) { 1397 + fbatch->folios[n] = fbatch->folios[j]; 1398 + } 1399 + 1400 + n++; 1401 + } 1402 + 1403 + fbatch->nr = n; 1404 + } 1405 + 1406 + static 1407 + int ceph_submit_write(struct address_space *mapping, 1408 + struct writeback_control *wbc, 1409 + struct ceph_writeback_ctl *ceph_wbc) 1410 + { 1411 + struct inode *inode = mapping->host; 1412 + struct ceph_inode_info *ci = ceph_inode(inode); 1413 + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 1414 + struct ceph_client *cl = fsc->client; 1415 + struct ceph_vino vino = ceph_vino(inode); 1416 + struct ceph_osd_request *req = NULL; 1417 + struct page *page = NULL; 1418 + bool caching = ceph_is_cache_enabled(inode); 1419 + u64 offset; 1420 + u64 len; 1421 + unsigned i; 1422 + 1423 + new_request: 1424 + offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]); 1425 + len = ceph_wbc->wsize; 1426 + 1427 + req = ceph_osdc_new_request(&fsc->client->osdc, 1428 + &ci->i_layout, vino, 1429 + offset, &len, 0, ceph_wbc->num_ops, 1430 + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, 1431 + ceph_wbc->snapc, ceph_wbc->truncate_seq, 1432 + ceph_wbc->truncate_size, false); 1433 + if (IS_ERR(req)) { 1434 + req = ceph_osdc_new_request(&fsc->client->osdc, 1435 + &ci->i_layout, vino, 1436 + offset, &len, 0, 1437 + min(ceph_wbc->num_ops, 1438 + CEPH_OSD_SLAB_OPS), 1439 + CEPH_OSD_OP_WRITE, 1440 + CEPH_OSD_FLAG_WRITE, 1441 + ceph_wbc->snapc, 1442 + ceph_wbc->truncate_seq, 1443 + ceph_wbc->truncate_size, 1444 + true); 1445 + BUG_ON(IS_ERR(req)); 1446 + } 1447 + 1448 + page = ceph_wbc->pages[ceph_wbc->locked_pages - 1]; 1449 + BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset); 1450 + 1451 + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { 1452 + for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) { 1453 + struct folio *folio = ceph_wbc->fbatch.folios[i]; 1454 + 1455 + if (!folio) 1456 + continue; 1457 + 1458 + page = &folio->page; 1459 + redirty_page_for_writepage(wbc, page); 1460 + unlock_page(page); 1461 + } 1462 + 1463 + for (i = 0; i < ceph_wbc->locked_pages; i++) { 1464 + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); 1465 + 1466 + if (!page) 1467 + continue; 1468 + 1469 + redirty_page_for_writepage(wbc, page); 1470 + unlock_page(page); 1471 + } 1472 + 1473 + ceph_osdc_put_request(req); 1474 + return -EIO; 1475 + } 1476 + 1477 + req->r_callback = writepages_finish; 1478 + req->r_inode = inode; 1479 + 1480 + /* Format the osd request message and submit the write */ 1481 + len = 0; 1482 + ceph_wbc->data_pages = ceph_wbc->pages; 1483 + ceph_wbc->op_idx = 0; 1484 + for (i = 0; i < ceph_wbc->locked_pages; i++) { 1485 + u64 cur_offset; 1486 + 1487 + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); 1488 + cur_offset = page_offset(page); 1489 + 1490 + /* 1491 + * Discontinuity in page range? Ceph can handle that by just passing 1492 + * multiple extents in the write op. 1493 + */ 1494 + if (offset + len != cur_offset) { 1495 + /* If it's full, stop here */ 1496 + if (ceph_wbc->op_idx + 1 == req->r_num_ops) 1497 + break; 1498 + 1499 + /* Kick off an fscache write with what we have so far. */ 1500 + ceph_fscache_write_to_cache(inode, offset, len, caching); 1501 + 1502 + /* Start a new extent */ 1503 + osd_req_op_extent_dup_last(req, ceph_wbc->op_idx, 1504 + cur_offset - offset); 1505 + 1506 + doutc(cl, "got pages at %llu~%llu\n", offset, len); 1507 + 1508 + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, 1509 + ceph_wbc->data_pages, 1510 + len, 0, 1511 + ceph_wbc->from_pool, 1512 + false); 1513 + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); 1514 + 1515 + len = 0; 1516 + offset = cur_offset; 1517 + ceph_wbc->data_pages = ceph_wbc->pages + i; 1518 + ceph_wbc->op_idx++; 1519 + } 1520 + 1521 + set_page_writeback(page); 1522 + 1523 + if (caching) 1524 + ceph_set_page_fscache(page); 1525 + 1526 + len += thp_size(page); 1527 + } 1528 + 1529 + ceph_fscache_write_to_cache(inode, offset, len, caching); 1530 + 1531 + if (ceph_wbc->size_stable) { 1532 + len = min(len, ceph_wbc->i_size - offset); 1533 + } else if (i == ceph_wbc->locked_pages) { 1534 + /* writepages_finish() clears writeback pages 1535 + * according to the data length, so make sure 1536 + * data length covers all locked pages */ 1537 + u64 min_len = len + 1 - thp_size(page); 1538 + len = get_writepages_data_length(inode, 1539 + ceph_wbc->pages[i - 1], 1540 + offset); 1541 + len = max(len, min_len); 1542 + } 1543 + 1544 + if (IS_ENCRYPTED(inode)) 1545 + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); 1546 + 1547 + doutc(cl, "got pages at %llu~%llu\n", offset, len); 1548 + 1549 + if (IS_ENCRYPTED(inode) && 1550 + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) { 1551 + pr_warn_client(cl, 1552 + "bad encrypted write offset=%lld len=%llu\n", 1553 + offset, len); 1554 + } 1555 + 1556 + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, 1557 + ceph_wbc->data_pages, len, 1558 + 0, ceph_wbc->from_pool, false); 1559 + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); 1560 + 1561 + BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops); 1562 + 1563 + ceph_wbc->from_pool = false; 1564 + if (i < ceph_wbc->locked_pages) { 1565 + BUG_ON(ceph_wbc->num_ops <= req->r_num_ops); 1566 + ceph_wbc->num_ops -= req->r_num_ops; 1567 + ceph_wbc->locked_pages -= i; 1568 + 1569 + /* allocate new pages array for next request */ 1570 + ceph_wbc->data_pages = ceph_wbc->pages; 1571 + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages); 1572 + memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i, 1573 + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); 1574 + memset(ceph_wbc->data_pages + i, 0, 1575 + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); 1576 + } else { 1577 + BUG_ON(ceph_wbc->num_ops != req->r_num_ops); 1578 + /* request message now owns the pages array */ 1579 + ceph_wbc->pages = NULL; 1580 + } 1581 + 1582 + req->r_mtime = inode_get_mtime(inode); 1583 + ceph_osdc_start_request(&fsc->client->osdc, req); 1584 + req = NULL; 1585 + 1586 + wbc->nr_to_write -= i; 1587 + if (ceph_wbc->pages) 1588 + goto new_request; 1589 + 1590 + return 0; 1591 + } 1592 + 1593 + static 1594 + void ceph_wait_until_current_writes_complete(struct address_space *mapping, 1595 + struct writeback_control *wbc, 1596 + struct ceph_writeback_ctl *ceph_wbc) 1597 + { 1598 + struct page *page; 1599 + unsigned i, nr; 1600 + 1601 + if (wbc->sync_mode != WB_SYNC_NONE && 1602 + ceph_wbc->start_index == 0 && /* all dirty pages were checked */ 1603 + !ceph_wbc->head_snapc) { 1604 + ceph_wbc->index = 0; 1605 + 1606 + while ((ceph_wbc->index <= ceph_wbc->end) && 1607 + (nr = filemap_get_folios_tag(mapping, 1608 + &ceph_wbc->index, 1609 + (pgoff_t)-1, 1610 + PAGECACHE_TAG_WRITEBACK, 1611 + &ceph_wbc->fbatch))) { 1612 + for (i = 0; i < nr; i++) { 1613 + page = &ceph_wbc->fbatch.folios[i]->page; 1614 + if (page_snap_context(page) != ceph_wbc->snapc) 1615 + continue; 1616 + wait_on_page_writeback(page); 1617 + } 1618 + 1619 + folio_batch_release(&ceph_wbc->fbatch); 1620 + cond_resched(); 1621 + } 1622 + } 1623 + } 1624 + 1625 + /* 1626 + * initiate async writeback 1627 + */ 1628 + static int ceph_writepages_start(struct address_space *mapping, 1629 + struct writeback_control *wbc) 1630 + { 1631 + struct inode *inode = mapping->host; 1632 + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); 1633 + struct ceph_client *cl = fsc->client; 1634 + struct ceph_writeback_ctl ceph_wbc; 1635 + int rc = 0; 1636 + 1637 + if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) 1638 + return 0; 1639 + 1640 + doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), 1641 + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 1642 + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 1643 + 1644 + if (is_forced_umount(mapping)) { 1645 + /* we're in a forced umount, don't write! */ 1646 + return -EIO; 1647 + } 1648 + 1649 + ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc); 1650 + 1651 + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { 1652 + rc = -EIO; 1653 + goto out; 1654 + } 1655 + 1656 + retry: 1657 + rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc); 1658 + if (rc == -ENODATA) { 1659 + /* hmm, why does writepages get called when there 1660 + is no dirty data? */ 1661 + rc = 0; 1662 + goto dec_osd_stopping_blocker; 1663 + } 1664 + 1665 + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 1666 + tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end); 1667 + 1668 + while (!has_writeback_done(&ceph_wbc)) { 1669 + ceph_wbc.locked_pages = 0; 1670 + ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT; 1671 + 1672 + get_more_pages: 1673 + ceph_folio_batch_reinit(&ceph_wbc); 1674 + 1675 + ceph_wbc.nr_folios = filemap_get_folios_tag(mapping, 1676 + &ceph_wbc.index, 1677 + ceph_wbc.end, 1678 + ceph_wbc.tag, 1679 + &ceph_wbc.fbatch); 1680 + doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n", 1681 + ceph_wbc.tag, ceph_wbc.nr_folios); 1682 + 1683 + if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages) 1684 + break; 1685 + 1686 + process_folio_batch: 1687 + rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); 1688 + if (rc) 1689 + goto release_folios; 1690 + 1691 + /* did we get anything? */ 1692 + if (!ceph_wbc.locked_pages) 1693 + goto release_folios; 1694 + 1695 + if (ceph_wbc.processed_in_fbatch) { 1696 + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); 1697 + 1698 + if (folio_batch_count(&ceph_wbc.fbatch) == 0 && 1699 + ceph_wbc.locked_pages < ceph_wbc.max_pages) { 1221 1700 doutc(cl, "reached end fbatch, trying for more\n"); 1222 - folio_batch_release(&fbatch); 1223 1701 goto get_more_pages; 1224 1702 } 1225 1703 } 1226 1704 1227 - new_request: 1228 - offset = ceph_fscrypt_page_offset(pages[0]); 1229 - len = wsize; 1230 - 1231 - req = ceph_osdc_new_request(&fsc->client->osdc, 1232 - &ci->i_layout, vino, 1233 - offset, &len, 0, num_ops, 1234 - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, 1235 - snapc, ceph_wbc.truncate_seq, 1236 - ceph_wbc.truncate_size, false); 1237 - if (IS_ERR(req)) { 1238 - req = ceph_osdc_new_request(&fsc->client->osdc, 1239 - &ci->i_layout, vino, 1240 - offset, &len, 0, 1241 - min(num_ops, 1242 - CEPH_OSD_SLAB_OPS), 1243 - CEPH_OSD_OP_WRITE, 1244 - CEPH_OSD_FLAG_WRITE, 1245 - snapc, ceph_wbc.truncate_seq, 1246 - ceph_wbc.truncate_size, true); 1247 - BUG_ON(IS_ERR(req)); 1248 - } 1249 - BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + 1250 - thp_size(pages[locked_pages - 1]) - offset); 1251 - 1252 - if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { 1253 - rc = -EIO; 1705 + rc = ceph_submit_write(mapping, wbc, &ceph_wbc); 1706 + if (rc) 1254 1707 goto release_folios; 1708 + 1709 + ceph_wbc.locked_pages = 0; 1710 + ceph_wbc.strip_unit_end = 0; 1711 + 1712 + if (folio_batch_count(&ceph_wbc.fbatch) > 0) { 1713 + ceph_wbc.nr_folios = 1714 + folio_batch_count(&ceph_wbc.fbatch); 1715 + goto process_folio_batch; 1255 1716 } 1256 - req->r_callback = writepages_finish; 1257 - req->r_inode = inode; 1258 - 1259 - /* Format the osd request message and submit the write */ 1260 - len = 0; 1261 - data_pages = pages; 1262 - op_idx = 0; 1263 - for (i = 0; i < locked_pages; i++) { 1264 - struct page *page = ceph_fscrypt_pagecache_page(pages[i]); 1265 - 1266 - u64 cur_offset = page_offset(page); 1267 - /* 1268 - * Discontinuity in page range? Ceph can handle that by just passing 1269 - * multiple extents in the write op. 1270 - */ 1271 - if (offset + len != cur_offset) { 1272 - /* If it's full, stop here */ 1273 - if (op_idx + 1 == req->r_num_ops) 1274 - break; 1275 - 1276 - /* Kick off an fscache write with what we have so far. */ 1277 - ceph_fscache_write_to_cache(inode, offset, len, caching); 1278 - 1279 - /* Start a new extent */ 1280 - osd_req_op_extent_dup_last(req, op_idx, 1281 - cur_offset - offset); 1282 - doutc(cl, "got pages at %llu~%llu\n", offset, 1283 - len); 1284 - osd_req_op_extent_osd_data_pages(req, op_idx, 1285 - data_pages, len, 0, 1286 - from_pool, false); 1287 - osd_req_op_extent_update(req, op_idx, len); 1288 - 1289 - len = 0; 1290 - offset = cur_offset; 1291 - data_pages = pages + i; 1292 - op_idx++; 1293 - } 1294 - 1295 - set_page_writeback(page); 1296 - if (caching) 1297 - ceph_set_page_fscache(page); 1298 - len += thp_size(page); 1299 - } 1300 - ceph_fscache_write_to_cache(inode, offset, len, caching); 1301 - 1302 - if (ceph_wbc.size_stable) { 1303 - len = min(len, ceph_wbc.i_size - offset); 1304 - } else if (i == locked_pages) { 1305 - /* writepages_finish() clears writeback pages 1306 - * according to the data length, so make sure 1307 - * data length covers all locked pages */ 1308 - u64 min_len = len + 1 - thp_size(page); 1309 - len = get_writepages_data_length(inode, pages[i - 1], 1310 - offset); 1311 - len = max(len, min_len); 1312 - } 1313 - if (IS_ENCRYPTED(inode)) 1314 - len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); 1315 - 1316 - doutc(cl, "got pages at %llu~%llu\n", offset, len); 1317 - 1318 - if (IS_ENCRYPTED(inode) && 1319 - ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) 1320 - pr_warn_client(cl, 1321 - "bad encrypted write offset=%lld len=%llu\n", 1322 - offset, len); 1323 - 1324 - osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, 1325 - 0, from_pool, false); 1326 - osd_req_op_extent_update(req, op_idx, len); 1327 - 1328 - BUG_ON(op_idx + 1 != req->r_num_ops); 1329 - 1330 - from_pool = false; 1331 - if (i < locked_pages) { 1332 - BUG_ON(num_ops <= req->r_num_ops); 1333 - num_ops -= req->r_num_ops; 1334 - locked_pages -= i; 1335 - 1336 - /* allocate new pages array for next request */ 1337 - data_pages = pages; 1338 - pages = kmalloc_array(locked_pages, sizeof(*pages), 1339 - GFP_NOFS); 1340 - if (!pages) { 1341 - from_pool = true; 1342 - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); 1343 - BUG_ON(!pages); 1344 - } 1345 - memcpy(pages, data_pages + i, 1346 - locked_pages * sizeof(*pages)); 1347 - memset(data_pages + i, 0, 1348 - locked_pages * sizeof(*pages)); 1349 - } else { 1350 - BUG_ON(num_ops != req->r_num_ops); 1351 - index = pages[i - 1]->index + 1; 1352 - /* request message now owns the pages array */ 1353 - pages = NULL; 1354 - } 1355 - 1356 - req->r_mtime = inode_get_mtime(inode); 1357 - ceph_osdc_start_request(&fsc->client->osdc, req); 1358 - req = NULL; 1359 - 1360 - wbc->nr_to_write -= i; 1361 - if (pages) 1362 - goto new_request; 1363 1717 1364 1718 /* 1365 1719 * We stop writing back only if we are not doing ··· 1713 1377 * we tagged for writeback prior to entering this loop. 1714 1378 */ 1715 1379 if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) 1716 - done = true; 1380 + ceph_wbc.done = true; 1717 1381 1718 1382 release_folios: 1719 1383 doutc(cl, "folio_batch release on %d folios (%p)\n", 1720 - (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); 1721 - folio_batch_release(&fbatch); 1384 + (int)ceph_wbc.fbatch.nr, 1385 + ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL); 1386 + folio_batch_release(&ceph_wbc.fbatch); 1722 1387 } 1723 1388 1724 - if (should_loop && !done) { 1389 + if (ceph_wbc.should_loop && !ceph_wbc.done) { 1725 1390 /* more to do; loop back to beginning of file */ 1726 1391 doutc(cl, "looping back to beginning of file\n"); 1727 - end = start_index - 1; /* OK even when start_index == 0 */ 1392 + /* OK even when start_index == 0 */ 1393 + ceph_wbc.end = ceph_wbc.start_index - 1; 1728 1394 1729 1395 /* to write dirty pages associated with next snapc, 1730 1396 * we need to wait until current writes complete */ 1731 - if (wbc->sync_mode != WB_SYNC_NONE && 1732 - start_index == 0 && /* all dirty pages were checked */ 1733 - !ceph_wbc.head_snapc) { 1734 - struct page *page; 1735 - unsigned i, nr; 1736 - index = 0; 1737 - while ((index <= end) && 1738 - (nr = filemap_get_folios_tag(mapping, &index, 1739 - (pgoff_t)-1, 1740 - PAGECACHE_TAG_WRITEBACK, 1741 - &fbatch))) { 1742 - for (i = 0; i < nr; i++) { 1743 - page = &fbatch.folios[i]->page; 1744 - if (page_snap_context(page) != snapc) 1745 - continue; 1746 - wait_on_page_writeback(page); 1747 - } 1748 - folio_batch_release(&fbatch); 1749 - cond_resched(); 1750 - } 1751 - } 1397 + ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc); 1752 1398 1753 - start_index = 0; 1754 - index = 0; 1399 + ceph_wbc.start_index = 0; 1400 + ceph_wbc.index = 0; 1755 1401 goto retry; 1756 1402 } 1757 1403 1758 - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 1759 - mapping->writeback_index = index; 1404 + if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0)) 1405 + mapping->writeback_index = ceph_wbc.index; 1406 + 1407 + dec_osd_stopping_blocker: 1408 + ceph_dec_osd_stopping_blocker(fsc->mdsc); 1760 1409 1761 1410 out: 1762 - ceph_osdc_put_request(req); 1763 - ceph_put_snap_context(last_snapc); 1411 + ceph_put_snap_context(ceph_wbc.last_snapc); 1764 1412 doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), 1765 1413 rc); 1414 + 1766 1415 return rc; 1767 1416 } 1768 - 1769 - 1770 1417 1771 1418 /* 1772 1419 * See if a given @snapc is either writeable, or already written. ··· 1766 1447 1767 1448 /** 1768 1449 * ceph_find_incompatible - find an incompatible context and return it 1769 - * @page: page being dirtied 1450 + * @folio: folio being dirtied 1770 1451 * 1771 - * We are only allowed to write into/dirty a page if the page is 1452 + * We are only allowed to write into/dirty a folio if the folio is 1772 1453 * clean, or already dirty within the same snap context. Returns a 1773 1454 * conflicting context if there is one, NULL if there isn't, or a 1774 1455 * negative error code on other errors. 1775 1456 * 1776 - * Must be called with page lock held. 1457 + * Must be called with folio lock held. 1777 1458 */ 1778 1459 static struct ceph_snap_context * 1779 - ceph_find_incompatible(struct page *page) 1460 + ceph_find_incompatible(struct folio *folio) 1780 1461 { 1781 - struct inode *inode = page->mapping->host; 1462 + struct inode *inode = folio->mapping->host; 1782 1463 struct ceph_client *cl = ceph_inode_to_client(inode); 1783 1464 struct ceph_inode_info *ci = ceph_inode(inode); 1784 1465 1785 1466 if (ceph_inode_is_shutdown(inode)) { 1786 - doutc(cl, " %llx.%llx page %p is shutdown\n", 1787 - ceph_vinop(inode), page); 1467 + doutc(cl, " %llx.%llx folio %p is shutdown\n", 1468 + ceph_vinop(inode), folio); 1788 1469 return ERR_PTR(-ESTALE); 1789 1470 } 1790 1471 1791 1472 for (;;) { 1792 1473 struct ceph_snap_context *snapc, *oldest; 1793 1474 1794 - wait_on_page_writeback(page); 1475 + folio_wait_writeback(folio); 1795 1476 1796 - snapc = page_snap_context(page); 1477 + snapc = page_snap_context(&folio->page); 1797 1478 if (!snapc || snapc == ci->i_head_snapc) 1798 1479 break; 1799 1480 1800 1481 /* 1801 - * this page is already dirty in another (older) snap 1482 + * this folio is already dirty in another (older) snap 1802 1483 * context! is it writeable now? 1803 1484 */ 1804 1485 oldest = get_oldest_context(inode, NULL, NULL); 1805 1486 if (snapc->seq > oldest->seq) { 1806 1487 /* not writeable -- return it for the caller to deal with */ 1807 1488 ceph_put_snap_context(oldest); 1808 - doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", 1809 - ceph_vinop(inode), page, snapc); 1489 + doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n", 1490 + ceph_vinop(inode), folio, snapc); 1810 1491 return ceph_get_snap_context(snapc); 1811 1492 } 1812 1493 ceph_put_snap_context(oldest); 1813 1494 1814 - /* yay, writeable, do it now (without dropping page lock) */ 1815 - doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", 1816 - ceph_vinop(inode), page, snapc); 1817 - if (clear_page_dirty_for_io(page)) { 1818 - int r = writepage_nounlock(page, NULL); 1495 + /* yay, writeable, do it now (without dropping folio lock) */ 1496 + doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n", 1497 + ceph_vinop(inode), folio, snapc); 1498 + if (folio_clear_dirty_for_io(folio)) { 1499 + int r = write_folio_nounlock(folio, NULL); 1819 1500 if (r < 0) 1820 1501 return ERR_PTR(r); 1821 1502 } ··· 1830 1511 struct ceph_inode_info *ci = ceph_inode(inode); 1831 1512 struct ceph_snap_context *snapc; 1832 1513 1833 - snapc = ceph_find_incompatible(folio_page(*foliop, 0)); 1514 + snapc = ceph_find_incompatible(*foliop); 1834 1515 if (snapc) { 1835 1516 int r; 1836 1517 ··· 1913 1594 const struct address_space_operations ceph_aops = { 1914 1595 .read_folio = netfs_read_folio, 1915 1596 .readahead = netfs_readahead, 1916 - .writepage = ceph_writepage, 1917 1597 .writepages = ceph_writepages_start, 1918 1598 .write_begin = ceph_write_begin, 1919 1599 .write_end = ceph_write_end, ··· 1920 1602 .invalidate_folio = ceph_invalidate_folio, 1921 1603 .release_folio = netfs_release_folio, 1922 1604 .direct_IO = noop_direct_IO, 1605 + .migrate_folio = filemap_migrate_folio, 1923 1606 }; 1924 1607 1925 1608 static void ceph_block_sigs(sigset_t *oldset) ··· 2037 1718 struct ceph_inode_info *ci = ceph_inode(inode); 2038 1719 struct ceph_file_info *fi = vma->vm_file->private_data; 2039 1720 struct ceph_cap_flush *prealloc_cf; 2040 - struct page *page = vmf->page; 2041 - loff_t off = page_offset(page); 1721 + struct folio *folio = page_folio(vmf->page); 1722 + loff_t off = folio_pos(folio); 2042 1723 loff_t size = i_size_read(inode); 2043 1724 size_t len; 2044 1725 int want, got, err; ··· 2055 1736 sb_start_pagefault(inode->i_sb); 2056 1737 ceph_block_sigs(&oldset); 2057 1738 2058 - if (off + thp_size(page) <= size) 2059 - len = thp_size(page); 1739 + if (off + folio_size(folio) <= size) 1740 + len = folio_size(folio); 2060 1741 else 2061 - len = offset_in_thp(page, size); 1742 + len = offset_in_folio(folio, size); 2062 1743 2063 1744 doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", 2064 1745 ceph_vinop(inode), off, len, size); ··· 2075 1756 doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), 2076 1757 off, len, ceph_cap_string(got)); 2077 1758 2078 - /* Update time before taking page lock */ 1759 + /* Update time before taking folio lock */ 2079 1760 file_update_time(vma->vm_file); 2080 1761 inode_inc_iversion_raw(inode); 2081 1762 2082 1763 do { 2083 1764 struct ceph_snap_context *snapc; 2084 1765 2085 - lock_page(page); 1766 + folio_lock(folio); 2086 1767 2087 - if (page_mkwrite_check_truncate(page, inode) < 0) { 2088 - unlock_page(page); 1768 + if (folio_mkwrite_check_truncate(folio, inode) < 0) { 1769 + folio_unlock(folio); 2089 1770 ret = VM_FAULT_NOPAGE; 2090 1771 break; 2091 1772 } 2092 1773 2093 - snapc = ceph_find_incompatible(page); 1774 + snapc = ceph_find_incompatible(folio); 2094 1775 if (!snapc) { 2095 - /* success. we'll keep the page locked. */ 2096 - set_page_dirty(page); 1776 + /* success. we'll keep the folio locked. */ 1777 + folio_mark_dirty(folio); 2097 1778 ret = VM_FAULT_LOCKED; 2098 1779 break; 2099 1780 } 2100 1781 2101 - unlock_page(page); 1782 + folio_unlock(folio); 2102 1783 2103 1784 if (IS_ERR(snapc)) { 2104 1785 ret = VM_FAULT_SIGBUS;
+8 -7
fs/ceph/dir.c
··· 141 141 if (ptr_pos >= i_size_read(dir)) 142 142 return NULL; 143 143 144 - if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) { 144 + if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) { 145 145 ceph_readdir_cache_release(cache_ctl); 146 - cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); 147 - if (!cache_ctl->page) { 148 - doutc(cl, " page %lu not found\n", ptr_pgoff); 146 + cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff); 147 + if (IS_ERR(cache_ctl->folio)) { 148 + cache_ctl->folio = NULL; 149 + doutc(cl, " folio %lu not found\n", ptr_pgoff); 149 150 return ERR_PTR(-EAGAIN); 150 151 } 151 152 /* reading/filling the cache are serialized by 152 - i_rwsem, no need to use page lock */ 153 - unlock_page(cache_ctl->page); 154 - cache_ctl->dentries = kmap(cache_ctl->page); 153 + i_rwsem, no need to use folio lock */ 154 + folio_unlock(cache_ctl->folio); 155 + cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0); 155 156 } 156 157 157 158 cache_ctl->index = idx & idx_mask;
+18 -13
fs/ceph/inode.c
··· 1845 1845 1846 1846 void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) 1847 1847 { 1848 - if (ctl->page) { 1849 - kunmap(ctl->page); 1850 - put_page(ctl->page); 1851 - ctl->page = NULL; 1848 + if (ctl->folio) { 1849 + folio_release_kmap(ctl->folio, ctl->dentries); 1850 + ctl->folio = NULL; 1852 1851 } 1853 1852 } 1854 1853 ··· 1861 1862 unsigned idx = ctl->index % nsize; 1862 1863 pgoff_t pgoff = ctl->index / nsize; 1863 1864 1864 - if (!ctl->page || pgoff != ctl->page->index) { 1865 + if (!ctl->folio || pgoff != ctl->folio->index) { 1865 1866 ceph_readdir_cache_release(ctl); 1867 + fgf_t fgf = FGP_LOCK; 1868 + 1866 1869 if (idx == 0) 1867 - ctl->page = grab_cache_page(&dir->i_data, pgoff); 1868 - else 1869 - ctl->page = find_lock_page(&dir->i_data, pgoff); 1870 - if (!ctl->page) { 1870 + fgf |= FGP_ACCESSED | FGP_CREAT; 1871 + 1872 + ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, 1873 + fgf, mapping_gfp_mask(&dir->i_data)); 1874 + if (IS_ERR(ctl->folio)) { 1875 + int err = PTR_ERR(ctl->folio); 1876 + 1877 + ctl->folio = NULL; 1871 1878 ctl->index = -1; 1872 - return idx == 0 ? -ENOMEM : 0; 1879 + return idx == 0 ? err : 0; 1873 1880 } 1874 1881 /* reading/filling the cache are serialized by 1875 - * i_rwsem, no need to use page lock */ 1876 - unlock_page(ctl->page); 1877 - ctl->dentries = kmap(ctl->page); 1882 + * i_rwsem, no need to use folio lock */ 1883 + folio_unlock(ctl->folio); 1884 + ctl->dentries = kmap_local_folio(ctl->folio, 0); 1878 1885 if (idx == 0) 1879 1886 memset(ctl->dentries, 0, PAGE_SIZE); 1880 1887 }
+2
fs/ceph/mds_client.c
··· 5489 5489 spin_lock_init(&mdsc->stopping_lock); 5490 5490 atomic_set(&mdsc->stopping_blockers, 0); 5491 5491 init_completion(&mdsc->stopping_waiter); 5492 + atomic64_set(&mdsc->dirty_folios, 0); 5493 + init_waitqueue_head(&mdsc->flush_end_wq); 5492 5494 init_waitqueue_head(&mdsc->session_close_wq); 5493 5495 INIT_LIST_HEAD(&mdsc->waiting_for_map); 5494 5496 mdsc->quotarealms_inodes = RB_ROOT;
+3
fs/ceph/mds_client.h
··· 458 458 atomic_t stopping_blockers; 459 459 struct completion stopping_waiter; 460 460 461 + atomic64_t dirty_folios; 462 + wait_queue_head_t flush_end_wq; 463 + 461 464 atomic64_t quotarealms_count; /* # realms with quota */ 462 465 /* 463 466 * We keep a list of inodes we don't see in the mountpoint but that we
+11
fs/ceph/super.c
··· 1563 1563 */ 1564 1564 sync_filesystem(s); 1565 1565 1566 + if (atomic64_read(&mdsc->dirty_folios) > 0) { 1567 + wait_queue_head_t *wq = &mdsc->flush_end_wq; 1568 + long timeleft = wait_event_killable_timeout(*wq, 1569 + atomic64_read(&mdsc->dirty_folios) <= 0, 1570 + fsc->client->options->mount_timeout); 1571 + if (!timeleft) /* timed out */ 1572 + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); 1573 + else if (timeleft < 0) /* killed */ 1574 + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); 1575 + } 1576 + 1566 1577 spin_lock(&mdsc->stopping_lock); 1567 1578 mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; 1568 1579 wait = !!atomic_read(&mdsc->stopping_blockers);
+1 -1
fs/ceph/super.h
··· 903 903 } 904 904 905 905 struct ceph_readdir_cache_control { 906 - struct page *page; 906 + struct folio *folio; 907 907 struct dentry **dentries; 908 908 int index; 909 909 };
+10 -12
fs/crypto/crypto.c
··· 153 153 } 154 154 155 155 /** 156 - * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page 157 - * @page: the locked pagecache page containing the data to encrypt 156 + * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio 157 + * @folio: the locked pagecache folio containing the data to encrypt 158 158 * @len: size of the data to encrypt, in bytes 159 159 * @offs: offset within @page of the data to encrypt, in bytes 160 160 * @gfp_flags: memory allocation flags; see details below ··· 177 177 * 178 178 * Return: the new encrypted bounce page on success; an ERR_PTR() on failure 179 179 */ 180 - struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, 181 - unsigned int len, 182 - unsigned int offs, 183 - gfp_t gfp_flags) 184 - 180 + struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, 181 + size_t len, size_t offs, gfp_t gfp_flags) 185 182 { 186 - const struct inode *inode = page->mapping->host; 183 + const struct inode *inode = folio->mapping->host; 187 184 const struct fscrypt_inode_info *ci = inode->i_crypt_info; 188 185 const unsigned int du_bits = ci->ci_data_unit_bits; 189 186 const unsigned int du_size = 1U << du_bits; 190 187 struct page *ciphertext_page; 191 - u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) + 188 + u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + 192 189 (offs >> du_bits); 193 190 unsigned int i; 194 191 int err; 195 192 196 - if (WARN_ON_ONCE(!PageLocked(page))) 193 + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); 194 + if (WARN_ON_ONCE(!folio_test_locked(folio))) 197 195 return ERR_PTR(-EINVAL); 198 196 199 197 if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) ··· 203 205 204 206 for (i = offs; i < offs + len; i += du_size, index++) { 205 207 err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, 206 - page, ciphertext_page, 208 + &folio->page, ciphertext_page, 207 209 du_size, i, gfp_flags); 208 210 if (err) { 209 211 fscrypt_free_bounce_page(ciphertext_page); ··· 211 213 } 212 214 } 213 215 SetPagePrivate(ciphertext_page); 214 - set_page_private(ciphertext_page, (unsigned long)page); 216 + set_page_private(ciphertext_page, (unsigned long)folio); 215 217 return ciphertext_page; 216 218 } 217 219 EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks);
+1 -1
fs/ext4/page-io.c
··· 522 522 if (io->io_bio) 523 523 gfp_flags = GFP_NOWAIT | __GFP_NOWARN; 524 524 retry_encrypt: 525 - bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, 525 + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, 526 526 enc_bytes, 0, gfp_flags); 527 527 if (IS_ERR(bounce_page)) { 528 528 ret = PTR_ERR(bounce_page);
+1 -1
fs/f2fs/data.c
··· 2500 2500 return 0; 2501 2501 2502 2502 retry_encrypt: 2503 - fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, 2503 + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page), 2504 2504 PAGE_SIZE, 0, gfp_flags); 2505 2505 if (IS_ERR(fio->encrypted_page)) { 2506 2506 /* flush pending IOs and wait for a while in the ENOMEM case */
+4 -8
include/linux/fscrypt.h
··· 310 310 /* crypto.c */ 311 311 void fscrypt_enqueue_decrypt_work(struct work_struct *); 312 312 313 - struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, 314 - unsigned int len, 315 - unsigned int offs, 316 - gfp_t gfp_flags); 313 + struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, 314 + size_t len, size_t offs, gfp_t gfp_flags); 317 315 int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, 318 316 unsigned int len, unsigned int offs, 319 317 u64 lblk_num, gfp_t gfp_flags); ··· 478 480 { 479 481 } 480 482 481 - static inline struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, 482 - unsigned int len, 483 - unsigned int offs, 484 - gfp_t gfp_flags) 483 + static inline struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, 484 + size_t len, size_t offs, gfp_t gfp_flags) 485 485 { 486 486 return ERR_PTR(-EOPNOTSUPP); 487 487 }
-28
include/linux/pagemap.h
··· 1605 1605 } 1606 1606 1607 1607 /** 1608 - * page_mkwrite_check_truncate - check if page was truncated 1609 - * @page: the page to check 1610 - * @inode: the inode to check the page against 1611 - * 1612 - * Returns the number of bytes in the page up to EOF, 1613 - * or -EFAULT if the page was truncated. 1614 - */ 1615 - static inline int page_mkwrite_check_truncate(struct page *page, 1616 - struct inode *inode) 1617 - { 1618 - loff_t size = i_size_read(inode); 1619 - pgoff_t index = size >> PAGE_SHIFT; 1620 - int offset = offset_in_page(size); 1621 - 1622 - if (page->mapping != inode->i_mapping) 1623 - return -EFAULT; 1624 - 1625 - /* page is wholly inside EOF */ 1626 - if (page->index < index) 1627 - return PAGE_SIZE; 1628 - /* page is wholly past EOF */ 1629 - if (page->index > index || !offset) 1630 - return -EFAULT; 1631 - /* page is partially inside EOF */ 1632 - return offset; 1633 - } 1634 - 1635 - /** 1636 1608 * i_blocks_per_folio - How many blocks fit in this folio. 1637 1609 * @inode: The inode which contains the blocks. 1638 1610 * @folio: The folio.