Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs

Push bcachefs fixes from Kent Overstreet:
"The data corruption in the buffered write path is troubling; inode
lock should not have been able to cause that...

- Fix a rare data corruption in the rebalance path, caught as a nonce
inconsistency on encrypted filesystems

- Revert lockless buffered write path

- Mark more errors as autofix"

* tag 'bcachefs-2024-08-21' of https://github.com/koverstreet/bcachefs:
bcachefs: Mark more errors as autofix
bcachefs: Revert lockless buffered IO path
bcachefs: Fix bch2_extents_match() false positive
bcachefs: Fix failure to return error in data_update_index_update()

+68 -116
+1
fs/bcachefs/data_update.c
··· 337 337 printbuf_exit(&buf); 338 338 339 339 bch2_fatal_error(c); 340 + ret = -EIO; 340 341 goto out; 341 342 } 342 343
-1
fs/bcachefs/errcode.h
··· 257 257 x(BCH_ERR_nopromote, nopromote_in_flight) \ 258 258 x(BCH_ERR_nopromote, nopromote_no_writes) \ 259 259 x(BCH_ERR_nopromote, nopromote_enomem) \ 260 - x(0, need_inode_lock) \ 261 260 x(0, invalid_snapshot_node) \ 262 261 x(0, option_needs_open_fs) 263 262
+22 -1
fs/bcachefs/extents.c
··· 929 929 bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) 930 930 if (p1.ptr.dev == p2.ptr.dev && 931 931 p1.ptr.gen == p2.ptr.gen && 932 + 933 + /* 934 + * This checks that the two pointers point 935 + * to the same region on disk - adjusting 936 + * for the difference in where the extents 937 + * start, since one may have been trimmed: 938 + */ 932 939 (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == 933 - (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) 940 + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k) && 941 + 942 + /* 943 + * This additionally checks that the 944 + * extents overlap on disk, since the 945 + * previous check may trigger spuriously 946 + * when one extent is immediately partially 947 + * overwritten with another extent (so that 948 + * on disk they are adjacent) and 949 + * compression is in use: 950 + */ 951 + ((p1.ptr.offset >= p2.ptr.offset && 952 + p1.ptr.offset < p2.ptr.offset + p2.crc.compressed_size) || 953 + (p2.ptr.offset >= p1.ptr.offset && 954 + p2.ptr.offset < p1.ptr.offset + p1.crc.compressed_size))) 934 955 return true; 935 956 936 957 return false;
+40 -109
fs/bcachefs/fs-io-buffered.c
··· 802 802 static int __bch2_buffered_write(struct bch_inode_info *inode, 803 803 struct address_space *mapping, 804 804 struct iov_iter *iter, 805 - loff_t pos, unsigned len, 806 - bool inode_locked) 805 + loff_t pos, unsigned len) 807 806 { 808 807 struct bch_fs *c = inode->v.i_sb->s_fs_info; 809 808 struct bch2_folio_reservation res; ··· 825 826 goto out; 826 827 827 828 BUG_ON(!fs.nr); 828 - 829 - /* 830 - * If we're not using the inode lock, we need to lock all the folios for 831 - * atomiticity of writes vs. other writes: 832 - */ 833 - if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { 834 - ret = -BCH_ERR_need_inode_lock; 835 - goto out; 836 - } 837 829 838 830 f = darray_first(fs); 839 831 if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ··· 922 932 end = pos + copied; 923 933 924 934 spin_lock(&inode->v.i_lock); 925 - if (end > inode->v.i_size) { 926 - BUG_ON(!inode_locked); 935 + if (end > inode->v.i_size) 927 936 i_size_write(&inode->v, end); 928 - } 929 937 spin_unlock(&inode->v.i_lock); 930 938 931 939 f_pos = pos; ··· 967 979 struct file *file = iocb->ki_filp; 968 980 struct address_space *mapping = file->f_mapping; 969 981 struct bch_inode_info *inode = file_bch_inode(file); 970 - loff_t pos; 971 - bool inode_locked = false; 972 - ssize_t written = 0, written2 = 0, ret = 0; 973 - 974 - /* 975 - * We don't take the inode lock unless i_size will be changing. Folio 976 - * locks provide exclusion with other writes, and the pagecache add lock 977 - * provides exclusion with truncate and hole punching. 978 - * 979 - * There is one nasty corner case where atomicity would be broken 980 - * without great care: when copying data from userspace to the page 981 - * cache, we do that with faults disable - a page fault would recurse 982 - * back into the filesystem, taking filesystem locks again, and 983 - * deadlock; so it's done with faults disabled, and we fault in the user 984 - * buffer when we aren't holding locks. 985 - * 986 - * If we do part of the write, but we then race and in the userspace 987 - * buffer have been evicted and are no longer resident, then we have to 988 - * drop our folio locks to re-fault them in, breaking write atomicity. 989 - * 990 - * To fix this, we restart the write from the start, if we weren't 991 - * holding the inode lock. 992 - * 993 - * There is another wrinkle after that; if we restart the write from the 994 - * start, and then get an unrecoverable error, we _cannot_ claim to 995 - * userspace that we did not write data we actually did - so we must 996 - * track (written2) the most we ever wrote. 997 - */ 998 - 999 - if ((iocb->ki_flags & IOCB_APPEND) || 1000 - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { 1001 - inode_lock(&inode->v); 1002 - inode_locked = true; 1003 - } 1004 - 1005 - ret = generic_write_checks(iocb, iter); 1006 - if (ret <= 0) 1007 - goto unlock; 1008 - 1009 - ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); 1010 - if (ret) { 1011 - if (!inode_locked) { 1012 - inode_lock(&inode->v); 1013 - inode_locked = true; 1014 - ret = file_remove_privs_flags(file, 0); 1015 - } 1016 - if (ret) 1017 - goto unlock; 1018 - } 1019 - 1020 - ret = file_update_time(file); 1021 - if (ret) 1022 - goto unlock; 1023 - 1024 - pos = iocb->ki_pos; 982 + loff_t pos = iocb->ki_pos; 983 + ssize_t written = 0; 984 + int ret = 0; 1025 985 1026 986 bch2_pagecache_add_get(inode); 1027 - 1028 - if (!inode_locked && 1029 - (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) 1030 - goto get_inode_lock; 1031 987 1032 988 do { 1033 989 unsigned offset = pos & (PAGE_SIZE - 1); ··· 997 1065 } 998 1066 } 999 1067 1000 - if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) 1001 - goto get_inode_lock; 1002 - 1003 1068 if (unlikely(fatal_signal_pending(current))) { 1004 1069 ret = -EINTR; 1005 1070 break; 1006 1071 } 1007 1072 1008 - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); 1009 - if (ret == -BCH_ERR_need_inode_lock) 1010 - goto get_inode_lock; 1073 + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); 1011 1074 if (unlikely(ret < 0)) 1012 1075 break; 1013 1076 ··· 1023 1096 } 1024 1097 pos += ret; 1025 1098 written += ret; 1026 - written2 = max(written, written2); 1027 - 1028 - if (ret != bytes && !inode_locked) 1029 - goto get_inode_lock; 1030 1099 ret = 0; 1031 1100 1032 1101 balance_dirty_pages_ratelimited(mapping); 1033 - 1034 - if (0) { 1035 - get_inode_lock: 1036 - bch2_pagecache_add_put(inode); 1037 - inode_lock(&inode->v); 1038 - inode_locked = true; 1039 - bch2_pagecache_add_get(inode); 1040 - 1041 - iov_iter_revert(iter, written); 1042 - pos -= written; 1043 - written = 0; 1044 - ret = 0; 1045 - } 1046 1102 } while (iov_iter_count(iter)); 1103 + 1047 1104 bch2_pagecache_add_put(inode); 1048 - unlock: 1049 - if (inode_locked) 1050 - inode_unlock(&inode->v); 1051 1105 1052 - iocb->ki_pos += written; 1053 - 1054 - ret = max(written, written2) ?: ret; 1055 - if (ret > 0) 1056 - ret = generic_write_sync(iocb, ret); 1057 - return ret; 1106 + return written ? written : ret; 1058 1107 } 1059 1108 1060 - ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) 1109 + ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) 1061 1110 { 1062 - ssize_t ret = iocb->ki_flags & IOCB_DIRECT 1063 - ? bch2_direct_write(iocb, iter) 1064 - : bch2_buffered_write(iocb, iter); 1111 + struct file *file = iocb->ki_filp; 1112 + struct bch_inode_info *inode = file_bch_inode(file); 1113 + ssize_t ret; 1065 1114 1115 + if (iocb->ki_flags & IOCB_DIRECT) { 1116 + ret = bch2_direct_write(iocb, from); 1117 + goto out; 1118 + } 1119 + 1120 + inode_lock(&inode->v); 1121 + 1122 + ret = generic_write_checks(iocb, from); 1123 + if (ret <= 0) 1124 + goto unlock; 1125 + 1126 + ret = file_remove_privs(file); 1127 + if (ret) 1128 + goto unlock; 1129 + 1130 + ret = file_update_time(file); 1131 + if (ret) 1132 + goto unlock; 1133 + 1134 + ret = bch2_buffered_write(iocb, from); 1135 + if (likely(ret > 0)) 1136 + iocb->ki_pos += ret; 1137 + unlock: 1138 + inode_unlock(&inode->v); 1139 + 1140 + if (ret > 0) 1141 + ret = generic_write_sync(iocb, ret); 1142 + out: 1066 1143 return bch2_err_class(ret); 1067 1144 } 1068 1145
+5 -5
fs/bcachefs/sb-errors_format.h
··· 23 23 x(jset_past_bucket_end, 9, 0) \ 24 24 x(jset_seq_blacklisted, 10, 0) \ 25 25 x(journal_entries_missing, 11, 0) \ 26 - x(journal_entry_replicas_not_marked, 12, 0) \ 26 + x(journal_entry_replicas_not_marked, 12, FSCK_AUTOFIX) \ 27 27 x(journal_entry_past_jset_end, 13, 0) \ 28 28 x(journal_entry_replicas_data_mismatch, 14, 0) \ 29 29 x(journal_entry_bkey_u64s_0, 15, 0) \ ··· 288 288 x(invalid_btree_id, 274, 0) \ 289 289 x(alloc_key_io_time_bad, 275, 0) \ 290 290 x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ 291 - x(accounting_key_junk_at_end, 277, 0) \ 292 - x(accounting_key_replicas_nr_devs_0, 278, 0) \ 293 - x(accounting_key_replicas_nr_required_bad, 279, 0) \ 294 - x(accounting_key_replicas_devs_unsorted, 280, 0) \ 291 + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ 292 + x(accounting_key_replicas_nr_devs_0, 278, FSCK_AUTOFIX) \ 293 + x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ 294 + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ 295 295 296 296 enum bch_sb_error_id { 297 297 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,