Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge tag 'bcachefs-2025-07-11' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet.

* tag 'bcachefs-2025-07-11' of git://evilpiepirate.org/bcachefs:
bcachefs: Don't set BCH_FS_error on transaction restart
bcachefs: Fix additional misalignment in journal space calculations
bcachefs: Don't schedule non persistent passes persistently
bcachefs: Fix bch2_btree_transactions_read() synchronization
bcachefs: btree read retry fixes
bcachefs: btree node scan no longer uses btree cache
bcachefs: Tweak btree cache helpers for use by btree node scan
bcachefs: Fix btree for nonexistent tree depth
bcachefs: Fix bch2_io_failures_to_text()
bcachefs: bch2_fpunch_snapshot()

+138 -108
+13 -13
fs/bcachefs/btree_cache.c
··· 85 85 six_unlock_intent(&b->c.lock); 86 86 } 87 87 88 - static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) 88 + void __btree_node_data_free(struct btree *b) 89 89 { 90 90 BUG_ON(!list_empty(&b->list)); 91 91 BUG_ON(btree_node_hashed(b)); ··· 112 112 munmap(b->aux_data, btree_aux_data_bytes(b)); 113 113 #endif 114 114 b->aux_data = NULL; 115 - 116 - btree_node_to_freedlist(bc, b); 117 115 } 118 116 119 117 static void btree_node_data_free(struct btree_cache *bc, struct btree *b) 120 118 { 121 119 BUG_ON(list_empty(&b->list)); 122 120 list_del_init(&b->list); 121 + 122 + __btree_node_data_free(b); 123 + 123 124 --bc->nr_freeable; 124 - __btree_node_data_free(bc, b); 125 + btree_node_to_freedlist(bc, b); 125 126 } 126 127 127 128 static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, ··· 186 185 187 186 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) 188 187 { 189 - struct btree_cache *bc = &c->btree_cache; 190 - struct btree *b; 191 - 192 - b = __btree_node_mem_alloc(c, GFP_KERNEL); 188 + struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); 193 189 if (!b) 194 190 return NULL; 195 191 ··· 196 198 } 197 199 198 200 bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); 199 - 200 - __bch2_btree_node_to_freelist(bc, b); 201 201 return b; 202 202 } 203 203 ··· 520 524 --touched;; 521 525 } else if (!btree_node_reclaim(c, b)) { 522 526 __bch2_btree_node_hash_remove(bc, b); 523 - __btree_node_data_free(bc, b); 527 + __btree_node_data_free(b); 528 + btree_node_to_freedlist(bc, b); 524 529 525 530 freed++; 526 531 bc->nr_freed++; ··· 649 652 650 653 bch2_recalc_btree_reserve(c); 651 654 652 - for (i = 0; i < bc->nr_reserve; i++) 653 - if (!__bch2_btree_node_mem_alloc(c)) 655 + for (i = 0; i < bc->nr_reserve; i++) { 656 + struct btree *b = __bch2_btree_node_mem_alloc(c); 657 + if (!b) 654 658 goto err; 659 + __bch2_btree_node_to_freelist(bc, b); 660 + } 655 661 656 662 list_splice_init(&bc->live[0].list, &bc->freeable); 657 663
+1
fs/bcachefs/btree_cache.h
··· 30 30 void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); 31 31 int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); 32 32 33 + void __btree_node_data_free(struct btree *); 33 34 struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); 34 35 struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); 35 36
+3 -5
fs/bcachefs/btree_io.c
··· 568 568 bch2_mark_btree_validate_failure(failed, ca->dev_idx); 569 569 570 570 struct extent_ptr_decoded pick; 571 - have_retry = !bch2_bkey_pick_read_device(c, 571 + have_retry = bch2_bkey_pick_read_device(c, 572 572 bkey_i_to_s_c(&b->key), 573 - failed, &pick, -1); 573 + failed, &pick, -1) == 1; 574 574 } 575 575 576 576 if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) ··· 615 615 goto out; 616 616 case -BCH_ERR_btree_node_read_err_bad_node: 617 617 prt_str(&out, ", "); 618 - ret = __bch2_topology_error(c, &out); 619 618 break; 620 619 } 621 620 ··· 643 644 goto out; 644 645 case -BCH_ERR_btree_node_read_err_bad_node: 645 646 prt_str(&out, ", "); 646 - ret = __bch2_topology_error(c, &out); 647 647 break; 648 648 } 649 649 print: ··· 1406 1408 ret = bch2_bkey_pick_read_device(c, 1407 1409 bkey_i_to_s_c(&b->key), 1408 1410 &failed, &rb->pick, -1); 1409 - if (ret) { 1411 + if (ret <= 0) { 1410 1412 set_btree_node_read_error(b); 1411 1413 break; 1412 1414 }
+41 -43
fs/bcachefs/btree_node_scan.c
··· 75 75 } 76 76 } 77 77 78 - static bool found_btree_node_is_readable(struct btree_trans *trans, 79 - struct found_btree_node *f) 80 - { 81 - struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; 82 - 83 - found_btree_node_to_key(&tmp.k, f); 84 - 85 - struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); 86 - bool ret = !IS_ERR_OR_NULL(b); 87 - if (!ret) 88 - return ret; 89 - 90 - f->sectors_written = b->written; 91 - f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); 92 - 93 - struct bkey_s_c k; 94 - struct bkey unpacked; 95 - struct btree_node_iter iter; 96 - for_each_btree_node_key_unpack(b, k, &iter, &unpacked) 97 - f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); 98 - 99 - six_unlock_read(&b->c.lock); 100 - 101 - /* 102 - * We might update this node's range; if that happens, we need the node 103 - * to be re-read so the read path can trim keys that are no longer in 104 - * this node 105 - */ 106 - if (b != btree_node_root(trans->c, b)) 107 - bch2_btree_node_evict(trans, &tmp.k); 108 - return ret; 109 - } 110 - 111 78 static int found_btree_node_cmp_cookie(const void *_l, const void *_r) 112 79 { 113 80 const struct found_btree_node *l = _l; ··· 126 159 }; 127 160 128 161 static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, 129 - struct bio *bio, struct btree_node *bn, u64 offset) 162 + struct btree *b, struct bio *bio, u64 offset) 130 163 { 131 164 struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); 165 + struct btree_node *bn = b->data; 132 166 133 167 bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 134 168 bio->bi_iter.bi_sector = offset; 135 - bch2_bio_map(bio, bn, PAGE_SIZE); 169 + bch2_bio_map(bio, b->data, c->opts.block_size); 136 170 137 171 u64 submit_time = local_clock(); 138 172 submit_bio_wait(bio); 139 - 140 173 bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); 141 174 142 175 if (bio->bi_status) { ··· 168 201 if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX) 169 202 return; 170 203 204 + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); 205 + bio->bi_iter.bi_sector = offset; 206 + bch2_bio_map(bio, b->data, c->opts.btree_node_size); 207 + 208 + submit_time = local_clock(); 209 + submit_bio_wait(bio); 210 + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); 211 + 171 212 rcu_read_lock(); 172 213 struct found_btree_node n = { 173 214 .btree_id = BTREE_NODE_ID(bn), ··· 192 217 }; 193 218 rcu_read_unlock(); 194 219 195 - if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { 220 + found_btree_node_to_key(&b->key, &n); 221 + 222 + CLASS(printbuf, buf)(); 223 + if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { 224 + /* read_done will swap out b->data for another buffer */ 225 + bn = b->data; 226 + /* 227 + * Grab journal_seq here because we want the max journal_seq of 228 + * any bset; read_done sorts down to a single set and picks the 229 + * max journal_seq 230 + */ 231 + n.journal_seq = le64_to_cpu(bn->keys.journal_seq), 232 + n.sectors_written = b->written; 233 + 196 234 mutex_lock(&f->lock); 197 235 if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { 198 236 bch_err(c, "try_read_btree_node() can't handle endian conversion"); ··· 225 237 struct find_btree_nodes_worker *w = p; 226 238 struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); 227 239 struct bch_dev *ca = w->ca; 228 - void *buf = (void *) __get_free_page(GFP_KERNEL); 229 - struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); 230 240 unsigned long last_print = jiffies; 241 + struct btree *b = NULL; 242 + struct bio *bio = NULL; 231 243 232 - if (!buf || !bio) { 233 - bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); 244 + b = __bch2_btree_node_mem_alloc(c); 245 + if (!b) { 246 + bch_err(c, "read_btree_nodes_worker: error allocating buf"); 247 + w->f->ret = -ENOMEM; 248 + goto err; 249 + } 250 + 251 + bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); 252 + if (!bio) { 253 + bch_err(c, "read_btree_nodes_worker: error allocating bio"); 234 254 w->f->ret = -ENOMEM; 235 255 goto err; 236 256 } ··· 262 266 !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) 263 267 continue; 264 268 265 - try_read_btree_node(w->f, ca, bio, buf, sector); 269 + try_read_btree_node(w->f, ca, b, bio, sector); 266 270 } 267 271 err: 272 + if (b) 273 + __btree_node_data_free(b); 274 + kfree(b); 268 275 bio_put(bio); 269 - free_page((unsigned long) buf); 270 276 enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); 271 277 closure_put(w->cl); 272 278 kfree(w);
+9 -2
fs/bcachefs/debug.c
··· 153 153 c->verify_data = __bch2_btree_node_mem_alloc(c); 154 154 if (!c->verify_data) 155 155 goto out; 156 - 157 - list_del_init(&c->verify_data->list); 158 156 } 159 157 160 158 BUG_ON(b->nsets != 1); ··· 584 586 i->ubuf = buf; 585 587 i->size = size; 586 588 i->ret = 0; 589 + 590 + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); 587 591 restart: 588 592 seqmutex_lock(&c->btree_trans_lock); 589 593 list_sort(&c->btree_trans_list, list_ptr_order_cmp); ··· 598 598 599 599 if (!closure_get_not_zero(&trans->ref)) 600 600 continue; 601 + 602 + if (!trans->srcu_held) { 603 + closure_put(&trans->ref); 604 + continue; 605 + } 601 606 602 607 u32 seq = seqmutex_unlock(&c->btree_trans_lock); 603 608 ··· 625 620 } 626 621 seqmutex_unlock(&c->btree_trans_lock); 627 622 unlocked: 623 + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); 624 + 628 625 if (i->buf.allocation_failure) 629 626 ret = -ENOMEM; 630 627
-1
fs/bcachefs/errcode.h
··· 282 282 x(EIO, sb_not_downgraded) \ 283 283 x(EIO, btree_node_write_all_failed) \ 284 284 x(EIO, btree_node_read_error) \ 285 - x(EIO, btree_node_read_validate_error) \ 286 285 x(EIO, btree_need_topology_repair) \ 287 286 x(EIO, bucket_ref_update) \ 288 287 x(EIO, trigger_alloc) \
+4 -2
fs/bcachefs/error.c
··· 103 103 return bch_err_throw(c, btree_need_topology_repair); 104 104 } else { 105 105 return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: 106 - bch_err_throw(c, btree_node_read_validate_error); 106 + bch_err_throw(c, btree_need_topology_repair); 107 107 } 108 108 } 109 109 ··· 633 633 * log_fsck_err()s: that would require us to track for every error type 634 634 * which recovery pass corrects it, to get the fsck exit status correct: 635 635 */ 636 - if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { 636 + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { 637 + /* nothing */ 638 + } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { 637 639 set_bit(BCH_FS_errors_fixed, &c->flags); 638 640 } else { 639 641 set_bit(BCH_FS_errors_not_fixed, &c->flags);
+8 -8
fs/bcachefs/extents.c
··· 50 50 struct bch_io_failures *failed) 51 51 { 52 52 static const char * const error_types[] = { 53 - "io", "checksum", "ec reconstruct", NULL 53 + "btree validate", "io", "checksum", "ec reconstruct", NULL 54 54 }; 55 55 56 56 for (struct bch_dev_io_failures *f = failed->devs; 57 57 f < failed->devs + failed->nr; 58 58 f++) { 59 59 unsigned errflags = 60 - ((!!f->failed_io) << 0) | 61 - ((!!f->failed_csum_nr) << 1) | 62 - ((!!f->failed_ec) << 2); 63 - 64 - if (!errflags) 65 - continue; 60 + ((!!f->failed_btree_validate) << 0) | 61 + ((!!f->failed_io) << 1) | 62 + ((!!f->failed_csum_nr) << 2) | 63 + ((!!f->failed_ec) << 3); 66 64 67 65 bch2_printbuf_make_room(out, 1024); 68 66 out->atomic++; ··· 75 77 76 78 prt_char(out, ' '); 77 79 78 - if (is_power_of_2(errflags)) { 80 + if (!errflags) { 81 + prt_str(out, "no error - confused"); 82 + } else if (is_power_of_2(errflags)) { 79 83 prt_bitflags(out, error_types, errflags); 80 84 prt_str(out, " error"); 81 85 } else {
+6 -27
fs/bcachefs/fsck.c
··· 12 12 #include "fs.h" 13 13 #include "fsck.h" 14 14 #include "inode.h" 15 + #include "io_misc.h" 15 16 #include "keylist.h" 16 17 #include "namei.h" 17 18 #include "recovery_passes.h" ··· 1920 1919 "extent type past end of inode %llu:%u, i_size %llu\n%s", 1921 1920 i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, 1922 1921 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { 1923 - struct bkey_i *whiteout = bch2_trans_kmalloc(trans, sizeof(*whiteout)); 1924 - ret = PTR_ERR_OR_ZERO(whiteout); 1925 - if (ret) 1926 - goto err; 1927 - 1928 - bkey_init(&whiteout->k); 1929 - whiteout->k.p = SPOS(k.k->p.inode, 1930 - last_block, 1931 - i->inode.bi_snapshot); 1932 - bch2_key_resize(&whiteout->k, 1933 - min(KEY_SIZE_MAX & (~0 << c->block_bits), 1934 - U64_MAX - whiteout->k.p.offset)); 1935 - 1936 - 1937 - /* 1938 - * Need a normal (not BTREE_ITER_all_snapshots) 1939 - * iterator, if we're deleting in a different 1940 - * snapshot and need to emit a whiteout 1941 - */ 1942 - struct btree_iter iter2; 1943 - bch2_trans_iter_init(trans, &iter2, BTREE_ID_extents, 1944 - bkey_start_pos(&whiteout->k), 1945 - BTREE_ITER_intent); 1946 - ret = bch2_btree_iter_traverse(trans, &iter2) ?: 1947 - bch2_trans_update(trans, &iter2, whiteout, 1948 - BTREE_UPDATE_internal_snapshot_node); 1949 - bch2_trans_iter_exit(trans, &iter2); 1922 + ret = bch2_fpunch_snapshot(trans, 1923 + SPOS(i->inode.bi_inum, 1924 + last_block, 1925 + i->inode.bi_snapshot), 1926 + POS(i->inode.bi_inum, U64_MAX)); 1950 1927 if (ret) 1951 1928 goto err; 1952 1929
+27
fs/bcachefs/io_misc.c
··· 135 135 return ret; 136 136 } 137 137 138 + /* For fsck */ 139 + int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) 140 + { 141 + u32 restart_count = trans->restart_count; 142 + struct bch_fs *c = trans->c; 143 + struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); 144 + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); 145 + struct bkey_i delete; 146 + 147 + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, 148 + start, end, 0, k, 149 + &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ 150 + bkey_init(&delete.k); 151 + delete.k.p = iter.pos; 152 + 153 + /* create the biggest key we can */ 154 + bch2_key_resize(&delete.k, max_sectors); 155 + bch2_cut_back(end, &delete); 156 + 157 + bch2_extent_trim_atomic(trans, &iter, &delete) ?: 158 + bch2_trans_update(trans, &iter, &delete, 0); 159 + })); 160 + 161 + bch2_disk_reservation_put(c, &disk_res); 162 + return ret ?: trans_was_restarted(trans, restart_count); 163 + } 164 + 138 165 /* 139 166 * Returns -BCH_ERR_transacton_restart if we had to drop locks: 140 167 */
+2
fs/bcachefs/io_misc.h
··· 5 5 int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, 6 6 u64, struct bch_io_opts, s64 *, 7 7 struct write_point_specifier); 8 + 9 + int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); 8 10 int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, 9 11 subvol_inum, u64, s64 *); 10 12 int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *);
+6
fs/bcachefs/journal_reclaim.c
··· 170 170 return (struct journal_space) { 0, 0 }; 171 171 172 172 /* 173 + * It's possible for bucket size to be misaligned w.r.t. the filesystem 174 + * block size: 175 + */ 176 + min_bucket_size = round_down(min_bucket_size, block_sectors(c)); 177 + 178 + /* 173 179 * We sorted largest to smallest, and we want the smallest out of the 174 180 * @nr_devs_want largest devices: 175 181 */
+17 -6
fs/bcachefs/recovery.c
··· 273 273 goto out; 274 274 275 275 struct btree_path *path = btree_iter_path(trans, &iter); 276 - if (unlikely(!btree_path_node(path, k->level) && 277 - !k->allocated)) { 276 + if (unlikely(!btree_path_node(path, k->level))) { 278 277 struct bch_fs *c = trans->c; 278 + 279 + CLASS(printbuf, buf)(); 280 + prt_str(&buf, "btree="); 281 + bch2_btree_id_to_text(&buf, k->btree_id); 282 + prt_printf(&buf, " level=%u ", k->level); 283 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); 279 284 280 285 if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| 281 286 BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { 282 - bch_err(c, "have key in journal replay for btree depth that does not exist, confused"); 287 + bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", 288 + buf.buf); 283 289 ret = -EINVAL; 284 290 } 285 - #if 0 291 + 292 + if (!k->allocated) { 293 + bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", 294 + buf.buf); 295 + k->overwritten = true; 296 + goto out; 297 + } 298 + 286 299 bch2_trans_iter_exit(trans, &iter); 287 300 bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, 288 301 BTREE_MAX_DEPTH, 0, iter_flags); 289 302 ret = bch2_btree_iter_traverse(trans, &iter) ?: 290 303 bch2_btree_increase_depth(trans, iter.path, 0) ?: 291 304 -BCH_ERR_transaction_restart_nested; 292 - #endif 293 - k->overwritten = true; 294 305 goto out; 295 306 } 296 307
+1 -1
fs/bcachefs/recovery_passes.c
··· 360 360 !(r->passes_complete & BIT_ULL(pass)); 361 361 bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; 362 362 363 - if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { 363 + if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { 364 364 struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); 365 365 __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); 366 366 }